001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.Compound;
031import org.biojava.nbio.core.sequence.template.Sequence;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import java.io.*;
036import java.util.HashMap;
037import java.util.LinkedHashMap;
038
039/**
040 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
041 * primary class used to read Fasta files
042 * @author Scooter Willis ;lt;willishf at gmail dot com>
043 */
044public class FastaReader<S extends Sequence<?>, C extends Compound> {
045
046        private final static Logger logger = LoggerFactory.getLogger(FastaReader.class);
047
048        SequenceCreatorInterface<C> sequenceCreator;
049        SequenceHeaderParserInterface<S,C> headerParser;
050        BufferedReaderBytesRead br;
051        InputStreamReader isr;
052        FileInputStream fi = null;
053        long fileIndex = 0;
054        long sequenceIndex = 0;
055        String line = "";
056        String header= "";
057
058        /**
059         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
060         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
061         * an inputstream is forced to read all the data so you don't gain anything.
062         * @param is inputStream
063         * @param headerParser
064         * @param sequenceCreator
065         */
066        public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser,
067                                           SequenceCreatorInterface<C> sequenceCreator) {
068                this.headerParser = headerParser;
069                isr = new InputStreamReader(is);
070                this.br = new BufferedReaderBytesRead(isr);
071                this.sequenceCreator = sequenceCreator;
072        }
073
074        /**
075         * If you are going to use the FileProxyProteinSequenceCreator then you
076         * need to use this constructor because we need details about
077         * the location of the file.
078         * @param file
079         * @param headerParser
080         * @param sequenceCreator
081         * @throws FileNotFoundException if the file does not exist, is a directory
082         *      rather than a regular file, or for some other reason cannot be opened
083         *      for reading.
084         * @throws SecurityException if a security manager exists and its checkRead
085         *      method denies read access to the file.
086         */
087        public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
088                                           SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
089                this.headerParser = headerParser;
090                fi = new FileInputStream(file);
091                isr = new InputStreamReader(fi);
092                this.br = new BufferedReaderBytesRead(isr);
093                this.sequenceCreator = sequenceCreator;
094        }
095
096        /**
097         * The parsing is done in this method.<br>
098         * This method tries to process all the available fasta records
099         * in the File or InputStream, closes the underlying resource,
100         * and return the results in {@link LinkedHashMap}.<br>
101         * You don't need to call {@link #close()} after calling this method.
102         * @see #process(int)
103         * @return {@link HashMap} containing all the parsed fasta records
104         * present, starting current fileIndex onwards.
105         * @throws IOException if an error occurs reading the input file
106         */
107        public LinkedHashMap<String,S> process() throws IOException {
108                LinkedHashMap<String,S> sequences = process(-1);
109                close();
110
111                return sequences;
112        }
113
114        /**
115         * This method tries to parse maximum <code>max</code> records from
116         * the open File or InputStream, and leaves the underlying resource open.<br>
117         * Subsequent calls to the same method continue parsing the rest of the file.<br>
118         * This is particularly useful when dealing with very big data files,
119         * (e.g. NCBI nr database), which can't fit into memory and will take long
120         * time before the first result is available.<br>
121         * <b>N.B.</b>
122         * <ul>
123         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
124         * <li>remember to close the underlying resource when you are done.</li>
125         * </ul>
126         * @see #process()
127         * @author Amr AL-Hossary
128         * @since 3.0.6
129         * @param max maximum number of records to return, <code>-1</code> for infinity.
130         * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records
131         * present, starting current fileIndex onwards.
132         * @throws IOException if an error occurs reading the input file
133         */
134        public LinkedHashMap<String,S> process(int max) throws IOException {
135
136
137                String line = "";
138                if(this.line != null && this.line.length() > 0){
139                        line=this.line;
140                }
141                String header = "";
142                if(this.header != null && this.header.length() > 0){
143                        header=this.header;
144                }
145
146                StringBuilder sb = new StringBuilder();
147                int processedSequences=0;
148                boolean keepGoing = true;
149
150
151                LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
152
153                do {
154                        line = line.trim(); // nice to have but probably not needed
155                        if (line.length() != 0) {
156                                if (line.startsWith(">")) {//start of new fasta record
157
158                                        if (sb.length() > 0) {
159                                                //i.e. if there is already a sequence before
160                                                //logger.info("Sequence index=" + sequenceIndex);
161
162                                                try {
163                                                        @SuppressWarnings("unchecked")
164                                                        S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
165                                                        headerParser.parseHeader(header, sequence);
166                                                        sequences.put(sequence.getAccession().getID(),sequence);
167                                                        processedSequences++;
168
169                                                } catch (CompoundNotFoundException e) {
170                                                        logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
171                                                                        header, e.getMessage());
172                                                }
173
174                                                sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer)
175                                        }
176                                        header = line.substring(1);
177                                } else if (line.startsWith(";")) {
178                                } else {
179                                        //mark the start of the sequence with the fileIndex before the line was read
180                                        if(sb.length() == 0){
181                                                sequenceIndex = fileIndex;
182                                        }
183                                        sb.append(line);
184                                }
185                        }
186                        fileIndex = br.getBytesRead();
187
188                        line = br.readLine();
189
190                        if (line == null) {
191
192
193                                // Fix for #282
194                                if ( sequences.size() == 0 && max != -1) {
195                                        return null;
196                                }
197
198                                //i.e. EOF
199                                String seq = sb.toString();
200                                if ( seq.length() == 0) {
201                                        logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex);
202                                        logger.warn("header: {}", header);
203                                }
204                                //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex );
205                                try {
206                                        @SuppressWarnings("unchecked")
207                                        S sequence = (S)sequenceCreator.getSequence(seq, sequenceIndex);
208                                        headerParser.parseHeader(header, sequence);
209                                        sequences.put(sequence.getAccession().getID(),sequence);
210                                        processedSequences++;
211                                } catch (CompoundNotFoundException e) {
212                                        logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
213                                                        header, e.getMessage());
214                                }
215                                keepGoing = false;
216                        }
217                        if (max > -1 && processedSequences>=max) {
218                                keepGoing=false;
219                        }
220                        if ( this.line == null)
221                                keepGoing = false;
222                } while (keepGoing);
223
224                this.line  = line;
225                this.header= header;
226
227                return sequences;
228        }
229
230        public void close() throws IOException {
231                br.close();
232                isr.close();
233                //If stream was created from File object then we need to close it
234                if (fi != null) {
235                        fi.close();
236                }
237                this.line=this.header = null;
238        }
239
240        public static void main(String[] args) {
241                try {
242                        String inputFile = "/PF00104_small.fasta";
243                        InputStream is = FastaReader.class.getResourceAsStream(inputFile);
244
245
246                        if ( is == null)
247                                System.err.println("Could not get input file " + inputFile);
248                        FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
249                        LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process();
250                        is.close();
251
252
253                        //logger.info("Protein Sequences: {}", proteinSequences);
254
255                        File file = new File(inputFile);
256                        FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader =
257                                        new FastaReader<ProteinSequence,AminoAcidCompound>(
258                                                        file,
259                                                        new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(),
260                                                        new FileProxyProteinSequenceCreator(
261                                                                        file,
262                                                                        AminoAcidCompoundSet.getAminoAcidCompoundSet(),
263                                                                        new FastaSequenceParser()
264                                                        )
265                                        );
266                        LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process();
267
268                        for(String key : proteinProxySequences.keySet()){
269                                ProteinSequence proteinSequence = proteinProxySequences.get(key);
270                                logger.info("Protein Proxy Sequence Key: {}", key);
271//                if(key.equals("Q98SJ1_CHICK/15-61")){
272//                    int dummy = 1;
273//                }
274                                logger.info("Protein Sequence: {}", proteinSequence.toString());
275
276                        }
277
278                } catch (Exception e) {
279                        logger.warn("Exception: ", e);
280                }
281        }
282}