001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.Compound;
031import org.biojava.nbio.core.sequence.template.Sequence;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import java.io.*;
036import java.util.HashMap;
037import java.util.LinkedHashMap;
038
039/**
040 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
041 * primary class used to read Fasta files
042 * @author Scooter Willis ;lt;willishf at gmail dot com>
043 */
044public class FastaReader<S extends Sequence<?>, C extends Compound> {
045
046        private final static Logger logger = LoggerFactory.getLogger(FastaReader.class);
047
048        SequenceCreatorInterface<C> sequenceCreator;
049        SequenceHeaderParserInterface<S,C> headerParser;
050        BufferedReaderBytesRead br;
051        InputStreamReader isr;
052        FileInputStream fi = null;
053        long fileIndex = 0;
054        long sequenceIndex = 0;
055        String line = "";
056        String header= "";
057
058        /**
059         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
060         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
061         * an inputstream is forced to read all the data so you don't gain anything.
062         * @param is inputStream
063         * @param headerParser
064         * @param sequenceCreator
065         */
066        public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser,
067                                           SequenceCreatorInterface<C> sequenceCreator) {
068                this.headerParser = headerParser;
069                isr = new InputStreamReader(is);
070                this.br = new BufferedReaderBytesRead(isr);
071                this.sequenceCreator = sequenceCreator;
072        }
073
074        /**
075         * If you are going to use the FileProxyProteinSequenceCreator then you
076         * need to use this constructor because we need details about
077         * the location of the file.
078         * @param file
079         * @param headerParser
080         * @param sequenceCreator
081         * @throws FileNotFoundException if the file does not exist, is a directory
082         *      rather than a regular file, or for some other reason cannot be opened
083         *      for reading.
084         * @throws SecurityException if a security manager exists and its checkRead
085         *      method denies read access to the file.
086         */
087        public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
088                                           SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
089                this.headerParser = headerParser;
090                fi = new FileInputStream(file);
091                isr = new InputStreamReader(fi);
092                this.br = new BufferedReaderBytesRead(isr);
093                this.sequenceCreator = sequenceCreator;
094        }
095
096        /**
097         * The parsing is done in this method.<br>
098         * This method tries to process all the available fasta records
099         * in the File or InputStream, closes the underlying resource,
100         * and return the results in {@link LinkedHashMap}.<br>
101         * You don't need to call {@link #close()} after calling this method.
102         * @see #process(int)
103         * @return {@link HashMap} containing all the parsed fasta records
104         * present, starting current fileIndex onwards.
105         * @throws IOException if an error occurs reading the input file
106         */
107        public LinkedHashMap<String,S> process() throws IOException {
108                LinkedHashMap<String,S> sequences = process(-1);
109                close();
110
111                return sequences;
112        }
113
114        /**
115         * This method tries to parse maximum <code>max</code> records from
116         * the open File or InputStream, and leaves the underlying resource open.<br>
117         * Subsequent calls to the same method continue parsing the rest of the file.<br>
118         * This is particularly useful when dealing with very big data files,
119         * (e.g. NCBI nr database), which can't fit into memory and will take long
120         * time before the first result is available.<br>
121         * <b>N.B.</b>
122         * <ul>
123         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
124         * <li>remember to close the underlying resource when you are done.</li>
125         * </ul>
126         * @see #process()
127         * @author Amr ALHOSSARY
128         * @since 3.0.6
129         * @param max maximum number of records to return, <code>-1</code> for infinity.
130         * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records
131         * present, starting current fileIndex onwards.
132         * @throws IOException if an error occurs reading the input file
133         */
134        public LinkedHashMap<String,S> process(int max) throws IOException {
135
136
137                String line = "";
138                if(this.line != null && this.line.length() > 0){
139                        line=this.line;
140                }
141                String header = "";
142                if(this.header != null && this.header.length() > 0){
143                        header=this.header;
144                }
145
146                StringBuilder sb = new StringBuilder();
147                int processedSequences=0;
148                boolean keepGoing = true;
149
150
151                LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
152
153                do {
154                        line = line.trim(); // nice to have but probably not needed
155                        if (line.length() != 0) {
156                                if (line.startsWith(">")) {//start of new fasta record
157
158                                        if (sb.length() > 0) {
159                                                //i.e. if there is already a sequence before
160                                                //logger.info("Sequence index=" + sequenceIndex);
161
162                                                try {
163                                                        @SuppressWarnings("unchecked")
164                                                        S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
165                                                        headerParser.parseHeader(header, sequence);
166                                                        sequences.put(sequence.getAccession().getID(),sequence);
167                                                        processedSequences++;
168
169                                                } catch (CompoundNotFoundException e) {
170                                                        logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
171                                                                        header, e.getMessage());
172                                                }
173
174                                                sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer)
175                                        }
176                                        header = line.substring(1);
177                                } else if (line.startsWith(";")) {
178                                } else {
179                                        //mark the start of the sequence with the fileIndex before the line was read
180                                        if(sb.length() == 0){
181                                                sequenceIndex = fileIndex;
182                                        }
183                                        sb.append(line);
184                                }
185                        }
186                        fileIndex = br.getBytesRead();
187
188                        line = br.readLine();
189
190                        if (line == null) {
191                                //i.e. EOF
192                                if ( sb.length() == 0 && header.length() != 0 ) {
193                                        logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex);
194                                        logger.warn("header: {}", header);
195                                        header = null;
196                                } else if ( sb.length() > 0 ) {
197                                        //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex );
198                                        try {
199                                                @SuppressWarnings("unchecked")
200                                                S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
201                                                headerParser.parseHeader(header, sequence);
202                                                sequences.put(sequence.getAccession().getID(),sequence);
203                                                processedSequences++;
204                                                header = null;
205                                        } catch (CompoundNotFoundException e) {
206                                                logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
207                                                                header, e.getMessage());
208                                        }
209                                }
210                                keepGoing = false;
211                        }
212                        if (max > -1 && processedSequences>=max) {
213                                keepGoing=false;
214                        }
215                } while (keepGoing);
216
217                this.line  = line;
218                this.header= header;
219
220                return max > -1 && sequences.isEmpty() ? null :  sequences;
221        }
222
223        public void close() throws IOException {
224                br.close();
225                isr.close();
226                //If stream was created from File object then we need to close it
227                if (fi != null) {
228                        fi.close();
229                }
230                this.line=this.header = null;
231        }
232        
233}