001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.Compound;
031import org.biojava.nbio.core.sequence.template.Sequence;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import java.io.*;
036import java.util.HashMap;
037import java.util.LinkedHashMap;
038import java.util.Map;
039
040/**
041 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
042 * primary class used to read Fasta files
043 * @author Scooter Willis ;lt;willishf at gmail dot com>
044 */
045public class FastaReader<S extends Sequence<?>, C extends Compound> {
046
047        private final static Logger logger = LoggerFactory.getLogger(FastaReader.class);
048
049        SequenceCreatorInterface<C> sequenceCreator;
050        SequenceHeaderParserInterface<S,C> headerParser;
051        BufferedReaderBytesRead br;
052        InputStreamReader isr;
053        FileInputStream fi = null;
054        long fileIndex = 0;
055        long sequenceIndex = 0;
056        String line = "";
057        String header= "";
058
059        /**
060         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
061         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
062         * an inputstream is forced to read all the data so you don't gain anything.
063         * @param is inputStream
064         * @param headerParser
065         * @param sequenceCreator
066         */
067        public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser,
068                                           SequenceCreatorInterface<C> sequenceCreator) {
069                this.headerParser = headerParser;
070                isr = new InputStreamReader(is);
071                this.br = new BufferedReaderBytesRead(isr);
072                this.sequenceCreator = sequenceCreator;
073        }
074
075        /**
076         * If you are going to use the FileProxyProteinSequenceCreator then you
077         * need to use this constructor because we need details about
078         * the location of the file.
079         * @param file
080         * @param headerParser
081         * @param sequenceCreator
082         * @throws FileNotFoundException if the file does not exist, is a directory
083         *      rather than a regular file, or for some other reason cannot be opened
084         *      for reading.
085         * @throws SecurityException if a security manager exists and its checkRead
086         *      method denies read access to the file.
087         */
088        public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
089                                           SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
090                this.headerParser = headerParser;
091                fi = new FileInputStream(file);
092                isr = new InputStreamReader(fi);
093                this.br = new BufferedReaderBytesRead(isr);
094                this.sequenceCreator = sequenceCreator;
095        }
096
097        /**
098         * The parsing is done in this method.<br>
099         * This method tries to process all the available fasta records
100         * in the File or InputStream, closes the underlying resource,
101         * and return the results in {@link LinkedHashMap}.<br>
102         * You don't need to call {@link #close()} after calling this method.
103         * @see #process(int)
104         * @return {@link HashMap} containing all the parsed fasta records
105         * present, starting current fileIndex onwards.
106         * @throws IOException if an error occurs reading the input file
107         */
108        public Map<String, S> process() throws IOException {
109                Map<String, S> sequences = process(-1);
110                close();
111
112                return sequences;
113        }
114
115        /**
116         * This method tries to parse maximum <code>max</code> records from
117         * the open File or InputStream, and leaves the underlying resource open.<br>
118         * Subsequent calls to the same method continue parsing the rest of the file.<br>
119         * This is particularly useful when dealing with very big data files,
120         * (e.g. NCBI nr database), which can't fit into memory and will take long
121         * time before the first result is available.<br>
122         * <b>N.B.</b>
123         * <ul>
124         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
125         * <li>remember to close the underlying resource when you are done.</li>
126         * </ul>
127         * @see #process()
128         * @author Amr ALHOSSARY
129         * @since 3.0.6
130         * @param max maximum number of records to return, <code>-1</code> for infinity.
131         * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records
132         * present, starting current fileIndex onwards.
133         * @throws IOException if an error occurs reading the input file
134         */
135        public Map<String, S> process(int max) throws IOException {
136
137
138                String line = "";
139                if(this.line != null && this.line.length() > 0){
140                        line=this.line;
141                }
142                String header = "";
143                if(this.header != null && this.header.length() > 0){
144                        header=this.header;
145                }
146
147                StringBuilder sb = new StringBuilder();
148                int processedSequences=0;
149                boolean keepGoing = true;
150
151
152                Map<String, S> sequences = new LinkedHashMap<>();
153
154                do {
155                        line = line.trim(); // nice to have but probably not needed
156                        if (line.length() != 0) {
157                                if (line.startsWith(">")) {//start of new fasta record
158
159                                        if (sb.length() > 0) {
160                                                //i.e. if there is already a sequence before
161                                                //logger.info("Sequence index=" + sequenceIndex);
162
163                                                try {
164                                                        @SuppressWarnings("unchecked")
165                                                        S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
166                                                        headerParser.parseHeader(header, sequence);
167                                                        sequences.put(sequence.getAccession().getID(),sequence);
168                                                        processedSequences++;
169
170                                                } catch (CompoundNotFoundException e) {
171                                                        logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
172                                                                        header, e.getMessage());
173                                                }
174
175                                                sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer)
176                                        }
177                                        header = line.substring(1);
178                                } else if (line.startsWith(";")) {
179                                } else {
180                                        //mark the start of the sequence with the fileIndex before the line was read
181                                        if(sb.length() == 0){
182                                                sequenceIndex = fileIndex;
183                                        }
184                                        sb.append(line);
185                                }
186                        }
187                        fileIndex = br.getBytesRead();
188
189                        line = br.readLine();
190
191                        if (line == null) {
192                                //i.e. EOF
193                                if ( sb.length() == 0 && header.length() != 0 ) {
194                                        logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex);
195                                        logger.warn("header: {}", header);
196                                        header = null;
197                                } else if ( sb.length() > 0 ) {
198                                        //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex );
199                                        try {
200                                                @SuppressWarnings("unchecked")
201                                                S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
202                                                headerParser.parseHeader(header, sequence);
203                                                sequences.put(sequence.getAccession().getID(),sequence);
204                                                processedSequences++;
205                                                header = null;
206                                        } catch (CompoundNotFoundException e) {
207                                                logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
208                                                                header, e.getMessage());
209                                        }
210                                }
211                                keepGoing = false;
212                        }
213                        if (max > -1 && processedSequences>=max) {
214                                keepGoing=false;
215                        }
216                } while (keepGoing);
217
218                this.line  = line;
219                this.header= header;
220
221                return max > -1 && sequences.isEmpty() ? null :  sequences;
222        }
223
224        public void close() throws IOException {
225                br.close();
226                isr.close();
227                //If stream was created from File object then we need to close it
228                if (fi != null) {
229                        fi.close();
230                }
231                this.line=this.header = null;
232        }
233        
234}