001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Scooter Willis ;lt;willishf at gmail dot com>
015 * @author Karl Nicholas <github:karlnicholas>
016 * @author Paolo Pavan
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 * Created on 01-21-2010
025 */
026package org.biojava.nbio.core.sequence.io;
027
028import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
029import org.biojava.nbio.core.sequence.AccessionID;
030import org.biojava.nbio.core.sequence.DataSource;
031import org.biojava.nbio.core.sequence.TaxonomyID;
032import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
033import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
034import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
035import org.biojava.nbio.core.sequence.template.AbstractSequence;
036import org.biojava.nbio.core.sequence.template.Compound;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import java.io.BufferedReader;
041import java.io.File;
042import java.io.FileNotFoundException;
043import java.io.FileReader;
044import java.io.IOException;
045import java.io.InputStream;
046import java.io.InputStreamReader;
047import java.util.ArrayList;
048import java.util.HashMap;
049import java.util.LinkedHashMap;
050import java.util.List;
051
052/**
053 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the
054 * primary class used to read Genbank files
055 *
056 */
057public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
058
059        private SequenceCreatorInterface<C> sequenceCreator;
060        private GenbankSequenceParser<S,C> genbankParser;
061        private BufferedReader bufferedReader;
062        private boolean closed;
063        private final Logger logger = LoggerFactory.getLogger(this.getClass());
064
065        public boolean isClosed() {
066                return closed;
067        }
068
069        /**
070         * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about
071         * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in
072         * an {@link InputStream} is forced to read all the data so you don't gain anything.
073         * @param is
074         * @param headerParser
075         * @param sequenceCreator
076         */
077        public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
078                                                 final SequenceCreatorInterface<C> sequenceCreator) {
079                this.sequenceCreator = sequenceCreator;
080                bufferedReader = new BufferedReader(new InputStreamReader(is));
081                genbankParser = new GenbankSequenceParser<>();
082                closed = false;
083        }
084
085        /**
086         * If you are going to use the FileProxyProteinSequenceCreator then you
087         * need to use this constructor because we need details about
088         * the location of the file.
089         * @param file
090         * @param headerParser
091         * @param sequenceCreator
092         * @throws FileNotFoundException if the file does not exist, is a directory
093         *      rather than a regular file, or for some other reason cannot be opened
094         *      for reading.
095         * @throws SecurityException if a security manager exists and its checkRead
096         *      method denies read access to the file.
097         */
098        public GenbankReader(
099                        final File file,
100                        final SequenceHeaderParserInterface<S,C> headerParser,
101                        final SequenceCreatorInterface<C> sequenceCreator
102                        ) throws FileNotFoundException {
103
104                this.bufferedReader = new BufferedReader(new FileReader(file));
105                this.sequenceCreator = sequenceCreator;
106                genbankParser = new GenbankSequenceParser<>();
107        }
108
109        /**
110         * The parsing is done in this method.<br>
111         * This method will return all the available Genbank records
112         * in the File or InputStream, closes the underlying resource,
113         * and return the results in {@link LinkedHashMap}.<br>
114         * You don't need to call {@link GenbankReader#close()} after calling this method.
115         * @see #process(int)
116         * @return {@link HashMap} containing all the parsed Genbank records
117         * present, starting current fileIndex onwards.
118         * @throws IOException
119         * @throws CompoundNotFoundException
120         * @throws OutOfMemoryError if the input resource is larger than the allocated heap.
121         */
122        public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
123                LinkedHashMap<String,S> result = process(-1);
124                close();
125                return result;
126        }
127
128        /**
129         * This method tries to parse maximum <code>max</code> records from
130         * the open File or InputStream, and leaves the underlying resource open.<br>
131         *
132         * Subsequent calls to the same method continue parsing the rest of the file.<br>
133         * This is particularly useful when dealing with very big data files,
134         * (e.g. NCBI nr database), which can't fit into memory and will take long
135         * time before the first result is available.<br>
136         * <b>N.B.</b>
137         * <ul>
138         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
139         * <li>remember to close the underlying resource when you are done.</li>
140         * </ul>
141         * @see #process()
142         * @author Amr ALHOSSARY
143         * @since 3.0.6
144         * @param max maximum number of records to return.
145         * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
146         * present, starting current fileIndex onwards.
147         * @throws IOException
148         * @throws CompoundNotFoundException
149         */
150        public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
151
152                if(closed){
153                        throw new IOException("Cannot perform action: resource has been closed.");
154                }
155
156                LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
157                int i=0;
158                while(true) {
159                        if(max>0 && i>=max) break;
160                        i++;
161                        String seqString = genbankParser.getSequence(bufferedReader, 0);
162                        //reached end of file?
163                        if(seqString==null) break;
164                        @SuppressWarnings("unchecked")
165                        S sequence = (S) sequenceCreator.getSequence(seqString, 0);
166                        GenericGenbankHeaderParser<S, C> genbankHeaderParser = genbankParser.getSequenceHeaderParser();                 
167                        genbankHeaderParser.parseHeader(genbankParser.getHeader(), sequence);                   
168                        String id = genbankHeaderParser.getAccession();
169                        int version = genbankHeaderParser.getVersion();
170                        String identifier = genbankHeaderParser.getIdentifier();
171                        AccessionID accession = new AccessionID(id , DataSource.GENBANK, version, identifier);
172                        sequence.setAccession(accession);
173                        
174                        // add features to new sequence
175                        genbankParser.getFeatures().values().stream()
176                        .flatMap(List::stream)
177                        .forEach(sequence::addFeature);
178
179                        // add taxonomy ID to new sequence
180                        List<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
181                        if (dbQualifier != null){
182                                DBReferenceInfo q = dbQualifier.get(0);
183                                sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
184                        }
185
186                        sequences.put(sequence.getAccession().getID(), sequence);
187                }
188
189                return sequences;
190        }
191
192        public void close() {
193                try {
194                        bufferedReader.close();
195                        this.closed = true;
196                } catch (IOException e) {
197                        logger.error("Couldn't close the reader.", e);
198                        this.closed = false;
199                }
200        }
201}
202