001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Scooter Willis ;lt;willishf at gmail dot com>
015 * @author Karl Nicholas <github:karlnicholas>
016 * @author Paolo Pavan
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 * Created on 01-21-2010
025 */
026package org.biojava.nbio.core.sequence.io;
027
028import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
029import org.biojava.nbio.core.sequence.DataSource;
030import org.biojava.nbio.core.sequence.TaxonomyID;
031import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
032import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
033import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
034import org.biojava.nbio.core.sequence.template.AbstractSequence;
035import org.biojava.nbio.core.sequence.template.Compound;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import java.io.BufferedReader;
040import java.io.File;
041import java.io.FileNotFoundException;
042import java.io.FileReader;
043import java.io.IOException;
044import java.io.InputStream;
045import java.io.InputStreamReader;
046import java.util.ArrayList;
047import java.util.HashMap;
048import java.util.LinkedHashMap;
049import java.util.List;
050
051/**
052 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the
053 * primary class used to read Genbank files
054 *
055 */
056public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
057
058        private SequenceCreatorInterface<C> sequenceCreator;
059        private GenbankSequenceParser<S,C> genbankParser;
060        private BufferedReader bufferedReader;
061        private boolean closed;
062        private final Logger logger = LoggerFactory.getLogger(this.getClass());
063
064        public boolean isClosed() {
065                return closed;
066        }
067
068        /**
069         * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about
070         * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in
071         * an {@link InputStream} is forced to read all the data so you don't gain anything.
072         * @param is
073         * @param headerParser
074         * @param sequenceCreator
075         */
076        public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
077                                                 final SequenceCreatorInterface<C> sequenceCreator) {
078                this.sequenceCreator = sequenceCreator;
079                bufferedReader = new BufferedReader(new InputStreamReader(is));
080                genbankParser = new GenbankSequenceParser<>();
081                closed = false;
082        }
083
084        /**
085         * If you are going to use the FileProxyProteinSequenceCreator then you
086         * need to use this constructor because we need details about
087         * the location of the file.
088         * @param file
089         * @param headerParser
090         * @param sequenceCreator
091         * @throws FileNotFoundException if the file does not exist, is a directory
092         *      rather than a regular file, or for some other reason cannot be opened
093         *      for reading.
094         * @throws SecurityException if a security manager exists and its checkRead
095         *      method denies read access to the file.
096         */
097        public GenbankReader(
098                        final File file,
099                        final SequenceHeaderParserInterface<S,C> headerParser,
100                        final SequenceCreatorInterface<C> sequenceCreator
101                        ) throws FileNotFoundException {
102
103                this.bufferedReader = new BufferedReader(new FileReader(file));
104                this.sequenceCreator = sequenceCreator;
105                genbankParser = new GenbankSequenceParser<>();
106        }
107
108        /**
109         * The parsing is done in this method.<br>
110         * This method will return all the available Genbank records
111         * in the File or InputStream, closes the underlying resource,
112         * and return the results in {@link LinkedHashMap}.<br>
113         * You don't need to call {@link GenbankReader#close()} after calling this method.
114         * @see #process(int)
115         * @return {@link HashMap} containing all the parsed Genbank records
116         * present, starting current fileIndex onwards.
117         * @throws IOException
118         * @throws CompoundNotFoundException
119         * @throws OutOfMemoryError if the input resource is larger than the allocated heap.
120         */
121        public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
122                LinkedHashMap<String,S> result = process(-1);
123                close();
124                return result;
125        }
126
127        /**
128         * This method tries to parse maximum <code>max</code> records from
129         * the open File or InputStream, and leaves the underlying resource open.<br>
130         *
131         * Subsequent calls to the same method continue parsing the rest of the file.<br>
132         * This is particularly useful when dealing with very big data files,
133         * (e.g. NCBI nr database), which can't fit into memory and will take long
134         * time before the first result is available.<br>
135         * <b>N.B.</b>
136         * <ul>
137         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
138         * <li>remember to close the underlying resource when you are done.</li>
139         * </ul>
140         * @see #process()
141         * @author Amr ALHOSSARY
142         * @since 3.0.6
143         * @param max maximum number of records to return.
144         * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
145         * present, starting current fileIndex onwards.
146         * @throws IOException
147         * @throws CompoundNotFoundException
148         */
149        public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
150
151                if(closed){
152                        throw new IOException("Cannot perform action: resource has been closed.");
153                }
154
155                LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
156                int i=0;
157                while(true) {
158                        if(max>0 && i>=max) break;
159                        i++;
160                        String seqString = genbankParser.getSequence(bufferedReader, 0);
161                        //reached end of file?
162                        if(seqString==null) break;
163                        @SuppressWarnings("unchecked")
164                        S sequence = (S) sequenceCreator.getSequence(seqString, 0);
165                        genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
166
167                        // add features to new sequence
168                        genbankParser.getFeatures().values().stream()
169                        .flatMap(List::stream)
170                        .forEach(sequence::addFeature);
171
172                        // add taxonomy ID to new sequence
173                        List<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
174                        if (dbQualifier != null){
175                                DBReferenceInfo q = dbQualifier.get(0);
176                                sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
177                        }
178
179                        sequences.put(sequence.getAccession().getID(), sequence);
180                }
181
182                return sequences;
183        }
184
185        public void close() {
186                try {
187                        bufferedReader.close();
188                        this.closed = true;
189                } catch (IOException e) {
190                        logger.error("Couldn't close the reader.", e);
191                        this.closed = false;
192                }
193        }
194}
195