001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Scooter Willis ;lt;willishf at gmail dot com>
015 * @author Karl Nicholas <github:karlnicholas>
016 * @author Paolo Pavan
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 * Created on 01-21-2010
025 */
026package org.biojava.nbio.core.sequence.io;
027
028import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
029import org.biojava.nbio.core.sequence.AccessionID;
030import org.biojava.nbio.core.sequence.DataSource;
031import org.biojava.nbio.core.sequence.TaxonomyID;
032import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
033import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
034import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
035import org.biojava.nbio.core.sequence.template.AbstractSequence;
036import org.biojava.nbio.core.sequence.template.Compound;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import java.io.BufferedReader;
041import java.io.File;
042import java.io.FileNotFoundException;
043import java.io.FileReader;
044import java.io.IOException;
045import java.io.InputStream;
046import java.io.InputStreamReader;
047import java.util.ArrayList;
048import java.util.HashMap;
049import java.util.LinkedHashMap;
050import java.util.List;
051import java.util.Map;
052
053/**
054 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the
055 * primary class used to read Genbank files
056 * @param <S> the sequence type
057 * @param <C> the compound type
058 */
059public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
060
061        private SequenceCreatorInterface<C> sequenceCreator;
062        private GenbankSequenceParser<S,C> genbankParser;
063        private BufferedReader bufferedReader;
064        private boolean closed;
065        private final Logger logger = LoggerFactory.getLogger(this.getClass());
066
067        public boolean isClosed() {
068                return closed;
069        }
070
071        /**
072         * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about
073         * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in
074         * an {@link InputStream} is forced to read all the data so you don't gain anything.
075         * @param is
076         * @param headerParser
077         * @param sequenceCreator
078         */
079        public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
080                                                 final SequenceCreatorInterface<C> sequenceCreator) {
081                this.sequenceCreator = sequenceCreator;
082                bufferedReader = new BufferedReader(new InputStreamReader(is));
083                genbankParser = new GenbankSequenceParser<>();
084                closed = false;
085        }
086
087        /**
088         * If you are going to use the FileProxyProteinSequenceCreator then you
089         * need to use this constructor because we need details about
090         * the location of the file.
091         * @param file
092         * @param headerParser
093         * @param sequenceCreator
094         * @throws FileNotFoundException if the file does not exist, is a directory
095         *      rather than a regular file, or for some other reason cannot be opened
096         *      for reading.
097         * @throws SecurityException if a security manager exists and its checkRead
098         *      method denies read access to the file.
099         */
100        public GenbankReader(
101                        final File file,
102                        final SequenceHeaderParserInterface<S,C> headerParser,
103                        final SequenceCreatorInterface<C> sequenceCreator
104                        ) throws FileNotFoundException {
105
106                this.bufferedReader = new BufferedReader(new FileReader(file));
107                this.sequenceCreator = sequenceCreator;
108                genbankParser = new GenbankSequenceParser<>();
109        }
110
111        /**
112         * The parsing is done in this method.<br>
113         * This method will return all the available Genbank records
114         * in the File or InputStream, closes the underlying resource,
115         * and return the results in {@link LinkedHashMap}.<br>
116         * You don't need to call {@link GenbankReader#close()} after calling this method.
117         * @see #process(int)
118         * @return {@link HashMap} containing all the parsed Genbank records
119         * present, starting current fileIndex onwards.
120         * @throws IOException
121         * @throws CompoundNotFoundException
122         * @throws OutOfMemoryError if the input resource is larger than the allocated heap.
123         */
124        public Map<String, S> process() throws IOException, CompoundNotFoundException {
125                Map<String, S> result = process(-1);
126                close();
127                return result;
128        }
129
130        /**
131         * This method tries to parse maximum <code>max</code> records from
132         * the open File or InputStream, and leaves the underlying resource open.<br>
133         *
134         * Subsequent calls to the same method continue parsing the rest of the file.<br>
135         * This is particularly useful when dealing with very big data files,
136         * (e.g. NCBI nr database), which can't fit into memory and will take long
137         * time before the first result is available.<br>
138         * <b>N.B.</b>
139         * <ul>
140         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
141         * <li>remember to close the underlying resource when you are done.</li>
142         * </ul>
143         * @see #process()
144         * @author Amr ALHOSSARY
145         * @since 3.0.6
146         * @param max maximum number of records to return.
147         * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
148         * present, starting current fileIndex onwards.
149         * @throws IOException
150         * @throws CompoundNotFoundException
151         */
152        public Map<String, S> process(final int max) throws IOException, CompoundNotFoundException {
153
154                if(closed){
155                        throw new IOException("Cannot perform action: resource has been closed.");
156                }
157
158                Map<String, S> sequences = new LinkedHashMap<>();
159                int i=0;
160                while(true) {
161                        if(max>0 && i>=max) break;
162                        i++;
163                        String seqString = genbankParser.getSequence(bufferedReader, 0);
164                        //reached end of file?
165                        if(seqString==null) break;
166                        @SuppressWarnings("unchecked")
167                        S sequence = (S) sequenceCreator.getSequence(seqString, 0);
168                        GenericGenbankHeaderParser<S, C> genbankHeaderParser = genbankParser.getSequenceHeaderParser();                 
169                        genbankHeaderParser.parseHeader(genbankParser.getHeader(), sequence);                   
170                        String id = genbankHeaderParser.getAccession();
171                        int version = genbankHeaderParser.getVersion();
172                        String identifier = genbankHeaderParser.getIdentifier();
173                        AccessionID accession = new AccessionID(id , DataSource.GENBANK, version, identifier);
174                        sequence.setAccession(accession);
175                        
176                        // add features to new sequence
177                        genbankParser.getFeatures().values().stream()
178                        .flatMap(List::stream)
179                        .forEach(sequence::addFeature);
180
181                        // add taxonomy ID to new sequence
182                        List<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
183                        if (dbQualifier != null){
184                                DBReferenceInfo q = dbQualifier.get(0);
185                                sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
186                        }
187
188                        sequences.put(sequence.getAccession().getID(), sequence);
189                }
190
191                return sequences;
192        }
193
194        public void close() {
195                try {
196                        bufferedReader.close();
197                        this.closed = true;
198                } catch (IOException e) {
199                        logger.error("Couldn't close the reader.", e);
200                        this.closed = false;
201                }
202        }
203}
204