001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Scooter Willis ;lt;willishf at gmail dot com>
015 * @author Karl Nicholas <github:karlnicholas>
016 * @author Paolo Pavan
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 * Created on 01-21-2010
025 */
026package org.biojava.nbio.core.sequence.io;
027
028import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
029import org.biojava.nbio.core.sequence.DNASequence;
030import org.biojava.nbio.core.sequence.DataSource;
031import org.biojava.nbio.core.sequence.ProteinSequence;
032import org.biojava.nbio.core.sequence.TaxonomyID;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
035import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.core.sequence.features.AbstractFeature;
038import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
039import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
040import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
041import org.biojava.nbio.core.sequence.template.AbstractSequence;
042import org.biojava.nbio.core.sequence.template.Compound;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046import java.io.*;
047import java.util.ArrayList;
048import java.util.HashMap;
049import java.util.LinkedHashMap;
050
051/**
052 * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the
053 * primary class used to read Genbank files
054 *
055 */
056public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
057
058        private SequenceCreatorInterface<C> sequenceCreator;
059        private GenbankSequenceParser<S,C> genbankParser;
060        private BufferedReader bufferedReader;
061        private boolean closed;
062        private final Logger logger = LoggerFactory.getLogger(this.getClass());
063
064        public boolean isClosed() {
065                return closed;
066        }
067
068        /**
069         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
070         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
071         * an inputstream is forced to read all the data so you don't gain anything.
072         * @param is
073         * @param headerParser
074         * @param sequenceCreator
075         */
076        public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
077                                                 final SequenceCreatorInterface<C> sequenceCreator) {
078                this.sequenceCreator = sequenceCreator;
079                bufferedReader = new BufferedReader(new InputStreamReader(is));
080                genbankParser = new GenbankSequenceParser<>();
081                closed = false;
082        }
083
084        /**
085         * If you are going to use the FileProxyProteinSequenceCreator then you
086         * need to use this constructor because we need details about
087         * the location of the file.
088         * @param file
089         * @param headerParser
090         * @param sequenceCreator
091         * @throws FileNotFoundException if the file does not exist, is a directory
092         *      rather than a regular file, or for some other reason cannot be opened
093         *      for reading.
094         * @throws SecurityException if a security manager exists and its checkRead
095         *      method denies read access to the file.
096         */
097        public GenbankReader(
098                        final File file,
099                        final SequenceHeaderParserInterface<S,C> headerParser,
100                        final SequenceCreatorInterface<C> sequenceCreator
101                        ) throws FileNotFoundException {
102
103                this.bufferedReader = new BufferedReader(new FileReader(file));
104                this.sequenceCreator = sequenceCreator;
105                genbankParser = new GenbankSequenceParser<>();
106        }
107
108        /**
109         * The parsing is done in this method.<br>
110         * This method tries to process all the available Genbank records
111         * in the File or InputStream, closes the underlying resource,
112         * and return the results in {@link LinkedHashMap}.<br>
113         * You don't need to call {@link #close()} after calling this method.
114         * @see #process(int)
115         * @return {@link HashMap} containing all the parsed Genbank records
116         * present, starting current fileIndex onwards.
117         * @throws IOException
118         * @throws CompoundNotFoundException
119         */
120        public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
121                return process(-1);
122        }
123
124        /**
125         * This method tries to parse maximum <code>max</code> records from
126         * the open File or InputStream, and leaves the underlying resource open.<br>
127         *
128         * Subsequent calls to the same method continue parsing the rest of the file.<br>
129         * This is particularly useful when dealing with very big data files,
130         * (e.g. NCBI nr database), which can't fit into memory and will take long
131         * time before the first result is available.<br>
132         * <b>N.B.</b>
133         * <ul>
134         * <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
135         * <li>remember to close the underlying resource when you are done.</li>
136         * </ul>
137         * @see #process()
138         * @author Amr AL-Hossary
139         * @since 3.0.6
140         * @param max maximum number of records to return, <code>-1</code> for infinity.
141         * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
142         * present, starting current fileIndex onwards.
143         * @throws IOException
144         * @throws CompoundNotFoundException
145         */
146        public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
147                LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
148                @SuppressWarnings("unchecked")
149                int i=0;
150                while(true) {
151                        if(max>0 && i>=max) break;
152                        i++;
153                        String seqString = genbankParser.getSequence(bufferedReader, 0);
154                        //reached end of file?
155                        if(seqString==null) break;
156            @SuppressWarnings("unchecked")
157                        S sequence = (S) sequenceCreator.getSequence(seqString, 0);
158                        genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
159
160                        // add features to new sequence
161                        for (String k: genbankParser.getFeatures().keySet()){
162                                for (AbstractFeature f: genbankParser.getFeatures(k)){
163                                        //f.getLocations().setSequence(sequence);  // can't set proper sequence source to features. It is actually needed? Don't think so...
164                                        sequence.addFeature(f);
165                                }
166                        }
167
168                        // add taxonomy ID to new sequence
169                        ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
170                        if (dbQualifier != null){
171                                DBReferenceInfo q = dbQualifier.get(0);
172                                sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
173                        }
174
175                        sequences.put(sequence.getAccession().getID(), sequence);
176                }
177
178                if (max < 0) {
179                        close();
180                }
181
182                return sequences;
183        }
184
185        public void close() {
186                try {
187                        bufferedReader.close();
188                        this.closed = true;
189                } catch (IOException e) {
190                        logger.error("Couldn't close the reader. {}", e.getMessage());
191                        this.closed = false;
192                }
193        }
194
195        public static void main(String[] args) throws Exception {
196                String proteinFile = "src/test/resources/BondFeature.gb";
197                FileInputStream is = new FileInputStream(proteinFile);
198
199                GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
200                LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
201                System.out.println(proteinSequences);
202
203                String inputFile = "src/test/resources/NM_000266.gb";
204                is = new FileInputStream(inputFile);
205                GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
206                LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
207                System.out.println(dnaSequences);
208
209                String crazyFile = "src/test/resources/CraftedFeature.gb";
210                is = new FileInputStream(crazyFile);
211                GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
212                LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();
213
214                is.close();
215                System.out.println(crazyAnnotatedSequences);
216        }
217
218}
219