001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Scooter Willis ;lt;willishf at gmail dot com>
015 * @author Karl Nicholas <github:karlnicholas>
016 * @author Paolo Pavan
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 * Created on 01-21-2010
025 */
026package org.biojava.nbio.core.sequence.io;
027
028import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
029import org.biojava.nbio.core.sequence.DNASequence;
030import org.biojava.nbio.core.sequence.DataSource;
031import org.biojava.nbio.core.sequence.ProteinSequence;
032import org.biojava.nbio.core.sequence.TaxonomyID;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
035import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.core.sequence.features.AbstractFeature;
038import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
039import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
040import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
041import org.biojava.nbio.core.sequence.template.AbstractSequence;
042import org.biojava.nbio.core.sequence.template.Compound;
043
044import java.io.*;
045import java.util.ArrayList;
046import java.util.HashMap;
047import java.util.LinkedHashMap;
048
049/**
050 * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the
051 * primary class used to read Genbank files
052 *
053 */
054public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
055
056        private SequenceCreatorInterface<C> sequenceCreator;
057        private GenbankSequenceParser<S,C> genbankParser;
058        private InputStream inputStream;
059
060        /**
061         * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
062         * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
063         * an inputstream is forced to read all the data so you don't gain anything.
064         * @param br
065         * @param headerParser
066         * @param sequenceCreator
067         */
068        public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
069                this.sequenceCreator = sequenceCreator;
070                this.inputStream = is;
071                genbankParser = new GenbankSequenceParser<S,C>();
072        }
073
074        /**
075         * If you are going to use the FileProxyProteinSequenceCreator then you
076         * need to use this constructor because we need details about
077         * the location of the file.
078         * @param file
079         * @param headerParser
080         * @param sequenceCreator
081         * @throws FileNotFoundException if the file does not exist, is a directory
082         *      rather than a regular file, or for some other reason cannot be opened
083         *      for reading.
084         * @throws SecurityException if a security manager exists and its checkRead
085         *      method denies read access to the file.
086         */
087        public GenbankReader(
088                        File file,
089                        SequenceHeaderParserInterface<S,C> headerParser,
090                        SequenceCreatorInterface<C> sequenceCreator
091                        ) throws FileNotFoundException {
092
093                inputStream = new FileInputStream(file);
094                this.sequenceCreator = sequenceCreator;
095                genbankParser = new GenbankSequenceParser<S,C>();
096        }
097
098        /**
099         * The parsing is done in this method.<br>
100         * This method tries to process all the available Genbank records
101         * in the File or InputStream, closes the underlying resource,
102         * and return the results in {@link LinkedHashMap}.<br>
103         * You don't need to call {@link #close()} after calling this method.
104         * @see #process(int)
105         * @return {@link HashMap} containing all the parsed Genbank records
106         * present, starting current fileIndex onwards.
107         * @throws IOException
108         * @throws CompoundNotFoundException
109         */
110        public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
111                LinkedHashMap<String,S> sequences = process(-1);
112                return sequences;
113        }
114
115        /**
116         * This method tries to parse maximum <code>max</code> records from
117         * the open File or InputStream, and leaves the underlying resource open.<br>
118         *
119         * Subsequent calls to the same method continue parsing the rest of the file.<br>
120         * This is particularly useful when dealing with very big data files,
121         * (e.g. NCBI nr database), which can't fit into memory and will take long
122         * time before the first result is available.<br>
123         * <b>N.B.</b>
124         * <ul>
125         * <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
126         * <li>remember to close the underlying resource when you are done.</li>
127         * </ul>
128         * @see #process()
129         * @author Amr AL-Hossary
130         * @since 3.0.6
131         * @param max maximum number of records to return, <code>-1</code> for infinity.
132         * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
133         * present, starting current fileIndex onwards.
134         * @throws IOException
135         * @throws CompoundNotFoundException
136         */
137        public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException {
138                LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
139                @SuppressWarnings("unchecked")
140                int i=0;
141                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
142                while(true) {
143                        if(max>0 && i>=max) break;
144                        i++;
145                        String seqString = genbankParser.getSequence(br, 0);
146                        //reached end of file?
147                        if(seqString==null) break;
148                        S sequence = (S) sequenceCreator.getSequence(seqString, 0);
149                        genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
150
151                        // add features to new sequence
152                        for (String k: genbankParser.getFeatures().keySet()){
153                                for (AbstractFeature f: genbankParser.getFeatures(k)){
154                                        //f.getLocations().setSequence(sequence);  // can't set proper sequence source to features. It is actually needed? Don't think so...
155                                        sequence.addFeature(f);
156                                }
157                        }
158
159                        // add taxonomy ID to new sequence
160                        ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
161                        if (dbQualifier != null){
162                                DBReferenceInfo q = dbQualifier.get(0);
163                                sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
164                        }
165
166                        sequences.put(sequence.getAccession().getID(), sequence);
167                }
168                br.close();
169                close();
170                return sequences;
171        }
172
173        public void close() throws IOException {
174                inputStream.close();
175        }
176
177        public static void main(String[] args) throws Exception {
178                String proteinFile = "src/test/resources/BondFeature.gb";
179                FileInputStream is = new FileInputStream(proteinFile);
180
181                GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
182                LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
183                System.out.println(proteinSequences);
184
185                String inputFile = "src/test/resources/NM_000266.gb";
186                is = new FileInputStream(inputFile);
187                GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
188                LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
189                System.out.println(dnaSequences);
190
191                String crazyFile = "src/test/resources/CraftedFeature.gb";
192                is = new FileInputStream(crazyFile);
193                GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
194                LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();
195
196                is.close();
197                System.out.println(crazyAnnotatedSequences);
198        }
199
200}
201