001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.RNASequence;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
029import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
030import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
031import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
032import org.biojava.nbio.core.sequence.template.AbstractSequence;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import java.io.File;
037import java.io.FileInputStream;
038import java.io.InputStream;
039import java.util.LinkedHashMap;
040
041/**
042 *
043 * @author Scooter Willis <willishf at gmail dot com>
044 */
045public class GenbankReaderHelper {
046
047        private final static Logger logger = LoggerFactory.getLogger(GenbankReaderHelper.class);
048
049        /**
050         * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
051         * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
052         * in one sequence based on accession id.
053         * @param file
054         * @param lazySequenceLoad
055         * @return
056         * @throws Exception
057         */
058        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(File file, boolean lazySequenceLoad) throws Exception {
059                if (!lazySequenceLoad) {
060                        return readGenbankDNASequence(file);
061                }
062
063                GenbankReader<DNASequence, NucleotideCompound> GenbankProxyReader =
064                                new GenbankReader<DNASequence, NucleotideCompound>(
065                                                file,
066                                                new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
067                                                new FileProxyDNASequenceCreator(
068                                                                file,
069                                                                DNACompoundSet.getDNACompoundSet(),
070                                                                new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
071                                                        )
072                                        );
073                return GenbankProxyReader.process();
074
075        }
076
077        /**
078         * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
079         * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
080         * in one sequence based on accession id.
081         * @param file
082         * @param lazySequenceLoad
083         * @return
084         * @throws Exception
085         */
086        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(File file, boolean lazySequenceLoad) throws Exception {
087                if (!lazySequenceLoad) {
088                        return readGenbankProteinSequence(file);
089                }
090
091                GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProxyReader =
092                                new GenbankReader<ProteinSequence, AminoAcidCompound>(
093                                                file,
094                                                new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
095                                                new FileProxyProteinSequenceCreator(
096                                                                file,
097                                                                AminoAcidCompoundSet.getAminoAcidCompoundSet(),
098                                                                new GenbankSequenceParser<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>()
099                                                        )
100                                        );
101                return GenbankProxyReader.process();
102
103        }
104
105        /**
106         * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
107         * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
108         * in one sequence based on accession id.
109         * @param file
110         * @param lazySequenceLoad
111         * @return
112         * @throws Exception
113         */
114        public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception {
115                if (!lazySequenceLoad) {
116                        return readGenbankRNASequence(file);
117                }
118
119                GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader =
120                                new GenbankReader<RNASequence, NucleotideCompound>(
121                                                file,
122                                                new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
123                                                new FileProxyRNASequenceCreator(
124                                                                file,
125                                                                RNACompoundSet.getRNACompoundSet(),
126                                                                new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
127                                                        )
128                                        );
129                return GenbankProxyReader.process();
130
131        }
132
133        /**
134         * Read a Genbank file containing amino acids with setup that would handle most
135         * cases.
136         *
137         * @param file
138         * @return
139         * @throws Exception
140         */
141        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
142                        File file) throws Exception {
143                FileInputStream inStream = new FileInputStream(file);
144                LinkedHashMap<String, ProteinSequence> proteinSequences = readGenbankProteinSequence(inStream);
145                inStream.close();
146                return proteinSequences;
147        }
148
149        /**
150         * Read a Genbank file containing amino acids with setup that would handle most
151         * cases. User is responsible for closing InputStream because you opened it
152         *
153         * @param inStream
154         * @return
155         * @throws Exception
156         */
157        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
158                        InputStream inStream) throws Exception {
159                GenbankReader<ProteinSequence, AminoAcidCompound> GenbankReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(
160                                inStream,
161                                new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
162                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
163                return GenbankReader.process();
164        }
165
166        /**
167         * Read a Genbank DNA sequence
168         * @param inStream
169         * @return
170         * @throws Exception
171         */
172        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
173                        InputStream inStream) throws Exception {
174                GenbankReader<DNASequence, NucleotideCompound> GenbankReader = new GenbankReader<DNASequence, NucleotideCompound>(
175                                inStream,
176                                new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
177                                new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
178                return GenbankReader.process();
179        }
180
181        /**
182         *
183         * @param file
184         * @return
185         * @throws Exception
186         */
187        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
188                        File file) throws Exception {
189                FileInputStream inStream = new FileInputStream(file);
190                LinkedHashMap<String, DNASequence> dnaSequences = readGenbankDNASequence(inStream);
191                inStream.close();
192                return dnaSequences;
193        }
194        /**
195         * Read a Genbank RNA sequence
196         * @param inStream
197         * @return
198         * @throws Exception
199         */
200        public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
201                        InputStream inStream) throws Exception {
202                GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<RNASequence, NucleotideCompound>(
203                                inStream,
204                                new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
205                                new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
206                return GenbankReader.process();
207        }
208
209        /**
210         *
211         * @param file
212         * @return
213         * @throws Exception
214         */
215        public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
216                        File file) throws Exception {
217                FileInputStream inStream = new FileInputStream(file);
218                LinkedHashMap<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream);
219                inStream.close();
220                return rnaSequences;
221        }
222
223        public static void main(String[] args) throws Exception {
224
225                LinkedHashMap<String, DNASequence> dnaSequences = GenbankReaderHelper.readGenbankDNASequence(new File("src/test/resources/NM_000266.gb"), true);
226                for (DNASequence sequence : dnaSequences.values()) {
227                        logger.info("DNA Sequence: {}", sequence.getRNASequence().getProteinSequence().getSequenceAsString());
228                }
229
230                LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankReaderHelper.readGenbankProteinSequence(new File("src/test/resources/BondFeature.gb"), true);
231                for (ProteinSequence sequence : proteinSequences.values()) {
232                        logger.info("Protein Sequence: {}", sequence.getSequenceAsString());
233                }
234        }
235}