001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
029import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import java.io.File;
035import java.io.FileInputStream;
036import java.io.InputStream;
037import java.util.LinkedHashMap;
038
039/**
040 *
041 * @author Scooter Willis <willishf at gmail dot com>
042 */
043public class GenbankReaderHelper {
044
045        private final static Logger logger = LoggerFactory.getLogger(GenbankReaderHelper.class);
046
047        /**
048         * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
049         * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
050         * in one sequence based on accession id.
051         * @param file
052         * @param lazySequenceLoad
053         * @return
054         * @throws Exception
055         */
056        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(File file, boolean lazySequenceLoad) throws Exception {
057                if (!lazySequenceLoad) {
058                        return readGenbankDNASequence(file);
059                }
060
061                GenbankReader<DNASequence, NucleotideCompound> GenbankProxyReader =
062                                new GenbankReader<DNASequence, NucleotideCompound>(
063                                                file,
064                                                new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
065                                                new FileProxyDNASequenceCreator(
066                                                                file,
067                                                                DNACompoundSet.getDNACompoundSet(),
068                                                                new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
069                                                        )
070                                        );
071                return GenbankProxyReader.process();
072
073        }
074
075        /**
076         * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
077         * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
078         * in one sequence based on accession id.
079         * @param file
080         * @param lazySequenceLoad
081         * @return
082         * @throws Exception
083         */
084        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(File file, boolean lazySequenceLoad) throws Exception {
085                if (!lazySequenceLoad) {
086                        return readGenbankProteinSequence(file);
087                }
088
089                GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProxyReader =
090                                new GenbankReader<ProteinSequence, AminoAcidCompound>(
091                                                file,
092                                                new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
093                                                new FileProxyProteinSequenceCreator(
094                                                                file,
095                                                                AminoAcidCompoundSet.getAminoAcidCompoundSet(),
096                                                                new GenbankSequenceParser<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>()
097                                                        )
098                                        );
099                return GenbankProxyReader.process();
100
101        }
102        /**
103         * Read a Genbank file containing amino acids with setup that would handle most
104         * cases.
105         *
106         * @param file
107         * @return
108         * @throws Exception
109         */
110        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
111                        File file) throws Exception {
112                FileInputStream inStream = new FileInputStream(file);
113                LinkedHashMap<String, ProteinSequence> proteinSequences = readGenbankProteinSequence(inStream);
114                inStream.close();
115                return proteinSequences;
116        }
117
118        /**
119         * Read a Genbank file containing amino acids with setup that would handle most
120         * cases. User is responsible for closing InputStream because you opened it
121         *
122         * @param inStream
123         * @return
124         * @throws Exception
125         */
126        public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
127                        InputStream inStream) throws Exception {
128                GenbankReader<ProteinSequence, AminoAcidCompound> GenbankReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(
129                                inStream,
130                                new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
131                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
132                return GenbankReader.process();
133        }
134
135        /**
136         * Read a Genbank DNA sequence
137         * @param inStream
138         * @return
139         * @throws Exception
140         */
141        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
142                        InputStream inStream) throws Exception {
143                GenbankReader<DNASequence, NucleotideCompound> GenbankReader = new GenbankReader<DNASequence, NucleotideCompound>(
144                                inStream,
145                                new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
146                                new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
147                return GenbankReader.process();
148        }
149
150        /**
151         *
152         * @param file
153         * @return
154         * @throws Exception
155         */
156        public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
157                        File file) throws Exception {
158                FileInputStream inStream = new FileInputStream(file);
159                LinkedHashMap<String, DNASequence> dnaSequences = readGenbankDNASequence(inStream);
160                inStream.close();
161                return dnaSequences;
162        }
163
164        public static void main(String args[]) throws Exception {
165
166                LinkedHashMap<String, DNASequence> dnaSequences = GenbankReaderHelper.readGenbankDNASequence(new File("src/test/resources/NM_000266.gb"), true);
167                for (DNASequence sequence : dnaSequences.values()) {
168                        logger.info("DNA Sequence: {}", sequence.getRNASequence().getProteinSequence().getSequenceAsString());
169                }
170
171                LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankReaderHelper.readGenbankProteinSequence(new File("src/test/resources/BondFeature.gb"), true);
172                for (ProteinSequence sequence : proteinSequences.values()) {
173                        logger.info("Protein Sequence: {}", sequence.getSequenceAsString());
174                }
175        }
176}