001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.RNASequence; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 029import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 030import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 031import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 032import org.biojava.nbio.core.sequence.template.AbstractSequence; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import java.io.File; 037import java.io.FileInputStream; 038import java.io.InputStream; 039import java.util.LinkedHashMap; 040 041/** 042 * 043 * @author Scooter Willis <willishf at gmail dot com> 044 */ 045public class GenbankReaderHelper { 046 047 private final static Logger logger = LoggerFactory.getLogger(GenbankReaderHelper.class); 048 049 /** 050 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 051 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 052 * in one sequence based on accession id. 053 * @param file 054 * @param lazySequenceLoad 055 * @return 056 * @throws Exception 057 */ 058 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(File file, boolean lazySequenceLoad) throws Exception { 059 if (!lazySequenceLoad) { 060 return readGenbankDNASequence(file); 061 } 062 063 GenbankReader<DNASequence, NucleotideCompound> GenbankProxyReader = 064 new GenbankReader<DNASequence, NucleotideCompound>( 065 file, 066 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 067 new FileProxyDNASequenceCreator( 068 file, 069 DNACompoundSet.getDNACompoundSet(), 070 new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>() 071 ) 072 ); 073 return GenbankProxyReader.process(); 074 075 } 076 077 /** 078 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 079 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 080 * in one sequence based on accession id. 081 * @param file 082 * @param lazySequenceLoad 083 * @return 084 * @throws Exception 085 */ 086 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(File file, boolean lazySequenceLoad) throws Exception { 087 if (!lazySequenceLoad) { 088 return readGenbankProteinSequence(file); 089 } 090 091 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProxyReader = 092 new GenbankReader<ProteinSequence, AminoAcidCompound>( 093 file, 094 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 095 new FileProxyProteinSequenceCreator( 096 file, 097 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 098 new GenbankSequenceParser<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>() 099 ) 100 ); 101 return GenbankProxyReader.process(); 102 103 } 104 105 /** 106 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 107 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 108 * in one sequence based on accession id. 109 * @param file 110 * @param lazySequenceLoad 111 * @return 112 * @throws Exception 113 */ 114 public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception { 115 if (!lazySequenceLoad) { 116 return readGenbankRNASequence(file); 117 } 118 119 GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader = 120 new GenbankReader<RNASequence, NucleotideCompound>( 121 file, 122 new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(), 123 new FileProxyRNASequenceCreator( 124 file, 125 RNACompoundSet.getRNACompoundSet(), 126 new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>() 127 ) 128 ); 129 return GenbankProxyReader.process(); 130 131 } 132 133 /** 134 * Read a Genbank file containing amino acids with setup that would handle most 135 * cases. 136 * 137 * @param file 138 * @return 139 * @throws Exception 140 */ 141 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence( 142 File file) throws Exception { 143 FileInputStream inStream = new FileInputStream(file); 144 LinkedHashMap<String, ProteinSequence> proteinSequences = readGenbankProteinSequence(inStream); 145 inStream.close(); 146 return proteinSequences; 147 } 148 149 /** 150 * Read a Genbank file containing amino acids with setup that would handle most 151 * cases. User is responsible for closing InputStream because you opened it 152 * 153 * @param inStream 154 * @return 155 * @throws Exception 156 */ 157 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence( 158 InputStream inStream) throws Exception { 159 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankReader = new GenbankReader<ProteinSequence, AminoAcidCompound>( 160 inStream, 161 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 162 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 163 return GenbankReader.process(); 164 } 165 166 /** 167 * Read a Genbank DNA sequence 168 * @param inStream 169 * @return 170 * @throws Exception 171 */ 172 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence( 173 InputStream inStream) throws Exception { 174 GenbankReader<DNASequence, NucleotideCompound> GenbankReader = new GenbankReader<DNASequence, NucleotideCompound>( 175 inStream, 176 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 177 new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 178 return GenbankReader.process(); 179 } 180 181 /** 182 * 183 * @param file 184 * @return 185 * @throws Exception 186 */ 187 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence( 188 File file) throws Exception { 189 FileInputStream inStream = new FileInputStream(file); 190 LinkedHashMap<String, DNASequence> dnaSequences = readGenbankDNASequence(inStream); 191 inStream.close(); 192 return dnaSequences; 193 } 194 /** 195 * Read a Genbank RNA sequence 196 * @param inStream 197 * @return 198 * @throws Exception 199 */ 200 public static LinkedHashMap<String, RNASequence> readGenbankRNASequence( 201 InputStream inStream) throws Exception { 202 GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<RNASequence, NucleotideCompound>( 203 inStream, 204 new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(), 205 new RNASequenceCreator(RNACompoundSet.getRNACompoundSet())); 206 return GenbankReader.process(); 207 } 208 209 /** 210 * 211 * @param file 212 * @return 213 * @throws Exception 214 */ 215 public static LinkedHashMap<String, RNASequence> readGenbankRNASequence( 216 File file) throws Exception { 217 FileInputStream inStream = new FileInputStream(file); 218 LinkedHashMap<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream); 219 inStream.close(); 220 return rnaSequences; 221 } 222 223 public static void main(String[] args) throws Exception { 224 225 LinkedHashMap<String, DNASequence> dnaSequences = GenbankReaderHelper.readGenbankDNASequence(new File("src/test/resources/NM_000266.gb"), true); 226 for (DNASequence sequence : dnaSequences.values()) { 227 logger.info("DNA Sequence: {}", sequence.getRNASequence().getProteinSequence().getSequenceAsString()); 228 } 229 230 LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankReaderHelper.readGenbankProteinSequence(new File("src/test/resources/BondFeature.gb"), true); 231 for (ProteinSequence sequence : proteinSequences.values()) { 232 logger.info("Protein Sequence: {}", sequence.getSequenceAsString()); 233 } 234 } 235}