001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 028import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 029import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import java.io.File; 035import java.io.FileInputStream; 036import java.io.InputStream; 037import java.util.LinkedHashMap; 038 039/** 040 * 041 * @author Scooter Willis <willishf at gmail dot com> 042 */ 043public class GenbankReaderHelper { 044 045 private final static Logger logger = LoggerFactory.getLogger(GenbankReaderHelper.class); 046 047 /** 048 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 049 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 050 * in one sequence based on accession id. 051 * @param file 052 * @param lazySequenceLoad 053 * @return 054 * @throws Exception 055 */ 056 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(File file, boolean lazySequenceLoad) throws Exception { 057 if (!lazySequenceLoad) { 058 return readGenbankDNASequence(file); 059 } 060 061 GenbankReader<DNASequence, NucleotideCompound> GenbankProxyReader = 062 new GenbankReader<DNASequence, NucleotideCompound>( 063 file, 064 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 065 new FileProxyDNASequenceCreator( 066 file, 067 DNACompoundSet.getDNACompoundSet(), 068 new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>() 069 ) 070 ); 071 return GenbankProxyReader.process(); 072 073 } 074 075 /** 076 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 077 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 078 * in one sequence based on accession id. 079 * @param file 080 * @param lazySequenceLoad 081 * @return 082 * @throws Exception 083 */ 084 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(File file, boolean lazySequenceLoad) throws Exception { 085 if (!lazySequenceLoad) { 086 return readGenbankProteinSequence(file); 087 } 088 089 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProxyReader = 090 new GenbankReader<ProteinSequence, AminoAcidCompound>( 091 file, 092 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 093 new FileProxyProteinSequenceCreator( 094 file, 095 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 096 new GenbankSequenceParser<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>() 097 ) 098 ); 099 return GenbankProxyReader.process(); 100 101 } 102 /** 103 * Read a Genbank file containing amino acids with setup that would handle most 104 * cases. 105 * 106 * @param file 107 * @return 108 * @throws Exception 109 */ 110 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence( 111 File file) throws Exception { 112 FileInputStream inStream = new FileInputStream(file); 113 LinkedHashMap<String, ProteinSequence> proteinSequences = readGenbankProteinSequence(inStream); 114 inStream.close(); 115 return proteinSequences; 116 } 117 118 /** 119 * Read a Genbank file containing amino acids with setup that would handle most 120 * cases. User is responsible for closing InputStream because you opened it 121 * 122 * @param inStream 123 * @return 124 * @throws Exception 125 */ 126 public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence( 127 InputStream inStream) throws Exception { 128 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankReader = new GenbankReader<ProteinSequence, AminoAcidCompound>( 129 inStream, 130 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 131 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 132 return GenbankReader.process(); 133 } 134 135 /** 136 * Read a Genbank DNA sequence 137 * @param inStream 138 * @return 139 * @throws Exception 140 */ 141 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence( 142 InputStream inStream) throws Exception { 143 GenbankReader<DNASequence, NucleotideCompound> GenbankReader = new GenbankReader<DNASequence, NucleotideCompound>( 144 inStream, 145 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 146 new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 147 return GenbankReader.process(); 148 } 149 150 /** 151 * 152 * @param file 153 * @return 154 * @throws Exception 155 */ 156 public static LinkedHashMap<String, DNASequence> readGenbankDNASequence( 157 File file) throws Exception { 158 FileInputStream inStream = new FileInputStream(file); 159 LinkedHashMap<String, DNASequence> dnaSequences = readGenbankDNASequence(inStream); 160 inStream.close(); 161 return dnaSequences; 162 } 163 164 public static void main(String args[]) throws Exception { 165 166 LinkedHashMap<String, DNASequence> dnaSequences = GenbankReaderHelper.readGenbankDNASequence(new File("src/test/resources/NM_000266.gb"), true); 167 for (DNASequence sequence : dnaSequences.values()) { 168 logger.info("DNA Sequence: {}", sequence.getRNASequence().getProteinSequence().getSequenceAsString()); 169 } 170 171 LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankReaderHelper.readGenbankProteinSequence(new File("src/test/resources/BondFeature.gb"), true); 172 for (ProteinSequence sequence : proteinSequences.values()) { 173 logger.info("Protein Sequence: {}", sequence.getSequenceAsString()); 174 } 175 } 176}