001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.RNASequence; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 029import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 030import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 031import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 032import org.biojava.nbio.core.sequence.template.AbstractSequence; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import java.io.File; 037import java.io.FileInputStream; 038import java.io.InputStream; 039import java.util.LinkedHashMap; 040import java.util.Map; 041 042/** 043 * 044 * @author Scooter Willis 045 */ 046public class GenbankReaderHelper { 047 048 private final static Logger logger = LoggerFactory.getLogger(GenbankReaderHelper.class); 049 050 /** 051 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 052 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 053 * in one sequence based on accession id. 054 * @param file 055 * @param lazySequenceLoad 056 * @return 057 * @throws Exception 058 */ 059 public static Map<String, DNASequence> readGenbankDNASequence(File file, boolean lazySequenceLoad) throws Exception { 060 if (!lazySequenceLoad) { 061 return readGenbankDNASequence(file); 062 } 063 064 GenbankReader<DNASequence, NucleotideCompound> GenbankProxyReader = 065 new GenbankReader<>( 066 file, 067 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 068 new FileProxyDNASequenceCreator( 069 file, 070 DNACompoundSet.getDNACompoundSet(), 071 new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>() 072 ) 073 ); 074 return GenbankProxyReader.process(); 075 076 } 077 078 /** 079 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 080 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 081 * in one sequence based on accession id. 082 * @param file 083 * @param lazySequenceLoad 084 * @return 085 * @throws Exception 086 */ 087 public static Map<String, ProteinSequence> readGenbankProteinSequence(File file, boolean lazySequenceLoad) throws Exception { 088 if (!lazySequenceLoad) { 089 return readGenbankProteinSequence(file); 090 } 091 092 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProxyReader = 093 new GenbankReader<>( 094 file, 095 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 096 new FileProxyProteinSequenceCreator( 097 file, 098 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 099 new GenbankSequenceParser<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>() 100 ) 101 ); 102 return GenbankProxyReader.process(); 103 104 } 105 106 /** 107 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects 108 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested 109 * in one sequence based on accession id. 110 * @param file 111 * @param lazySequenceLoad 112 * @return 113 * @throws Exception 114 */ 115 public static Map<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception { 116 if (!lazySequenceLoad) { 117 return readGenbankRNASequence(file); 118 } 119 120 GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader = 121 new GenbankReader<>( 122 file, 123 new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(), 124 new FileProxyRNASequenceCreator( 125 file, 126 RNACompoundSet.getRNACompoundSet(), 127 new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>() 128 ) 129 ); 130 return GenbankProxyReader.process(); 131 132 } 133 134 /** 135 * Read a Genbank file containing amino acids with setup that would handle most 136 * cases. 137 * 138 * @param file 139 * @return 140 * @throws Exception 141 */ 142 public static Map<String, ProteinSequence> readGenbankProteinSequence( 143 File file) throws Exception { 144 FileInputStream inStream = new FileInputStream(file); 145 Map<String, ProteinSequence> proteinSequences = readGenbankProteinSequence(inStream); 146 inStream.close(); 147 return proteinSequences; 148 } 149 150 /** 151 * Read a Genbank file containing amino acids with setup that would handle most 152 * cases. User is responsible for closing InputStream because you opened it 153 * 154 * @param inStream 155 * @return 156 * @throws Exception 157 */ 158 public static Map<String, ProteinSequence> readGenbankProteinSequence( 159 InputStream inStream) throws Exception { 160 GenbankReader<ProteinSequence, AminoAcidCompound> GenbankReader = new GenbankReader<>( 161 inStream, 162 new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(), 163 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 164 return GenbankReader.process(); 165 } 166 167 /** 168 * Read a Genbank DNA sequence 169 * @param inStream 170 * @return 171 * @throws Exception 172 */ 173 public static Map<String, DNASequence> readGenbankDNASequence( 174 InputStream inStream) throws Exception { 175 GenbankReader<DNASequence, NucleotideCompound> GenbankReader = new GenbankReader<>( 176 inStream, 177 new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(), 178 new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 179 return GenbankReader.process(); 180 } 181 182 /** 183 * 184 * @param file 185 * @return 186 * @throws Exception 187 */ 188 public static Map<String, DNASequence> readGenbankDNASequence( 189 File file) throws Exception { 190 FileInputStream inStream = new FileInputStream(file); 191 Map<String, DNASequence> dnaSequences = readGenbankDNASequence(inStream); 192 inStream.close(); 193 return dnaSequences; 194 } 195 /** 196 * Read a Genbank RNA sequence 197 * @param inStream 198 * @return 199 * @throws Exception 200 */ 201 public static Map<String, RNASequence> readGenbankRNASequence( 202 InputStream inStream) throws Exception { 203 GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<>( 204 inStream, 205 new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(), 206 new RNASequenceCreator(RNACompoundSet.getRNACompoundSet())); 207 return GenbankReader.process(); 208 } 209 210 /** 211 * 212 * @param file 213 * @return 214 * @throws Exception 215 */ 216 public static Map<String, RNASequence> readGenbankRNASequence( 217 File file) throws Exception { 218 FileInputStream inStream = new FileInputStream(file); 219 Map<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream); 220 inStream.close(); 221 return rnaSequences; 222 } 223 224}