001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Scooter Willis ;lt;willishf at gmail dot com> 015 * @author Karl Nicholas <github:karlnicholas> 016 * @author Paolo Pavan 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 * Created on 01-21-2010 025 */ 026package org.biojava.nbio.core.sequence.io; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.DNASequence; 030import org.biojava.nbio.core.sequence.DataSource; 031import org.biojava.nbio.core.sequence.ProteinSequence; 032import org.biojava.nbio.core.sequence.TaxonomyID; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 035import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 036import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 037import org.biojava.nbio.core.sequence.features.AbstractFeature; 038import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 039import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 040import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 041import org.biojava.nbio.core.sequence.template.AbstractSequence; 042import org.biojava.nbio.core.sequence.template.Compound; 043 044import java.io.*; 045import java.util.ArrayList; 046import java.util.HashMap; 047import java.util.LinkedHashMap; 048 049/** 050 * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the 051 * primary class used to read Genbank files 052 * 053 */ 054public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> { 055 056 private SequenceCreatorInterface<C> sequenceCreator; 057 private GenbankSequenceParser<S,C> genbankParser; 058 private InputStream inputStream; 059 060 /** 061 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about 062 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in 063 * an inputstream is forced to read all the data so you don't gain anything. 064 * @param br 065 * @param headerParser 066 * @param sequenceCreator 067 */ 068 public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) { 069 this.sequenceCreator = sequenceCreator; 070 this.inputStream = is; 071 genbankParser = new GenbankSequenceParser<S,C>(); 072 } 073 074 /** 075 * If you are going to use the FileProxyProteinSequenceCreator then you 076 * need to use this constructor because we need details about 077 * the location of the file. 078 * @param file 079 * @param headerParser 080 * @param sequenceCreator 081 * @throws FileNotFoundException if the file does not exist, is a directory 082 * rather than a regular file, or for some other reason cannot be opened 083 * for reading. 084 * @throws SecurityException if a security manager exists and its checkRead 085 * method denies read access to the file. 086 */ 087 public GenbankReader( 088 File file, 089 SequenceHeaderParserInterface<S,C> headerParser, 090 SequenceCreatorInterface<C> sequenceCreator 091 ) throws FileNotFoundException { 092 093 inputStream = new FileInputStream(file); 094 this.sequenceCreator = sequenceCreator; 095 genbankParser = new GenbankSequenceParser<S,C>(); 096 } 097 098 /** 099 * The parsing is done in this method.<br> 100 * This method tries to process all the available Genbank records 101 * in the File or InputStream, closes the underlying resource, 102 * and return the results in {@link LinkedHashMap}.<br> 103 * You don't need to call {@link #close()} after calling this method. 104 * @see #process(int) 105 * @return {@link HashMap} containing all the parsed Genbank records 106 * present, starting current fileIndex onwards. 107 * @throws IOException 108 * @throws CompoundNotFoundException 109 */ 110 public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException { 111 LinkedHashMap<String,S> sequences = process(-1); 112 return sequences; 113 } 114 115 /** 116 * This method tries to parse maximum <code>max</code> records from 117 * the open File or InputStream, and leaves the underlying resource open.<br> 118 * 119 * Subsequent calls to the same method continue parsing the rest of the file.<br> 120 * This is particularly useful when dealing with very big data files, 121 * (e.g. NCBI nr database), which can't fit into memory and will take long 122 * time before the first result is available.<br> 123 * <b>N.B.</b> 124 * <ul> 125 * <li>This method ca't be called after calling its NO-ARGUMENT twin.</li> 126 * <li>remember to close the underlying resource when you are done.</li> 127 * </ul> 128 * @see #process() 129 * @author Amr AL-Hossary 130 * @since 3.0.6 131 * @param max maximum number of records to return, <code>-1</code> for infinity. 132 * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records 133 * present, starting current fileIndex onwards. 134 * @throws IOException 135 * @throws CompoundNotFoundException 136 */ 137 public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException { 138 LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>(); 139 @SuppressWarnings("unchecked") 140 int i=0; 141 BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); 142 while(true) { 143 if(max>0 && i>=max) break; 144 i++; 145 String seqString = genbankParser.getSequence(br, 0); 146 //reached end of file? 147 if(seqString==null) break; 148 S sequence = (S) sequenceCreator.getSequence(seqString, 0); 149 genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); 150 151 // add features to new sequence 152 for (String k: genbankParser.getFeatures().keySet()){ 153 for (AbstractFeature f: genbankParser.getFeatures(k)){ 154 //f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so... 155 sequence.addFeature(f); 156 } 157 } 158 159 // add taxonomy ID to new sequence 160 ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); 161 if (dbQualifier != null){ 162 DBReferenceInfo q = dbQualifier.get(0); 163 sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); 164 } 165 166 sequences.put(sequence.getAccession().getID(), sequence); 167 } 168 br.close(); 169 close(); 170 return sequences; 171 } 172 173 public void close() throws IOException { 174 inputStream.close(); 175 } 176 177 public static void main(String[] args) throws Exception { 178 String proteinFile = "src/test/resources/BondFeature.gb"; 179 FileInputStream is = new FileInputStream(proteinFile); 180 181 GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 182 LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process(); 183 System.out.println(proteinSequences); 184 185 String inputFile = "src/test/resources/NM_000266.gb"; 186 is = new FileInputStream(inputFile); 187 GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 188 LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process(); 189 System.out.println(dnaSequences); 190 191 String crazyFile = "src/test/resources/CraftedFeature.gb"; 192 is = new FileInputStream(crazyFile); 193 GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 194 LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process(); 195 196 is.close(); 197 System.out.println(crazyAnnotatedSequences); 198 } 199 200} 201