001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Scooter Willis ;lt;willishf at gmail dot com> 015 * @author Karl Nicholas <github:karlnicholas> 016 * @author Paolo Pavan 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 * Created on 01-21-2010 025 */ 026package org.biojava.nbio.core.sequence.io; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.DNASequence; 030import org.biojava.nbio.core.sequence.DataSource; 031import org.biojava.nbio.core.sequence.ProteinSequence; 032import org.biojava.nbio.core.sequence.TaxonomyID; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 035import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 036import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 037import org.biojava.nbio.core.sequence.features.AbstractFeature; 038import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 039import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 040import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 041import org.biojava.nbio.core.sequence.template.AbstractSequence; 042import org.biojava.nbio.core.sequence.template.Compound; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046import java.io.*; 047import java.util.ArrayList; 048import java.util.HashMap; 049import java.util.LinkedHashMap; 050 051/** 052 * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the 053 * primary class used to read Genbank files 054 * 055 */ 056public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> { 057 058 private SequenceCreatorInterface<C> sequenceCreator; 059 private GenbankSequenceParser<S,C> genbankParser; 060 private BufferedReader bufferedReader; 061 private boolean closed; 062 private final Logger logger = LoggerFactory.getLogger(this.getClass()); 063 064 public boolean isClosed() { 065 return closed; 066 } 067 068 /** 069 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about 070 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in 071 * an inputstream is forced to read all the data so you don't gain anything. 072 * @param is 073 * @param headerParser 074 * @param sequenceCreator 075 */ 076 public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser, 077 final SequenceCreatorInterface<C> sequenceCreator) { 078 this.sequenceCreator = sequenceCreator; 079 bufferedReader = new BufferedReader(new InputStreamReader(is)); 080 genbankParser = new GenbankSequenceParser<>(); 081 closed = false; 082 } 083 084 /** 085 * If you are going to use the FileProxyProteinSequenceCreator then you 086 * need to use this constructor because we need details about 087 * the location of the file. 088 * @param file 089 * @param headerParser 090 * @param sequenceCreator 091 * @throws FileNotFoundException if the file does not exist, is a directory 092 * rather than a regular file, or for some other reason cannot be opened 093 * for reading. 094 * @throws SecurityException if a security manager exists and its checkRead 095 * method denies read access to the file. 096 */ 097 public GenbankReader( 098 final File file, 099 final SequenceHeaderParserInterface<S,C> headerParser, 100 final SequenceCreatorInterface<C> sequenceCreator 101 ) throws FileNotFoundException { 102 103 this.bufferedReader = new BufferedReader(new FileReader(file)); 104 this.sequenceCreator = sequenceCreator; 105 genbankParser = new GenbankSequenceParser<>(); 106 } 107 108 /** 109 * The parsing is done in this method.<br> 110 * This method tries to process all the available Genbank records 111 * in the File or InputStream, closes the underlying resource, 112 * and return the results in {@link LinkedHashMap}.<br> 113 * You don't need to call {@link #close()} after calling this method. 114 * @see #process(int) 115 * @return {@link HashMap} containing all the parsed Genbank records 116 * present, starting current fileIndex onwards. 117 * @throws IOException 118 * @throws CompoundNotFoundException 119 */ 120 public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException { 121 return process(-1); 122 } 123 124 /** 125 * This method tries to parse maximum <code>max</code> records from 126 * the open File or InputStream, and leaves the underlying resource open.<br> 127 * 128 * Subsequent calls to the same method continue parsing the rest of the file.<br> 129 * This is particularly useful when dealing with very big data files, 130 * (e.g. NCBI nr database), which can't fit into memory and will take long 131 * time before the first result is available.<br> 132 * <b>N.B.</b> 133 * <ul> 134 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 135 * <li>remember to close the underlying resource when you are done.</li> 136 * </ul> 137 * @see #process() 138 * @author Amr AL-Hossary 139 * @since 3.0.6 140 * @param max maximum number of records to return, <code>-1</code> for infinity. 141 * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records 142 * present, starting current fileIndex onwards. 143 * @throws IOException 144 * @throws CompoundNotFoundException 145 */ 146 public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException { 147 LinkedHashMap<String,S> sequences = new LinkedHashMap<>(); 148 @SuppressWarnings("unchecked") 149 int i=0; 150 while(true) { 151 if(max>0 && i>=max) break; 152 i++; 153 String seqString = genbankParser.getSequence(bufferedReader, 0); 154 //reached end of file? 155 if(seqString==null) break; 156 @SuppressWarnings("unchecked") 157 S sequence = (S) sequenceCreator.getSequence(seqString, 0); 158 genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); 159 160 // add features to new sequence 161 for (String k: genbankParser.getFeatures().keySet()){ 162 for (AbstractFeature f: genbankParser.getFeatures(k)){ 163 //f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so... 164 sequence.addFeature(f); 165 } 166 } 167 168 // add taxonomy ID to new sequence 169 ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); 170 if (dbQualifier != null){ 171 DBReferenceInfo q = dbQualifier.get(0); 172 sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); 173 } 174 175 sequences.put(sequence.getAccession().getID(), sequence); 176 } 177 178 if (max < 0) { 179 close(); 180 } 181 182 return sequences; 183 } 184 185 public void close() { 186 try { 187 bufferedReader.close(); 188 this.closed = true; 189 } catch (IOException e) { 190 logger.error("Couldn't close the reader. {}", e.getMessage()); 191 this.closed = false; 192 } 193 } 194 195 public static void main(String[] args) throws Exception { 196 String proteinFile = "src/test/resources/BondFeature.gb"; 197 FileInputStream is = new FileInputStream(proteinFile); 198 199 GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 200 LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process(); 201 System.out.println(proteinSequences); 202 203 String inputFile = "src/test/resources/NM_000266.gb"; 204 is = new FileInputStream(inputFile); 205 GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 206 LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process(); 207 System.out.println(dnaSequences); 208 209 String crazyFile = "src/test/resources/CraftedFeature.gb"; 210 is = new FileInputStream(crazyFile); 211 GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 212 LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process(); 213 214 is.close(); 215 System.out.println(crazyAnnotatedSequences); 216 } 217 218} 219