001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Scooter Willis ;lt;willishf at gmail dot com> 015 * @author Karl Nicholas <github:karlnicholas> 016 * @author Paolo Pavan 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 * Created on 01-21-2010 025 */ 026package org.biojava.nbio.core.sequence.io; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.AccessionID; 030import org.biojava.nbio.core.sequence.DataSource; 031import org.biojava.nbio.core.sequence.TaxonomyID; 032import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 033import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 034import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 035import org.biojava.nbio.core.sequence.template.AbstractSequence; 036import org.biojava.nbio.core.sequence.template.Compound; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import java.io.BufferedReader; 041import java.io.File; 042import java.io.FileNotFoundException; 043import java.io.FileReader; 044import java.io.IOException; 045import java.io.InputStream; 046import java.io.InputStreamReader; 047import java.util.ArrayList; 048import java.util.HashMap; 049import java.util.LinkedHashMap; 050import java.util.List; 051import java.util.Map; 052 053/** 054 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the 055 * primary class used to read Genbank files 056 * @param <S> the sequence type 057 * @param <C> the compound type 058 */ 059public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> { 060 061 private SequenceCreatorInterface<C> sequenceCreator; 062 private GenbankSequenceParser<S,C> genbankParser; 063 private BufferedReader bufferedReader; 064 private boolean closed; 065 private final Logger logger = LoggerFactory.getLogger(this.getClass()); 066 067 public boolean isClosed() { 068 return closed; 069 } 070 071 /** 072 * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about 073 * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in 074 * an {@link InputStream} is forced to read all the data so you don't gain anything. 075 * @param is 076 * @param headerParser 077 * @param sequenceCreator 078 */ 079 public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser, 080 final SequenceCreatorInterface<C> sequenceCreator) { 081 this.sequenceCreator = sequenceCreator; 082 bufferedReader = new BufferedReader(new InputStreamReader(is)); 083 genbankParser = new GenbankSequenceParser<>(); 084 closed = false; 085 } 086 087 /** 088 * If you are going to use the FileProxyProteinSequenceCreator then you 089 * need to use this constructor because we need details about 090 * the location of the file. 091 * @param file 092 * @param headerParser 093 * @param sequenceCreator 094 * @throws FileNotFoundException if the file does not exist, is a directory 095 * rather than a regular file, or for some other reason cannot be opened 096 * for reading. 097 * @throws SecurityException if a security manager exists and its checkRead 098 * method denies read access to the file. 099 */ 100 public GenbankReader( 101 final File file, 102 final SequenceHeaderParserInterface<S,C> headerParser, 103 final SequenceCreatorInterface<C> sequenceCreator 104 ) throws FileNotFoundException { 105 106 this.bufferedReader = new BufferedReader(new FileReader(file)); 107 this.sequenceCreator = sequenceCreator; 108 genbankParser = new GenbankSequenceParser<>(); 109 } 110 111 /** 112 * The parsing is done in this method.<br> 113 * This method will return all the available Genbank records 114 * in the File or InputStream, closes the underlying resource, 115 * and return the results in {@link LinkedHashMap}.<br> 116 * You don't need to call {@link GenbankReader#close()} after calling this method. 117 * @see #process(int) 118 * @return {@link HashMap} containing all the parsed Genbank records 119 * present, starting current fileIndex onwards. 120 * @throws IOException 121 * @throws CompoundNotFoundException 122 * @throws OutOfMemoryError if the input resource is larger than the allocated heap. 123 */ 124 public Map<String, S> process() throws IOException, CompoundNotFoundException { 125 Map<String, S> result = process(-1); 126 close(); 127 return result; 128 } 129 130 /** 131 * This method tries to parse maximum <code>max</code> records from 132 * the open File or InputStream, and leaves the underlying resource open.<br> 133 * 134 * Subsequent calls to the same method continue parsing the rest of the file.<br> 135 * This is particularly useful when dealing with very big data files, 136 * (e.g. NCBI nr database), which can't fit into memory and will take long 137 * time before the first result is available.<br> 138 * <b>N.B.</b> 139 * <ul> 140 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 141 * <li>remember to close the underlying resource when you are done.</li> 142 * </ul> 143 * @see #process() 144 * @author Amr ALHOSSARY 145 * @since 3.0.6 146 * @param max maximum number of records to return. 147 * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records 148 * present, starting current fileIndex onwards. 149 * @throws IOException 150 * @throws CompoundNotFoundException 151 */ 152 public Map<String, S> process(final int max) throws IOException, CompoundNotFoundException { 153 154 if(closed){ 155 throw new IOException("Cannot perform action: resource has been closed."); 156 } 157 158 Map<String, S> sequences = new LinkedHashMap<>(); 159 int i=0; 160 while(true) { 161 if(max>0 && i>=max) break; 162 i++; 163 String seqString = genbankParser.getSequence(bufferedReader, 0); 164 //reached end of file? 165 if(seqString==null) break; 166 @SuppressWarnings("unchecked") 167 S sequence = (S) sequenceCreator.getSequence(seqString, 0); 168 GenericGenbankHeaderParser<S, C> genbankHeaderParser = genbankParser.getSequenceHeaderParser(); 169 genbankHeaderParser.parseHeader(genbankParser.getHeader(), sequence); 170 String id = genbankHeaderParser.getAccession(); 171 int version = genbankHeaderParser.getVersion(); 172 String identifier = genbankHeaderParser.getIdentifier(); 173 AccessionID accession = new AccessionID(id , DataSource.GENBANK, version, identifier); 174 sequence.setAccession(accession); 175 176 // add features to new sequence 177 genbankParser.getFeatures().values().stream() 178 .flatMap(List::stream) 179 .forEach(sequence::addFeature); 180 181 // add taxonomy ID to new sequence 182 List<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); 183 if (dbQualifier != null){ 184 DBReferenceInfo q = dbQualifier.get(0); 185 sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); 186 } 187 188 sequences.put(sequence.getAccession().getID(), sequence); 189 } 190 191 return sequences; 192 } 193 194 public void close() { 195 try { 196 bufferedReader.close(); 197 this.closed = true; 198 } catch (IOException e) { 199 logger.error("Couldn't close the reader.", e); 200 this.closed = false; 201 } 202 } 203} 204