001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Scooter Willis ;lt;willishf at gmail dot com> 015 * @author Karl Nicholas <github:karlnicholas> 016 * @author Paolo Pavan 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 * Created on 01-21-2010 025 */ 026package org.biojava.nbio.core.sequence.io; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.AccessionID; 030import org.biojava.nbio.core.sequence.DataSource; 031import org.biojava.nbio.core.sequence.TaxonomyID; 032import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 033import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 034import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 035import org.biojava.nbio.core.sequence.template.AbstractSequence; 036import org.biojava.nbio.core.sequence.template.Compound; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import java.io.BufferedReader; 041import java.io.File; 042import java.io.FileNotFoundException; 043import java.io.FileReader; 044import java.io.IOException; 045import java.io.InputStream; 046import java.io.InputStreamReader; 047import java.util.ArrayList; 048import java.util.HashMap; 049import java.util.LinkedHashMap; 050import java.util.List; 051 052/** 053 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the 054 * primary class used to read Genbank files 055 * 056 */ 057public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> { 058 059 private SequenceCreatorInterface<C> sequenceCreator; 060 private GenbankSequenceParser<S,C> genbankParser; 061 private BufferedReader bufferedReader; 062 private boolean closed; 063 private final Logger logger = LoggerFactory.getLogger(this.getClass()); 064 065 public boolean isClosed() { 066 return closed; 067 } 068 069 /** 070 * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about 071 * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in 072 * an {@link InputStream} is forced to read all the data so you don't gain anything. 073 * @param is 074 * @param headerParser 075 * @param sequenceCreator 076 */ 077 public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser, 078 final SequenceCreatorInterface<C> sequenceCreator) { 079 this.sequenceCreator = sequenceCreator; 080 bufferedReader = new BufferedReader(new InputStreamReader(is)); 081 genbankParser = new GenbankSequenceParser<>(); 082 closed = false; 083 } 084 085 /** 086 * If you are going to use the FileProxyProteinSequenceCreator then you 087 * need to use this constructor because we need details about 088 * the location of the file. 089 * @param file 090 * @param headerParser 091 * @param sequenceCreator 092 * @throws FileNotFoundException if the file does not exist, is a directory 093 * rather than a regular file, or for some other reason cannot be opened 094 * for reading. 095 * @throws SecurityException if a security manager exists and its checkRead 096 * method denies read access to the file. 097 */ 098 public GenbankReader( 099 final File file, 100 final SequenceHeaderParserInterface<S,C> headerParser, 101 final SequenceCreatorInterface<C> sequenceCreator 102 ) throws FileNotFoundException { 103 104 this.bufferedReader = new BufferedReader(new FileReader(file)); 105 this.sequenceCreator = sequenceCreator; 106 genbankParser = new GenbankSequenceParser<>(); 107 } 108 109 /** 110 * The parsing is done in this method.<br> 111 * This method will return all the available Genbank records 112 * in the File or InputStream, closes the underlying resource, 113 * and return the results in {@link LinkedHashMap}.<br> 114 * You don't need to call {@link GenbankReader#close()} after calling this method. 115 * @see #process(int) 116 * @return {@link HashMap} containing all the parsed Genbank records 117 * present, starting current fileIndex onwards. 118 * @throws IOException 119 * @throws CompoundNotFoundException 120 * @throws OutOfMemoryError if the input resource is larger than the allocated heap. 121 */ 122 public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException { 123 LinkedHashMap<String,S> result = process(-1); 124 close(); 125 return result; 126 } 127 128 /** 129 * This method tries to parse maximum <code>max</code> records from 130 * the open File or InputStream, and leaves the underlying resource open.<br> 131 * 132 * Subsequent calls to the same method continue parsing the rest of the file.<br> 133 * This is particularly useful when dealing with very big data files, 134 * (e.g. NCBI nr database), which can't fit into memory and will take long 135 * time before the first result is available.<br> 136 * <b>N.B.</b> 137 * <ul> 138 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 139 * <li>remember to close the underlying resource when you are done.</li> 140 * </ul> 141 * @see #process() 142 * @author Amr ALHOSSARY 143 * @since 3.0.6 144 * @param max maximum number of records to return. 145 * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records 146 * present, starting current fileIndex onwards. 147 * @throws IOException 148 * @throws CompoundNotFoundException 149 */ 150 public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException { 151 152 if(closed){ 153 throw new IOException("Cannot perform action: resource has been closed."); 154 } 155 156 LinkedHashMap<String,S> sequences = new LinkedHashMap<>(); 157 int i=0; 158 while(true) { 159 if(max>0 && i>=max) break; 160 i++; 161 String seqString = genbankParser.getSequence(bufferedReader, 0); 162 //reached end of file? 163 if(seqString==null) break; 164 @SuppressWarnings("unchecked") 165 S sequence = (S) sequenceCreator.getSequence(seqString, 0); 166 GenericGenbankHeaderParser<S, C> genbankHeaderParser = genbankParser.getSequenceHeaderParser(); 167 genbankHeaderParser.parseHeader(genbankParser.getHeader(), sequence); 168 String id = genbankHeaderParser.getAccession(); 169 int version = genbankHeaderParser.getVersion(); 170 String identifier = genbankHeaderParser.getIdentifier(); 171 AccessionID accession = new AccessionID(id , DataSource.GENBANK, version, identifier); 172 sequence.setAccession(accession); 173 174 // add features to new sequence 175 genbankParser.getFeatures().values().stream() 176 .flatMap(List::stream) 177 .forEach(sequence::addFeature); 178 179 // add taxonomy ID to new sequence 180 List<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); 181 if (dbQualifier != null){ 182 DBReferenceInfo q = dbQualifier.get(0); 183 sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); 184 } 185 186 sequences.put(sequence.getAccession().getID(), sequence); 187 } 188 189 return sequences; 190 } 191 192 public void close() { 193 try { 194 bufferedReader.close(); 195 this.closed = true; 196 } catch (IOException e) { 197 logger.error("Couldn't close the reader.", e); 198 this.closed = false; 199 } 200 } 201} 202