001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Scooter Willis ;lt;willishf at gmail dot com> 015 * @author Karl Nicholas <github:karlnicholas> 016 * @author Paolo Pavan 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 * Created on 01-21-2010 025 */ 026package org.biojava.nbio.core.sequence.io; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.DataSource; 030import org.biojava.nbio.core.sequence.TaxonomyID; 031import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 032import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 033import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 034import org.biojava.nbio.core.sequence.template.AbstractSequence; 035import org.biojava.nbio.core.sequence.template.Compound; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import java.io.BufferedReader; 040import java.io.File; 041import java.io.FileNotFoundException; 042import java.io.FileReader; 043import java.io.IOException; 044import java.io.InputStream; 045import java.io.InputStreamReader; 046import java.util.ArrayList; 047import java.util.HashMap; 048import java.util.LinkedHashMap; 049import java.util.List; 050 051/** 052 * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the 053 * primary class used to read Genbank files 054 * 055 */ 056public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> { 057 058 private SequenceCreatorInterface<C> sequenceCreator; 059 private GenbankSequenceParser<S,C> genbankParser; 060 private BufferedReader bufferedReader; 061 private boolean closed; 062 private final Logger logger = LoggerFactory.getLogger(this.getClass()); 063 064 public boolean isClosed() { 065 return closed; 066 } 067 068 /** 069 * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about 070 * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in 071 * an {@link InputStream} is forced to read all the data so you don't gain anything. 072 * @param is 073 * @param headerParser 074 * @param sequenceCreator 075 */ 076 public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser, 077 final SequenceCreatorInterface<C> sequenceCreator) { 078 this.sequenceCreator = sequenceCreator; 079 bufferedReader = new BufferedReader(new InputStreamReader(is)); 080 genbankParser = new GenbankSequenceParser<>(); 081 closed = false; 082 } 083 084 /** 085 * If you are going to use the FileProxyProteinSequenceCreator then you 086 * need to use this constructor because we need details about 087 * the location of the file. 088 * @param file 089 * @param headerParser 090 * @param sequenceCreator 091 * @throws FileNotFoundException if the file does not exist, is a directory 092 * rather than a regular file, or for some other reason cannot be opened 093 * for reading. 094 * @throws SecurityException if a security manager exists and its checkRead 095 * method denies read access to the file. 096 */ 097 public GenbankReader( 098 final File file, 099 final SequenceHeaderParserInterface<S,C> headerParser, 100 final SequenceCreatorInterface<C> sequenceCreator 101 ) throws FileNotFoundException { 102 103 this.bufferedReader = new BufferedReader(new FileReader(file)); 104 this.sequenceCreator = sequenceCreator; 105 genbankParser = new GenbankSequenceParser<>(); 106 } 107 108 /** 109 * The parsing is done in this method.<br> 110 * This method will return all the available Genbank records 111 * in the File or InputStream, closes the underlying resource, 112 * and return the results in {@link LinkedHashMap}.<br> 113 * You don't need to call {@link GenbankReader#close()} after calling this method. 114 * @see #process(int) 115 * @return {@link HashMap} containing all the parsed Genbank records 116 * present, starting current fileIndex onwards. 117 * @throws IOException 118 * @throws CompoundNotFoundException 119 * @throws OutOfMemoryError if the input resource is larger than the allocated heap. 120 */ 121 public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException { 122 LinkedHashMap<String,S> result = process(-1); 123 close(); 124 return result; 125 } 126 127 /** 128 * This method tries to parse maximum <code>max</code> records from 129 * the open File or InputStream, and leaves the underlying resource open.<br> 130 * 131 * Subsequent calls to the same method continue parsing the rest of the file.<br> 132 * This is particularly useful when dealing with very big data files, 133 * (e.g. NCBI nr database), which can't fit into memory and will take long 134 * time before the first result is available.<br> 135 * <b>N.B.</b> 136 * <ul> 137 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 138 * <li>remember to close the underlying resource when you are done.</li> 139 * </ul> 140 * @see #process() 141 * @author Amr AL-Hossary 142 * @since 3.0.6 143 * @param max maximum number of records to return. 144 * @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records 145 * present, starting current fileIndex onwards. 146 * @throws IOException 147 * @throws CompoundNotFoundException 148 */ 149 public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException { 150 151 if(closed){ 152 throw new IOException("Cannot perform action: resource has been closed."); 153 } 154 155 LinkedHashMap<String,S> sequences = new LinkedHashMap<>(); 156 @SuppressWarnings("unchecked") 157 int i=0; 158 while(true) { 159 if(max>0 && i>=max) break; 160 i++; 161 String seqString = genbankParser.getSequence(bufferedReader, 0); 162 //reached end of file? 163 if(seqString==null) break; 164 @SuppressWarnings("unchecked") 165 S sequence = (S) sequenceCreator.getSequence(seqString, 0); 166 genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); 167 168 // add features to new sequence 169 genbankParser.getFeatures().values().stream() 170 .flatMap(List::stream) 171 .forEach(sequence::addFeature); 172 173 // add taxonomy ID to new sequence 174 ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); 175 if (dbQualifier != null){ 176 DBReferenceInfo q = dbQualifier.get(0); 177 sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK)); 178 } 179 180 sequences.put(sequence.getAccession().getID(), sequence); 181 } 182 183 return sequences; 184 } 185 186 public void close() { 187 try { 188 bufferedReader.close(); 189 this.closed = true; 190 } catch (IOException e) { 191 logger.error("Couldn't close the reader.", e); 192 this.closed = false; 193 } 194 } 195} 196