001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.Compound; 031import org.biojava.nbio.core.sequence.template.Sequence; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import java.io.*; 036import java.util.HashMap; 037import java.util.LinkedHashMap; 038import java.util.Map; 039 040/** 041 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the 042 * primary class used to read Fasta files 043 * @author Scooter Willis ;lt;willishf at gmail dot com> 044 */ 045public class FastaReader<S extends Sequence<?>, C extends Compound> { 046 047 private final static Logger logger = LoggerFactory.getLogger(FastaReader.class); 048 049 SequenceCreatorInterface<C> sequenceCreator; 050 SequenceHeaderParserInterface<S,C> headerParser; 051 BufferedReaderBytesRead br; 052 InputStreamReader isr; 053 FileInputStream fi = null; 054 long fileIndex = 0; 055 long sequenceIndex = 0; 056 String line = ""; 057 String header= ""; 058 059 /** 060 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about 061 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in 062 * an inputstream is forced to read all the data so you don't gain anything. 063 * @param is inputStream 064 * @param headerParser 065 * @param sequenceCreator 066 */ 067 public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, 068 SequenceCreatorInterface<C> sequenceCreator) { 069 this.headerParser = headerParser; 070 isr = new InputStreamReader(is); 071 this.br = new BufferedReaderBytesRead(isr); 072 this.sequenceCreator = sequenceCreator; 073 } 074 075 /** 076 * If you are going to use the FileProxyProteinSequenceCreator then you 077 * need to use this constructor because we need details about 078 * the location of the file. 079 * @param file 080 * @param headerParser 081 * @param sequenceCreator 082 * @throws FileNotFoundException if the file does not exist, is a directory 083 * rather than a regular file, or for some other reason cannot be opened 084 * for reading. 085 * @throws SecurityException if a security manager exists and its checkRead 086 * method denies read access to the file. 087 */ 088 public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser, 089 SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException { 090 this.headerParser = headerParser; 091 fi = new FileInputStream(file); 092 isr = new InputStreamReader(fi); 093 this.br = new BufferedReaderBytesRead(isr); 094 this.sequenceCreator = sequenceCreator; 095 } 096 097 /** 098 * The parsing is done in this method.<br> 099 * This method tries to process all the available fasta records 100 * in the File or InputStream, closes the underlying resource, 101 * and return the results in {@link LinkedHashMap}.<br> 102 * You don't need to call {@link #close()} after calling this method. 103 * @see #process(int) 104 * @return {@link HashMap} containing all the parsed fasta records 105 * present, starting current fileIndex onwards. 106 * @throws IOException if an error occurs reading the input file 107 */ 108 public Map<String, S> process() throws IOException { 109 Map<String, S> sequences = process(-1); 110 close(); 111 112 return sequences; 113 } 114 115 /** 116 * This method tries to parse maximum <code>max</code> records from 117 * the open File or InputStream, and leaves the underlying resource open.<br> 118 * Subsequent calls to the same method continue parsing the rest of the file.<br> 119 * This is particularly useful when dealing with very big data files, 120 * (e.g. NCBI nr database), which can't fit into memory and will take long 121 * time before the first result is available.<br> 122 * <b>N.B.</b> 123 * <ul> 124 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 125 * <li>remember to close the underlying resource when you are done.</li> 126 * </ul> 127 * @see #process() 128 * @author Amr ALHOSSARY 129 * @since 3.0.6 130 * @param max maximum number of records to return, <code>-1</code> for infinity. 131 * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records 132 * present, starting current fileIndex onwards. 133 * @throws IOException if an error occurs reading the input file 134 */ 135 public Map<String, S> process(int max) throws IOException { 136 137 138 String line = ""; 139 if(this.line != null && this.line.length() > 0){ 140 line=this.line; 141 } 142 String header = ""; 143 if(this.header != null && this.header.length() > 0){ 144 header=this.header; 145 } 146 147 StringBuilder sb = new StringBuilder(); 148 int processedSequences=0; 149 boolean keepGoing = true; 150 151 152 Map<String, S> sequences = new LinkedHashMap<>(); 153 154 do { 155 line = line.trim(); // nice to have but probably not needed 156 if (line.length() != 0) { 157 if (line.startsWith(">")) {//start of new fasta record 158 159 if (sb.length() > 0) { 160 //i.e. if there is already a sequence before 161 //logger.info("Sequence index=" + sequenceIndex); 162 163 try { 164 @SuppressWarnings("unchecked") 165 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex); 166 headerParser.parseHeader(header, sequence); 167 sequences.put(sequence.getAccession().getID(),sequence); 168 processedSequences++; 169 170 } catch (CompoundNotFoundException e) { 171 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 172 header, e.getMessage()); 173 } 174 175 sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer) 176 } 177 header = line.substring(1); 178 } else if (line.startsWith(";")) { 179 } else { 180 //mark the start of the sequence with the fileIndex before the line was read 181 if(sb.length() == 0){ 182 sequenceIndex = fileIndex; 183 } 184 sb.append(line); 185 } 186 } 187 fileIndex = br.getBytesRead(); 188 189 line = br.readLine(); 190 191 if (line == null) { 192 //i.e. EOF 193 if ( sb.length() == 0 && header.length() != 0 ) { 194 logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex); 195 logger.warn("header: {}", header); 196 header = null; 197 } else if ( sb.length() > 0 ) { 198 //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex ); 199 try { 200 @SuppressWarnings("unchecked") 201 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex); 202 headerParser.parseHeader(header, sequence); 203 sequences.put(sequence.getAccession().getID(),sequence); 204 processedSequences++; 205 header = null; 206 } catch (CompoundNotFoundException e) { 207 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 208 header, e.getMessage()); 209 } 210 } 211 keepGoing = false; 212 } 213 if (max > -1 && processedSequences>=max) { 214 keepGoing=false; 215 } 216 } while (keepGoing); 217 218 this.line = line; 219 this.header= header; 220 221 return max > -1 && sequences.isEmpty() ? null : sequences; 222 } 223 224 public void close() throws IOException { 225 br.close(); 226 isr.close(); 227 //If stream was created from File object then we need to close it 228 if (fi != null) { 229 fi.close(); 230 } 231 this.line=this.header = null; 232 } 233 234}