001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.Compound; 031import org.biojava.nbio.core.sequence.template.Sequence; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import java.io.*; 036import java.util.HashMap; 037import java.util.LinkedHashMap; 038 039/** 040 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the 041 * primary class used to read Fasta files 042 * @author Scooter Willis ;lt;willishf at gmail dot com> 043 */ 044public class FastaReader<S extends Sequence<?>, C extends Compound> { 045 046 private final static Logger logger = LoggerFactory.getLogger(FastaReader.class); 047 048 SequenceCreatorInterface<C> sequenceCreator; 049 SequenceHeaderParserInterface<S,C> headerParser; 050 BufferedReaderBytesRead br; 051 InputStreamReader isr; 052 FileInputStream fi = null; 053 long fileIndex = 0; 054 long sequenceIndex = 0; 055 String line = ""; 056 String header= ""; 057 058 /** 059 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about 060 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in 061 * an inputstream is forced to read all the data so you don't gain anything. 062 * @param is inputStream 063 * @param headerParser 064 * @param sequenceCreator 065 */ 066 public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, 067 SequenceCreatorInterface<C> sequenceCreator) { 068 this.headerParser = headerParser; 069 isr = new InputStreamReader(is); 070 this.br = new BufferedReaderBytesRead(isr); 071 this.sequenceCreator = sequenceCreator; 072 } 073 074 /** 075 * If you are going to use the FileProxyProteinSequenceCreator then you 076 * need to use this constructor because we need details about 077 * the location of the file. 078 * @param file 079 * @param headerParser 080 * @param sequenceCreator 081 * @throws FileNotFoundException if the file does not exist, is a directory 082 * rather than a regular file, or for some other reason cannot be opened 083 * for reading. 084 * @throws SecurityException if a security manager exists and its checkRead 085 * method denies read access to the file. 086 */ 087 public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser, 088 SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException { 089 this.headerParser = headerParser; 090 fi = new FileInputStream(file); 091 isr = new InputStreamReader(fi); 092 this.br = new BufferedReaderBytesRead(isr); 093 this.sequenceCreator = sequenceCreator; 094 } 095 096 /** 097 * The parsing is done in this method.<br> 098 * This method tries to process all the available fasta records 099 * in the File or InputStream, closes the underlying resource, 100 * and return the results in {@link LinkedHashMap}.<br> 101 * You don't need to call {@link #close()} after calling this method. 102 * @see #process(int) 103 * @return {@link HashMap} containing all the parsed fasta records 104 * present, starting current fileIndex onwards. 105 * @throws IOException if an error occurs reading the input file 106 */ 107 public LinkedHashMap<String,S> process() throws IOException { 108 LinkedHashMap<String,S> sequences = process(-1); 109 close(); 110 111 return sequences; 112 } 113 114 /** 115 * This method tries to parse maximum <code>max</code> records from 116 * the open File or InputStream, and leaves the underlying resource open.<br> 117 * Subsequent calls to the same method continue parsing the rest of the file.<br> 118 * This is particularly useful when dealing with very big data files, 119 * (e.g. NCBI nr database), which can't fit into memory and will take long 120 * time before the first result is available.<br> 121 * <b>N.B.</b> 122 * <ul> 123 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 124 * <li>remember to close the underlying resource when you are done.</li> 125 * </ul> 126 * @see #process() 127 * @author Amr AL-Hossary 128 * @since 3.0.6 129 * @param max maximum number of records to return, <code>-1</code> for infinity. 130 * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records 131 * present, starting current fileIndex onwards. 132 * @throws IOException if an error occurs reading the input file 133 */ 134 public LinkedHashMap<String,S> process(int max) throws IOException { 135 136 137 String line = ""; 138 if(this.line != null && this.line.length() > 0){ 139 line=this.line; 140 } 141 String header = ""; 142 if(this.header != null && this.header.length() > 0){ 143 header=this.header; 144 } 145 146 StringBuilder sb = new StringBuilder(); 147 int processedSequences=0; 148 boolean keepGoing = true; 149 150 151 LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>(); 152 153 do { 154 line = line.trim(); // nice to have but probably not needed 155 if (line.length() != 0) { 156 if (line.startsWith(">")) {//start of new fasta record 157 158 if (sb.length() > 0) { 159 //i.e. if there is already a sequence before 160 //logger.info("Sequence index=" + sequenceIndex); 161 162 try { 163 @SuppressWarnings("unchecked") 164 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex); 165 headerParser.parseHeader(header, sequence); 166 sequences.put(sequence.getAccession().getID(),sequence); 167 processedSequences++; 168 169 } catch (CompoundNotFoundException e) { 170 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 171 header, e.getMessage()); 172 } 173 174 sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer) 175 } 176 header = line.substring(1); 177 } else if (line.startsWith(";")) { 178 } else { 179 //mark the start of the sequence with the fileIndex before the line was read 180 if(sb.length() == 0){ 181 sequenceIndex = fileIndex; 182 } 183 sb.append(line); 184 } 185 } 186 fileIndex = br.getBytesRead(); 187 188 line = br.readLine(); 189 190 if (line == null) { 191 192 193 // Fix for #282 194 if ( sequences.size() == 0 && max != -1) { 195 return null; 196 } 197 198 //i.e. EOF 199 String seq = sb.toString(); 200 if ( seq.length() == 0) { 201 logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex); 202 logger.warn("header: {}", header); 203 } 204 //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex ); 205 try { 206 @SuppressWarnings("unchecked") 207 S sequence = (S)sequenceCreator.getSequence(seq, sequenceIndex); 208 headerParser.parseHeader(header, sequence); 209 sequences.put(sequence.getAccession().getID(),sequence); 210 processedSequences++; 211 } catch (CompoundNotFoundException e) { 212 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 213 header, e.getMessage()); 214 } 215 keepGoing = false; 216 } 217 if (max > -1 && processedSequences>=max) { 218 keepGoing=false; 219 } 220 if ( this.line == null) 221 keepGoing = false; 222 } while (keepGoing); 223 224 this.line = line; 225 this.header= header; 226 227 return sequences; 228 } 229 230 public void close() throws IOException { 231 br.close(); 232 isr.close(); 233 //If stream was created from File object then we need to close it 234 if (fi != null) { 235 fi.close(); 236 } 237 this.line=this.header = null; 238 } 239 240 public static void main(String[] args) { 241 try { 242 String inputFile = "/PF00104_small.fasta"; 243 InputStream is = FastaReader.class.getResourceAsStream(inputFile); 244 245 246 if ( is == null) 247 System.err.println("Could not get input file " + inputFile); 248 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 249 LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process(); 250 is.close(); 251 252 253 //logger.info("Protein Sequences: {}", proteinSequences); 254 255 File file = new File(inputFile); 256 FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader = 257 new FastaReader<ProteinSequence,AminoAcidCompound>( 258 file, 259 new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), 260 new FileProxyProteinSequenceCreator( 261 file, 262 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 263 new FastaSequenceParser() 264 ) 265 ); 266 LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process(); 267 268 for(String key : proteinProxySequences.keySet()){ 269 ProteinSequence proteinSequence = proteinProxySequences.get(key); 270 logger.info("Protein Proxy Sequence Key: {}", key); 271// if(key.equals("Q98SJ1_CHICK/15-61")){ 272// int dummy = 1; 273// } 274 logger.info("Protein Sequence: {}", proteinSequence.toString()); 275 276 } 277 278 } catch (Exception e) { 279 logger.warn("Exception: ", e); 280 } 281 } 282}