001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.seq.io; 024 025import java.io.BufferedReader; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.util.NoSuchElementException; 029 030import org.biojava.bio.BioException; 031import org.biojava.bio.seq.Sequence; 032import org.biojava.bio.seq.SequenceIterator; 033 034/** 035 * Parses a stream into sequences. 036 * <p> 037 * This object implements SequenceIterator, so you can loop over each sequence 038 * produced. It consumes a stream, and uses a SequenceFormat to extract each 039 * sequence from the stream. 040 * <p> 041 * It is assumed that the stream contains sequences that can be handled by the 042 * one format, and that they are not seperated other than by delimiters that the 043 * format can handle. 044 * <p> 045 * Sequences are instantiated when they are requested by nextSequence, not 046 * before, so it is safe to use this object to parse a gigabyte fasta file, and 047 * do sequence-by-sequence processing, while being guaranteed that StreamReader 048 * will not require you to keep any of the sequences in memory. 049 * 050 * <p>More functionality is offered by {@link org.biojavax.bio.seq.io.RichStreamReader RichStreamReader}, 051 * Use of this interface is prefered.</p> 052 * 053 * @author Matthew Pocock 054 * @author Thomas Down 055 * @see org.biojavax.bio.seq.io.RichStreamReader 056 */ 057 058public class StreamReader implements SequenceIterator, org.biojava.utils.ParseErrorListener 059{ 060 /** 061 * The symbol parser. 062 */ 063 private SymbolTokenization symParser; 064 065 /** 066 * The sequence format. 067 */ 068 private SequenceFormat format; 069 070 /** 071 * The sequence-builder factory. 072 */ 073 private SequenceBuilderFactory sf; 074 075 /** 076 * The stream of data to parse. 077 */ 078 079 private BufferedReader reader; 080 081 /** 082 * Flag indicating if more sequences are available. 083 */ 084 085 private boolean moreSequenceAvailable = true; 086 087 /** 088 * Pull the next sequence out of the stream. 089 * <p> 090 * This method will delegate parsing from the stream to a SequenceFormat 091 * object, and then return the resulting sequence. 092 * 093 * @return the next Sequence 094 * @throws NoSuchElementException if the end of the stream has been hit 095 * @throws BioException if for any reason the next sequence could not be read 096 */ 097 098 public Sequence nextSequence() 099 throws NoSuchElementException, BioException 100 { 101 if(!moreSequenceAvailable) 102 throw new NoSuchElementException("Stream is empty"); 103 try { 104 SequenceBuilder builder = sf.makeSequenceBuilder(); 105 moreSequenceAvailable = format.readSequence(reader, symParser, builder); 106 return builder.makeSequence(); 107 } catch (Exception e) { 108 throw new BioException("Could not read sequence",e); 109 } 110 } 111 112 public boolean hasNext() { 113 return moreSequenceAvailable; 114 } 115 116 public StreamReader(InputStream is, 117 SequenceFormat format, 118 SymbolTokenization symParser, 119 SequenceBuilderFactory sf) { 120 this.reader = new BufferedReader(new InputStreamReader(is)); 121 this.format = format; 122 this.symParser = symParser; 123 this.sf = sf; 124 } 125 126 public StreamReader(BufferedReader reader, 127 SequenceFormat format, 128 SymbolTokenization symParser, 129 SequenceBuilderFactory sf) { 130 this.reader = reader; 131 this.format = format; 132 this.symParser = symParser; 133 this.sf = sf; 134 ((org.biojava.utils.ParseErrorSource)(this.format)).addParseErrorListener(this); 135 } 136 137 /** 138 * This method determines the behaviour when a bad line is processed. 139 * Some options are to log the error, throw an exception, ignore it 140 * completely, or pass the event through. 141 * <p> 142 * This method should be overwritten when different behavior is desired. 143 * 144 * @param theEvent The event that contains the bad line and token. 145 */ 146 public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) 147 { 148 System.err.println(theEvent.getMessage()); 149 } 150}