001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.seq.io;
024
025import java.io.BufferedReader;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.util.NoSuchElementException;
029
030import org.biojava.bio.BioException;
031import org.biojava.bio.seq.Sequence;
032import org.biojava.bio.seq.SequenceIterator;
033
034/**
035 * Parses a stream into sequences.
036 * <p>
037 * This object implements SequenceIterator, so you can loop over each sequence
038 * produced. It consumes a stream, and uses a SequenceFormat to extract each
039 * sequence from the stream.
040 * <p>
041 * It is assumed that the stream contains sequences that can be handled by the
042 * one format, and that they are not seperated other than by delimiters that the
043 * format can handle.
044 * <p>
045 * Sequences are instantiated when they are requested by nextSequence, not
046 * before, so it is safe to use this object to parse a gigabyte fasta file, and
047 * do sequence-by-sequence processing, while being guaranteed that StreamReader
048 * will not require you to keep any of the sequences in memory.
049 * 
050 * <p>More functionality is offered by {@link org.biojavax.bio.seq.io.RichStreamReader RichStreamReader},
051 * Use of this interface is prefered.</p>
052 *
053 * @author Matthew Pocock
054 * @author Thomas Down
055 * @see org.biojavax.bio.seq.io.RichStreamReader
056 */
057
058public class StreamReader implements SequenceIterator, org.biojava.utils.ParseErrorListener
059{
060    /**
061     * The symbol parser.
062     */
063    private SymbolTokenization symParser;
064
065    /**
066     * The sequence format.
067     */
068    private SequenceFormat format;
069
070    /**
071     * The sequence-builder factory.
072     */
073    private SequenceBuilderFactory sf;
074
075    /**
076     * The stream of data to parse.
077     */
078
079    private BufferedReader reader;
080
081    /**
082     * Flag indicating if more sequences are available.
083     */
084
085    private boolean moreSequenceAvailable = true;
086
087    /**
088     * Pull the next sequence out of the stream.
089     * <p>
090     * This method will delegate parsing from the stream to a SequenceFormat
091     * object, and then return the resulting sequence.
092     *
093     * @return the next Sequence
094     * @throws NoSuchElementException if the end of the stream has been hit
095     * @throws BioException if for any reason the next sequence could not be read
096     */
097
098    public Sequence nextSequence()
099        throws NoSuchElementException, BioException
100    {
101        if(!moreSequenceAvailable)
102            throw new NoSuchElementException("Stream is empty");
103        try {
104            SequenceBuilder builder = sf.makeSequenceBuilder();
105            moreSequenceAvailable = format.readSequence(reader, symParser, builder);
106            return builder.makeSequence();
107        } catch (Exception e) {
108            throw new BioException("Could not read sequence",e);
109        }
110    }
111
112    public boolean hasNext() {
113        return moreSequenceAvailable;
114    }
115
116    public StreamReader(InputStream is,
117                        SequenceFormat format,
118                        SymbolTokenization symParser,
119                        SequenceBuilderFactory sf)  {
120        this.reader = new BufferedReader(new InputStreamReader(is));
121        this.format = format;
122        this.symParser = symParser;
123        this.sf = sf;
124    }
125
126    public StreamReader(BufferedReader reader,
127                        SequenceFormat format,
128                        SymbolTokenization symParser,
129                        SequenceBuilderFactory sf)  {
130        this.reader = reader;
131        this.format = format;
132        this.symParser = symParser;
133        this.sf = sf;
134        ((org.biojava.utils.ParseErrorSource)(this.format)).addParseErrorListener(this);
135    }
136
137        /**
138         * This method determines the behaviour when a bad line is processed.
139         * Some options are to log the error, throw an exception, ignore it
140         * completely, or pass the event through.
141         * <p>
142         * This method should be overwritten when different behavior is desired.
143         *
144         * @param theEvent The event that contains the bad line and token.
145         */
146        public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent)
147        {
148                System.err.println(theEvent.getMessage());
149        }
150}