001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.InputStream;
026import java.io.InputStreamReader;
027import java.util.NoSuchElementException;
028
029import org.biojava.bio.BioException;
030import org.biojava.bio.seq.Sequence;
031import org.biojava.bio.seq.io.SymbolTokenization;
032import org.biojavax.Namespace;
033import org.biojavax.bio.BioEntry;
034import org.biojavax.bio.seq.RichSequence;
035import org.biojavax.bio.seq.RichSequenceIterator;
036
037/**
038 * Parses a stream into sequences.
039 * This object implements SequenceIterator, so you can loop over each sequence
040 * produced. It consumes a stream, and uses a SequenceFormat to extract each
041 * sequence from the stream.
042 * It is assumed that the stream contains sequences that can be handled by the
043 * one format, and that they are not seperated other than by delimiters that the
044 * format can handle.
045 * Sequences are instantiated when they are requested by nextSequence, not
046 * before, so it is safe to use this object to parse a gigabyte fasta file, and
047 * do sequence-by-sequence processing, while being guaranteed that RichStreamReader
048 * will not require you to keep any of the sequences in memory.
049 * @author Matthew Pocock
050 * @author Thomas Down
051 * @author Richard Holland
052 * @since 1.5
053 */
054
055public class RichStreamReader implements RichSequenceIterator {
056    
057    /**
058     * The symbol parser.
059     */
060    private Namespace ns;
061    
062    /**
063     * The symbol parser.
064     */
065    private SymbolTokenization symParser;
066    
067    /**
068     * The sequence format.
069     */
070    private RichSequenceFormat format;
071    
072    /**
073     * The sequence-builder factory.
074     */
075    private RichSequenceBuilderFactory sf;
076    
077    /**
078     * The stream of data to parse.
079     */
080    
081    private BufferedReader reader;
082    
083    /**
084     * Flag indicating if more sequences are available.
085     */
086    private boolean moreSequenceAvailable = true;
087    
088    /**
089     * {@inheritDoc}
090     */
091    public Sequence nextSequence() throws NoSuchElementException, BioException {
092        return this.nextRichSequence();
093    }
094    
095    /**
096     * {@inheritDoc}
097     */
098    public BioEntry nextBioEntry() throws NoSuchElementException, BioException {
099        return this.nextRichSequence();
100    }
101    
102    /**
103     * {@inheritDoc}
104     */
105    public RichSequence nextRichSequence() throws NoSuchElementException, BioException {
106        if(!moreSequenceAvailable)
107            throw new NoSuchElementException("Stream is empty");
108        try {
109            RichSequenceBuilder builder = (RichSequenceBuilder)sf.makeSequenceBuilder();
110            moreSequenceAvailable = format.readRichSequence(reader, symParser, builder, ns);
111            return builder.makeRichSequence();
112        } catch (Exception e) {
113            throw new BioException("Could not read sequence",e);
114        }
115    }
116    
117    /**
118     * {@inheritDoc}
119     */
120    public boolean hasNext() {
121        return moreSequenceAvailable;
122    }
123    
124    /**
125     * Creates a new stream reader on the given input stream, which will attempt to read
126     * sequences in the given format, having symbols from the given tokenization, and
127     * pass them to the given factory to be transformed into RichSequence objects in
128     * the given namespace.
129     * @param is the input stream to read from
130     * @param format the input file format
131     * @param symParser the tokenizer that understands the sequence symbols in the file
132     * @param sf the factory that will build the sequences
133     * @param ns the namespace the sequences will be loaded into.
134     */
135    public RichStreamReader(InputStream is,
136            RichSequenceFormat format,
137            SymbolTokenization symParser,
138            RichSequenceBuilderFactory sf,
139            Namespace ns)  {
140        this(new BufferedReader(new InputStreamReader(is)), format,symParser,sf,ns);
141    }
142    
143    /**
144     * Creates a new stream reader on the given reader, which will attempt to read
145     * sequences in the given format, having symbols from the given tokenization, and
146     * pass them to the given factory to be transformed into RichSequence objects in
147     * the given namespace.
148     * @param reader the reader to read from
149     * @param format the input file format
150     * @param symParser the tokenizer that understands the sequence symbols in the file
151     * @param sf the factory that will build the sequences
152     * @param ns the namespace the sequences will be loaded into.
153     */
154    public RichStreamReader(BufferedReader reader,
155            RichSequenceFormat format,
156            SymbolTokenization symParser,
157            RichSequenceBuilderFactory sf,
158            Namespace ns)  {
159        this.reader = reader;
160        this.format = format;
161        this.symParser = symParser;
162        this.sf = sf;
163        this.ns = ns;
164    }
165}