001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.util.NoSuchElementException; 028 029import org.biojava.bio.BioException; 030import org.biojava.bio.seq.Sequence; 031import org.biojava.bio.seq.io.SymbolTokenization; 032import org.biojavax.Namespace; 033import org.biojavax.bio.BioEntry; 034import org.biojavax.bio.seq.RichSequence; 035import org.biojavax.bio.seq.RichSequenceIterator; 036 037/** 038 * Parses a stream into sequences. 039 * This object implements SequenceIterator, so you can loop over each sequence 040 * produced. It consumes a stream, and uses a SequenceFormat to extract each 041 * sequence from the stream. 042 * It is assumed that the stream contains sequences that can be handled by the 043 * one format, and that they are not seperated other than by delimiters that the 044 * format can handle. 045 * Sequences are instantiated when they are requested by nextSequence, not 046 * before, so it is safe to use this object to parse a gigabyte fasta file, and 047 * do sequence-by-sequence processing, while being guaranteed that RichStreamReader 048 * will not require you to keep any of the sequences in memory. 049 * @author Matthew Pocock 050 * @author Thomas Down 051 * @author Richard Holland 052 * @since 1.5 053 */ 054 055public class RichStreamReader implements RichSequenceIterator { 056 057 /** 058 * The symbol parser. 059 */ 060 private Namespace ns; 061 062 /** 063 * The symbol parser. 064 */ 065 private SymbolTokenization symParser; 066 067 /** 068 * The sequence format. 069 */ 070 private RichSequenceFormat format; 071 072 /** 073 * The sequence-builder factory. 074 */ 075 private RichSequenceBuilderFactory sf; 076 077 /** 078 * The stream of data to parse. 079 */ 080 081 private BufferedReader reader; 082 083 /** 084 * Flag indicating if more sequences are available. 085 */ 086 private boolean moreSequenceAvailable = true; 087 088 /** 089 * {@inheritDoc} 090 */ 091 public Sequence nextSequence() throws NoSuchElementException, BioException { 092 return this.nextRichSequence(); 093 } 094 095 /** 096 * {@inheritDoc} 097 */ 098 public BioEntry nextBioEntry() throws NoSuchElementException, BioException { 099 return this.nextRichSequence(); 100 } 101 102 /** 103 * {@inheritDoc} 104 */ 105 public RichSequence nextRichSequence() throws NoSuchElementException, BioException { 106 if(!moreSequenceAvailable) 107 throw new NoSuchElementException("Stream is empty"); 108 try { 109 RichSequenceBuilder builder = (RichSequenceBuilder)sf.makeSequenceBuilder(); 110 moreSequenceAvailable = format.readRichSequence(reader, symParser, builder, ns); 111 return builder.makeRichSequence(); 112 } catch (Exception e) { 113 throw new BioException("Could not read sequence",e); 114 } 115 } 116 117 /** 118 * {@inheritDoc} 119 */ 120 public boolean hasNext() { 121 return moreSequenceAvailable; 122 } 123 124 /** 125 * Creates a new stream reader on the given input stream, which will attempt to read 126 * sequences in the given format, having symbols from the given tokenization, and 127 * pass them to the given factory to be transformed into RichSequence objects in 128 * the given namespace. 129 * @param is the input stream to read from 130 * @param format the input file format 131 * @param symParser the tokenizer that understands the sequence symbols in the file 132 * @param sf the factory that will build the sequences 133 * @param ns the namespace the sequences will be loaded into. 134 */ 135 public RichStreamReader(InputStream is, 136 RichSequenceFormat format, 137 SymbolTokenization symParser, 138 RichSequenceBuilderFactory sf, 139 Namespace ns) { 140 this(new BufferedReader(new InputStreamReader(is)), format,symParser,sf,ns); 141 } 142 143 /** 144 * Creates a new stream reader on the given reader, which will attempt to read 145 * sequences in the given format, having symbols from the given tokenization, and 146 * pass them to the given factory to be transformed into RichSequence objects in 147 * the given namespace. 148 * @param reader the reader to read from 149 * @param format the input file format 150 * @param symParser the tokenizer that understands the sequence symbols in the file 151 * @param sf the factory that will build the sequences 152 * @param ns the namespace the sequences will be loaded into. 153 */ 154 public RichStreamReader(BufferedReader reader, 155 RichSequenceFormat format, 156 SymbolTokenization symParser, 157 RichSequenceBuilderFactory sf, 158 Namespace ns) { 159 this.reader = reader; 160 this.format = format; 161 this.symParser = symParser; 162 this.sf = sf; 163 this.ns = ns; 164 } 165}