001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.phred;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.PrintStream;
027import java.io.Serializable;
028import java.util.NoSuchElementException;
029import java.util.Vector;
030
031import org.biojava.bio.seq.Sequence;
032import org.biojava.bio.seq.io.ParseException;
033import org.biojava.bio.seq.io.SeqIOListener;
034import org.biojava.bio.seq.io.SequenceFormat;
035import org.biojava.bio.seq.io.StreamParser;
036import org.biojava.bio.seq.io.SymbolTokenization;
037import org.biojava.bio.symbol.IllegalSymbolException;
038import org.biojava.bio.symbol.IntegerAlphabet;
039import org.biojava.utils.ParseErrorEvent;
040import org.biojava.utils.ParseErrorListener;
041import org.biojava.utils.ParseErrorSource;
042
043/**
044 * Format object representing Phred Quality files.
045 * The only `sequence property' reported by this parser
046 * is PROPERTY_DESCRIPTIONLINE, which is the contents of the
047 * sequence's description line (the line starting with a '>'
048 * character).
049 *
050 * Essentially a rework of FastaFormat to cope with the quirks of Phred Quality data.<p>
051 * Copyright (c) 2001<p>
052 * Company:      AgResearch<p>
053 *
054 * @author Mark Schreiber
055 * @author Greg Cox
056 * @author Frans Verhoef
057 * @since 1.1
058 */
059
060public class PhredFormat implements SequenceFormat, ParseErrorSource, ParseErrorListener, Serializable {
061
062  public static final String DEFAULT = "PHRED";
063
064  private Vector mListeners = new Vector();
065
066  /**
067   * Constant string which is the property key used to notify
068   * listeners of the description lines of Phred sequences.
069   */
070
071  public final static String PROPERTY_DESCRIPTIONLINE = "description_line";
072
073  /**
074   * The line width for output.
075   */
076  private int lineWidth = 60;
077
078  /**
079   * Retrive the current line width.
080   *
081   * @return the line width
082   */
083
084  public int getLineWidth() {
085    return lineWidth;
086  }
087
088  /**
089   * Set the line width.
090   * <p>
091   * When writing, the lines of sequence will never be longer than the line
092   * width.
093   *
094   * @param width the new line width
095   */
096
097  public void setLineWidth(int width) {
098    this.lineWidth = width;
099  }
100
101  public boolean readSequence(BufferedReader reader,
102  SymbolTokenization symParser,
103  SeqIOListener siol)
104  throws IllegalSymbolException, IOException, ParseException {
105    String line = reader.readLine();
106    if (line == null) {
107      throw new IOException("Premature stream end");
108    }
109    if (!line.startsWith(">")) {
110      throw new IOException("Stream does not appear to contain Phred formatted data: " + line);
111    }
112
113    siol.startSequence();
114
115    String description = line.substring(1).trim();
116    siol.addSequenceProperty(PROPERTY_DESCRIPTIONLINE, description);
117
118    boolean seenEOF = readSequenceData(reader, symParser, siol);
119    siol.endSequence();
120
121    return !seenEOF;
122  }
123
124  private boolean readSequenceData(BufferedReader br,
125  SymbolTokenization parser,
126  SeqIOListener listener)
127  throws IOException, IllegalSymbolException {
128    char[] buffer = new char[256];
129    StreamParser sparser = parser.parseStream(listener);
130    boolean seenEOF = false; //reached the end of the file
131    boolean reachedEnd = false; //reached the end of this sequence
132
133    while(reachedEnd == false){// while more sequence
134      br.mark(buffer.length); // mark the read ahead limit
135      int bytesRead = br.read(buffer,0,buffer.length); // read into the buffer
136      while(Character.isDigit(buffer[buffer.length -1])){// may have ended halfway through a number
137        br.reset();// if so reset
138        buffer = new char[buffer.length+64]; //make the buffer a little bigger
139        br.mark(buffer.length); //mark the new read ahead limit
140        bytesRead = br.read(buffer,0,buffer.length); //read into buffer
141      }
142      if(bytesRead < 0){ //ie -1 indicates end of file
143        seenEOF = reachedEnd = true;
144      }else{ // otherwise
145
146        int parseEnd = 0;
147
148        // while more sequence and more chars in the buffer and not a new sequence
149        while(!reachedEnd && parseEnd < bytesRead && buffer[parseEnd] != '>'){
150          ++parseEnd;
151        }
152        sparser.characters(buffer,0,parseEnd);
153
154        //If found the start of a new sequence
155        if(parseEnd < bytesRead && buffer[parseEnd] == '>'){
156          br.reset(); // reset the reader
157          // then skip the file reading pointer to the start of the new sequence ready for the
158          //next read (if required).
159          if(br.skip(parseEnd) != parseEnd) throw new IOException("Couldn't reset to start of next sequence");
160          reachedEnd = true; //found the end of this sequence.
161        }
162      }
163    }
164
165    sparser.close();
166    return seenEOF;
167  }
168
169  /**
170   * Return a suitable description line for a Sequence. If the
171   * sequence's annotation bundle contains PROPERTY_DESCRIPTIONLINE,
172   * this is used verbatim.  Otherwise, the sequence's name is used.
173   */
174
175  protected String describeSequence(Sequence seq) {
176    String description = null;
177    try {
178      description = seq.getAnnotation().getProperty(PROPERTY_DESCRIPTIONLINE).toString();
179    } catch (NoSuchElementException ex) {
180      description = seq.getName();
181    }
182    return description;
183  }
184
185  /**
186   * This method will print symbols to the line width followed by a
187   * new line etc.  NOTE that an integer symbol does not always
188   * correspond to one character therefore a line width of sixty
189   * will print sixty characters followed by a new line. Not
190   * necessarily sixty integers.
191   */
192  public void writeSequence(Sequence seq, PrintStream os)
193  throws IOException {
194    os.print(">");
195    os.println(describeSequence(seq));
196
197    StringBuffer line = new StringBuffer();
198    int seqLen = seq.length();
199
200    for (int i = 1; i <= seqLen; i++) {
201      int val = ((IntegerAlphabet.IntegerSymbol)seq.symbolAt(i)).intValue();
202      String s = Integer.toString(val);
203      if ((line.length() + s.length()) > lineWidth) {
204        os.println(line.substring(0));
205        line = new StringBuffer();
206      }
207      line.append(s + " ");
208    }
209  }
210
211  /**
212   * <code>writeSequence</code> writes a sequence to the specified
213   * <code>PrintStream</code>, using the specified format.
214   *
215   * @param seq a <code>Sequence</code> to write out.
216   * @param format a <code>String</code> indicating which sub-format
217   * of those available from a particular
218   * <code>SequenceFormat</code> implemention to use when
219   * writing.
220   * @param os a <code>PrintStream</code> object.
221   *
222   * @exception IOException if an error occurs.
223   * @deprecated use writeSequence(Sequence seq, PrintStream os)
224   */
225  public void writeSequence(Sequence seq, String format, PrintStream os)
226  throws IOException {
227    if (! format.equalsIgnoreCase(getDefaultFormat()))
228      throw new IllegalArgumentException("Unknown format '"
229      + format
230      + "'");
231    writeSequence(seq, os);
232  }
233
234  /**
235   * <code>getDefaultFormat</code> returns the String identifier for
236   * the default format.
237   *
238   * @return a <code>String</code>.
239   * @deprecated
240   */
241  public String getDefaultFormat() {
242    return DEFAULT;
243  }
244
245  /**
246   * Adds a parse error listener to the list of listeners if it isn't already
247   * included.
248   *
249   * @param theListener Listener to be added.
250   */
251  public synchronized void addParseErrorListener(ParseErrorListener theListener) {
252    if (mListeners.contains(theListener) == false) {
253      mListeners.addElement(theListener);
254    }
255  }
256
257  /**
258   * Removes a parse error listener from the list of listeners if it is
259   * included.
260   *
261   * @param theListener Listener to be removed.
262   */
263  public synchronized void removeParseErrorListener(ParseErrorListener theListener) {
264    if (mListeners.contains(theListener) == true) {
265      mListeners.removeElement(theListener);
266    }
267  }
268
269  /**
270   * This method determines the behaviour when a bad line is processed.
271   * Some options are to log the error, throw an exception, ignore it
272   * completely, or pass the event through.
273   * <p>
274   * This method should be overwritten when different behavior is desired.
275   *
276   * @param theEvent The event that contains the bad line and token.
277   */
278  public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) {
279    notifyParseErrorEvent(theEvent);
280  }
281
282  // Protected methods
283  /**
284   * Passes the event on to all the listeners registered for ParseErrorEvents.
285   *
286   * @param theEvent The event to be handed to the listeners.
287   */
288  protected void notifyParseErrorEvent(ParseErrorEvent theEvent) {
289    Vector listeners;
290    synchronized(this) {
291      listeners = (Vector)mListeners.clone();
292    }
293
294    for (int index = 0; index < listeners.size(); index++) {
295      ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index);
296      client.BadLineParsed(theEvent);
297    }
298  }
299
300}