001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.phred; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.PrintStream; 027import java.io.Serializable; 028import java.util.NoSuchElementException; 029import java.util.Vector; 030 031import org.biojava.bio.seq.Sequence; 032import org.biojava.bio.seq.io.ParseException; 033import org.biojava.bio.seq.io.SeqIOListener; 034import org.biojava.bio.seq.io.SequenceFormat; 035import org.biojava.bio.seq.io.StreamParser; 036import org.biojava.bio.seq.io.SymbolTokenization; 037import org.biojava.bio.symbol.IllegalSymbolException; 038import org.biojava.bio.symbol.IntegerAlphabet; 039import org.biojava.utils.ParseErrorEvent; 040import org.biojava.utils.ParseErrorListener; 041import org.biojava.utils.ParseErrorSource; 042 043/** 044 * Format object representing Phred Quality files. 045 * The only `sequence property' reported by this parser 046 * is PROPERTY_DESCRIPTIONLINE, which is the contents of the 047 * sequence's description line (the line starting with a '>' 048 * character). 049 * 050 * Essentially a rework of FastaFormat to cope with the quirks of Phred Quality data.<p> 051 * Copyright (c) 2001<p> 052 * Company: AgResearch<p> 053 * 054 * @author Mark Schreiber 055 * @author Greg Cox 056 * @author Frans Verhoef 057 * @since 1.1 058 */ 059 060public class PhredFormat implements SequenceFormat, ParseErrorSource, ParseErrorListener, Serializable { 061 062 public static final String DEFAULT = "PHRED"; 063 064 private Vector mListeners = new Vector(); 065 066 /** 067 * Constant string which is the property key used to notify 068 * listeners of the description lines of Phred sequences. 069 */ 070 071 public final static String PROPERTY_DESCRIPTIONLINE = "description_line"; 072 073 /** 074 * The line width for output. 075 */ 076 private int lineWidth = 60; 077 078 /** 079 * Retrive the current line width. 080 * 081 * @return the line width 082 */ 083 084 public int getLineWidth() { 085 return lineWidth; 086 } 087 088 /** 089 * Set the line width. 090 * <p> 091 * When writing, the lines of sequence will never be longer than the line 092 * width. 093 * 094 * @param width the new line width 095 */ 096 097 public void setLineWidth(int width) { 098 this.lineWidth = width; 099 } 100 101 public boolean readSequence(BufferedReader reader, 102 SymbolTokenization symParser, 103 SeqIOListener siol) 104 throws IllegalSymbolException, IOException, ParseException { 105 String line = reader.readLine(); 106 if (line == null) { 107 throw new IOException("Premature stream end"); 108 } 109 if (!line.startsWith(">")) { 110 throw new IOException("Stream does not appear to contain Phred formatted data: " + line); 111 } 112 113 siol.startSequence(); 114 115 String description = line.substring(1).trim(); 116 siol.addSequenceProperty(PROPERTY_DESCRIPTIONLINE, description); 117 118 boolean seenEOF = readSequenceData(reader, symParser, siol); 119 siol.endSequence(); 120 121 return !seenEOF; 122 } 123 124 private boolean readSequenceData(BufferedReader br, 125 SymbolTokenization parser, 126 SeqIOListener listener) 127 throws IOException, IllegalSymbolException { 128 char[] buffer = new char[256]; 129 StreamParser sparser = parser.parseStream(listener); 130 boolean seenEOF = false; //reached the end of the file 131 boolean reachedEnd = false; //reached the end of this sequence 132 133 while(reachedEnd == false){// while more sequence 134 br.mark(buffer.length); // mark the read ahead limit 135 int bytesRead = br.read(buffer,0,buffer.length); // read into the buffer 136 while(Character.isDigit(buffer[buffer.length -1])){// may have ended halfway through a number 137 br.reset();// if so reset 138 buffer = new char[buffer.length+64]; //make the buffer a little bigger 139 br.mark(buffer.length); //mark the new read ahead limit 140 bytesRead = br.read(buffer,0,buffer.length); //read into buffer 141 } 142 if(bytesRead < 0){ //ie -1 indicates end of file 143 seenEOF = reachedEnd = true; 144 }else{ // otherwise 145 146 int parseEnd = 0; 147 148 // while more sequence and more chars in the buffer and not a new sequence 149 while(!reachedEnd && parseEnd < bytesRead && buffer[parseEnd] != '>'){ 150 ++parseEnd; 151 } 152 sparser.characters(buffer,0,parseEnd); 153 154 //If found the start of a new sequence 155 if(parseEnd < bytesRead && buffer[parseEnd] == '>'){ 156 br.reset(); // reset the reader 157 // then skip the file reading pointer to the start of the new sequence ready for the 158 //next read (if required). 159 if(br.skip(parseEnd) != parseEnd) throw new IOException("Couldn't reset to start of next sequence"); 160 reachedEnd = true; //found the end of this sequence. 161 } 162 } 163 } 164 165 sparser.close(); 166 return seenEOF; 167 } 168 169 /** 170 * Return a suitable description line for a Sequence. If the 171 * sequence's annotation bundle contains PROPERTY_DESCRIPTIONLINE, 172 * this is used verbatim. Otherwise, the sequence's name is used. 173 */ 174 175 protected String describeSequence(Sequence seq) { 176 String description = null; 177 try { 178 description = seq.getAnnotation().getProperty(PROPERTY_DESCRIPTIONLINE).toString(); 179 } catch (NoSuchElementException ex) { 180 description = seq.getName(); 181 } 182 return description; 183 } 184 185 /** 186 * This method will print symbols to the line width followed by a 187 * new line etc. NOTE that an integer symbol does not always 188 * correspond to one character therefore a line width of sixty 189 * will print sixty characters followed by a new line. Not 190 * necessarily sixty integers. 191 */ 192 public void writeSequence(Sequence seq, PrintStream os) 193 throws IOException { 194 os.print(">"); 195 os.println(describeSequence(seq)); 196 197 StringBuffer line = new StringBuffer(); 198 int seqLen = seq.length(); 199 200 for (int i = 1; i <= seqLen; i++) { 201 int val = ((IntegerAlphabet.IntegerSymbol)seq.symbolAt(i)).intValue(); 202 String s = Integer.toString(val); 203 if ((line.length() + s.length()) > lineWidth) { 204 os.println(line.substring(0)); 205 line = new StringBuffer(); 206 } 207 line.append(s + " "); 208 } 209 } 210 211 /** 212 * <code>writeSequence</code> writes a sequence to the specified 213 * <code>PrintStream</code>, using the specified format. 214 * 215 * @param seq a <code>Sequence</code> to write out. 216 * @param format a <code>String</code> indicating which sub-format 217 * of those available from a particular 218 * <code>SequenceFormat</code> implemention to use when 219 * writing. 220 * @param os a <code>PrintStream</code> object. 221 * 222 * @exception IOException if an error occurs. 223 * @deprecated use writeSequence(Sequence seq, PrintStream os) 224 */ 225 public void writeSequence(Sequence seq, String format, PrintStream os) 226 throws IOException { 227 if (! format.equalsIgnoreCase(getDefaultFormat())) 228 throw new IllegalArgumentException("Unknown format '" 229 + format 230 + "'"); 231 writeSequence(seq, os); 232 } 233 234 /** 235 * <code>getDefaultFormat</code> returns the String identifier for 236 * the default format. 237 * 238 * @return a <code>String</code>. 239 * @deprecated 240 */ 241 public String getDefaultFormat() { 242 return DEFAULT; 243 } 244 245 /** 246 * Adds a parse error listener to the list of listeners if it isn't already 247 * included. 248 * 249 * @param theListener Listener to be added. 250 */ 251 public synchronized void addParseErrorListener(ParseErrorListener theListener) { 252 if (mListeners.contains(theListener) == false) { 253 mListeners.addElement(theListener); 254 } 255 } 256 257 /** 258 * Removes a parse error listener from the list of listeners if it is 259 * included. 260 * 261 * @param theListener Listener to be removed. 262 */ 263 public synchronized void removeParseErrorListener(ParseErrorListener theListener) { 264 if (mListeners.contains(theListener) == true) { 265 mListeners.removeElement(theListener); 266 } 267 } 268 269 /** 270 * This method determines the behaviour when a bad line is processed. 271 * Some options are to log the error, throw an exception, ignore it 272 * completely, or pass the event through. 273 * <p> 274 * This method should be overwritten when different behavior is desired. 275 * 276 * @param theEvent The event that contains the bad line and token. 277 */ 278 public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) { 279 notifyParseErrorEvent(theEvent); 280 } 281 282 // Protected methods 283 /** 284 * Passes the event on to all the listeners registered for ParseErrorEvents. 285 * 286 * @param theEvent The event to be handed to the listeners. 287 */ 288 protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { 289 Vector listeners; 290 synchronized(this) { 291 listeners = (Vector)mListeners.clone(); 292 } 293 294 for (int index = 0; index < listeners.size(); index++) { 295 ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index); 296 client.BadLineParsed(theEvent); 297 } 298 } 299 300}