001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.PrintStream; 027import java.io.Serializable; 028import java.util.Vector; 029import java.util.regex.Matcher; 030import java.util.regex.Pattern; 031 032import org.biojava.bio.Annotation; 033import org.biojava.bio.seq.Sequence; 034import org.biojava.bio.symbol.IllegalSymbolException; 035import org.biojava.utils.ParseErrorEvent; 036import org.biojava.utils.ParseErrorListener; 037 038 039/** 040 * Format object representing FASTA files. These files are almost pure 041 * sequence data. The only `sequence property' reported by this parser 042 * is PROPERTY_DESCRIPTIONLINE, which is the contents of the 043 * sequence's description line (the line starting with a '>' 044 * character). Normally, the first word of this is a sequence ID. If 045 * you wish it to be interpreted as such, you should use 046 * FastaDescriptionLineParser as a SeqIO filter. 047 * 048 * If you pass it a RichSeqIOListener, you'll get RichSequence objects 049 * in return. Likewise, if you write RichSequence objects, you'll get 050 * absolutely correct FASTA formatted output. 051 * 052 * @author Thomas Down 053 * @author Matthew Pocock 054 * @author Greg Cox 055 * @author Lukas Kall 056 * @author Richard Holland 057 * @author Mark Schreiber 058 * @deprecated Use org.biojavax.bio.seq.io.FastaFormat 059 */ 060 061public class FastaFormat implements SequenceFormat, 062 Serializable, 063 org.biojava.utils.ParseErrorListener, 064 org.biojava.utils.ParseErrorSource { 065 public static final String DEFAULT = "FASTA"; 066 067 /** 068 * Constant string which is the property key used to notify 069 * listeners of the description lines of FASTA sequences. 070 */ 071 public final static String PROPERTY_DESCRIPTIONLINE = "description_line"; 072 073 protected Vector mListeners = new Vector(); 074 075 /** 076 * The line width for output. 077 */ 078 protected int lineWidth = 60; 079 080 /** 081 * Retrive the current line width. 082 * 083 * @return the line width 084 */ 085 public int getLineWidth() { 086 return lineWidth; 087 } 088 089 /** 090 * Set the line width. 091 * <p> 092 * When writing, the lines of sequence will never be longer than the line 093 * width. 094 * 095 * @param width the new line width 096 */ 097 public void setLineWidth(int width) { 098 this.lineWidth = width; 099 } 100 101 /** 102 * Reads information from a flatfile to a <code>SeqIOListener</code> 103 * using a <code>SymbolTokenizer</code> to convert sequence strings 104 * to <code>Symbol</code> objects. 105 * @param reader The reader that is the source of the information 106 * @param symParser converts text seqeunce to biojava objects 107 * @param siol The listener that listens for event callbacks from this class. 108 * The listener can be a <code>RichSeqIOListener</code>. 109 * @throws org.biojava.bio.symbol.IllegalSymbolException if <code>symParser</code> 110 * doesn't know how to convert the text sequence into biojava <code>Symbol</code>s 111 * @throws java.io.IOException if there is a problem reading. 112 * @throws org.biojava.bio.seq.io.ParseException if the source cannot be parsed. 113 * @return true if there is another unread sequence in the source. 114 */ 115 public boolean readSequence( 116 BufferedReader reader, 117 SymbolTokenization symParser, 118 SeqIOListener siol 119 ) throws 120 IllegalSymbolException, 121 IOException, 122 ParseException { 123 String line = reader.readLine(); 124 if (line == null) { 125 throw new IOException("Premature stream end"); 126 } 127 while(line.length() == 0) { 128 line = reader.readLine(); 129 if (line == null) { 130 throw new IOException("Premature stream end"); 131 } 132 } 133 if (!line.startsWith(">")) { 134 throw new IOException("Stream does not appear to contain FASTA formatted data: " + line); 135 } 136 137 siol.startSequence(); 138 139 String description = line.substring(1).trim(); 140 141 String regex = "(\\S+)(\\s+(.*))*"; 142 Pattern p = Pattern.compile(regex); 143 Matcher m = p.matcher(description); 144 if (!m.matches()) { 145 throw new IOException("Stream does not appear to contain FASTA formatted data: " + line); 146 } 147 148 String name = m.group(1); 149 150 siol.setName(name); 151 siol.addSequenceProperty(PROPERTY_DESCRIPTIONLINE, description); 152 153 boolean seenEOF = readSequenceData(reader, symParser, siol); 154 siol.endSequence(); 155 156 return !seenEOF; 157 } 158 159 private boolean readSequenceData( 160 BufferedReader r, 161 SymbolTokenization parser, 162 SeqIOListener listener 163 ) throws 164 IOException, 165 IllegalSymbolException { 166 char[] cache = new char[512]; 167 boolean reachedEnd = false, seenEOF = false; 168 StreamParser sparser = parser.parseStream(listener); 169 170 while (!reachedEnd) { 171 r.mark(cache.length + 1); 172 int bytesRead = r.read(cache, 0, cache.length); 173 if (bytesRead < 0) { 174 reachedEnd = seenEOF = true; 175 } else { 176 int parseStart = 0; 177 int parseEnd = 0; 178 while (!reachedEnd && parseStart < bytesRead && cache[parseStart] != '>') { 179 parseEnd = parseStart; 180 181 while (parseEnd < bytesRead && 182 cache[parseEnd] != '\n' && 183 cache[parseEnd] != '\r' 184 ) { 185 ++parseEnd; 186 } 187 188 sparser.characters(cache, parseStart, parseEnd - parseStart); 189 190 parseStart = parseEnd + 1; 191 while (parseStart < bytesRead && 192 (cache[parseStart] == '\n' || 193 cache[parseStart] == '\r') ) { 194 ++parseStart; 195 } 196 } 197 if (parseStart < bytesRead && cache[parseStart] == '>') { 198 try { 199 r.reset(); 200 } catch (IOException ioe) { 201 throw new IOException( 202 "Can't reset: " + 203 ioe.getMessage() + 204 " parseStart=" + parseStart + 205 " bytesRead=" + bytesRead 206 ); 207 } 208 if (r.skip(parseStart) != parseStart) { 209 throw new IOException("Couldn't reset to start of next sequence"); 210 } 211 reachedEnd = true; 212 } 213 } 214 } 215 216 sparser.close(); 217 return seenEOF; 218 } 219 220 /** 221 * Return a suitable description line for a Sequence. If the 222 * sequence's annotation bundle contains PROPERTY_DESCRIPTIONLINE, 223 * this is used verbatim. Otherwise, the sequence's name is used. 224 */ 225 protected String describeSequence(Sequence seq) { 226 String description = null; 227 228 Annotation seqAnn = seq.getAnnotation(); 229 230 if(seqAnn.containsProperty(PROPERTY_DESCRIPTIONLINE)) { 231 description = (String) seqAnn.getProperty(PROPERTY_DESCRIPTIONLINE); 232 } else { 233 description = seq.getName(); 234 } 235 236 return description; 237 } 238 239 /** 240 * Writes a <code>Sequence</code> or <code>RichSequence</code> to a 241 * <code>PrintStream</code> in FASTA format. If the sequence is a 242 * <code>RichSequence</code> the format of the header will be in line with 243 * the NCBI standard. 244 * @param seq the sequence to format 245 * @param os the stream to write the sequence to. To print to screen use 246 * <code>System.out</code> 247 * @throws java.io.IOException if data cannot be written to <code>os</code> 248 */ 249 public void writeSequence(Sequence seq, PrintStream os) 250 throws IOException { 251 os.print(">"); 252 os.println(describeSequence(seq)); 253 254 int length = seq.length(); 255 256 for (int pos = 1; pos <= length; pos += lineWidth) { 257 int end = Math.min(pos + lineWidth - 1, length); 258 os.println(seq.subStr(pos, end)); 259 } 260 } 261 262 /** 263 * <code>writeSequence</code> writes a sequence to the specified 264 * <code>PrintStream</code>, using the specified format. 265 * 266 * @param seq a <code>Sequence</code> to write out. 267 * @param format a <code>String</code> indicating which sub-format 268 * of those available from a particular 269 * <code>SequenceFormat</code> implemention to use when 270 * writing. 271 * @param os a <code>PrintStream</code> object. 272 * 273 * @exception IOException if an error occurs. 274 * @deprecated use writeSequence(Sequence seq, PrintStream os) 275 */ 276 public void writeSequence(Sequence seq, String format, PrintStream os) 277 throws IOException { 278 if (! format.equalsIgnoreCase(getDefaultFormat())) 279 throw new IllegalArgumentException("Unknown format '" 280 + format 281 + "'"); 282 writeSequence(seq, os); 283 } 284 285 /** 286 * <code>getDefaultFormat</code> returns the String identifier for 287 * the default format. 288 * 289 * @return a <code>String</code>. 290 * @deprecated 291 */ 292 public String getDefaultFormat() { 293 return DEFAULT; 294 } 295 296 /** 297 * Adds a parse error listener to the list of listeners if it isn't already 298 * included. 299 * 300 * @param theListener Listener to be added. 301 */ 302 public synchronized void addParseErrorListener(ParseErrorListener theListener) { 303 if (mListeners.contains(theListener) == false) { 304 mListeners.addElement(theListener); 305 } 306 } 307 308 /** 309 * Removes a parse error listener from the list of listeners if it is 310 * included. 311 * 312 * @param theListener Listener to be removed. 313 */ 314 public synchronized void removeParseErrorListener(ParseErrorListener theListener) { 315 if (mListeners.contains(theListener) == true) { 316 mListeners.removeElement(theListener); 317 } 318 } 319 320 /** 321 * This method determines the behaviour when a bad line is processed. 322 * Some options are to log the error, throw an exception, ignore it 323 * completely, or pass the event through. 324 * <p> 325 * This method should be overwritten when different behavior is desired. 326 * 327 * @param theEvent The event that contains the bad line and token. 328 */ 329 public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) { 330 notifyParseErrorEvent(theEvent); 331 } 332 333 // Protected methods 334 /** 335 * Passes the event on to all the listeners registered for ParseErrorEvents. 336 * 337 * @param theEvent The event to be handed to the listeners. 338 */ 339 protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { 340 Vector listeners; 341 synchronized(this) { 342 listeners = (Vector)mListeners.clone(); 343 } 344 345 for (int index = 0; index < listeners.size(); index++) { 346 ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index); 347 client.BadLineParsed(theEvent); 348 } 349 } 350}