001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.PrintStream; 027import java.io.Serializable; 028import java.util.ArrayList; 029import java.util.Vector; 030 031import org.biojava.bio.seq.Sequence; 032import org.biojava.bio.symbol.IllegalSymbolException; 033import org.biojava.utils.ChangeVetoException; 034import org.biojava.utils.ParseErrorEvent; 035import org.biojava.utils.ParseErrorListener; 036import org.biojava.utils.ParseErrorSource; 037 038/** 039 * <p> 040 * Format processor for handling EMBL records and similar files. This 041 * takes a very simple approach: all `normal' attribute lines are 042 * passed to the listener as a tag (first two characters) and a value 043 * (the rest of the line from the 6th character onwards). Any data 044 * between the special `SQ' line and the "//" entry terminator is 045 * passed as a SymbolReader. 046 * </p> 047 * 048 * <p> 049 * This low-level format processor should normally be used in 050 * conjunction with one or more `filter' objects, such as 051 * EmblProcessor. 052 * </p> 053 * 054 * <p> 055 * Many ideas borrowed from the old EmblFormat processor by Thomas 056 * Down and Thad Welch. 057 * </p> 058 * 059 * @author Thomas Down 060 * @author Greg Cox 061 * @author Keith James 062 * @author Len Trigg 063 * @author Lorna Morris 064 * @since 1.1 065 * @deprecated Use org.biojavax.bio.seq.io.EMBLFormat instead 066 */ 067 068public class EmblLikeFormat implements 069 SequenceFormat, 070 Serializable, 071 ParseErrorSource, 072 ParseErrorListener { 073 public static final String DEFAULT = "EMBL"; 074 075 protected static final String ID_TAG = "ID"; 076 protected static final String SIZE_TAG = "SIZE"; 077 protected static final String STRAND_NUMBER_TAG = "STRANDS"; 078 protected static final String TYPE_TAG = "TYPE"; 079 protected static final String CIRCULAR_TAG = "CIRCULAR"; 080 protected static final String DIVISION_TAG = "DIVISION"; 081 protected static final String DR_TAG = "DR"; //Lorna: new tag 082 083 protected static final String ACCESSION_TAG = "AC"; 084 protected static final String VERSION_TAG = "SV"; 085 protected static final String DATE_TAG = "DT"; 086 protected static final String DEFINITION_TAG = "DE"; 087 protected static final String KEYWORDS_TAG = "KW"; 088 protected static final String SOURCE_TAG = "OS"; 089 protected static final String ORGANISM_TAG = "OC"; 090 protected static final String ORGANISM_XREF_TAG = "OX"; 091 protected static final String REFERENCE_TAG = "RN"; 092 protected static final String COORDINATE_TAG = "RP"; 093 protected static final String REF_ACCESSION_TAG = "RX"; 094 protected static final String AUTHORS_TAG = "RA"; 095 protected static final String REF_XREF_TAG = "RX"; 096 protected static final String TITLE_TAG = "RT"; 097 protected static final String JOURNAL_TAG = "RL"; 098 protected static final String COMMENT_TAG = "CC"; 099 protected static final String FEATURE_TAG = "FH"; 100 protected static final String SEPARATOR_TAG = "XX"; 101 protected static final String FEATURE_TABLE_TAG = "FT"; 102 protected static final String START_SEQUENCE_TAG = "SQ"; 103 protected static final String END_SEQUENCE_TAG = "//"; 104 105 private boolean elideSymbols = false; 106 private Vector mListeners = new Vector(); 107 108 /** 109 * <p>Specifies whether the symbols (SQ) part of the entry should 110 * be ignored. If this property is set to <code>true</code>, the 111 * parser will never call addSymbols on the 112 * <code>SeqIOListener</code>, but parsing will be faster if 113 * you're only interested in header information.</p> 114 * 115 * <p> This property also allows the header to be parsed for files 116 * which have invalid sequence data.</p> 117 */ 118 public void setElideSymbols(boolean b) { 119 elideSymbols = b; 120 } 121 122 /** 123 * Return a flag indicating if symbol data will be skipped 124 * when parsing streams. 125 */ 126 public boolean getElideSymbols() { 127 return elideSymbols; 128 } 129 130 public boolean readSequence(BufferedReader reader, 131 SymbolTokenization symParser, 132 SeqIOListener listener) throws 133 IllegalSymbolException, IOException, ParseException { 134 135 //EmblReferenceProperty reference = null; //lorna 136 137 if (listener instanceof ParseErrorSource) { 138 ( (ParseErrorSource) (listener)).addParseErrorListener(this); 139 } 140 141 String line; 142 StreamParser sparser = null; 143 boolean hasMoreSequence = true; 144 boolean hasInternalWhitespace = false; 145 146 listener.startSequence(); 147 148 while ( (line = reader.readLine()) != null) { 149 if (line.startsWith(END_SEQUENCE_TAG)) { 150 if (sparser != null) { 151 // End of symbol data 152 sparser.close(); 153 sparser = null; 154 } 155 156 // Allows us to tolerate trailing whitespace without 157 // thinking that there is another Sequence to follow 158 while (true) { 159 reader.mark(1); 160 int c = reader.read(); 161 162 if (c == -1) { 163 hasMoreSequence = false; 164 break; 165 } 166 167 if (Character.isWhitespace( (char) c)) { 168 hasInternalWhitespace = true; 169 continue; 170 } 171 172 if (hasInternalWhitespace) 173 System.err.println( 174 "Warning: whitespace found between sequence entries"); 175 176 reader.reset(); 177 break; 178 } 179 180 listener.endSequence(); 181 return hasMoreSequence; 182 } 183 else if (line.startsWith(START_SEQUENCE_TAG)) { 184 // Adding a null property to flush the last feature; 185 // Needed for Swissprot files because there is no gap 186 // between the feature table and the sequence data 187 listener.addSequenceProperty(SEPARATOR_TAG, ""); 188 189 sparser = symParser.parseStream(listener); 190 } 191 else { 192 if (sparser == null) { 193 // Normal attribute line 194 String tag = line.substring(0, 2); 195 String rest = null; 196 if (line.length() > 5) { 197 rest = line.substring(5); 198 } 199 200 if (tag.equals(REFERENCE_TAG)) { //only 1 reference_tag! 201 202 try { 203 //lorna added, tags read in order, when a complete set goes through, 204 //spit out a single annotation event 205 ReferenceAnnotation refAnnot = new ReferenceAnnotation(); 206 207 refAnnot.setProperty(tag, rest); 208 while (! (tag.equals(SEPARATOR_TAG))) { 209 // Normal attribute line 210 211 line = reader.readLine(); 212 213 tag = line.substring(0, 2); 214 215 if (line.length() > 5) { 216 rest = line.substring(5); 217 } 218 else { 219 rest = null; //for XX lines 220 } 221 222 if (refAnnot.containsProperty(tag)) { 223 224 Object property = refAnnot.getProperty(tag); 225 ArrayList properties; 226 227 if (property instanceof String) { 228 properties = new ArrayList(); 229 properties.add(property); 230 properties.add(rest); 231 refAnnot.setProperty(tag, properties); 232 } 233 if (property instanceof ArrayList) { 234 ( (ArrayList) property).add(rest); 235 } 236 } 237 else { 238 refAnnot.setProperty(tag, rest); 239 } 240 //mark_s: required for parsing swissprot 241 //fixme: it is actually possible to have more than one JOURNAL_TAG 242 //so should really only break after the last one. 243 if(tag.equals(JOURNAL_TAG)) 244 break; 245 } 246 listener.addSequenceProperty(ReferenceAnnotation.class, refAnnot); 247 248 } catch (ChangeVetoException cve) { 249 cve.printStackTrace(); 250 } 251 252 } 253 // lorna, end 254 else { //lorna 255 listener.addSequenceProperty(tag, rest); 256 } //lorna 257 } 258 else { 259 // Sequence line 260 if (!elideSymbols) 261 processSequenceLine(line, sparser); 262 } 263 } 264 } 265 266 if (sparser != null) 267 sparser.close(); 268 269 throw new IOException( 270 "Premature end of stream or missing end tag '//' for EMBL"); 271 } 272 273 /** 274 * Dispatch symbol data from SQ-block line of an EMBL-like file. 275 */ 276 protected void processSequenceLine(String line, StreamParser parser) throws 277 IllegalSymbolException, ParseException { 278 char[] cline = line.toCharArray(); 279 int parseStart = 0; 280 int parseEnd = 0; 281 282 while (parseStart < cline.length) { 283 while (parseStart < cline.length && cline[parseStart] == ' ') 284 ++ 285 parseStart; 286 if (parseStart >= cline.length) 287 break; 288 289 if (Character.isDigit(cline[parseStart])) 290 return; 291 292 parseEnd = parseStart + 1; 293 while (parseEnd < cline.length && cline[parseEnd] != ' ') { 294 if (cline[parseEnd] == '.' || cline[parseEnd] == '~') { 295 cline[parseEnd] = '-'; 296 } 297 ++parseEnd; 298 } 299 300 // Got a segment of read sequence data 301 parser.characters(cline, parseStart, parseEnd - parseStart); 302 303 parseStart = parseEnd; 304 } 305 } 306 307 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 308 writeSequence(seq, getDefaultFormat(), os); 309 } 310 311 /** 312 * <code>writeSequence</code> writes a sequence to the specified 313 * <code>PrintStream</code>, using the specified format. 314 * 315 * @param seq a <code>Sequence</code> to write out. 316 * @param format a <code>String</code> indicating which sub-format 317 * of those available from a particular 318 * <code>SequenceFormat</code> implemention to use when 319 * writing. 320 * @param os a <code>PrintStream</code> object. 321 * 322 * @exception IOException if an error occurs. 323 * @deprecated use writeSequence(Sequence seq, PrintStream os) 324 */ 325 public void writeSequence(Sequence seq, String format, PrintStream os) throws 326 IOException { 327 SeqFileFormer former; 328 329 if (format.equalsIgnoreCase("EMBL")) 330 former = new EmblFileFormer(); 331 else if (format.equalsIgnoreCase("SWISSPROT")) 332 former = new SwissprotFileFormer(); 333 else 334 throw new IllegalArgumentException("Unknown format '" 335 + format 336 + "'"); 337 former.setPrintStream(os); 338 339 SeqIOEventEmitter emitter = 340 new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE, 341 GenEmblFeatureComparator.INSTANCE); 342 343 emitter.getSeqIOEvents(seq, former); 344 } 345 346 /** 347 * <code>getDefaultFormat</code> returns the String identifier for 348 * the default format written by a <code>SequenceFormat</code> 349 * implementation. 350 * 351 * @return a <code>String</code>. 352 * @deprecated 353 */ 354 public String getDefaultFormat() { 355 return DEFAULT; 356 } 357 358 /** 359 * <p> 360 * This method determines the behaviour when a bad line is processed. 361 * Some options are to log the error, throw an exception, ignore it 362 * completely, or pass the event through. 363 * </p> 364 * 365 * <p> 366 * This method should be overwritten when different behavior is desired. 367 * </p> 368 * 369 * @param theEvent The event that contains the bad line and token. 370 */ 371 public void BadLineParsed(ParseErrorEvent theEvent) { 372 notifyParseErrorEvent(theEvent); 373 } 374 375 /** 376 * Adds a parse error listener to the list of listeners if it isn't already 377 * included. 378 * 379 * @param theListener Listener to be added. 380 */ 381 public synchronized void addParseErrorListener(ParseErrorListener theListener) { 382 if (mListeners.contains(theListener) == false) { 383 mListeners.addElement(theListener); 384 } 385 } 386 387 /** 388 * Removes a parse error listener from the list of listeners if it is 389 * included. 390 * 391 * @param theListener Listener to be removed. 392 */ 393 public synchronized void removeParseErrorListener(ParseErrorListener 394 theListener) { 395 if (mListeners.contains(theListener) == true) { 396 mListeners.removeElement(theListener); 397 } 398 } 399 400 // Protected methods 401 /** 402 * Passes the event on to all the listeners registered for ParseErrorEvents. 403 * 404 * @param theEvent The event to be handed to the listeners. 405 */ 406 protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { 407 Vector listeners; 408 synchronized (this) { 409 listeners = (Vector) mListeners.clone(); 410 } 411 412 for (int index = 0; index < listeners.size(); index++) { 413 ParseErrorListener client = (ParseErrorListener) listeners.elementAt( 414 index); 415 client.BadLineParsed(theEvent); 416 } 417 } 418}