001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.io.PrintWriter; 032import java.util.ArrayList; 033import java.util.Collection; 034import java.util.Iterator; 035import java.util.LinkedHashMap; 036import java.util.List; 037import java.util.Map; 038import java.util.Set; 039import java.util.TreeMap; 040import java.util.TreeSet; 041import java.util.regex.Pattern; 042 043import javax.xml.parsers.ParserConfigurationException; 044 045import org.biojava.bio.seq.Sequence; 046import org.biojava.bio.seq.io.ParseException; 047import org.biojava.bio.seq.io.SeqIOListener; 048import org.biojava.bio.seq.io.SymbolTokenization; 049import org.biojava.bio.symbol.IllegalSymbolException; 050import org.biojava.bio.symbol.SimpleSymbolList; 051import org.biojava.bio.symbol.Symbol; 052import org.biojava.bio.symbol.SymbolList; 053import org.biojava.utils.ChangeVetoException; 054import org.biojava.utils.xml.PrettyXMLWriter; 055import org.biojava.utils.xml.XMLWriter; 056import org.biojavax.Comment; 057import org.biojavax.CrossRef; 058import org.biojavax.DocRef; 059import org.biojavax.DocRefAuthor; 060import org.biojavax.Namespace; 061import org.biojavax.Note; 062import org.biojavax.RankedCrossRef; 063import org.biojavax.RankedDocRef; 064import org.biojavax.RichAnnotation; 065import org.biojavax.RichObjectFactory; 066import org.biojavax.SimpleCrossRef; 067import org.biojavax.SimpleDocRef; 068import org.biojavax.SimpleDocRefAuthor; 069import org.biojavax.SimpleNote; 070import org.biojavax.SimpleRankedCrossRef; 071import org.biojavax.SimpleRankedDocRef; 072import org.biojavax.SimpleRichAnnotation; 073import org.biojavax.bio.seq.Position; 074import org.biojavax.bio.seq.RichFeature; 075import org.biojavax.bio.seq.RichLocation; 076import org.biojavax.bio.seq.RichSequence; 077import org.biojavax.bio.seq.RichLocation.Strand; 078import org.biojavax.bio.taxa.NCBITaxon; 079import org.biojavax.bio.taxa.SimpleNCBITaxon; 080import org.biojavax.ontology.ComparableTerm; 081import org.biojavax.utils.StringTools; 082import org.biojavax.utils.XMLTools; 083import org.xml.sax.Attributes; 084import org.xml.sax.SAXException; 085import org.xml.sax.helpers.DefaultHandler; 086 087/** 088 * Format reader for EMBLxml files. This version of EMBLxml format will generate 089 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 090 * org.biojava.bio.seq.io.GenbankXmlFormat object. 091 * 092 * Understands http://www.ebi.ac.uk/embl/dtd/EMBL_Services_V1.1.dtd 093 * 094 * @author Alan Li (code based on his work) 095 * @author Richard Holland 096 * @author Mark Schreiber 097 * @since 1.5 098 */ 099public class EMBLxmlFormat extends RichSequenceFormat.BasicFormat { 100 101 // Register this format with the format auto-guesser. 102 static { 103 RichSequence.IOTools.registerFormat(EMBLxmlFormat.class); 104 } 105 106 /** 107 * The name of this format 108 */ 109 public static final String EMBLXML_FORMAT = "EMBLxml"; 110 111 protected static final String ENTRY_GROUP_TAG = "EMBL_Services"; 112 protected static final String ENTRY_TAG = "entry"; 113 protected static final String ENTRY_ACCESSION_ATTR = "accession"; 114 protected static final String ENTRY_TAX_DIVISION_ATTR = "taxonomicDivision"; 115 protected static final String ENTRY_DATACLASS_ATTR = "dataClass"; 116 protected static final String ENTRY_CREATED_ATTR = "created"; 117 protected static final String ENTRY_RELCREATED_ATTR = "releaseCreated"; 118 protected static final String ENTRY_UPDATED_ATTR = "lastUpdated"; 119 protected static final String ENTRY_RELUPDATED_ATTR = "releaseLastUpdated"; 120 protected static final String ENTRY_VER_ATTR = "version"; 121 protected static final String ENTRY_SUBACC_ATTR = "submitterAccession"; 122 protected static final String ENTRY_SUBVER_ATTR = "submitterVersion"; 123 protected static final String ENTRY_SUBWGSVER_ATTR = "submitterWgsVersion"; 124 protected static final String ENTRY_STATUS_ATTR = "status"; 125 protected static final String ENTRY_STATUS_DATE_ATTR = "statusDate"; 126 127 protected static final String SEC_ACC_TAG = "secondaryAccession"; 128 protected static final String PROJ_ACC_TAG = "projectAccession"; 129 protected static final String DESC_TAG = "description"; 130 protected static final String KEYWORD_TAG = "keyword"; 131 protected static final String REFERENCE_TAG = "reference"; 132 133 protected static final String CITATION_TAG = "citation"; 134 protected static final String CITATION_ID_ATTR = "id"; 135 protected static final String CITATION_TYPE_ATTR = "type"; 136 protected static final String CITATION_DATE_ATTR = "date"; 137 protected static final String CITATION_NAME_ATTR = "name"; 138 protected static final String CITATION_VOL_ATTR = "volume"; 139 protected static final String CITATION_ISSUE_ATTR = "issue"; 140 protected static final String CITATION_FIRST_ATTR = "first"; 141 protected static final String CITATION_LAST_ATTR = "last"; 142 protected static final String CITATION_PUB_ATTR = "publisher"; 143 protected static final String CITATION_PATENT_ATTR = "patentNumber"; 144 protected static final String CITATION_INSTITUTE_ATTR = "institute"; 145 protected static final String CITATION_YEAR_ATTR = "year"; 146 147 protected static final String DBREFERENCE_TAG = "dbreference"; 148 protected static final String DBREF_DB_ATTR = "db"; 149 protected static final String DBREF_PRIMARY_ATTR = "primary"; 150 protected static final String DBREF_SEC_ATTR = "secondary"; 151 152 protected static final String CONSORTIUM_TAG = "consortium"; 153 protected static final String TITLE_TAG = "title"; 154 protected static final String EDITOR_TAG = "editor"; 155 protected static final String AUTHOR_TAG = "author"; 156 protected static final String PATENT_TAG = "patentApplicant"; 157 protected static final String LOCATOR_TAG = "locator"; 158 159 protected static final String CITATION_LOCATION_TAG = "citationLocation"; 160 protected static final String REF_POS_BEGIN_ATTR = "begin"; 161 protected static final String REF_POS_END_ATTR = "end"; 162 163 protected static final String COMMENT_TAG = "comment"; 164 165 protected static final String FEATURE_TAG = "feature"; 166 protected static final String FEATURE_NAME_ATTR = "name"; 167 168 protected static final String ORGANISM_TAG = "organism"; 169 protected static final String SCINAME_TAG = "scientificName"; 170 protected static final String COMNAME_TAG = "preferredCommonName"; 171 protected static final String TAXID_TAG = "taxId"; 172 protected static final String LINEAGE_TAG = "lineage"; 173 protected static final String TAXON_TAG = "taxon"; 174 protected static final String ORGANELLE_TAG = "organelle"; 175 176 protected static final String QUALIFIER_TAG = "qualifier"; 177 protected static final String QUALIFIER_NAME_ATTR = "name"; 178 179 protected static final String LOCATION_TAG = "location"; 180 protected static final String LOCATION_TYPE_ATTR = "type"; 181 protected static final String LOCATION_COMPL_ATTR = "complement"; 182 183 protected static final String LOCATION_ELEMENT_TAG = "locationElement"; 184 protected static final String LOC_ELEMENT_TYPE_ATTR = "type"; 185 protected static final String LOC_ELEMENT_ACC_ATTR = "accession"; 186 protected static final String LOC_ELEMENT_VER_ATTR = "version"; 187 protected static final String LOC_ELEMENT_COMPL_ATTR = "complement"; 188 189 protected static final String BASEPOSITION_TAG = "basePosition"; 190 protected static final String BASEPOSITION_TYPE_ATTR = "type"; 191 192 protected static final String CONTIG_TAG = "contig"; 193 protected static final String SEQUENCE_TAG = "sequence"; 194 protected static final String SEQUENCE_TYPE_ATTR = "type"; 195 protected static final String SEQUENCE_LENGTH_ATTR = "length"; 196 protected static final String SEQUENCE_TOPOLOGY_ATTR = "topology"; 197 protected static final String SEQUENCE_VER_ATTR = "version"; 198 199 protected static final Pattern xmlSchema = Pattern.compile(".*http://www\\.ebi\\.ac\\.uk/schema/EMBL_schema\\.xsd.*"); 200 201 /** 202 * Implements some EMBLxml-specific terms. 203 */ 204 public static class Terms extends RichSequence.Terms { 205 /** 206 * Getter for the SubmitterAccession term 207 * @return The SubmitterAccession Term 208 */ 209 public static ComparableTerm getSubmitterAccessionTerm() { 210 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterAccession"); 211 } 212 213 /** 214 * Getter for the SubmitterVersion term 215 * @return The SubmitterVersion Term 216 */ 217 public static ComparableTerm getSubmitterVersionTerm() { 218 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterVersion"); 219 } 220 221 /** 222 * Getter for the SubmitterWgsVersion term 223 * @return The SubmitterWgsVersion Term 224 */ 225 public static ComparableTerm getSubmitterWgsVersionTerm() { 226 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("SubmitterWgsVersion"); 227 } 228 229 /** 230 * Getter for the Status term 231 * @return The Status Term 232 */ 233 public static ComparableTerm getStatusTerm() { 234 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Status"); 235 } 236 237 /** 238 * Getter for the StatusDate term 239 * @return The StatusDate Term 240 */ 241 public static ComparableTerm getStatusDateTerm() { 242 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("StatusDate"); 243 } 244 245 /** 246 * Getter for the ProjectAccession term 247 * @return The ProjectAccession Term 248 */ 249 public static ComparableTerm getProjectAccessionTerm() { 250 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("ProjectAccession"); 251 } 252 253 /** 254 * Getter for the EMBLxml term 255 * @return The EMBLxml Term 256 */ 257 public static ComparableTerm getEMBLxmlTerm() { 258 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBLxml"); 259 } 260 261 /** 262 * Getter for the Ensembl-specific 'dataClass' term 263 * @return The data class Term 264 */ 265 public static ComparableTerm getDataClassTerm() { 266 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass"); 267 } 268 } 269 270 /** 271 * {@inheritDoc} 272 * A file is in EMBLxml format if the second XML line contains the phrase "http://www.ebi.ac.uk/schema/EMBL_schema.xsd". 273 */ 274 @Override 275 public boolean canRead(File file) throws IOException { 276 BufferedReader br = new BufferedReader(new FileReader(file)); 277 br.readLine(); // skip first line 278 String secondLine = br.readLine(); 279 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 280 br.close(); 281 return readable; 282 } 283 284 /** 285 * {@inheritDoc} 286 * Always returns a DNA tokenizer. 287 */ 288 @Override 289 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 290 return RichSequence.IOTools.getDNAParser(); 291 } 292 293 /** 294 * {@inheritDoc} 295 * A stream is in EMBLxml format if the second XML line contains the phrase "http://www.ebi.ac.uk/schema/EMBL_schema.xsd". 296 */ 297 public boolean canRead(BufferedInputStream stream) throws IOException { 298 stream.mark(2000); // some streams may not support this 299 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 300 br.readLine(); // skip first line 301 String secondLine = br.readLine(); 302 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 303 // don't close the reader as it'll close the stream too. 304 // br.close(); 305 stream.reset(); 306 return readable; 307 } 308 309 /** 310 * {@inheritDoc} 311 * Always returns a DNA tokenizer. 312 */ 313 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 314 return RichSequence.IOTools.getDNAParser(); 315 } 316 317 /** 318 * {@inheritDoc} 319 */ 320 public boolean readSequence(BufferedReader reader, 321 SymbolTokenization symParser, 322 SeqIOListener listener) 323 throws IllegalSymbolException, IOException, ParseException { 324 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 325 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 326 } 327 328 /** 329 * {@inheritDoc} 330 */ 331 public boolean readRichSequence(BufferedReader reader, 332 SymbolTokenization symParser, 333 RichSeqIOListener rlistener, 334 Namespace ns) 335 throws IllegalSymbolException, IOException, ParseException { 336 337 try { 338 DefaultHandler m_handler = new EMBLxmlHandler(this,symParser,rlistener,ns); 339 return XMLTools.readXMLChunk(reader, m_handler, ENTRY_TAG); 340 } catch (ParserConfigurationException e) { 341 throw new ParseException(e); 342 } catch (SAXException e) { 343 throw new ParseException(e); 344 } 345 } 346 347 private PrintWriter pw; 348 private XMLWriter xml; 349 350 /** 351 * {@inheritDoc} 352 */ 353 public void beginWriting() throws IOException { 354 // make an XML writer 355 pw = new PrintWriter(this.getPrintStream()); 356 xml = new PrettyXMLWriter(pw); 357 xml.printRaw("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); 358 xml.openTag(ENTRY_GROUP_TAG); 359 xml.attribute("xmlns:ebi", "http://www.ebi.ac.uk/embl/schema"); 360 xml.attribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); 361 xml.attribute("xsi:noNamespaceSchemaLocation","http://www.ebi.ac.uk/embl/schema/EMBL_Services_V1.1.xsd"); 362 } 363 364 /** 365 * {@inheritDoc} 366 */ 367 public void finishWriting() throws IOException { 368 xml.closeTag(ENTRY_GROUP_TAG); 369 pw.flush(); 370 } 371 372 /** 373 * {@inheritDoc} 374 */ 375 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 376 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 377 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 378 } 379 380 /** 381 * {@inheritDoc} 382 */ 383 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 384 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 385 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 386 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 387 } 388 389 /** 390 * {@inheritDoc} 391 * Namespace is ignored as EMBLxml has no concept of it. 392 */ 393 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 394 RichSequence rs; 395 try { 396 if (seq instanceof RichSequence) rs = (RichSequence)seq; 397 else rs = RichSequence.Tools.enrich(seq); 398 } catch (ChangeVetoException e) { 399 IOException e2 = new IOException("Unable to enrich sequence"); 400 e2.initCause(e); 401 throw e2; 402 } 403 404 Set<Note> notes = rs.getNoteSet(); 405 List accessions = new ArrayList(); 406 List projAccessions = new ArrayList(); 407 List kws = new ArrayList(); 408 List organelles = new ArrayList(); 409 String cdat = null; 410 String udat = null; 411 String crel = null; 412 String urel = null; 413 String dataClass = null; 414 String moltype = rs.getAlphabet().getName(); 415 String subWgsVer = null; 416 String subAcc = null; 417 String subVer = null; 418 String status = null; 419 String statusDate = null; 420 for (Iterator<Note> i = notes.iterator(); i.hasNext();) { 421 Note n = i.next(); 422 if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); 423 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 424 else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue(); 425 else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue(); 426 else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); 427 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) accessions.add(n.getValue()); 428 else if (n.getTerm().equals(Terms.getProjectAccessionTerm())) projAccessions.add(n.getValue()); 429 else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelles.add(n.getValue()); 430 else if (n.getTerm().equals(Terms.getKeywordTerm())) kws.add(n.getValue()); 431 else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue(); 432 else if (n.getTerm().equals(Terms.getSubmitterAccessionTerm())) subAcc = n.getValue(); 433 else if (n.getTerm().equals(Terms.getSubmitterVersionTerm())) subVer = n.getValue(); 434 else if (n.getTerm().equals(Terms.getSubmitterWgsVersionTerm())) subWgsVer = n.getValue(); 435 else if (n.getTerm().equals(Terms.getStatusTerm())) status = n.getValue(); 436 else if (n.getTerm().equals(Terms.getStatusDateTerm())) statusDate = n.getValue(); 437 } 438 439 xml.openTag(ENTRY_TAG); 440 xml.attribute(ENTRY_ACCESSION_ATTR,rs.getAccession()); 441 xml.attribute(ENTRY_TAX_DIVISION_ATTR,rs.getDivision()); 442 xml.attribute(ENTRY_DATACLASS_ATTR,dataClass); 443 xml.attribute(ENTRY_CREATED_ATTR,cdat==null?udat:cdat); 444 xml.attribute(ENTRY_RELCREATED_ATTR,crel==null?"0":crel); 445 xml.attribute(ENTRY_UPDATED_ATTR,udat); 446 xml.attribute(ENTRY_RELUPDATED_ATTR,urel==null?"0":urel); 447 xml.attribute(ENTRY_VER_ATTR,""+rs.getVersion()); 448 if (subAcc!=null) 449 xml.attribute(ENTRY_SUBACC_ATTR,subAcc); 450 if (subVer!=null) 451 xml.attribute(ENTRY_SUBVER_ATTR,subVer); 452 if (subWgsVer!=null) 453 xml.attribute(ENTRY_SUBWGSVER_ATTR,subWgsVer); 454 if (status!=null) 455 xml.attribute(ENTRY_STATUS_ATTR,status); 456 if (statusDate!=null) 457 xml.attribute(ENTRY_STATUS_DATE_ATTR,statusDate); 458 459 for (Iterator i = accessions.iterator(); i.hasNext(); ) { 460 xml.openTag(SEC_ACC_TAG); 461 xml.print((String)i.next()); 462 xml.closeTag(SEC_ACC_TAG); 463 } 464 465 for (Iterator i = projAccessions.iterator(); i.hasNext(); ) { 466 xml.openTag(PROJ_ACC_TAG); 467 xml.print((String)i.next()); 468 xml.closeTag(PROJ_ACC_TAG); 469 } 470 471 xml.openTag(DESC_TAG); 472 xml.print(rs.getDescription()); 473 xml.closeTag(DESC_TAG); 474 475 for (Iterator i = kws.iterator(); i.hasNext(); ) { 476 xml.openTag(KEYWORD_TAG); 477 xml.print((String)i.next()); 478 xml.closeTag(KEYWORD_TAG); 479 } 480 481 for (Iterator i = rs.getRankedDocRefs().iterator(); i.hasNext(); ) { 482 RankedDocRef rdr = (RankedDocRef)i.next(); 483 DocRef dr = rdr.getDocumentReference(); 484 485 xml.openTag(REFERENCE_TAG); 486 487 xml.openTag(CITATION_TAG); 488 xml.attribute(CITATION_ID_ATTR,""+rdr.getRank()); 489 xml.attribute(CITATION_TYPE_ATTR,"journal article"); 490 491 CrossRef cr = dr.getCrossref(); 492 if (cr!=null) { 493 xml.openTag(DBREFERENCE_TAG); 494 xml.attribute(DBREF_DB_ATTR,cr.getDbname()); 495 xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession()); 496 if (!cr.getNoteSet().isEmpty()) { 497 for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) { 498 Note n = j.next(); 499 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 500 xml.attribute(DBREF_SEC_ATTR,n.getValue()); 501 break; 502 } 503 } 504 } 505 xml.closeTag(DBREFERENCE_TAG); 506 } 507 508 List<DocRefAuthor> auths = dr.getAuthorList(); 509 510 for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) { 511 DocRefAuthor a = j.next(); 512 if (a.isConsortium()) { 513 xml.openTag(CONSORTIUM_TAG); 514 xml.print(a.getName()); 515 xml.closeTag(CONSORTIUM_TAG); 516 j.remove(); 517 } 518 } 519 520 if (dr.getTitle()!=null) { 521 xml.openTag(TITLE_TAG); 522 xml.print(dr.getTitle()); 523 xml.closeTag(TITLE_TAG); 524 } 525 526 for (Iterator<DocRefAuthor> j = auths.iterator(); j.hasNext(); ) { 527 DocRefAuthor a = j.next(); 528 if (a.isEditor()) { 529 xml.openTag(EDITOR_TAG); 530 xml.print(a.getName()); 531 xml.closeTag(EDITOR_TAG); 532 } else { 533 xml.openTag(AUTHOR_TAG); 534 xml.print(a.getName()); 535 xml.closeTag(AUTHOR_TAG); 536 } 537 } 538 539 xml.openTag(LOCATOR_TAG); 540 xml.print(dr.getLocation()); 541 xml.closeTag(LOCATOR_TAG); 542 xml.closeTag(CITATION_TAG); 543 544 xml.openTag(CITATION_LOCATION_TAG); 545 Integer rstart = rdr.getStart(); 546 if (rstart==null) rstart = new Integer(1); 547 Integer rend = rdr.getEnd(); 548 if (rend==null) rend = new Integer(rs.length()); 549 xml.attribute(REF_POS_BEGIN_ATTR,""+rstart); 550 xml.attribute(REF_POS_END_ATTR,""+rend); 551 if (dr.getRemark()!=null) { 552 xml.openTag(COMMENT_TAG); 553 xml.print(dr.getRemark()); 554 xml.closeTag(COMMENT_TAG); 555 } 556 xml.closeTag(CITATION_LOCATION_TAG); 557 558 xml.closeTag(REFERENCE_TAG); 559 } 560 561 for (Iterator<RankedCrossRef> i = rs.getRankedCrossRefs().iterator(); i.hasNext(); ) { 562 RankedCrossRef rcr = i.next(); 563 CrossRef cr = rcr.getCrossRef(); 564 565 xml.openTag(DBREFERENCE_TAG); 566 xml.attribute(DBREF_DB_ATTR,cr.getDbname()); 567 xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession()); 568 569 if (!cr.getNoteSet().isEmpty()) { 570 for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) { 571 Note n = j.next(); 572 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 573 xml.attribute(DBREF_SEC_ATTR,n.getValue()); 574 break; 575 } 576 } 577 } 578 579 xml.closeTag(DBREFERENCE_TAG); 580 } 581 582 for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) { 583 xml.openTag(COMMENT_TAG); 584 xml.println(i.next().getComment()); 585 xml.closeTag(COMMENT_TAG); 586 } 587 588 NCBITaxon tax = rs.getTaxon(); 589 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 590 RichFeature f = (RichFeature)i.next(); 591 xml.openTag(FEATURE_TAG); 592 xml.attribute(FEATURE_NAME_ATTR,f.getTypeTerm().getName()); 593 594 // display organism on source feature only 595 if (f.getTypeTerm().getName().equals("source") && tax!=null) { 596 xml.openTag(ORGANISM_TAG); 597 598 String[] parts = tax.getDisplayName().split("(\\(|\\))"); 599 xml.openTag(SCINAME_TAG); 600 xml.print(parts[0].trim()); 601 xml.closeTag(SCINAME_TAG); 602 if (parts.length>1) { 603 xml.openTag(COMNAME_TAG); 604 xml.print(parts[1].trim()); 605 xml.closeTag(COMNAME_TAG); 606 } 607 608 xml.openTag(TAXID_TAG); 609 xml.print(""+tax.getNCBITaxID()); 610 xml.closeTag(TAXID_TAG); 611 612 String hierarchy = tax.getNameHierarchy(); 613 hierarchy = hierarchy.substring(0,hierarchy.length()-1); // chomp "." 614 if (hierarchy.length()>0) { 615 parts = hierarchy.split(";"); 616 xml.openTag(LINEAGE_TAG); 617 for (int j = 0; j < parts.length; j++) { 618 xml.openTag(TAXON_TAG); 619 xml.print(parts[j].trim()); 620 xml.closeTag(TAXON_TAG); 621 } 622 xml.closeTag(LINEAGE_TAG); 623 } 624 625 for (final Iterator j = organelles.iterator(); j.hasNext(); ) { 626 final String organelle = (String)j.next(); 627 xml.openTag(ORGANELLE_TAG); 628 xml.print(organelle); 629 xml.closeTag(ORGANELLE_TAG); 630 } 631 632 xml.closeTag(ORGANISM_TAG); 633 } 634 635 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { 636 RankedCrossRef rcr = j.next(); 637 CrossRef cr = rcr.getCrossRef(); 638 639 xml.openTag(DBREFERENCE_TAG); 640 xml.attribute(DBREF_DB_ATTR,cr.getDbname()); 641 xml.attribute(DBREF_PRIMARY_ATTR,cr.getAccession()); 642 643 if (!cr.getNoteSet().isEmpty()) { 644 for (Iterator<Note> k = cr.getNoteSet().iterator(); k.hasNext(); ) { 645 Note n = k.next(); 646 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 647 xml.attribute(DBREF_SEC_ATTR,n.getValue()); 648 break; 649 } 650 } 651 } 652 653 xml.closeTag(DBREFERENCE_TAG); 654 } 655 656 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext();) { 657 Note n = j.next(); 658 xml.openTag(QUALIFIER_TAG); 659 xml.attribute(QUALIFIER_NAME_ATTR,n.getTerm().getName()); 660 if (n.getValue()!=null && !n.getValue().equals("")) { 661 if (n.getTerm().getName().equalsIgnoreCase("translation")) { 662 String[] lines = StringTools.wordWrap(n.getValue(), "\\s+", this.getLineWidth()); 663 for (int k = 0; k < lines.length; k++) xml.println(lines[k]); 664 } else { 665 xml.print(n.getValue()); 666 } 667 } 668 xml.closeTag(QUALIFIER_TAG); 669 } 670 671 // make it easy for ourselves by flattening into a single compound location 672 RichLocation rle = (RichLocation)f.getLocation(); 673 Collection locElements = RichLocation.Tools.flatten(rle); 674 xml.openTag(LOCATION_TAG); 675 xml.attribute(LOCATION_TYPE_ATTR,(locElements.size()>1?rle.getTerm().getName():"single")); 676 xml.attribute(LOCATION_COMPL_ATTR,"false"); 677 for (Iterator j = locElements.iterator(); j.hasNext(); ) { 678 RichLocation rl = (RichLocation)j.next(); 679 xml.openTag(LOCATION_ELEMENT_TAG); 680 681 if (rl.getStrand().equals(Strand.NEGATIVE_STRAND)) { 682 xml.attribute(LOC_ELEMENT_COMPL_ATTR,"true"); 683 } else { 684 xml.attribute(LOC_ELEMENT_COMPL_ATTR,"false"); 685 } 686 687 if (rl.getCrossRef()!=null) { 688 xml.attribute(LOC_ELEMENT_ACC_ATTR,rl.getCrossRef().getAccession()); 689 xml.attribute(LOC_ELEMENT_VER_ATTR,""+rl.getCrossRef().getVersion()); 690 } 691 692 Position start = rl.getMinPosition(); 693 // EMBLxml does not support fuzzy locations so we only ever 694 // use the start coordinate. 695 696 // output first base only 697 xml.attribute(LOC_ELEMENT_TYPE_ATTR,"site"); 698 699 xml.openTag(BASEPOSITION_TAG); 700 if (start.getFuzzyStart()) xml.attribute(BASEPOSITION_TYPE_ATTR,"<"); 701 else if (start.getFuzzyEnd()) xml.attribute(BASEPOSITION_TYPE_ATTR,"<"); 702 else xml.attribute(BASEPOSITION_TYPE_ATTR,"simple"); 703 xml.print(""+start.getStart()); 704 xml.closeTag(BASEPOSITION_TAG); 705 706 xml.closeTag(LOCATION_ELEMENT_TAG); 707 } 708 709 xml.closeTag(LOCATION_TAG); 710 711 xml.closeTag(FEATURE_TAG); 712 } 713 714 xml.openTag(SEQUENCE_TAG); 715 xml.attribute(SEQUENCE_TYPE_ATTR,moltype); 716 xml.attribute(SEQUENCE_LENGTH_ATTR,""+rs.length()); 717 xml.attribute(SEQUENCE_TOPOLOGY_ATTR,rs.getCircular()?"circular":"linear"); 718 xml.attribute(SEQUENCE_VER_ATTR,""+rs.getSeqVersion().intValue()); 719 String[] lines = StringTools.wordWrap(rs.seqString(), "\\s+", this.getLineWidth()); 720 for (int i = 0; i < lines.length; i ++) xml.println(lines[i]); 721 xml.closeTag(SEQUENCE_TAG); 722 723 xml.closeTag(ENTRY_TAG); 724 725 pw.flush(); 726 } 727 728 /** 729 * {@inheritDoc} 730 */ 731 public String getDefaultFormat() { 732 return EMBLXML_FORMAT; 733 } 734 735 // SAX event handler for parsing http://www.ebi.ac.uk/embl/Documentation/DTD/EMBL_dtd.txt 736 private class EMBLxmlHandler extends DefaultHandler { 737 738 private RichSequenceFormat parent; 739 private SymbolTokenization symParser; 740 private RichSeqIOListener rlistener; 741 private Namespace ns; 742 private StringBuffer m_currentString; 743 744 private NCBITaxon tax; 745 private String accession; 746 private RichFeature.Template templ; 747 private String currFeatQual; 748 private String currRefLocation; 749 private List currRefAuthors; 750 private String currRefTitle; 751 private Map currNames = new TreeMap(); 752 private int currRefStart; 753 private int currRefEnd; 754 private int currRefRank; 755 private int currLocBrackets; 756 private int currLocElemBrackets; 757 private StringBuffer currLocStr; 758 private String currBaseType; 759 private boolean firstBase; // oooh err! 760 private boolean firstLocationElement; 761 private List currDBXrefs = new ArrayList(); 762 private List currComments = new ArrayList(); 763 private Map currQuals = new LinkedHashMap(); 764 765 // construct a new handler that will populate the given list of sequences 766 private EMBLxmlHandler(RichSequenceFormat parent, 767 SymbolTokenization symParser, 768 RichSeqIOListener rlistener, 769 Namespace ns) { 770 this.parent = parent; 771 this.symParser = symParser; 772 this.rlistener = rlistener; 773 this.ns = ns; 774 this.m_currentString = new StringBuffer(); 775 } 776 777 // process an opening tag 778 @Override 779 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 780 if (qName.equals(ENTRY_TAG)) { 781 try { 782 rlistener.startSequence(); 783 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 784 rlistener.setNamespace(ns); 785 for (int i = 0; i < attributes.getLength(); i++) { 786 String name = attributes.getQName(i); 787 String val = attributes.getValue(i); 788 if (name.equals(ENTRY_ACCESSION_ATTR)) { 789 accession = val; 790 rlistener.setAccession(accession); 791 rlistener.setName(accession); 792 } else if (name.equals(ENTRY_TAX_DIVISION_ATTR)) rlistener.setDivision(val); 793 else if (name.equals(ENTRY_DATACLASS_ATTR)) rlistener.addSequenceProperty(Terms.getDataClassTerm(),val); 794 else if (name.equals(ENTRY_CREATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateCreatedTerm(),val); 795 else if (name.equals(ENTRY_UPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),val); 796 else if (name.equals(ENTRY_RELCREATED_ATTR)) rlistener.addSequenceProperty(Terms.getRelCreatedTerm(),val); 797 else if (name.equals(ENTRY_RELUPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(),val); 798 else if (name.equals(ENTRY_VER_ATTR)) rlistener.setVersion(Integer.parseInt(val)); 799 else if (name.equals(ENTRY_SUBACC_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterAccessionTerm(),val); 800 else if (name.equals(ENTRY_SUBVER_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterVersionTerm(),val); 801 else if (name.equals(ENTRY_SUBWGSVER_ATTR)) rlistener.addSequenceProperty(Terms.getSubmitterWgsVersionTerm(),val); 802 else if (name.equals(ENTRY_STATUS_ATTR)) rlistener.addSequenceProperty(Terms.getStatusTerm(),val); 803 else if (name.equals(ENTRY_STATUS_DATE_ATTR)) rlistener.addSequenceProperty(Terms.getStatusDateTerm(),val); 804 } 805 currNames.clear(); 806 currComments.clear(); 807 currDBXrefs.clear(); 808 } catch (ParseException e) { 809 throw new SAXException(e); 810 } 811 } 812 813 else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 814 currRefLocation = null; 815 currRefAuthors = new ArrayList(); 816 currRefTitle = null; 817 currRefStart = -999; 818 currRefEnd = -999; 819 currRefRank = 0; 820 currDBXrefs.clear(); 821 currComments.clear(); 822 } else if (qName.equals(CITATION_LOCATION_TAG) && !this.parent.getElideReferences()) { 823 for (int i = 0; i < attributes.getLength(); i++) { 824 String name = attributes.getQName(i); 825 String val = attributes.getValue(i); 826 if (name.equals(REF_POS_BEGIN_ATTR)) currRefStart = Integer.parseInt(val); 827 else if (name.equals(REF_POS_END_ATTR)) currRefEnd = Integer.parseInt(val); 828 } 829 } else if (qName.equals(CITATION_TAG) && !this.parent.getElideReferences()) { 830 StringBuffer currRef = new StringBuffer(); 831 for (int i = 0; i < attributes.getLength(); i++) { 832 String name = attributes.getQName(i); 833 String val = attributes.getValue(i); 834 if (name.equals(CITATION_ID_ATTR)) currRefRank = Integer.parseInt(val); 835 // combine everything else into a fake reference to use if locator is a no-show 836 else if (!name.equals(CITATION_TYPE_ATTR)) { 837 if (currRef.length()>0) currRef.append(" "); 838 currRef.append(val); 839 } 840 } 841 currRefLocation = currRef.toString(); 842 } 843 844 else if (qName.equals(DBREFERENCE_TAG)) { 845 String db = null; 846 String primary = null; 847 String secondary = null; 848 for (int i = 0; i < attributes.getLength(); i++) { 849 String name = attributes.getQName(i); 850 String val = attributes.getValue(i); 851 if (name.equals(DBREF_DB_ATTR)) db = val; 852 else if (name.equals(DBREF_PRIMARY_ATTR)) primary = val; 853 else if (name.equals(DBREF_SEC_ATTR)) secondary = val; 854 } 855 CrossRef dbx = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{db, primary, new Integer(0)}); 856 if (secondary!=null) { 857 Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),secondary,0); 858 try { 859 dbx.getRichAnnotation().addNote(note); 860 } catch (ChangeVetoException ce) { 861 SAXException pe = new SAXException("Could not annotate identifier terms"); 862 pe.initCause(ce); 863 throw pe; 864 } 865 } 866 currDBXrefs.add(dbx); 867 } 868 869 else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) { 870 templ = new RichFeature.Template(); 871 templ.annotation = new SimpleRichAnnotation(); 872 templ.sourceTerm = Terms.getEMBLxmlTerm(); 873 templ.featureRelationshipSet = new TreeSet(); 874 templ.rankedCrossRefs = new TreeSet(); 875 for (int i = 0; i < attributes.getLength(); i++) { 876 String name = attributes.getQName(i); 877 String val = attributes.getValue(i); 878 if (name.equals(FEATURE_NAME_ATTR)) templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(val); 879 } 880 currLocStr = new StringBuffer(); 881 currDBXrefs.clear(); 882 currQuals.clear(); 883 } else if (qName.equals(QUALIFIER_TAG) && !this.parent.getElideFeatures()) { 884 for (int i = 0; i < attributes.getLength(); i++) { 885 String name = attributes.getQName(i); 886 String val = attributes.getValue(i); 887 if (name.equals(QUALIFIER_NAME_ATTR)) currFeatQual = val; 888 } 889 } else if (qName.equals(LOCATION_TAG) && !this.parent.getElideFeatures()) { 890 currLocBrackets = 0; 891 for (int i = 0; i < attributes.getLength(); i++) { 892 String name = attributes.getQName(i); 893 String val = attributes.getValue(i); 894 if (name.equals(LOCATION_TYPE_ATTR) && !val.equalsIgnoreCase("single")) { 895 // open a bracket just in case 896 currLocStr.append(val); 897 currLocStr.append("("); 898 currLocBrackets++; 899 } else if (name.equals(LOCATION_COMPL_ATTR) && val.equalsIgnoreCase("true")) { 900 currLocStr.append("complement"); 901 currLocStr.append("("); 902 currLocBrackets++; 903 } 904 } 905 firstLocationElement = true; 906 } else if (qName.equals(LOCATION_ELEMENT_TAG) && !this.parent.getElideFeatures()) { 907 String currAcc = null; 908 String currVer = null; 909 if (!firstLocationElement) currLocStr.append(","); 910 for (int i = 0; i < attributes.getLength(); i++) { 911 String name = attributes.getQName(i); 912 String val = attributes.getValue(i); 913 if (name.equals(LOCATION_COMPL_ATTR) && val.equalsIgnoreCase("true")) { 914 currLocStr.append("complement"); 915 currLocStr.append("("); 916 currLocElemBrackets++; 917 } else if (name.equals(LOC_ELEMENT_ACC_ATTR)) currAcc = val; 918 else if (name.equals(LOC_ELEMENT_VER_ATTR)) currVer = val; 919 } 920 if (currAcc!=null) { 921 currLocStr.append(currAcc); 922 if (currVer!=null) { 923 currLocStr.append("."); 924 currLocStr.append(currVer); 925 } 926 currLocStr.append(":"); 927 } 928 firstBase = true; 929 } else if (qName.equals(BASEPOSITION_TAG) && !this.parent.getElideFeatures()) { 930 for (int i = 0; i < attributes.getLength(); i++) { 931 String name = attributes.getQName(i); 932 String val = attributes.getValue(i); 933 if (name.equals(BASEPOSITION_TYPE_ATTR)) currBaseType = val; 934 } 935 } 936 937 else if (qName.equals(CONTIG_TAG)) { 938 String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet", qName); 939 ParseException e = new ParseException(message); 940 SAXException pe = new SAXException("Could not set contig properties"); 941 pe.initCause(e); 942 throw pe; 943 } 944 945 else if (qName.equals(SEQUENCE_TAG)) { 946 try { 947 for (int i = 0; i < attributes.getLength(); i++) { 948 String name = attributes.getQName(i); 949 String val = attributes.getValue(i); 950 if (name.equals(SEQUENCE_TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getMolTypeTerm(),val); 951 else if (name.equals(SEQUENCE_VER_ATTR)) rlistener.setSeqVersion(val); 952 else if (name.equals(SEQUENCE_TOPOLOGY_ATTR) && val.equalsIgnoreCase("circular")) rlistener.setCircular(true); 953 } 954 } catch (ParseException e) { 955 SAXException pe = new SAXException("Could not set sequence properties"); 956 pe.initCause(e); 957 throw pe; 958 } 959 } 960 } 961 962 // process a closing tag - we will have read the text already 963 @Override 964 public void endElement(String uri, String localName, String qName) throws SAXException { 965 String val = this.m_currentString.toString().trim(); 966 967 try { 968 if (qName.equals(SEC_ACC_TAG)) { 969 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),val); 970 } else if (qName.equals(PROJ_ACC_TAG)) { 971 rlistener.addSequenceProperty(Terms.getProjectAccessionTerm(),val); 972 } else if (qName.equals(ORGANELLE_TAG)) { 973 rlistener.addSequenceProperty(Terms.getOrganelleTerm(),val); 974 } else if (qName.equals(DESC_TAG)) { 975 rlistener.setDescription(val); 976 } else if (qName.equals(KEYWORD_TAG)) { 977 rlistener.addSequenceProperty(Terms.getKeywordTerm(), val); 978 } else if (qName.equals(COMMENT_TAG)) { 979 currComments.add(val); 980 } 981 982 else if (qName.equals(TITLE_TAG)) { 983 currRefTitle = val; 984 } else if (qName.equals(AUTHOR_TAG)) { 985 currRefAuthors.add(new SimpleDocRefAuthor(val,false,false)); 986 } else if (qName.equals(EDITOR_TAG)) { 987 currRefAuthors.add(new SimpleDocRefAuthor(val,false,true)); 988 } else if (qName.equals(CONSORTIUM_TAG)) { 989 currRefAuthors.add(new SimpleDocRefAuthor(val,true,false)); 990 } else if (qName.equals(LOCATOR_TAG)) { 991 currRefLocation = val; 992 } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 993 // do the crossrefs 994 CrossRef useForDocRef = null; 995 for (Iterator j = currDBXrefs.iterator(); j.hasNext();) { 996 CrossRef dbx = (CrossRef)j.next(); 997 RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx,0); 998 rlistener.setRankedCrossRef(rdbx); 999 if (useForDocRef==null) useForDocRef = dbx; 1000 else { 1001 // medline gets priority, then pubmed - if multiple, use last 1002 if (dbx.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY) || 1003 (dbx.getDbname().equalsIgnoreCase(Terms.PUBMED_KEY) && 1004 !useForDocRef.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY))) { 1005 useForDocRef = dbx; 1006 } 1007 } 1008 } 1009 // do the comment - will only be one, if any 1010 String currRefRemark = null; 1011 if (currComments.size()>0) currRefRemark = (String)currComments.iterator().next(); 1012 // create the docref object 1013 try { 1014 DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{currRefAuthors,currRefLocation,currRefTitle}); 1015 // assign the pubmed or medline to the docref - medline gets priority 1016 if (useForDocRef!=null) dr.setCrossref(useForDocRef); 1017 // assign the remarks 1018 dr.setRemark(currRefRemark); 1019 // assign the docref to the bioentry 1020 RankedDocRef rdr = new SimpleRankedDocRef(dr, 1021 (currRefStart != -999 ? new Integer(currRefStart) : null), 1022 (currRefEnd != -999 ? new Integer(currRefEnd) : null), 1023 currRefRank); 1024 rlistener.setRankedDocRef(rdr); 1025 } catch (ChangeVetoException e) { 1026 throw new ParseException(e); 1027 } 1028 currDBXrefs.clear(); 1029 currComments.clear(); 1030 } 1031 1032 else if (qName.equals(LOCATION_TAG) && !this.parent.getElideFeatures()) { 1033 while (currLocBrackets-->0) currLocStr.append(")"); // close the location groups 1034 String tidyLocStr = currLocStr.toString().replaceAll("\\s+",""); 1035 templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); 1036 } else if (qName.equals(LOCATION_ELEMENT_TAG) && !this.parent.getElideFeatures()) { 1037 while (currLocElemBrackets-->0) currLocStr.append(")"); // close the location groups 1038 firstLocationElement = false; 1039 } else if (qName.equals(BASEPOSITION_TAG) && !this.parent.getElideFeatures()) { 1040 if (!firstBase) currLocStr.append(".."); 1041 // left angle bracket, right angle bracket, simple, fuzzy 1042 if (currBaseType.equals("<")) { 1043 currLocStr.append("<"); 1044 currLocStr.append(val); 1045 } else if (currBaseType.equals(">")) { 1046 currLocStr.append(val); 1047 currLocStr.append(">"); 1048 } else if (currBaseType.equalsIgnoreCase("simple")) { 1049 currLocStr.append(val); 1050 } 1051 firstBase = false; 1052 } else if (qName.equals(QUALIFIER_TAG) && !this.parent.getElideFeatures()) { 1053 currQuals.put(currFeatQual,val); 1054 } else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) { 1055 // start the feature 1056 rlistener.startFeature(templ); 1057 // assign qualifiers 1058 for (Iterator j = currQuals.keySet().iterator(); j.hasNext(); ) { 1059 String qualName = (String)j.next(); 1060 String qualVal = (String)currQuals.get(qualName); 1061 if (qualName.equalsIgnoreCase("translation")) { 1062 // strip spaces from sequence 1063 qualVal = qualVal.replaceAll("\\s+",""); 1064 } 1065 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(qualName),qualVal); 1066 } 1067 // do the crossrefs 1068 int rcrossrefCount = 0; 1069 for (Iterator j = currDBXrefs.iterator(); j.hasNext();) { 1070 CrossRef dbx = (CrossRef)j.next(); 1071 RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, ++rcrossrefCount); 1072 try { 1073 rlistener.getCurrentFeature().addRankedCrossRef(rdbx); 1074 } catch (ChangeVetoException ce) { 1075 throw new ParseException(ce); 1076 } 1077 } 1078 // end the feature 1079 rlistener.endFeature(); 1080 currDBXrefs.clear(); 1081 } 1082 1083 else if (qName.equals(TAXID_TAG)) { 1084 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(val)}); 1085 rlistener.setTaxon(tax); 1086 for (Iterator j = currNames.keySet().iterator(); j.hasNext(); ) { 1087 String nameClass = (String)j.next(); 1088 Set nameSet = (Set)currNames.get(nameClass); 1089 try { 1090 for (Iterator k = nameSet.iterator(); k.hasNext(); ) { 1091 String name = (String)k.next(); 1092 tax.addName(nameClass,name); 1093 } 1094 } catch (ChangeVetoException ce) { 1095 throw new ParseException(ce); 1096 } 1097 } 1098 currNames.clear(); 1099 } else if (qName.equals(SCINAME_TAG)) { 1100 try { 1101 if (tax==null) { 1102 if (!currNames.containsKey(NCBITaxon.SCIENTIFIC)) currNames.put(NCBITaxon.SCIENTIFIC,new TreeSet()); 1103 ((Set)currNames.get(NCBITaxon.SCIENTIFIC)).add(val); 1104 } else { 1105 tax.addName(NCBITaxon.SCIENTIFIC,val); 1106 } 1107 } catch (ChangeVetoException ce) { 1108 throw new ParseException(ce); 1109 } 1110 } else if (qName.equals(COMNAME_TAG)) { 1111 try { 1112 if (tax==null) { 1113 if (!currNames.containsKey(NCBITaxon.COMMON)) currNames.put(NCBITaxon.COMMON,new TreeSet()); 1114 ((Set)currNames.get(NCBITaxon.COMMON)).add(val); 1115 } else { 1116 tax.addName(NCBITaxon.COMMON,val); 1117 } 1118 } catch (ChangeVetoException ce) { 1119 throw new ParseException(ce); 1120 } 1121 } 1122 1123 else if (qName.equals(SEQUENCE_TAG) && !this.parent.getElideSymbols()) { 1124 try { 1125 SymbolList sl = new SimpleSymbolList(symParser, 1126 val.replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 1127 rlistener.addSymbols(symParser.getAlphabet(), 1128 (Symbol[])(sl.toList().toArray(new Symbol[0])), 1129 0, sl.length()); 1130 } catch (Exception e) { 1131 throw new ParseException(e); 1132 } 1133 } 1134 1135 else if (qName.equals(ENTRY_TAG)) { 1136 // do the comments 1137 for (Iterator j = currComments.iterator(); j.hasNext();) { 1138 rlistener.setComment((String)j.next()); 1139 } 1140 // do the crossrefs 1141 for (Iterator j = currDBXrefs.iterator(); j.hasNext();) { 1142 CrossRef dbx = (CrossRef)j.next(); 1143 RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, 0); 1144 rlistener.setRankedCrossRef(rdbx); 1145 } 1146 // end the sequence 1147 rlistener.endSequence(); 1148 currComments.clear(); 1149 currDBXrefs.clear(); 1150 } 1151 1152 } catch (ParseException e) { 1153 throw new SAXException(e); 1154 } 1155 1156 // drop old string 1157 this.m_currentString.setLength(0); 1158 } 1159 1160 // process text inside tags 1161 @Override 1162 public void characters(char[] ch, int start, int length) { 1163 this.m_currentString.append(ch, start, length); 1164 } 1165 } 1166} 1167