001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.io.PrintWriter; 032import java.util.ArrayList; 033import java.util.Iterator; 034import java.util.List; 035import java.util.Map; 036import java.util.Set; 037import java.util.TreeMap; 038import java.util.TreeSet; 039import java.util.regex.Matcher; 040import java.util.regex.Pattern; 041 042import javax.xml.parsers.ParserConfigurationException; 043 044import org.biojava.bio.proteomics.MassCalc; 045import org.biojava.bio.seq.Sequence; 046import org.biojava.bio.seq.io.ParseException; 047import org.biojava.bio.seq.io.SeqIOListener; 048import org.biojava.bio.seq.io.SymbolTokenization; 049import org.biojava.bio.symbol.IllegalSymbolException; 050import org.biojava.bio.symbol.Location; 051import org.biojava.bio.symbol.SimpleSymbolList; 052import org.biojava.bio.symbol.Symbol; 053import org.biojava.bio.symbol.SymbolList; 054import org.biojava.utils.ChangeVetoException; 055import org.biojava.utils.xml.PrettyXMLWriter; 056import org.biojava.utils.xml.XMLWriter; 057import org.biojavax.Comment; 058import org.biojavax.CrossRef; 059import org.biojavax.DocRef; 060import org.biojavax.DocRefAuthor; 061import org.biojavax.Namespace; 062import org.biojavax.Note; 063import org.biojavax.RankedCrossRef; 064import org.biojavax.RankedDocRef; 065import org.biojavax.RichAnnotation; 066import org.biojavax.RichObjectFactory; 067import org.biojavax.SimpleCrossRef; 068import org.biojavax.SimpleDocRef; 069import org.biojavax.SimpleDocRefAuthor; 070import org.biojavax.SimpleNamespace; 071import org.biojavax.SimpleNote; 072import org.biojavax.SimpleRankedCrossRef; 073import org.biojavax.SimpleRankedDocRef; 074import org.biojavax.SimpleRichAnnotation; 075import org.biojavax.bio.seq.Position; 076import org.biojavax.bio.seq.RichFeature; 077import org.biojavax.bio.seq.RichLocation; 078import org.biojavax.bio.seq.RichSequence; 079import org.biojavax.bio.seq.io.UniProtCommentParser.Event; 080import org.biojavax.bio.seq.io.UniProtCommentParser.Interaction; 081import org.biojavax.bio.seq.io.UniProtCommentParser.Isoform; 082import org.biojavax.bio.taxa.NCBITaxon; 083import org.biojavax.bio.taxa.SimpleNCBITaxon; 084import org.biojavax.ontology.ComparableOntology; 085import org.biojavax.ontology.ComparableTerm; 086import org.biojavax.ontology.SimpleComparableOntology; 087import org.biojavax.utils.CRC64Checksum; 088import org.biojavax.utils.StringTools; 089import org.biojavax.utils.XMLTools; 090import org.xml.sax.Attributes; 091import org.xml.sax.SAXException; 092import org.xml.sax.helpers.DefaultHandler; 093 094/** 095 * Format reader for UniProtXML files. This version of UniProtXML format will generate 096 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 097 * org.biojava.bio.seq.io.GenbankXmlFormat object. 098 * 099 * Understands http://www.ebi.uniprot.org/support/docs/uniprot.xsd 100 * 101 * @author Alan Li (code based on his work) 102 * @author Richard Holland 103 * @since 1.5 104 */ 105public class UniProtXMLFormat extends RichSequenceFormat.BasicFormat { 106 107 // Register this format with the format auto-guesser. 108 static { 109 RichSequence.IOTools.registerFormat(UniProtXMLFormat.class); 110 } 111 112 /** 113 * The name of this format 114 */ 115 public static final String UNIPROTXML_FORMAT = "UniProtXML"; 116 117 protected static final String ENTRY_GROUP_TAG = "uniprot"; 118 protected static final String ENTRY_TAG = "entry"; 119 protected static final String ENTRY_VERSION_ATTR = "version"; 120 protected static final String ENTRY_NAMESPACE_ATTR = "dataset"; 121 protected static final String ENTRY_CREATED_ATTR = "created"; 122 protected static final String ENTRY_UPDATED_ATTR = "modified"; 123 protected static final String COPYRIGHT_TAG = "copyright"; 124 125 protected static final String ACCESSION_TAG = "accession"; 126 protected static final String NAME_TAG = "name"; 127 protected static final String TEXT_TAG = "text"; 128 129 protected static final String REF_ATTR = "ref"; 130 protected static final String TYPE_ATTR = "type"; 131 protected static final String KEY_ATTR = "key"; 132 protected static final String ID_ATTR = "id"; 133 protected static final String EVIDENCE_ATTR = "evidence"; 134 protected static final String VALUE_ATTR = "value"; 135 protected static final String STATUS_ATTR = "value"; 136 protected static final String NAME_ATTR = "name"; 137 138 protected static final String PROTEIN_TAG = "protein"; 139 protected static final String PROTEIN_TYPE_ATTR = "type"; 140 141 protected static final String DOMAIN_TAG = "domain"; 142 protected static final String COMPONENT_TAG = "component"; 143 protected static final String GENE_TAG = "gene"; 144 protected static final String ORGANISM_TAG = "organism"; 145 protected static final String DBXREF_TAG = "dbReference"; 146 protected static final String PROPERTY_TAG = "property"; 147 protected static final String LINEAGE_TAG = "lineage"; 148 protected static final String TAXON_TAG = "taxon"; 149 protected static final String GENELOCATION_TAG = "geneLocation"; 150 protected static final String GENELOCATION_NAME_TAG = "name"; 151 152 protected static final String REFERENCE_TAG = "reference"; 153 protected static final String CITATION_TAG = "citation"; 154 protected static final String TITLE_TAG = "title"; 155 protected static final String EDITOR_LIST_TAG = "editorList"; 156 protected static final String AUTHOR_LIST_TAG = "authorList"; 157 protected static final String PERSON_TAG = "person"; 158 protected static final String CONSORTIUM_TAG = "consortium"; 159 protected static final String LOCATOR_TAG = "locator"; 160 protected static final String RP_LINE_TAG = "scope"; 161 protected static final String RC_LINE_TAG = "source"; 162 protected static final String RC_SPECIES_TAG = "species"; 163 protected static final String RC_TISSUE_TAG = "tissue"; 164 protected static final String RC_TRANSP_TAG = "transposon"; 165 protected static final String RC_STRAIN_TAG = "strain"; 166 protected static final String RC_PLASMID_TAG = "plasmid"; 167 168 protected static final String COMMENT_TAG = "comment"; 169 protected static final String COMMENT_MASS_ATTR = "mass"; 170 protected static final String COMMENT_ERROR_ATTR = "error"; 171 protected static final String COMMENT_METHOD_ATTR = "method"; 172 protected static final String COMMENT_LOCTYPE_ATTR = "locationType"; 173 174 protected static final String COMMENT_ABSORPTION_TAG = "absorption"; 175 protected static final String COMMENT_ABS_MAX_TAG = "max"; 176 protected static final String COMMENT_KINETICS_TAG = "kinetics"; 177 protected static final String COMMENT_KIN_KM_TAG = "KM"; 178 protected static final String COMMENT_KIN_VMAX_TAG = "VMax"; 179 protected static final String COMMENT_PH_TAG = "phDependence"; 180 protected static final String COMMENT_REDOX_TAG = "redoxPotential"; 181 protected static final String COMMENT_TEMPERATURE_TAG = "temperatureDependence"; 182 protected static final String COMMENT_LINK_TAG = "link"; 183 protected static final String COMMENT_LINK_URI_ATTR = "uri"; 184 protected static final String COMMENT_EVENT_TAG = "event"; 185 protected static final String COMMENT_ISOFORM_TAG = "isoform"; 186 protected static final String COMMENT_INTERACTANT_TAG = "interactant"; 187 protected static final String COMMENT_INTERACT_INTACT_ATTR = "intactId"; 188 protected static final String COMMENT_INTERACT_LABEL_TAG = "label"; 189 protected static final String COMMENT_ORGANISMS_TAG = "organismsDiffer"; 190 protected static final String COMMENT_EXPERIMENTS_TAG = "experiments"; 191 192 protected static final String NOTE_TAG = "note"; 193 protected static final String KEYWORD_TAG = "keyword"; 194 protected static final String PROTEIN_EXISTS_TAG = "proteinExistence"; 195 protected static final String ID_TAG = "id"; 196 197 protected static final String FEATURE_TAG = "feature"; 198 protected static final String FEATURE_DESC_ATTR = "description"; 199 protected static final String FEATURE_ORIGINAL_TAG = "original"; 200 protected static final String FEATURE_VARIATION_TAG = "variation"; 201 202 protected static final String EVIDENCE_TAG = "evidence"; 203 protected static final String EVIDENCE_CATEGORY_ATTR = "category"; 204 protected static final String EVIDENCE_ATTRIBUTE_ATTR = "attribute"; 205 protected static final String EVIDENCE_DATE_ATTR = "date"; 206 207 protected static final String LOCATION_TAG = "location"; 208 protected static final String LOCATION_SEQ_ATTR = "sequence"; 209 protected static final String LOCATION_BEGIN_TAG = "begin"; 210 protected static final String LOCATION_END_TAG = "end"; 211 protected static final String LOCATION_POSITION_ATTR = "position"; 212 protected static final String LOCATION_POSITION_TAG = "position"; 213 214 protected static final String SEQUENCE_TAG = "sequence"; 215 protected static final String SEQUENCE_VERSION_ATTR = "version"; 216 protected static final String SEQUENCE_LENGTH_ATTR = "length"; 217 protected static final String SEQUENCE_MASS_ATTR = "mass"; 218 protected static final String SEQUENCE_CHECKSUM_ATTR = "checksum"; 219 protected static final String SEQUENCE_MODIFIED_ATTR = "modified"; 220 221 // RP line parser 222 protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)"); 223 224 protected static final Pattern xmlSchema = Pattern.compile(".*http://www\\.uniprot\\.org/support/docs/uniprot\\.xsd.*"); 225 226 /** 227 * Implements some UniProtXML-specific terms. 228 */ 229 public static class Terms extends RichSequence.Terms { 230 public static final String CONTAINS_PREFIX = "Contains:"; 231 public static final String INCLUDES_PREFIX = "Includes:"; 232 233 public static final String GENENAME_KEY = "primary"; 234 public static final String GENESYNONYM_KEY = "synonym"; 235 public static final String ORDLOCNAME_KEY = "ordered locus"; 236 public static final String ORFNAME_KEY = "ORF"; 237 238 public static final String NCBI_TAXON_KEY = "NCBI Taxonomy"; 239 public static final String COMMON_NAME_KEY = "common"; 240 public static final String FULL_NAME_KEY = "full"; 241 public static final String SCIENTIFIC_NAME_KEY = "scientific"; 242 public static final String SYNONYM_NAME_KEY = "synonym"; 243 public static final String ABBREV_NAME_KEY = "abbreviation"; 244 245 public static final String LOC_FUZZY_START_KEY = "less than"; 246 public static final String LOC_FUZZY_END_KEY = "greater than"; 247 248 // Ontology for uniprot keywords (because they have identifiers, aaargh...) 249 private static ComparableOntology uniprotKWOnto = null; 250 251 /** 252 * Getter for the protein exists term 253 * @return The protein exists Term 254 */ 255 public static ComparableTerm getProteinExistsTerm() { 256 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists"); 257 } 258 259 /** 260 * Getter for the private uniprot ontology. 261 * @return the ontology. 262 */ 263 public static ComparableOntology getUniprotKWOnto() { 264 return (ComparableOntology)RichObjectFactory.getObject(SimpleComparableOntology.class, new Object[]{"uniprot_kw"}); 265 } 266 267 /** 268 * Getter for the UniProtXML term 269 * @return The UniProtXML Term 270 */ 271 public static ComparableTerm getUniProtXMLTerm() { 272 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProtXML"); 273 } 274 275 /** 276 * Getter for the protein type term 277 * @return The protein type Term 278 */ 279 public static ComparableTerm getProteinTypeTerm() { 280 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("protein_type"); 281 } 282 283 /** 284 * Getter for the evidence category term 285 * @return The evidence category Term 286 */ 287 public static ComparableTerm getEvidenceCategoryTerm() { 288 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_category"); 289 } 290 291 /** 292 * Getter for the evidence type term 293 * @return The evidence type Term 294 */ 295 public static ComparableTerm getEvidenceTypeTerm() { 296 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_type"); 297 } 298 299 /** 300 * Getter for the evidence date term 301 * @return The evidence date Term 302 */ 303 public static ComparableTerm getEvidenceDateTerm() { 304 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_date"); 305 } 306 307 /** 308 * Getter for the evidence attr term 309 * @return The evidence attr Term 310 */ 311 public static ComparableTerm getEvidenceAttrTerm() { 312 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("evidence_attr"); 313 } 314 315 /** 316 * Getter for the feature ref term 317 * @return The feature ref Term 318 */ 319 public static ComparableTerm getFeatureRefTerm() { 320 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_ref"); 321 } 322 323 /** 324 * Getter for the feature status term 325 * @return The feature status Term 326 */ 327 public static ComparableTerm getFeatureStatusTerm() { 328 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_status"); 329 } 330 331 /** 332 * Getter for the feature original term 333 * @return The feature original Term 334 */ 335 public static ComparableTerm getFeatureOriginalTerm() { 336 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_original"); 337 } 338 339 /** 340 * Getter for the feature variation term 341 * @return The feature variation Term 342 */ 343 public static ComparableTerm getFeatureVariationTerm() { 344 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("feature_variation"); 345 } 346 347 /** 348 * Getter for the location seq term 349 * @return The location seq Term 350 */ 351 public static ComparableTerm getLocationSequenceTerm() { 352 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("locseq"); 353 } 354 } 355 356 /** 357 * {@inheritDoc} 358 * A file is in UniProtXML format if the second XML line contains the phrase "http://www.uniprot.org/support/docs/uniprot.xsd". 359 */ 360 @Override 361 public boolean canRead(File file) throws IOException { 362 BufferedReader br = new BufferedReader(new FileReader(file)); 363 br.readLine(); // skip first line 364 String secondLine = br.readLine(); 365 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 366 br.close(); 367 return readable; 368 } 369 370 /** 371 * {@inheritDoc} 372 * Always returns a protein tokenizer. 373 */ 374 @Override 375 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 376 return RichSequence.IOTools.getProteinParser(); 377 } 378 379 /** 380 * {@inheritDoc} 381 * A stream is in UniProtXML format if the second XML line contains the phrase "http://www.uniprot.org/support/docs/uniprot.xsd". 382 */ 383 public boolean canRead(BufferedInputStream stream) throws IOException { 384 stream.mark(2000); // some streams may not support this 385 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 386 br.readLine(); // skip first line 387 String secondLine = br.readLine(); 388 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 389 // don't close the reader as it'll close the stream too. 390 // br.close(); 391 stream.reset(); 392 return readable; 393 } 394 395 /** 396 * {@inheritDoc} 397 * Always returns a protein tokenizer. 398 */ 399 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 400 return RichSequence.IOTools.getProteinParser(); 401 } 402 403 /** 404 * {@inheritDoc} 405 */ 406 public boolean readSequence(BufferedReader reader, 407 SymbolTokenization symParser, 408 SeqIOListener listener) 409 throws IllegalSymbolException, IOException, ParseException { 410 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 411 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 412 } 413 414 /** 415 * {@inheritDoc} 416 * If namespace is null, then the namespace of the sequence in the fasta is used. 417 * If the namespace is null and so is the namespace of the sequence in the fasta, 418 * then the default namespace is used. 419 */ 420 public boolean readRichSequence(BufferedReader reader, 421 SymbolTokenization symParser, 422 RichSeqIOListener rlistener, 423 Namespace ns) 424 throws IllegalSymbolException, IOException, ParseException { 425 426 Pattern copyright = Pattern.compile(".*<"+COPYRIGHT_TAG+".*"); 427 428 try { 429 rlistener.startSequence(); 430 DefaultHandler m_handler = new UniProtXMLHandler(this,symParser,rlistener,ns); 431 boolean hasMore=XMLTools.readXMLChunk(reader, m_handler, ENTRY_TAG); 432 // deal with copyright chunk 433 reader.mark(10000); 434 String line = reader.readLine(); 435 reader.reset(); 436 if (copyright.matcher(line).matches()) XMLTools.readXMLChunk(reader, m_handler, COPYRIGHT_TAG); 437 // all done! 438 rlistener.endSequence(); 439 return hasMore; 440 } catch (ParserConfigurationException e) { 441 throw new ParseException(e); 442 } catch (SAXException e) { 443 throw new ParseException(e); 444 } 445 } 446 447 private PrintWriter pw; 448 private XMLWriter xml; 449 450 /** 451 * {@inheritDoc} 452 */ 453 public void beginWriting() throws IOException { 454 // make an XML writer 455 pw = new PrintWriter(this.getPrintStream()); 456 xml = new PrettyXMLWriter(pw); 457 xml.printRaw("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); 458 xml.openTag(ENTRY_GROUP_TAG); 459 xml.attribute("xmlns","http://uniprot.org/uniprot"); 460 xml.attribute("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance"); 461 xml.attribute("xsi:schemaLocation","http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"); 462 } 463 464 /** 465 * {@inheritDoc} 466 */ 467 public void finishWriting() throws IOException { 468 xml.closeTag(ENTRY_GROUP_TAG); 469 pw.flush(); 470 } 471 472 /** 473 * {@inheritDoc} 474 */ 475 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 476 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 477 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 478 } 479 480 /** 481 * {@inheritDoc} 482 */ 483 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 484 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 485 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 486 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 487 } 488 489 /** 490 * {@inheritDoc} 491 * If namespace is null, then the sequence's own namespace is used. 492 */ 493 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 494 RichSequence rs; 495 try { 496 if (seq instanceof RichSequence) rs = (RichSequence)seq; 497 else rs = RichSequence.Tools.enrich(seq); 498 } catch (ChangeVetoException e) { 499 IOException e2 = new IOException("Unable to enrich sequence"); 500 e2.initCause(e); 501 throw e2; 502 } 503 504 int key = 1; 505 506 Set<Note> notes = rs.getNoteSet(); 507 List accessions = new ArrayList(); 508 List kws = new ArrayList(); 509 String cdat = null; 510 String udat = null; 511 String arel = null; 512 String adat = null; 513 String copyright = null; 514 String proteinType = null; 515 String proteinExists = null; 516 Map genenames = new TreeMap(); 517 Map genesynonyms = new TreeMap(); 518 Map orfnames = new TreeMap(); 519 Map ordlocnames = new TreeMap(); 520 Set evidenceIDs = new TreeSet(); 521 Set organelles = new TreeSet(); 522 Map evcats = new TreeMap(); 523 Map evtypes = new TreeMap(); 524 Map evdates = new TreeMap(); 525 Map evattrs = new TreeMap(); 526 Map speciesRecs = new TreeMap(); 527 Map strainRecs = new TreeMap(); 528 Map tissueRecs = new TreeMap(); 529 Map transpRecs = new TreeMap(); 530 Map plasmidRecs = new TreeMap(); 531 for (Iterator<Note> i = notes.iterator(); i.hasNext();) { 532 Note n = i.next(); 533 if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); 534 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 535 else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue(); 536 else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue(); 537 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) accessions.add(n.getValue()); 538 else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelles.add(n.getValue()); 539 else if (n.getTerm().equals(Terms.getKeywordTerm())) { 540 ComparableTerm t = Terms.getUniprotKWOnto().getOrCreateTerm(n.getValue()); 541 try { 542 if (t.getIdentifier()==null || t.getIdentifier().length()==0) t.setIdentifier("UNKNOWN"); 543 } catch (ChangeVetoException ce) { 544 IOException e = new IOException("Failed to assign keyword identifier"); 545 e.initCause(ce); 546 throw e; 547 } 548 kws.add(t); 549 } else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright=n.getValue(); 550 else if (n.getTerm().equals(Terms.getProteinTypeTerm())) proteinType=n.getValue(); 551 else if (n.getTerm().equals(Terms.getProteinExistsTerm())) proteinExists=n.getValue(); 552 // use the nasty hack to split the reference rank away from the actual value in this field 553 else if (n.getTerm().equals(Terms.getGeneNameTerm())) { 554 String ref = n.getValue(); 555 int colon = ref.indexOf(':'); 556 Integer refID = new Integer(0); 557 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 558 genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 559 } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) { 560 String ref = n.getValue(); 561 int colon = ref.indexOf(':'); 562 Integer refID = new Integer(0); 563 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 564 if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList()); 565 ((List)genesynonyms.get(refID)).add(ref.substring(colon+1)); 566 } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) { 567 String ref = n.getValue(); 568 int colon = ref.indexOf(':'); 569 Integer refID = new Integer(0); 570 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 571 if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList()); 572 ((List)ordlocnames.get(refID)).add(ref.substring(colon+1)); 573 } else if (n.getTerm().equals(Terms.getORFNameTerm())) { 574 String ref = n.getValue(); 575 int colon = ref.indexOf(':'); 576 Integer refID = new Integer(0); 577 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 578 if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList()); 579 ((List)orfnames.get(refID)).add(ref.substring(colon+1)); 580 } 581 // use the nasty hack to split the reference rank away from the actual value in this field 582 else if (n.getTerm().equals(Terms.getEvidenceCategoryTerm())) { 583 String ref = n.getValue(); 584 int colon = ref.indexOf(':'); 585 Integer refID = new Integer(0); 586 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 587 evcats.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 588 evidenceIDs.add(refID); 589 } else if (n.getTerm().equals(Terms.getEvidenceTypeTerm())) { 590 String ref = n.getValue(); 591 int colon = ref.indexOf(':'); 592 Integer refID = new Integer(0); 593 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 594 evtypes.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 595 evidenceIDs.add(refID); 596 } else if (n.getTerm().equals(Terms.getEvidenceDateTerm())) { 597 String ref = n.getValue(); 598 int colon = ref.indexOf(':'); 599 Integer refID = new Integer(0); 600 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 601 evdates.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 602 evidenceIDs.add(refID); 603 } else if (n.getTerm().equals(Terms.getEvidenceAttrTerm())) { 604 String ref = n.getValue(); 605 int colon = ref.indexOf(':'); 606 Integer refID = new Integer(0); 607 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 608 evattrs.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene 609 evidenceIDs.add(refID); 610 } 611 // use the nasty hack to split the reference rank away from the actual value in this field 612 // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now. 613 else if (n.getTerm().equals(Terms.getSpeciesTerm())) { 614 String ref = n.getValue(); 615 int colon = ref.indexOf(':'); 616 Integer refID = new Integer(0); 617 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 618 if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList()); 619 ((List)speciesRecs.get(refID)).add(ref.substring(colon+1)); 620 } else if (n.getTerm().equals(Terms.getStrainTerm())) { 621 String ref = n.getValue(); 622 int colon = ref.indexOf(':'); 623 Integer refID = new Integer(0); 624 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 625 if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList()); 626 ((List)strainRecs.get(refID)).add(ref.substring(colon+1)); 627 } else if (n.getTerm().equals(Terms.getTissueTerm())) { 628 String ref = n.getValue(); 629 int colon = ref.indexOf(':'); 630 Integer refID = new Integer(0); 631 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 632 if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList()); 633 ((List)tissueRecs.get(refID)).add(ref.substring(colon+1)); 634 } else if (n.getTerm().equals(Terms.getTransposonTerm())) { 635 String ref = n.getValue(); 636 int colon = ref.indexOf(':'); 637 Integer refID = new Integer(0); 638 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 639 if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList()); 640 ((List)transpRecs.get(refID)).add(ref.substring(colon+1)); 641 } else if (n.getTerm().equals(Terms.getPlasmidTerm())) { 642 String ref = n.getValue(); 643 int colon = ref.indexOf(':'); 644 Integer refID = new Integer(0); 645 if (colon>=1) refID = new Integer(ref.substring(0,colon)); 646 if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList()); 647 ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1)); 648 } 649 } 650 651 xml.openTag(ENTRY_TAG); 652 xml.attribute(ENTRY_VERSION_ATTR,""+(arel==null?""+rs.getVersion():arel)); 653 xml.attribute(ENTRY_NAMESPACE_ATTR,(ns==null?rs.getNamespace().getName():ns.getName())); 654 xml.attribute(ENTRY_CREATED_ATTR,cdat); 655 xml.attribute(ENTRY_UPDATED_ATTR,(adat==null?cdat:adat)); // annotation update 656 657 xml.openTag(ACCESSION_TAG); 658 xml.print(rs.getAccession()); 659 xml.closeTag(ACCESSION_TAG); 660 661 xml.openTag(NAME_TAG); 662 xml.print(rs.getName()); 663 xml.closeTag(NAME_TAG); 664 665 xml.openTag(PROTEIN_TAG); 666 if (proteinType!=null) xml.attribute(TYPE_ATTR,proteinType); 667 String desc = rs.getDescription().trim(); // this is only going to make sense if it was a UniProt seq to start with 668 if (desc.endsWith(".")) desc = desc.substring(0, desc.length()-1); // chomp trailing dot 669 String[] parts = desc.split("\\["); 670 for (int j = 0 ; j < parts.length; j++) { 671 if (parts[j].startsWith(Terms.CONTAINS_PREFIX)) { 672 // contains section 673 String chunk = parts[j].substring(Terms.CONTAINS_PREFIX.length()+1).trim(); 674 if (chunk.endsWith("]")) chunk = chunk.substring(0, chunk.length()-1); // chomp trailing ] 675 String[] moreparts = chunk.split(";"); 676 for (int k = 0; k < moreparts.length; k++) { 677 xml.openTag(DOMAIN_TAG); 678 String[] names = moreparts[k].split("\\("); 679 for (int l = 0; l < names.length; l++) { 680 String name = names[l].trim(); 681 if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing ) 682 xml.openTag(NAME_TAG); 683 xml.print(name); 684 xml.closeTag(NAME_TAG); 685 } 686 xml.closeTag(DOMAIN_TAG); 687 } 688 } else if (parts[j].startsWith(Terms.INCLUDES_PREFIX)) { 689 // includes section 690 String chunk = parts[j].substring(Terms.INCLUDES_PREFIX.length()+1).trim(); 691 if (chunk.endsWith("]")) chunk = chunk.substring(0, chunk.length()-1); // chomp trailing ] 692 String[] moreparts = chunk.split(";"); 693 for (int k = 0; k < moreparts.length; k++) { 694 xml.openTag(COMPONENT_TAG); 695 String[] names = moreparts[k].split("\\("); 696 for (int l = 0; l < names.length; l++) { 697 String name = names[l].trim(); 698 if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing ) 699 xml.openTag(NAME_TAG); 700 xml.print(name); 701 xml.closeTag(NAME_TAG); 702 } 703 xml.closeTag(COMPONENT_TAG); 704 } 705 } else { 706 // plain names 707 String[] names = parts[j].split("\\("); 708 for (int l = 0; l < names.length; l++) { 709 String name = names[l].trim(); 710 if (name.endsWith(")")) name = name.substring(0,name.length()-1); // chomp trailing ) 711 xml.openTag(NAME_TAG); 712 xml.print(name); 713 xml.closeTag(NAME_TAG); 714 } 715 } 716 } 717 xml.closeTag(PROTEIN_TAG); 718 719 // gene line 720 for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) { 721 Integer geneid = (Integer)i.next(); 722 String genename = (String)genenames.get(geneid); 723 List synonyms = (List)genesynonyms.get(geneid); 724 List orfs = (List)orfnames.get(geneid); 725 List ordlocs = (List)ordlocnames.get(geneid); 726 727 xml.openTag(GENE_TAG); 728 729 xml.openTag(NAME_TAG); 730 xml.attribute(TYPE_ATTR,Terms.GENENAME_KEY); 731 xml.print(genename); 732 xml.closeTag(NAME_TAG); 733 734 if (synonyms!=null) { 735 for (Iterator j = synonyms.iterator(); j.hasNext(); ) { 736 xml.openTag(NAME_TAG); 737 xml.attribute(TYPE_ATTR,Terms.GENESYNONYM_KEY); 738 xml.print((String)j.next()); 739 xml.closeTag(NAME_TAG); 740 } 741 } 742 if (ordlocs!=null) { 743 for (Iterator j = synonyms.iterator(); j.hasNext(); ) { 744 xml.openTag(NAME_TAG); 745 xml.attribute(TYPE_ATTR,Terms.ORDLOCNAME_KEY); 746 xml.print((String)j.next()); 747 xml.closeTag(NAME_TAG); 748 } 749 } 750 if (orfs!=null) { 751 for (Iterator j = synonyms.iterator(); j.hasNext(); ) { 752 xml.openTag(NAME_TAG); 753 xml.attribute(TYPE_ATTR,Terms.ORFNAME_KEY); 754 xml.print((String)j.next()); 755 xml.closeTag(NAME_TAG); 756 } 757 } 758 759 xml.closeTag(GENE_TAG); 760 } 761 762 // source line (from taxon) 763 // organism line 764 NCBITaxon tax = rs.getTaxon(); 765 if (tax!=null) { 766 xml.openTag(ORGANISM_TAG); 767 xml.attribute(KEY_ATTR,""+(key++)); 768 769 for (Iterator i = tax.getNameClasses().iterator(); i.hasNext(); ) { 770 String nameclass = (String)i.next(); 771 String ournameclass = Terms.COMMON_NAME_KEY; 772 if (nameclass.equalsIgnoreCase(Terms.FULL_NAME_KEY)) ournameclass = NCBITaxon.EQUIVALENT; 773 else if (nameclass.equalsIgnoreCase(Terms.SCIENTIFIC_NAME_KEY)) ournameclass = NCBITaxon.SCIENTIFIC; 774 else if (nameclass.equalsIgnoreCase(Terms.SYNONYM_NAME_KEY)) ournameclass = NCBITaxon.SYNONYM; 775 else if (nameclass.equalsIgnoreCase(Terms.ABBREV_NAME_KEY)) ournameclass = NCBITaxon.ACRONYM; 776 for (Iterator j = tax.getNames(nameclass).iterator(); j.hasNext(); ) { 777 xml.openTag(NAME_TAG); 778 xml.attribute(TYPE_ATTR,ournameclass); 779 xml.print((String)j.next()); 780 xml.closeTag(NAME_TAG); 781 } 782 } 783 784 xml.openTag(DBXREF_TAG); 785 xml.attribute(KEY_ATTR,""+(key++)); 786 xml.attribute(TYPE_ATTR,Terms.NCBI_TAXON_KEY); 787 xml.attribute(ID_ATTR,""+tax.getNCBITaxID()); 788 xml.closeTag(DBXREF_TAG); 789 790 String h = tax.getNameHierarchy(); 791 h = h.substring(0, h.length()-1); // chomp dot 792 String[] hierarch = h.split(";"); 793 xml.openTag(LINEAGE_TAG); 794 for (int j = 0; j < hierarch.length; j++) { 795 xml.openTag(TAXON_TAG); 796 xml.print(hierarch[j].trim()); 797 xml.closeTag(TAXON_TAG); 798 } 799 xml.closeTag(LINEAGE_TAG); 800 801 xml.closeTag(ORGANISM_TAG); 802 } 803 804 // gene location line (organelle) 805 for (Iterator i = organelles.iterator(); i.hasNext(); ) { 806 String org = (String)i.next(); 807 xml.openTag(GENELOCATION_TAG); 808 if (org.startsWith("Plasmid")) { 809 xml.attribute(TYPE_ATTR,"plasmid"); 810 String[] subparts = org.split(","); 811 for (int j = 0; j < parts.length; j++) { 812 org = subparts[j].trim(); 813 if (org.startsWith("and")) org = org.substring(3).trim(); 814 org = org.substring("Plasmid".length()).trim(); 815 xml.openTag(GENELOCATION_NAME_TAG); 816 xml.attribute(STATUS_ATTR,"known"); 817 xml.print(org); 818 xml.closeTag(GENELOCATION_NAME_TAG); 819 } 820 } else { 821 xml.attribute(TYPE_ATTR,org.toLowerCase()); // uniprotxml must have lower case 822 } 823 xml.closeTag(GENELOCATION_TAG); 824 } 825 826 // docrefs 827 for (Iterator<RankedDocRef> i = rs.getRankedDocRefs().iterator(); i.hasNext(); ) { 828 RankedDocRef rdr = i.next(); 829 DocRef dr = rdr.getDocumentReference(); 830 831 xml.openTag(REFERENCE_TAG); 832 xml.attribute(KEY_ATTR,""+(key++)); 833 834 xml.openTag(CITATION_TAG); 835 xml.attribute(TYPE_ATTR,"journal article"); // faking it i know 836 837 if (dr.getTitle()!=null) { 838 xml.openTag(TITLE_TAG); 839 xml.print(dr.getTitle()); 840 xml.closeTag(TITLE_TAG); 841 } 842 843 List<DocRefAuthor> auths = new ArrayList(dr.getAuthorList()); 844 List<DocRefAuthor> editors = new ArrayList<DocRefAuthor>(auths); 845 for (final Iterator<DocRefAuthor> j = editors.iterator(); j.hasNext(); ) { 846 DocRefAuthor a = j.next(); 847 if (!a.isEditor()) 848 j.remove(); 849 else 850 auths.remove(a); 851 } 852 if (!editors.isEmpty()) { 853 xml.openTag(EDITOR_LIST_TAG); 854 for (Iterator<DocRefAuthor> j = editors.iterator(); j.hasNext(); ) { 855 DocRefAuthor a = j.next(); 856 if (a.isEditor()) { 857 if (a.isConsortium()) { 858 xml.openTag(CONSORTIUM_TAG); 859 xml.attribute(NAME_ATTR,a.getName()); 860 xml.closeTag(CONSORTIUM_TAG); 861 } else { 862 xml.openTag(PERSON_TAG); 863 xml.attribute(NAME_ATTR,a.getName()); 864 xml.closeTag(PERSON_TAG); 865 } 866 } 867 } 868 xml.closeTag(EDITOR_LIST_TAG); 869 } 870 if (!auths.isEmpty()) { 871 xml.openTag(AUTHOR_LIST_TAG); 872 for (Iterator j = auths.iterator(); j.hasNext(); ) { 873 DocRefAuthor a = (DocRefAuthor)j.next(); 874 if (a.isConsortium()) { 875 xml.openTag(CONSORTIUM_TAG); 876 xml.attribute(NAME_ATTR,a.getName()); 877 xml.closeTag(CONSORTIUM_TAG); 878 } else { 879 xml.openTag(PERSON_TAG); 880 xml.attribute(NAME_ATTR,a.getName()); 881 xml.closeTag(PERSON_TAG); 882 } 883 } 884 xml.closeTag(AUTHOR_LIST_TAG); 885 } 886 887 xml.openTag(LOCATOR_TAG); 888 xml.print(dr.getLocation()); 889 xml.closeTag(LOCATOR_TAG); 890 891 CrossRef cr = dr.getCrossref(); 892 if (cr!=null) { 893 xml.openTag(DBXREF_TAG); 894 xml.attribute(TYPE_ATTR,cr.getDbname()); 895 xml.attribute(ID_ATTR,cr.getAccession()); 896 xml.attribute(KEY_ATTR,""+(key++)); 897 if (!cr.getNoteSet().isEmpty()) { 898 for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) { 899 Note n = j.next(); 900 xml.openTag(PROPERTY_TAG); 901 xml.attribute(TYPE_ATTR,n.getTerm().getName()); 902 xml.attribute(VALUE_ATTR,n.getValue()); 903 xml.closeTag(PROPERTY_TAG); 904 } 905 } 906 xml.closeTag(DBXREF_TAG); 907 } 908 909 xml.closeTag(CITATION_TAG); 910 911 // RP 912 xml.openTag(RP_LINE_TAG); 913 xml.print(dr.getRemark()); 914 xml.closeTag(RP_LINE_TAG); 915 // Print out ref position if present 916 if (rdr.getStart()!=null && rdr.getEnd()!=null && !rppat.matcher(dr.getRemark()).matches()) { 917 xml.openTag(RP_LINE_TAG); 918 xml.print("SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+"."); 919 xml.closeTag(RP_LINE_TAG); 920 } 921 922 // RC 923 boolean rcOpened = false; 924 Integer rank = new Integer(rdr.getRank()); 925 if (speciesRecs.get(rank)!=null) { 926 if (!rcOpened) { 927 xml.openTag(RC_LINE_TAG); 928 rcOpened = true; 929 } 930 for (Iterator j = ((List)speciesRecs.get(rank)).iterator(); j.hasNext(); ) { 931 xml.openTag(RC_SPECIES_TAG); 932 xml.print((String)j.next()); 933 xml.closeTag(RC_SPECIES_TAG); 934 } 935 } 936 if (strainRecs.get(rank)!=null) { 937 if (!rcOpened) { 938 xml.openTag(RC_LINE_TAG); 939 rcOpened = true; 940 } 941 for (Iterator j = ((List)strainRecs.get(rank)).iterator(); j.hasNext(); ) { 942 xml.openTag(RC_STRAIN_TAG); 943 xml.print((String)j.next()); 944 xml.closeTag(RC_STRAIN_TAG); 945 } 946 } 947 if (tissueRecs.get(rank)!=null) { 948 if (!rcOpened) { 949 xml.openTag(RC_LINE_TAG); 950 rcOpened = true; 951 } 952 for (Iterator j = ((List)tissueRecs.get(rank)).iterator(); j.hasNext(); ) { 953 xml.openTag(RC_TISSUE_TAG); 954 xml.print((String)j.next()); 955 xml.closeTag(RC_TISSUE_TAG); 956 } 957 } 958 if (transpRecs.get(rank)!=null) { 959 if (!rcOpened) { 960 xml.openTag(RC_LINE_TAG); 961 rcOpened = true; 962 } 963 for (Iterator j = ((List)transpRecs.get(rank)).iterator(); j.hasNext(); ) { 964 xml.openTag(RC_TRANSP_TAG); 965 xml.print((String)j.next()); 966 xml.closeTag(RC_TRANSP_TAG); 967 } 968 } 969 if (plasmidRecs.get(rank)!=null) { 970 if (!rcOpened) { 971 xml.openTag(RC_LINE_TAG); 972 rcOpened = true; 973 } 974 for (Iterator j = ((List)plasmidRecs.get(rank)).iterator(); j.hasNext(); ) { 975 xml.openTag(RC_PLASMID_TAG); 976 xml.print((String)j.next()); 977 xml.closeTag(RC_PLASMID_TAG); 978 } 979 } 980 if (rcOpened) 981 xml.closeTag(RC_LINE_TAG); 982 983 xml.closeTag(REFERENCE_TAG); 984 } 985 986 // comments 987 for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) { 988 // use UniProtCommentParser to convert each text comment from string to object 989 // do not print unconvertible ones (eg. no -!- on text) 990 Comment c = i.next(); 991 if (UniProtCommentParser.isParseable(c)) { 992 // otherwise parse and display appropriately 993 UniProtCommentParser ucp = new UniProtCommentParser(); 994 try { 995 ucp.parseComment(c); 996 } catch (ParseException ce) { 997 IOException e = new IOException("Failed to parse comment when outputting"); 998 e.initCause(ce); 999 throw e; 1000 } 1001 String type = ucp.getCommentType(); 1002 String xtype = type.toLowerCase(); // uniprotxml requires lower case 1003 if (type.equalsIgnoreCase(UniProtCommentParser.PTM)) xtype = "posttranslational modification"; 1004 else if (type.equalsIgnoreCase(UniProtCommentParser.DATABASE)) xtype = "online information"; 1005 1006 xml.openTag(COMMENT_TAG); 1007 xml.attribute(TYPE_ATTR,xtype); 1008 1009 // database comment 1010 if (type.equalsIgnoreCase(UniProtCommentParser.DATABASE)) { 1011 xml.attribute(NAME_ATTR,ucp.getDatabaseName()); 1012 1013 xml.openTag(COMMENT_LINK_TAG); 1014 xml.attribute(COMMENT_LINK_URI_ATTR,ucp.getUri()); 1015 xml.closeTag(COMMENT_LINK_TAG); 1016 } 1017 // mass spec 1018 else if (type.equalsIgnoreCase(UniProtCommentParser.MASS_SPECTROMETRY)) { 1019 xml.attribute(COMMENT_MASS_ATTR,""+ucp.getMolecularWeight()); 1020 if (ucp.getMolWeightError()!=null) xml.attribute(COMMENT_ERROR_ATTR,""+ucp.getMolWeightError()); 1021 xml.attribute(COMMENT_METHOD_ATTR,""+ucp.getMolWeightMethod()); 1022 1023 xml.openTag(LOCATION_TAG); 1024 xml.openTag(LOCATION_BEGIN_TAG); 1025 xml.attribute(LOCATION_POSITION_ATTR,""+ucp.getMolWeightRangeStart()); 1026 xml.closeTag(LOCATION_BEGIN_TAG); 1027 xml.openTag(LOCATION_END_TAG); 1028 xml.attribute(LOCATION_POSITION_ATTR,""+ucp.getMolWeightRangeEnd()); 1029 xml.closeTag(LOCATION_END_TAG); 1030 xml.closeTag(LOCATION_TAG); 1031 } 1032 // interaction 1033 else if (type.equalsIgnoreCase(UniProtCommentParser.INTERACTION)) { 1034 // UniProt flat allows for multiple interactions per comment, but 1035 // UniProtXML only allows for a single one. So, we have to open/close 1036 // and write additional comments as necessary. 1037 for (Iterator j = ucp.getInteractions().iterator(); j.hasNext(); ) { 1038 // process comment 1039 Interaction interact = (Interaction)j.next(); 1040 1041 xml.openTag(COMMENT_INTERACTANT_TAG); 1042 xml.attribute(COMMENT_INTERACT_INTACT_ATTR,interact.getFirstIntActID()); 1043 xml.closeTag(COMMENT_INTERACTANT_TAG); 1044 1045 xml.openTag(COMMENT_INTERACTANT_TAG); 1046 xml.attribute(COMMENT_INTERACT_INTACT_ATTR,interact.getSecondIntActID()); 1047 xml.openTag(ID_TAG); 1048 xml.print(interact.getID()); 1049 xml.closeTag(ID_TAG); 1050 if (interact.getLabel()!=null) { 1051 xml.openTag(COMMENT_INTERACT_LABEL_TAG); 1052 xml.print(interact.getLabel()); 1053 xml.closeTag(COMMENT_INTERACT_LABEL_TAG); 1054 } 1055 xml.closeTag(COMMENT_INTERACTANT_TAG); 1056 1057 xml.openTag(COMMENT_ORGANISMS_TAG); 1058 xml.print(interact.isOrganismsDiffer()?"true":"false"); 1059 xml.closeTag(COMMENT_ORGANISMS_TAG); 1060 1061 xml.openTag(COMMENT_EXPERIMENTS_TAG); 1062 xml.print(""+interact.getNumberExperiments()); 1063 xml.closeTag(COMMENT_EXPERIMENTS_TAG); 1064 1065 // if has next, close and open next comment tag 1066 if (j.hasNext()) { 1067 xml.closeTag(COMMENT_TAG); 1068 xml.openTag(COMMENT_TAG); 1069 xml.attribute(TYPE_ATTR,xtype); 1070 } 1071 } 1072 } 1073 // alternative products 1074 else if (type.equalsIgnoreCase(UniProtCommentParser.ALTERNATIVE_PRODUCTS)) { 1075 for (Iterator j = ucp.getEvents().iterator(); j.hasNext(); ) { 1076 Event event = (Event)j.next(); 1077 xml.openTag(COMMENT_EVENT_TAG); 1078 xml.attribute(TYPE_ATTR,event.getType().toLowerCase()); // uniprotxml requires lowercase 1079 xml.closeTag(COMMENT_EVENT_TAG); 1080 } 1081 for (Iterator j = ucp.getIsoforms().iterator(); j.hasNext(); ) { 1082 Isoform isoform = (Isoform)j.next(); 1083 xml.openTag(COMMENT_ISOFORM_TAG); 1084 for (Iterator k = isoform.getIsoIDs().iterator(); k.hasNext(); ) { 1085 xml.openTag(ID_TAG); 1086 xml.print((String)k.next()); 1087 xml.closeTag(ID_TAG); 1088 } 1089 for (Iterator k = isoform.getNames().iterator(); k.hasNext(); ) { 1090 xml.openTag(NAME_TAG); 1091 xml.print((String)k.next()); 1092 xml.closeTag(NAME_TAG); 1093 } 1094 xml.openTag(SEQUENCE_TAG); 1095 xml.attribute(TYPE_ATTR,isoform.getSequenceType().toLowerCase()); 1096 if (isoform.getSequenceType().equalsIgnoreCase("Described")) { 1097 xml.attribute(REF_ATTR,isoform.getSequenceRef()); 1098 } 1099 xml.closeTag(SEQUENCE_TAG); 1100 xml.openTag(NOTE_TAG); 1101 xml.print(isoform.getNote()); 1102 xml.closeTag(NOTE_TAG); 1103 xml.closeTag(COMMENT_ISOFORM_TAG); 1104 } 1105 } 1106 // biophysicoblahblah stuff 1107 else if (type.equalsIgnoreCase(UniProtCommentParser.BIOPHYSICOCHEMICAL_PROPERTIES)) { 1108 if (ucp.getAbsorptionNote()!=null) { 1109 xml.openTag(COMMENT_ABSORPTION_TAG); 1110 xml.openTag(COMMENT_ABS_MAX_TAG); 1111 xml.print(ucp.getAbsorptionMax()); 1112 xml.closeTag(COMMENT_ABS_MAX_TAG); 1113 xml.openTag(TEXT_TAG); 1114 xml.print(ucp.getAbsorptionNote()); 1115 xml.closeTag(TEXT_TAG); 1116 xml.closeTag(COMMENT_ABSORPTION_TAG); 1117 } 1118 if (ucp.getKineticsNote()!=null) { 1119 xml.openTag(COMMENT_KINETICS_TAG); 1120 for (Iterator j = ucp.getKMs().iterator(); j.hasNext(); ) { 1121 xml.openTag(COMMENT_KIN_KM_TAG); 1122 xml.print((String)j.next()); 1123 xml.closeTag(COMMENT_KIN_KM_TAG); 1124 } 1125 for (Iterator j = ucp.getVMaxes().iterator(); j.hasNext(); ) { 1126 xml.openTag(COMMENT_KIN_VMAX_TAG); 1127 xml.print((String)j.next()); 1128 xml.closeTag(COMMENT_KIN_VMAX_TAG); 1129 } 1130 xml.openTag(TEXT_TAG); 1131 xml.print(ucp.getKineticsNote()); 1132 xml.closeTag(TEXT_TAG); 1133 xml.closeTag(COMMENT_KINETICS_TAG); 1134 } 1135 if (ucp.getPHDependence()!=null) { 1136 xml.openTag(COMMENT_PH_TAG); 1137 xml.print(ucp.getPHDependence()); 1138 xml.closeTag(COMMENT_PH_TAG); 1139 } 1140 if (ucp.getRedoxPotential()!=null) { 1141 xml.openTag(COMMENT_REDOX_TAG); 1142 xml.print(ucp.getRedoxPotential()); 1143 xml.closeTag(COMMENT_REDOX_TAG); 1144 } 1145 if (ucp.getTemperatureDependence()!=null) { 1146 xml.openTag(COMMENT_TEMPERATURE_TAG); 1147 xml.print(ucp.getTemperatureDependence()); 1148 xml.closeTag(COMMENT_TEMPERATURE_TAG); 1149 } 1150 } 1151 // all other comments 1152 else { 1153 xml.openTag(TEXT_TAG); 1154 xml.print(ucp.getText()); 1155 xml.closeTag(TEXT_TAG); 1156 } 1157 1158 // finish comment up 1159 if (ucp.getNote()!=null) { 1160 xml.openTag(NOTE_TAG); 1161 xml.print(ucp.getNote()); 1162 xml.closeTag(NOTE_TAG); 1163 } 1164 1165 xml.closeTag(COMMENT_TAG); 1166 } 1167 } 1168 1169 // xrefs 1170 for (Iterator<RankedCrossRef> i = rs.getRankedCrossRefs().iterator(); i.hasNext(); ) { 1171 RankedCrossRef rcr = i.next(); 1172 CrossRef cr = rcr.getCrossRef(); 1173 1174 xml.openTag(DBXREF_TAG); 1175 String dbname = cr.getDbname(); 1176 xml.attribute(TYPE_ATTR,dbname); 1177 xml.attribute(ID_ATTR,cr.getAccession()); 1178 xml.attribute(KEY_ATTR,""+(key++)); 1179 if (!cr.getNoteSet().isEmpty()) { 1180 int acccount = 2; 1181 for (Iterator<Note> j = cr.getNoteSet().iterator(); j.hasNext(); ) { 1182 Note n = j.next(); 1183 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm()) && !n.getValue().equals("-")) { 1184 xml.openTag(PROPERTY_TAG); 1185 String name = n.getTerm().getName(); 1186 if (acccount==2) { 1187 // SECONDARY IDENTIFIER 1188 if (dbname.equalsIgnoreCase("HIV") || 1189 dbname.equalsIgnoreCase("INTERPRO") || 1190 dbname.equalsIgnoreCase("PANTHER") || 1191 dbname.equalsIgnoreCase("PFAM") || 1192 dbname.equalsIgnoreCase("PIR") || 1193 dbname.equalsIgnoreCase("PRINTS") || 1194 dbname.equalsIgnoreCase("PRODOM") || 1195 dbname.equalsIgnoreCase("REBASE") || 1196 dbname.equalsIgnoreCase("SMART") || 1197 dbname.equalsIgnoreCase("TIGRFAMS")) { 1198 // the secondary identifier is the entry name. 1199 name = "entry name"; 1200 } else if (dbname.equalsIgnoreCase("PDB")) { 1201 // the secondary identifier is the structure determination method, which is controlled vocabulary that currently includes: X-ray(for X-ray crystallography), NMR(for NMR spectroscopy), EM(for electron microscopy and cryo-electron diffraction), Fiber(for fiber diffraction), IR(for infrared spectroscopy), Model(for predicted models) and Neutron(for neutron diffraction). 1202 name = "structure determination method"; 1203 } else if (dbname.equalsIgnoreCase("DICTYBASE") || 1204 dbname.equalsIgnoreCase("ECOGENE") || 1205 dbname.equalsIgnoreCase("FLYBASE") || 1206 dbname.equalsIgnoreCase("HGNC") || 1207 dbname.equalsIgnoreCase("MGI") || 1208 dbname.equalsIgnoreCase("RGD") || 1209 dbname.equalsIgnoreCase("SGD") || 1210 dbname.equalsIgnoreCase("STYGENE") || 1211 dbname.equalsIgnoreCase("SUBTILIST") || 1212 dbname.equalsIgnoreCase("WORMBASE") || 1213 dbname.equalsIgnoreCase("ZFIN")) { 1214 // the secondary identifier is the gene designation. If the gene designation is not available, a dash('-') is used. 1215 name = "gene designation"; 1216 } else if (dbname.equalsIgnoreCase("GO")) { 1217 // the second identifier is a 1-letter abbreviation for one of the 3 ontology aspects, separated from the GO term by a column. If the term is longer than 46 characters, the first 43 characters are indicated followed by 3 dots('...'). The abbreviations for the 3 distinct aspects of the ontology are P(biological Process), F(molecular Function), and C(cellular Component). 1218 name = "term"; 1219 } else if (dbname.equalsIgnoreCase("HAMAP")) { 1220 // the secondary identifier indicates if a domain is 'atypical' and/or 'fused', otherwise the field is empty('-'). 1221 name = "domain"; 1222 } else if (dbname.equalsIgnoreCase("ECO2DBASE")) { 1223 // the secondary identifier is the latest release number or edition of the database that has been used to derive the cross-reference. 1224 name = "release number"; 1225 } else if (dbname.equalsIgnoreCase("SWISS-2DPAGE") || 1226 dbname.equalsIgnoreCase("HSC-2DPAGE")) { 1227 // the secondary identifier is the species or tissue of origin. 1228 name = "organism name"; 1229 } else if (dbname.equalsIgnoreCase("ENSEMBL")) { 1230 // the secondary identifier is the species of origin. 1231 name = "organism name"; 1232 } else if (dbname.equalsIgnoreCase("PIRSF")) { 1233 // the secondary identifier is the protein family name. 1234 name = "protein family name"; 1235 } else if (dbname.equalsIgnoreCase("AARHUS") || 1236 dbname.equalsIgnoreCase("GHENT-2DPAGE")) { 1237 // the secondary identifier is either 'IEF' (for isoelectric focusing) or 'NEPHGE' (for non-equilibrium pH gradient electrophoresis). 1238 name = "secondary identifier"; 1239 } else if (dbname.equalsIgnoreCase("WORMPEP")) { 1240 // the secondary identifier is a number attributed by the C.elegans genome-sequencing project to that protein. 1241 name = "C.elegans number"; 1242 } else if (dbname.equalsIgnoreCase("AGD") || 1243 dbname.equalsIgnoreCase("ANU-2DPAGE") || 1244 dbname.equalsIgnoreCase("COMPLUYEAST-2DPAGE") || 1245 dbname.equalsIgnoreCase("ECHOBASE") || 1246 dbname.equalsIgnoreCase("GENEDB_SPOMBE") || 1247 dbname.equalsIgnoreCase("GERMONLINE") || 1248 dbname.equalsIgnoreCase("GLYCOSUITEDB") || 1249 dbname.equalsIgnoreCase("GRAMENE") || 1250 dbname.equalsIgnoreCase("H-INVDB") || 1251 dbname.equalsIgnoreCase("INTACT") || 1252 dbname.equalsIgnoreCase("LEGIOLIST") || 1253 dbname.equalsIgnoreCase("LEPROMA") || 1254 dbname.equalsIgnoreCase("LISTILIST") || 1255 dbname.equalsIgnoreCase("MAIZEDB") || 1256 dbname.equalsIgnoreCase("MEROPS") || 1257 dbname.equalsIgnoreCase("MIM") || 1258 dbname.equalsIgnoreCase("MYPULIST") || 1259 dbname.equalsIgnoreCase("OGP") || 1260 dbname.equalsIgnoreCase("PHCI-2DPAGE") || 1261 dbname.equalsIgnoreCase("PHOSSITE") || 1262 dbname.equalsIgnoreCase("PHOTOLIST") || 1263 dbname.equalsIgnoreCase("PMMA-2DPAGE") || 1264 dbname.equalsIgnoreCase("RAT-HEART-2DPAGE") || 1265 dbname.equalsIgnoreCase("REACTOME") || 1266 dbname.equalsIgnoreCase("SAGALIST") || 1267 dbname.equalsIgnoreCase("SIENA-2DPAGE") || 1268 dbname.equalsIgnoreCase("TAIR") || 1269 dbname.equalsIgnoreCase("TIGR") || 1270 dbname.equalsIgnoreCase("TRANSFAC") || 1271 dbname.equalsIgnoreCase("TUBERCULIST")) { 1272 // the secondary identifier is not used and a dash('-') is stored in that field. 1273 // should never get here - I hope! 1274 } else if (dbname.equalsIgnoreCase("HSSP")) { 1275 // the secondary identifier is the entry name of the PDB structure related to that of the entry in which the HSSP cross-reference is present. 1276 name = "entry name"; 1277 } else if (dbname.equalsIgnoreCase("GENEFARM")) { 1278 // the secondary identifier is the gene family identifier. If the gene family identifier is not available, a dash('-') is used. 1279 name = "gene family"; 1280 } else if (dbname.equalsIgnoreCase("SMR")) { 1281 // the secondary identifier indicates the range(s) relevant to the structure model(s). 1282 name = "range"; 1283 } else if (dbname.equalsIgnoreCase("EMBL") || 1284 dbname.equalsIgnoreCase("DDBJ") || 1285 dbname.equalsIgnoreCase("GENBANK")) { 1286 // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE 1287 name = "protein id"; 1288 } else if (dbname.equalsIgnoreCase("PROSITE")) { 1289 // ENTRY_NAME; STATUS. 1290 name = "entry name"; 1291 } 1292 } else if (acccount==3) { 1293 // TERTIARY IDENTIFIER 1294 if (dbname.equalsIgnoreCase("HAMAP") || 1295 dbname.equalsIgnoreCase("PANTHER") || 1296 dbname.equalsIgnoreCase("PFAM") || 1297 dbname.equalsIgnoreCase("PIRSF") || 1298 dbname.equalsIgnoreCase("PRODOM") || 1299 dbname.equalsIgnoreCase("SMART") || 1300 dbname.equalsIgnoreCase("TIGRFAMS")) { 1301 // the tertiary identifier is the number of hits found in the sequence. 1302 name = "number of hits"; 1303 } else if (dbname.equalsIgnoreCase("GO")) { 1304 // the tertiary identifier is a 3-character GO evidence code. The meaning of the evidence codes is: IDA=inferred from direct assay, IMP=inferred from mutant phenotype, IGI=inferred from genetic interaction, IPI=inferred from physical interaction, IEP=inferred from expression pattern, TAS=traceable author statement, NAS=non-traceable author statement, IC=inferred by curator, ISS=inferred from sequence or structural similarity. 1305 name = "evidence"; 1306 } else if (dbname.equalsIgnoreCase("PDB")) { 1307 // the tertiary identifier indicates the chain(s) and the corresponding range, of which the structure has been determined. If the range is unknown, a dash is given rather than the range positions(e.g. 'A/B=-.'), if the chains and the range is unknown, a dash is used. 1308 name = "chains"; 1309 } else if (dbname.equalsIgnoreCase("EMBL") || 1310 dbname.equalsIgnoreCase("DDBJ") || 1311 dbname.equalsIgnoreCase("GENBANK")) { 1312 // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE 1313 name = "status identifier"; 1314 } else if (dbname.equalsIgnoreCase("PROSITE")) { 1315 // ENTRY_NAME; STATUS. 1316 name = "status"; 1317 } 1318 } else { 1319 // QUATERNARY AND ADDITIONAL 1320 if (dbname.equalsIgnoreCase("EMBL") || 1321 dbname.equalsIgnoreCase("DDBJ") || 1322 dbname.equalsIgnoreCase("GENBANK")) { 1323 // PROTEIN_ID; STATUS_IDENTIFIER; MOLECULE_TYPE 1324 name = "molecule type"; 1325 } 1326 } 1327 xml.attribute(TYPE_ATTR,name); 1328 xml.attribute(VALUE_ATTR,n.getValue()); 1329 xml.closeTag(PROPERTY_TAG); 1330 acccount++; 1331 } 1332 } 1333 } 1334 xml.closeTag(DBXREF_TAG); 1335 } 1336 1337 // protein exists 1338 xml.openTag(PROTEIN_EXISTS_TAG); 1339 xml.attribute(TYPE_ATTR,proteinExists); 1340 xml.closeTag(PROTEIN_EXISTS_TAG); 1341 1342 // keywords 1343 for (Iterator j = kws.iterator(); j.hasNext(); ) { 1344 ComparableTerm t = (ComparableTerm)j.next(); 1345 xml.openTag(KEYWORD_TAG); 1346 xml.attribute(ID_ATTR,t.getIdentifier()); 1347 xml.print(t.getName()); 1348 xml.closeTag(KEYWORD_TAG); 1349 } 1350 1351 // features 1352 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 1353 RichFeature f = (RichFeature)i.next(); 1354 String descr = null; 1355 String ftid = null; 1356 String ref = null; 1357 String status = null; 1358 String original = null; 1359 String locseq = null; 1360 List variation = new ArrayList(); 1361 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext(); ) { 1362 Note n = j.next(); 1363 if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue(); 1364 else if (n.getTerm().equals(Terms.getFeatureDescTerm())) descr = n.getValue(); 1365 else if (n.getTerm().equals(Terms.getFeatureStatusTerm())) status = n.getValue(); 1366 else if (n.getTerm().equals(Terms.getFeatureRefTerm())) ref = n.getValue(); 1367 else if (n.getTerm().equals(Terms.getFeatureOriginalTerm())) original = n.getValue(); 1368 else if (n.getTerm().equals(Terms.getFeatureVariationTerm())) variation.add(n.getValue()); 1369 else if (n.getTerm().equals(Terms.getLocationSequenceTerm())) locseq = n.getValue(); 1370 } 1371 1372 xml.openTag(FEATURE_TAG); 1373 1374 xml.attribute(TYPE_ATTR,f.getTypeTerm().getName()); // TODO : need to translate from UniProt flatfile format names? 1375 if (ftid!=null) xml.attribute(ID_ATTR,ftid); 1376 if (descr!=null) xml.attribute(FEATURE_DESC_ATTR,descr); 1377 if (ref!=null) xml.attribute(REF_ATTR,ref); 1378 if (status!=null) xml.attribute(STATUS_ATTR,status); 1379 if (original!=null) { 1380 xml.openTag(FEATURE_ORIGINAL_TAG); 1381 xml.print(original.trim()); 1382 xml.closeTag(FEATURE_ORIGINAL_TAG); 1383 } 1384 for (Iterator j = variation.iterator(); j.hasNext(); ) { 1385 xml.openTag(FEATURE_VARIATION_TAG); 1386 xml.print(((String)j.next()).trim()); 1387 xml.closeTag(FEATURE_VARIATION_TAG); 1388 } 1389 1390 xml.openTag(LOCATION_TAG); 1391 if (locseq!=null) xml.attribute(LOCATION_SEQ_ATTR,locseq.trim()); 1392 RichLocation rl = (RichLocation)f.getLocation(); 1393 if (rl.getMinPosition().equals(rl.getMaxPosition())) { 1394 // point position 1395 xml.openTag(LOCATION_POSITION_TAG); 1396 if (rl.getMinPosition().getFuzzyStart() || rl.getMaxPosition().getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than"); 1397 else if (rl.getMinPosition().getFuzzyEnd() || rl.getMaxPosition().getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than"); 1398 xml.attribute(LOCATION_POSITION_ATTR,""+rl.getMin()); 1399 xml.closeTag(LOCATION_POSITION_TAG); 1400 } else { 1401 // range position 1402 // begin 1403 xml.openTag(LOCATION_BEGIN_TAG); 1404 Position begin = rl.getMinPosition(); 1405 if (begin.getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than"); 1406 else if (begin.getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than"); 1407 xml.attribute(LOCATION_POSITION_ATTR,""+begin.getStart()); 1408 xml.closeTag(LOCATION_BEGIN_TAG); 1409 // end 1410 xml.openTag(LOCATION_END_TAG); 1411 Position end = rl.getMaxPosition(); 1412 if (end.getFuzzyStart()) xml.attribute(STATUS_ATTR,"less than"); 1413 else if (end.getFuzzyEnd()) xml.attribute(STATUS_ATTR,"greater than"); 1414 xml.attribute(LOCATION_POSITION_ATTR,""+end.getEnd()); 1415 xml.closeTag(LOCATION_END_TAG); 1416 } 1417 xml.closeTag(LOCATION_TAG); 1418 1419 xml.closeTag(FEATURE_TAG); 1420 } 1421 1422 // evidence 1423 for (Iterator i = evidenceIDs.iterator(); i.hasNext(); ) { 1424 Integer evidenceID = (Integer)i.next(); 1425 String cat = (String)evcats.get(evidenceID); 1426 String type = (String)evtypes.get(evidenceID); 1427 String date = (String)evdates.get(evidenceID); 1428 String attr = (String)evattrs.get(evidenceID); 1429 1430 xml.openTag(EVIDENCE_TAG); 1431 xml.attribute(KEY_ATTR,""+(key++)); 1432 xml.attribute(EVIDENCE_CATEGORY_ATTR,cat); 1433 xml.attribute(EVIDENCE_DATE_ATTR,date); 1434 xml.attribute(TYPE_ATTR,type); 1435 if (attr!=null) xml.attribute(EVIDENCE_ATTRIBUTE_ATTR,attr); 1436 xml.closeTag(EVIDENCE_TAG); 1437 } 1438 1439 // sequence 1440 int mw = 0; 1441 try { 1442 mw = (int)MassCalc.getMolecularWeight(rs); 1443 } catch (IllegalSymbolException e) { 1444 throw new RuntimeException("Found illegal symbol", e); 1445 } 1446 CRC64Checksum crc = new CRC64Checksum(); 1447 String seqstr = rs.seqString(); 1448 crc.update(seqstr.getBytes(),0,seqstr.length()); 1449 xml.openTag(SEQUENCE_TAG); 1450 xml.attribute(SEQUENCE_VERSION_ATTR,""+rs.getVersion()); 1451 xml.attribute(SEQUENCE_LENGTH_ATTR,""+rs.length()); 1452 xml.attribute(SEQUENCE_MASS_ATTR,""+mw); 1453 xml.attribute(SEQUENCE_CHECKSUM_ATTR,""+crc); 1454 xml.attribute(SEQUENCE_MODIFIED_ATTR,(udat==null?cdat:udat)); // sequence update 1455 String[] lines = StringTools.wordWrap(rs.seqString(), "\\s+", this.getLineWidth()); 1456 for (int i = 0; i < lines.length; i ++) xml.println(lines[i]); 1457 xml.closeTag(SEQUENCE_TAG); 1458 1459 // close entry 1460 xml.closeTag(ENTRY_TAG); 1461 1462 // copyright (if present) 1463 if (copyright!=null) { 1464 xml.openTag(COPYRIGHT_TAG); 1465 xml.println(copyright); 1466 xml.closeTag(COPYRIGHT_TAG); 1467 } 1468 1469 pw.flush(); 1470 } 1471 1472 /** 1473 * {@inheritDoc} 1474 */ 1475 public String getDefaultFormat() { 1476 return UNIPROTXML_FORMAT; 1477 } 1478 1479 // SAX event handler for parsing http://www.ebi.uniprot.org/support/docs/uniprot.xsd 1480 private class UniProtXMLHandler extends DefaultHandler { 1481 1482 private RichSequenceFormat parent; 1483 private SymbolTokenization symParser; 1484 private RichSeqIOListener rlistener; 1485 private Namespace ns; 1486 private StringBuffer m_currentString; 1487 1488 private NCBITaxon tax; 1489 private RichFeature.Template templ; 1490 private StringBuffer proteinDesc; 1491 private boolean firstNameInProteinGroup; 1492 private boolean firstDomainInProteinGroup; 1493 private boolean firstComponentInProteinGroup; 1494 private int currGene; 1495 private String geneNameClass; 1496 private String organismNameClass; 1497 private Map currNames = new TreeMap(); 1498 private StringBuffer organelleDesc; 1499 private List currDBXrefs = new ArrayList(); 1500 private List currComments = new ArrayList(); 1501 private String currRefLocation; 1502 private List currRefAuthors; 1503 private String currRefTitle; 1504 private int currRefStart; 1505 private int currRefEnd; 1506 private int currRefRank; 1507 private String currPersonIs; 1508 private int currRCID; 1509 private int currEvID; 1510 private String currKWID; 1511 private UniProtCommentParser currUCParser; 1512 private Interaction currUCParserInteract; 1513 private Event currUCParserEvent; 1514 private Isoform currUCParserIsoform; 1515 private String currLocIsFor; 1516 private String currTextIsFor; 1517 private String currNoteIsFor; 1518 private String currSeqIsFor; 1519 private String currIDIsFor; 1520 private String currNameIsFor; 1521 private int interactantCount; 1522 private StringBuffer currLocStr; 1523 private int featNoteRank; 1524 1525 // construct a new handler that will populate the given list of sequences 1526 private UniProtXMLHandler(RichSequenceFormat parent, 1527 SymbolTokenization symParser, 1528 RichSeqIOListener rlistener, 1529 Namespace ns) { 1530 this.parent = parent; 1531 this.symParser = symParser; 1532 this.rlistener = rlistener; 1533 this.ns = ns; 1534 this.m_currentString = new StringBuffer(); 1535 } 1536 1537 1538 // process an opening tag 1539 @Override 1540 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 1541 1542 if (qName.equals(ENTRY_TAG)) { 1543 try { 1544 for (int i = 0; i < attributes.getLength(); i++) { 1545 String name = attributes.getQName(i); 1546 String val = attributes.getValue(i); 1547 if (name.equals(ENTRY_NAMESPACE_ATTR) && this.ns==null) ns=(Namespace)RichObjectFactory.getObject(SimpleNamespace.class,new Object[]{val}); 1548 else if (name.equals(ENTRY_VERSION_ATTR)) rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), val); 1549 else if (name.equals(ENTRY_CREATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), val); 1550 else if (name.equals(ENTRY_UPDATED_ATTR)) rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), val); 1551 } 1552 if (this.ns==null) ns=RichObjectFactory.getDefaultNamespace(); 1553 rlistener.setNamespace(ns); 1554 } catch (ParseException e) { 1555 throw new SAXException(e); 1556 } 1557 this.currNameIsFor = "ENTRY"; 1558 this.currSeqIsFor = "ENTRY"; 1559 this.currGene = 0; 1560 this.currNames.clear(); 1561 this.currRefRank = 0; 1562 this.currRCID = 0; 1563 this.currEvID = 0; 1564 } 1565 1566 else if (qName.equals(PROTEIN_TAG)) { 1567 for (int i = 0; i < attributes.getLength(); i++) { 1568 String name = attributes.getQName(i).trim(); 1569 String val = attributes.getValue(i).trim(); 1570 try { 1571 if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getProteinTypeTerm(),val); 1572 } catch (ParseException e) { 1573 throw new SAXException(e); 1574 } 1575 } 1576 this.proteinDesc = new StringBuffer(); 1577 this.currNameIsFor = "PROTEIN"; 1578 this.firstNameInProteinGroup = true; 1579 this.firstDomainInProteinGroup = true; 1580 this.firstComponentInProteinGroup = true; 1581 } else if (qName.equals(DOMAIN_TAG)) { 1582 if (!this.firstComponentInProteinGroup) proteinDesc.append("]"); 1583 if (this.firstDomainInProteinGroup) proteinDesc.append(" ["+Terms.CONTAINS_PREFIX); 1584 else proteinDesc.append(";"); 1585 this.firstDomainInProteinGroup = false; 1586 this.firstNameInProteinGroup = true; 1587 } else if (qName.equals(COMPONENT_TAG)) { 1588 if (!this.firstDomainInProteinGroup) proteinDesc.append("]"); 1589 if (this.firstComponentInProteinGroup) proteinDesc.append(" ["+Terms.INCLUDES_PREFIX); 1590 else proteinDesc.append(";"); 1591 this.firstComponentInProteinGroup = false; 1592 this.firstNameInProteinGroup = true; 1593 } 1594 1595 else if (qName.equals(GENE_TAG)) { 1596 this.currGene++; 1597 this.currNameIsFor="GENE"; 1598 } 1599 1600 else if (qName.equals(NAME_TAG)) { 1601 if (this.currNameIsFor.equals("GENE")) { 1602 for (int i = 0; i < attributes.getLength(); i++) { 1603 String name = attributes.getQName(i); 1604 String val = attributes.getValue(i); 1605 if (name.equals(TYPE_ATTR)) this.geneNameClass=val; 1606 } 1607 } 1608 1609 else if (this.currNameIsFor.equals("ORGANISM")) { 1610 for (int i = 0; i < attributes.getLength(); i++) { 1611 String name = attributes.getQName(i); 1612 String val = attributes.getValue(i); 1613 if (name.equals(TYPE_ATTR)) this.organismNameClass=val; 1614 } 1615 } 1616 } 1617 1618 else if (qName.equals(ORGANISM_TAG)) { 1619 this.currNameIsFor="ORGANISM"; 1620 } 1621 1622 else if (qName.equals(DBXREF_TAG)) { 1623 if (this.currNameIsFor.equals("ORGANISM")) { 1624 Integer taxID = null; 1625 for (int i = 0; i < attributes.getLength(); i++) { 1626 String name = attributes.getQName(i); 1627 String val = attributes.getValue(i); 1628 if (name.equals(ID_ATTR)) taxID = Integer.valueOf(val); 1629 } 1630 try { 1631 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{taxID}); 1632 rlistener.setTaxon(tax); 1633 for (Iterator j = currNames.keySet().iterator(); j.hasNext(); ) { 1634 String nameClass = (String)j.next(); 1635 Set nameSet = (Set)this.currNames.get(nameClass); 1636 try { 1637 for (Iterator k = nameSet.iterator(); k.hasNext(); ) { 1638 String name = (String)k.next(); 1639 tax.addName(nameClass,name); 1640 } 1641 } catch (ChangeVetoException ce) { 1642 throw new ParseException(ce); 1643 } 1644 } 1645 } catch (ParseException e) { 1646 throw new SAXException(e); 1647 } 1648 this.currNames.clear(); 1649 } 1650 1651 else { 1652 String type = null; 1653 String id = null; 1654 for (int i = 0; i < attributes.getLength(); i++) { 1655 String name = attributes.getQName(i); 1656 String val = attributes.getValue(i); 1657 if (name.equals(ID_ATTR)) id = val; 1658 else if (name.equals(TYPE_ATTR)) type = val; 1659 } 1660 CrossRef dbx = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{type, id, new Integer(0)}); 1661 this.currDBXrefs.add(dbx); 1662 } 1663 } else if (qName.equals(PROPERTY_TAG)) { 1664 String id = null; 1665 for (int i = 0; i < attributes.getLength(); i++) { 1666 String name = attributes.getQName(i); 1667 String val = attributes.getValue(i); 1668 if (name.equals(VALUE_ATTR)) id = val; 1669 } 1670 Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),id,1); 1671 try { 1672 int last = this.currDBXrefs.size(); 1673 ((CrossRef)this.currDBXrefs.get(last-1)).getRichAnnotation().addNote(note); 1674 } catch (ChangeVetoException ce) { 1675 SAXException pe = new SAXException("Could not annotate identifier terms"); 1676 pe.initCause(ce); 1677 throw pe; 1678 } 1679 } 1680 1681 else if (qName.equals(GENELOCATION_TAG)) { 1682 this.currNameIsFor = "ORGANELLE"; 1683 this.organelleDesc = new StringBuffer(); 1684 for (int i = 0; i < attributes.getLength(); i++) { 1685 String name = attributes.getQName(i); 1686 String val = attributes.getValue(i); 1687 if (name.equals(TYPE_ATTR)) { 1688 val = val.toUpperCase().charAt(0)+val.substring(1); // init caps for flat format 1689 if (!val.equals("Plasmid")) this.organelleDesc.append(val); 1690 } 1691 } 1692 } 1693 1694 else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 1695 this.currRefLocation = null; 1696 this.currRefAuthors = new ArrayList(); 1697 this.currRefTitle = null; 1698 this.currDBXrefs.clear(); 1699 this.currComments.clear(); 1700 this.currRefRank++; 1701 this.currRefStart = -999; 1702 this.currRefEnd = -999; 1703 } else if (qName.equals(CITATION_TAG) && !this.parent.getElideReferences()) { 1704 StringBuffer currRef = new StringBuffer(); 1705 for (int i = 0; i < attributes.getLength(); i++) { 1706 String name = attributes.getQName(i); 1707 String val = attributes.getValue(i); 1708 // combine everything except type into a fake reference to use if locator is a no-show 1709 if (!name.equals(TYPE_ATTR)) { 1710 if (currRef.length()>0) currRef.append(" "); 1711 currRef.append(val); 1712 } 1713 } 1714 this.currRefLocation = currRef.toString(); 1715 } else if (qName.equals(EDITOR_LIST_TAG)) { 1716 this.currPersonIs = "EDITOR"; 1717 } else if (qName.equals(AUTHOR_LIST_TAG)) { 1718 this.currPersonIs = "AUTHOR"; 1719 } else if (qName.equals(PERSON_TAG)) { 1720 for (int i = 0; i < attributes.getLength(); i++) { 1721 String name = attributes.getQName(i); 1722 String val = attributes.getValue(i); 1723 if (name.equals(NAME_ATTR)) { 1724 if (this.currPersonIs.equals("AUTHOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, false, false)); 1725 else if (this.currPersonIs.equals("EDITOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, false, true)); 1726 } 1727 } 1728 } else if (qName.equals(CONSORTIUM_TAG)) { 1729 for (int i = 0; i < attributes.getLength(); i++) { 1730 String name = attributes.getQName(i); 1731 String val = attributes.getValue(i); 1732 if (name.equals(NAME_ATTR)) { 1733 if (this.currPersonIs.equals("AUTHOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, true, false)); 1734 else if (this.currPersonIs.equals("EDITOR")) currRefAuthors.add(new SimpleDocRefAuthor(val, true, true)); 1735 } 1736 } 1737 } else if (qName.equals(RC_LINE_TAG)) { 1738 this.currRCID++; 1739 } 1740 1741 else if (qName.equals(PROTEIN_EXISTS_TAG)) { 1742 try { 1743 for (int i = 0; i < attributes.getLength(); i++) { 1744 String name = attributes.getQName(i); 1745 String val = attributes.getValue(i); 1746 if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val); 1747 } 1748 } catch (ParseException e) { 1749 SAXException pe = new SAXException("Could not annotate protein exists terms"); 1750 pe.initCause(e); 1751 throw pe; 1752 } 1753 } 1754 1755 else if (qName.equals(KEYWORD_TAG)) { 1756 for (int i = 0; i < attributes.getLength(); i++) { 1757 String name = attributes.getQName(i); 1758 String val = attributes.getValue(i); 1759 if (name.equals(ID_ATTR)) this.currKWID = val; 1760 } 1761 } 1762 1763 else if (qName.equals(EVIDENCE_TAG)) { 1764 this.currEvID++; 1765 try { 1766 for (int i = 0; i < attributes.getLength(); i++) { 1767 String name = attributes.getQName(i); 1768 String val = attributes.getValue(i); 1769 if (name.equals(EVIDENCE_CATEGORY_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceCategoryTerm(),val); 1770 else if (name.equals(EVIDENCE_DATE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceDateTerm(),val); 1771 else if (name.equals(TYPE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceTypeTerm(),val); 1772 else if (name.equals(EVIDENCE_ATTRIBUTE_ATTR)) rlistener.addSequenceProperty(Terms.getEvidenceAttrTerm(),val); 1773 } 1774 } catch (ParseException e) { 1775 SAXException pe = new SAXException("Could not annotate evidence terms"); 1776 pe.initCause(e); 1777 throw pe; 1778 } 1779 } 1780 1781 else if (qName.equals(LOCATION_TAG)) { 1782 this.currLocStr = new StringBuffer(); 1783 if (this.currLocIsFor.equals("FEATURE")) { 1784 try { 1785 for (int i = 0; i < attributes.getLength(); i++) { 1786 String name = attributes.getQName(i); 1787 String val = attributes.getValue(i); 1788 if (name.equals(LOCATION_SEQ_ATTR)) { 1789 Note note = new SimpleNote(Terms.getLocationSequenceTerm(), val, this.featNoteRank++); 1790 ((RichAnnotation)templ.annotation).addNote(note); 1791 } 1792 } 1793 } catch (ChangeVetoException e) { 1794 SAXException pe = new SAXException("Could not create location terms"); 1795 pe.initCause(e); 1796 throw pe; 1797 } 1798 } 1799 } else if (qName.equals(LOCATION_BEGIN_TAG) || qName.equals(LOCATION_END_TAG) || qName.equals(LOCATION_POSITION_TAG)) { 1800 StringBuffer pos = new StringBuffer(); 1801 pos.append(" "); // space between start and end 1802 for (int i = 0; i < attributes.getLength(); i++) { 1803 String name = attributes.getQName(i); 1804 String val = attributes.getValue(i); 1805 if (name.equals(STATUS_ATTR)) { 1806 if (val.equals("less than")) pos.append("<"); 1807 else if (val.equals("greater than")) pos.append(">"); 1808 } else if (name.equals(LOCATION_POSITION_ATTR)) { 1809 pos.append(val); 1810 } 1811 } 1812 this.currLocStr.append(pos.toString()); 1813 if (qName.equals(LOCATION_POSITION_TAG)) currLocStr.append(pos.toString()); // fake it as begin=end 1814 } 1815 1816 else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) { 1817 this.featNoteRank = 1; 1818 templ = new RichFeature.Template(); 1819 templ.annotation = new SimpleRichAnnotation(); 1820 templ.sourceTerm = Terms.getUniProtXMLTerm(); 1821 templ.featureRelationshipSet = new TreeSet(); 1822 templ.rankedCrossRefs = new TreeSet(); 1823 try { 1824 for (int i = 0; i < attributes.getLength(); i++) { 1825 String name = attributes.getQName(i); 1826 String val = attributes.getValue(i); 1827 if (name.equals(TYPE_ATTR)) { 1828 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(val); 1829 } else if (name.equals(ID_ATTR)) { 1830 Note note = new SimpleNote(Terms.getFTIdTerm(), val, this.featNoteRank++); 1831 ((RichAnnotation)templ.annotation).addNote(note); 1832 } else if (name.equals(FEATURE_DESC_ATTR)) { 1833 Note note = new SimpleNote(Terms.getFeatureDescTerm(), val, this.featNoteRank++); 1834 ((RichAnnotation)templ.annotation).addNote(note); 1835 } else if (name.equals(STATUS_ATTR)) { 1836 Note note = new SimpleNote(Terms.getFeatureStatusTerm(), val, this.featNoteRank++); 1837 ((RichAnnotation)templ.annotation).addNote(note); 1838 } else if (name.equals(REF_ATTR)) { 1839 Note note = new SimpleNote(Terms.getFeatureRefTerm(), val, this.featNoteRank++); 1840 ((RichAnnotation)templ.annotation).addNote(note); 1841 } 1842 } 1843 } catch (ChangeVetoException e) { 1844 SAXException pe = new SAXException("Could not create location terms"); 1845 pe.initCause(e); 1846 throw pe; 1847 } 1848 this.currLocStr = new StringBuffer(); 1849 this.currLocIsFor = "FEATURE"; 1850 } 1851 1852 else if (qName.equals(COMMENT_TAG)) { 1853 this.currUCParser = new UniProtCommentParser(); 1854 this.currUCParser.setInteractions(new ArrayList()); 1855 this.currUCParser.setEvents(new ArrayList()); 1856 this.currUCParser.setIsoforms(new ArrayList()); 1857 this.currUCParser.setKMs(new ArrayList()); 1858 this.currUCParser.setVMaxes(new ArrayList()); 1859 for (int i = 0; i < attributes.getLength(); i++) { 1860 String name = attributes.getQName(i).trim(); 1861 String val = attributes.getValue(i).trim(); 1862 if (name.equals(TYPE_ATTR)) { 1863 String type = val.toUpperCase(); // easier to check this way, plus flat uniprot requires it 1864 if (type.equals("POSTTRANSLATIONAL MODIFICATION")) type="PTM"; 1865 else if (type.equals("ONLINE INFORMATION")) type="DATABASE"; 1866 currUCParser.setCommentType(type); 1867 } else if (name.equals(COMMENT_MASS_ATTR)) this.currUCParser.setMolecularWeight(Integer.parseInt(val)); 1868 else if (name.equals(COMMENT_ERROR_ATTR)) this.currUCParser.setMolWeightError(Integer.valueOf(val)); 1869 else if (name.equals(COMMENT_METHOD_ATTR)) this.currUCParser.setMolWeightMethod(val); 1870 else if (name.equals(NAME_ATTR)) this.currUCParser.setDatabaseName(val); 1871 } 1872 this.currLocIsFor="COMMENT"; 1873 this.currTextIsFor="COMMENT"; 1874 this.currNoteIsFor="COMMENT"; 1875 this.interactantCount = 0; 1876 } else if (qName.equals(COMMENT_ABSORPTION_TAG)) { 1877 this.currTextIsFor="ABSORPTION"; 1878 } else if (qName.equals(COMMENT_KINETICS_TAG)) { 1879 this.currTextIsFor="KINETICS"; 1880 } else if (qName.equals(COMMENT_LINK_TAG)) { 1881 this.currTextIsFor="KINETICS"; 1882 for (int i = 0; i < attributes.getLength(); i++) { 1883 String name = attributes.getQName(i); 1884 String val = attributes.getValue(i); 1885 if (name.equals(COMMENT_LINK_URI_ATTR)) this.currUCParser.setUri(val); 1886 } 1887 } else if (qName.equals(COMMENT_EVENT_TAG)) { 1888 this.currUCParserEvent = new Event(); 1889 for (int i = 0; i < attributes.getLength(); i++) { 1890 String name = attributes.getQName(i); 1891 String val = attributes.getValue(i); 1892 if (name.equals(TYPE_ATTR)) { 1893 val = val.toUpperCase().charAt(0)+val.substring(1); // make first letter upper case for flat uniprot 1894 this.currUCParserEvent.setType(val); 1895 } 1896 } 1897 currUCParser.getEvents().add(currUCParserEvent); 1898 } else if (qName.equals(COMMENT_ISOFORM_TAG)) { 1899 this.currUCParserIsoform = new Isoform(); 1900 this.currUCParser.getIsoforms().add(currUCParserIsoform); 1901 this.currUCParserEvent.setNamedIsoforms(this.currUCParser.getIsoforms().size()); 1902 this.currNameIsFor="ISOFORM"; 1903 this.currNoteIsFor="ISOFORM"; 1904 this.currSeqIsFor="ISOFORM"; 1905 this.currIDIsFor="ISOFORM"; 1906 } else if (qName.equals(COMMENT_INTERACTANT_TAG)) { 1907 this.currIDIsFor="INTERACTION"; 1908 this.interactantCount++; 1909 for (int i = 0; i < attributes.getLength(); i++) { 1910 String name = attributes.getQName(i); 1911 String val = attributes.getValue(i); 1912 if (name.equals(COMMENT_INTERACT_INTACT_ATTR)) { 1913 if (this.interactantCount%2==1) { 1914 this.currUCParserInteract = new Interaction(); 1915 this.currUCParserInteract.setFirstIntActID(val); 1916 this.currUCParser.getInteractions().add(this.currUCParserInteract); 1917 } 1918 else this.currUCParserInteract.setSecondIntActID(val); 1919 } 1920 } 1921 } 1922 1923 else if (qName.equals(SEQUENCE_TAG)) { 1924 if (this.currSeqIsFor.equals("ENTRY")) { 1925 try { 1926 for (int i = 0; i < attributes.getLength(); i++) { 1927 String name = attributes.getQName(i); 1928 String val = attributes.getValue(i); 1929 if (name.equals(SEQUENCE_MODIFIED_ATTR)) { 1930 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),val); 1931 } 1932 else if (name.equals(SEQUENCE_VERSION_ATTR)) 1933 rlistener.setVersion(Integer.parseInt(val)); 1934 } 1935 } catch (ParseException e) { 1936 SAXException pe = new SAXException("Could not set sequence properties"); 1937 pe.initCause(e); 1938 throw pe; 1939 } 1940 } 1941 1942 else if (this.currSeqIsFor.equals("ISOFORM")) { 1943 for (int i = 0; i < attributes.getLength(); i++) { 1944 String name = attributes.getQName(i); 1945 String val = attributes.getValue(i); 1946 if (name.equals(TYPE_ATTR)) { 1947 val = val.toUpperCase().charAt(0)+val.substring(1); // init caps for flat uniprot 1948 this.currUCParserIsoform.setSequenceType(val); 1949 } else if (name.equals(REF_ATTR)) { 1950 this.currUCParserIsoform.setSequenceRef(val); 1951 } 1952 } 1953 } 1954 } 1955 } 1956 1957 // process a closing tag - we will have read the text already 1958 @Override 1959 public void endElement(String uri, String localName, String qName) throws SAXException { 1960 String val = this.m_currentString.toString().trim(); 1961 1962 try { 1963 if (qName.equals(COPYRIGHT_TAG)) { 1964 rlistener.addSequenceProperty(Terms.getCopyrightTerm(),val); 1965 } 1966 1967 else if (qName.equals(ACCESSION_TAG)) { 1968 rlistener.setAccession(val); 1969 } else if (qName.equals(NAME_TAG)) { 1970 if (this.currNameIsFor.equals("ENTRY")) rlistener.setName(val); 1971 1972 else if (this.currNameIsFor.equals("PROTEIN")) { 1973 if (this.firstNameInProteinGroup) { 1974 proteinDesc.append(" "); 1975 proteinDesc.append(val); 1976 } else { 1977 proteinDesc.append(" ("); 1978 proteinDesc.append(val); 1979 proteinDesc.append(")"); 1980 } 1981 this.firstNameInProteinGroup = false; 1982 } 1983 1984 else if (this.currNameIsFor.equals("GENE")) { 1985 if (this.geneNameClass.equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(), this.currGene+":"+val); 1986 else if (this.geneNameClass.equals(Terms.GENESYNONYM_KEY)) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(), this.currGene+":"+val); 1987 else if (this.geneNameClass.equals(Terms.ORDLOCNAME_KEY)) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(), this.currGene+":"+val); 1988 else if (this.geneNameClass.equals(Terms.ORFNAME_KEY)) rlistener.addSequenceProperty(Terms.getORFNameTerm(), this.currGene+":"+val); 1989 } 1990 1991 else if (this.currNameIsFor.equals("ORGANISM")) { 1992 String ournameclass = NCBITaxon.COMMON; 1993 if (this.organismNameClass.equals(Terms.ABBREV_NAME_KEY)) ournameclass = NCBITaxon.ACRONYM; 1994 else if (this.organismNameClass.equals(Terms.FULL_NAME_KEY)) ournameclass = NCBITaxon.EQUIVALENT; 1995 else if (this.organismNameClass.equals(Terms.SCIENTIFIC_NAME_KEY)) ournameclass = NCBITaxon.SCIENTIFIC; 1996 else if (this.organismNameClass.equals(Terms.SYNONYM_NAME_KEY)) ournameclass = NCBITaxon.SYNONYM; 1997 if (!this.currNames.containsKey(ournameclass)) this.currNames.put(ournameclass,new TreeSet()); 1998 ((Set)this.currNames.get(ournameclass)).add(val); 1999 } 2000 2001 else if (this.currNameIsFor.equals("ORGANELLE")) { 2002 this.organelleDesc.append(", Plasmid "); 2003 this.organelleDesc.append(val); 2004 } 2005 2006 else if (this.currNameIsFor.equals("ISOFORM")) { 2007 this.currUCParserIsoform.getNames().add(val); 2008 } 2009 } 2010 2011 else if (qName.equals(PROTEIN_TAG)) { 2012 if (!this.firstDomainInProteinGroup || !this.firstComponentInProteinGroup) this.proteinDesc.append("]"); 2013 this.proteinDesc.append("."); 2014 rlistener.setDescription(this.proteinDesc.toString()); 2015 } 2016 2017 else if (qName.equals(ORGANISM_TAG)) { 2018 this.currNameIsFor=""; 2019 } 2020 2021 else if (qName.equals(GENELOCATION_TAG)) { 2022 String total = this.organelleDesc.toString().substring(3); // chomp leading ", " 2023 int lastComma = total.lastIndexOf(','); 2024 if (lastComma>-1) { 2025 this.organelleDesc.insert(lastComma+1," and"); 2026 total = this.organelleDesc.toString(); 2027 } 2028 rlistener.addSequenceProperty(Terms.getOrganelleTerm(), total); 2029 } 2030 2031 else if (qName.equals(RC_SPECIES_TAG)) { 2032 rlistener.addSequenceProperty(Terms.getSpeciesTerm(), this.currRCID+":"+val); 2033 } else if (qName.equals(RC_TISSUE_TAG)) { 2034 rlistener.addSequenceProperty(Terms.getTissueTerm(), this.currRCID+":"+val); 2035 } else if (qName.equals(RC_TRANSP_TAG)) { 2036 rlistener.addSequenceProperty(Terms.getTransposonTerm(), this.currRCID+":"+val); 2037 } else if (qName.equals(RC_PLASMID_TAG)) { 2038 rlistener.addSequenceProperty(Terms.getPlasmidTerm(), this.currRCID+":"+val); 2039 } 2040 2041 else if (qName.equals(TITLE_TAG)) { 2042 this.currRefTitle = val; 2043 } else if (qName.equals(LOCATOR_TAG)) { 2044 this.currRefLocation = val; 2045 } else if (qName.equals(RP_LINE_TAG)) { 2046 this.currComments.add(val); 2047 // Try to use it to find the location of the reference, if we have one. 2048 Matcher m = rppat.matcher(val); 2049 if (m.matches()) { 2050 this.currRefStart = Integer.parseInt(m.group(1)); 2051 this.currRefEnd = Integer.parseInt(m.group(2)); 2052 } 2053 } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 2054 // do the crossrefs 2055 CrossRef useForDocRef = null; 2056 for (Iterator j = this.currDBXrefs.iterator(); j.hasNext();) { 2057 CrossRef dbx = (CrossRef)j.next(); 2058 RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx,0); 2059 rlistener.setRankedCrossRef(rdbx); 2060 if (useForDocRef==null) useForDocRef = dbx; 2061 else { 2062 // medline gets priority, then pubmed - if multiple, use last 2063 if (dbx.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY) || 2064 (dbx.getDbname().equalsIgnoreCase(Terms.PUBMED_KEY) && 2065 !useForDocRef.getDbname().equalsIgnoreCase(Terms.MEDLINE_KEY))) { 2066 useForDocRef = dbx; 2067 } 2068 } 2069 } 2070 // do the comment - can only be one in this object model 2071 String currRefRemark = null; 2072 if (currComments.size()>0) currRefRemark = (String)currComments.iterator().next(); 2073 // create the docref object 2074 try { 2075 DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{currRefAuthors,currRefLocation,currRefTitle}); 2076 // assign the pubmed or medline to the docref - medline gets priority 2077 if (useForDocRef!=null) dr.setCrossref(useForDocRef); 2078 // assign the remarks 2079 dr.setRemark(currRefRemark); 2080 // assign the docref to the bioentry 2081 RankedDocRef rdr = new SimpleRankedDocRef(dr, 2082 (currRefStart != -999 ? new Integer(currRefStart) : null), 2083 (currRefEnd != -999 ? new Integer(currRefEnd) : null), 2084 currRefRank); 2085 rlistener.setRankedDocRef(rdr); 2086 } catch (ChangeVetoException e) { 2087 throw new ParseException(e); 2088 } 2089 currDBXrefs.clear(); 2090 currComments.clear(); 2091 } 2092 2093 // keywords 2094 else if (qName.equals(KEYWORD_TAG)) { 2095 // create and persist term 2096 ComparableTerm t = Terms.getUniprotKWOnto().getOrCreateTerm(val); 2097 try { 2098 t.setIdentifier(currKWID); 2099 } catch (ChangeVetoException e) { 2100 throw new ParseException(e); 2101 } 2102 rlistener.addSequenceProperty(Terms.getKeywordTerm(), val); 2103 } 2104 2105 else if (qName.equals(LOCATION_TAG)) { 2106 if (currLocIsFor.equals("FEATURE")) { 2107 templ.location = UniProtLocationParser.parseLocation(currLocStr.toString()); 2108 } else if (currLocIsFor.equals("COMMENT")) { 2109 Location l = UniProtLocationParser.parseLocation(currLocStr.toString()); 2110 this.currUCParser.setMolWeightRangeStart(l.getMin()); 2111 this.currUCParser.setMolWeightRangeEnd(l.getMax()); 2112 } 2113 } 2114 2115 else if (qName.equals(FEATURE_TAG)) { 2116 // start the feature from the template we built 2117 rlistener.startFeature(templ); 2118 // end the feature 2119 rlistener.endFeature(); 2120 } else if (qName.equals(FEATURE_ORIGINAL_TAG)) { 2121 try { 2122 Note note = new SimpleNote(Terms.getFeatureOriginalTerm(), val, featNoteRank++); 2123 ((RichAnnotation)templ.annotation).addNote(note); 2124 } catch (ChangeVetoException e) { 2125 SAXException pe = new SAXException("Could not create location terms"); 2126 pe.initCause(e); 2127 throw pe; 2128 } 2129 } else if (qName.equals(FEATURE_VARIATION_TAG)) { 2130 try { 2131 Note note = new SimpleNote(Terms.getFeatureVariationTerm(), val, featNoteRank++); 2132 ((RichAnnotation)templ.annotation).addNote(note); 2133 } catch (ChangeVetoException e) { 2134 SAXException pe = new SAXException("Could not create location terms"); 2135 pe.initCause(e); 2136 throw pe; 2137 } 2138 } 2139 2140 else if (qName.equals(COMMENT_TAG)) { 2141 rlistener.setComment(currUCParser.generate()); 2142 } else if (qName.equals(TEXT_TAG)) { 2143 if (this.currTextIsFor.equals("COMMENT")) currUCParser.setText(val); 2144 else if (this.currTextIsFor.equals("ABSORPTION")) currUCParser.setAbsorptionNote(val); 2145 else if (this.currTextIsFor.equals("KINETICS")) currUCParser.setKineticsNote(val); 2146 } else if (qName.equals(COMMENT_ABS_MAX_TAG)) { 2147 currUCParser.setAbsorptionMax(val); 2148 } else if (qName.equals(COMMENT_KIN_KM_TAG)) { 2149 currUCParser.getKMs().add(val); 2150 } else if (qName.equals(COMMENT_KIN_VMAX_TAG)) { 2151 currUCParser.getVMaxes().add(val); 2152 } else if (qName.equals(COMMENT_PH_TAG)) { 2153 currUCParser.setPHDependence(val); 2154 } else if (qName.equals(COMMENT_REDOX_TAG)) { 2155 currUCParser.setRedoxPotential(val); 2156 } else if (qName.equals(COMMENT_TEMPERATURE_TAG)) { 2157 currUCParser.setTemperatureDependence(val); 2158 } else if (qName.equals(COMMENT_ORGANISMS_TAG)) { 2159 if (val.equalsIgnoreCase("true")) currUCParserInteract.setOrganismsDiffer(true); 2160 else currUCParserInteract.setOrganismsDiffer(false); 2161 } else if (qName.equals(COMMENT_EXPERIMENTS_TAG)) { 2162 currUCParserInteract.setNumberExperiments(Integer.parseInt(val)); 2163 } else if (qName.equals(NOTE_TAG)) { 2164 if (currNoteIsFor.equals("COMMENT")) currUCParser.setNote(val); 2165 else if (currNoteIsFor.equals("ISOFORM")) currUCParser.setNote(val); 2166 } else if (qName.equals(COMMENT_EVENT_TAG)) { 2167 currUCParserEvent.setComment(val); 2168 } else if (qName.equals(COMMENT_ISOFORM_TAG)) { 2169 this.currSeqIsFor = "ENTRY"; 2170 this.currNoteIsFor = "COMMENT"; 2171 } else if (qName.equals(ID_TAG)) { 2172 if (currIDIsFor.equals("ISOFORM")) currUCParserIsoform.getIsoIDs().add(val); 2173 else if (currIDIsFor.equals("INTERACTION")) currUCParserInteract.setID(val); 2174 } else if (qName.equals(COMMENT_INTERACT_LABEL_TAG)) { 2175 currUCParserInteract.setLabel(val); 2176 } 2177 2178 else if (qName.equals(SEQUENCE_TAG)) { 2179 if (this.currSeqIsFor.equals("ENTRY") && !this.parent.getElideSymbols()) { 2180 try { 2181 SymbolList sl = new SimpleSymbolList(symParser, 2182 val.replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 2183 rlistener.addSymbols(symParser.getAlphabet(), 2184 (Symbol[])(sl.toList().toArray(new Symbol[0])), 2185 0, sl.length()); 2186 } catch (Exception e) { 2187 throw new ParseException(e); 2188 } 2189 } 2190 } 2191 2192 else if (qName.equals(ENTRY_TAG)) { 2193 // do the comments 2194 for (Iterator j = currComments.iterator(); j.hasNext();) { 2195 rlistener.setComment((String)j.next()); 2196 } 2197 // do the crossrefs 2198 for (Iterator j = currDBXrefs.iterator(); j.hasNext();) { 2199 CrossRef dbx = (CrossRef)j.next(); 2200 RankedCrossRef rdbx = new SimpleRankedCrossRef(dbx, 0); 2201 rlistener.setRankedCrossRef(rdbx); 2202 } 2203 // end the sequence 2204 currComments.clear(); 2205 currDBXrefs.clear(); 2206 } 2207 2208 } catch (ParseException e) { 2209 throw new SAXException(e); 2210 } 2211 2212 // drop old string 2213 this.m_currentString.setLength(0); 2214 } 2215 2216 // process text inside tags 2217 @Override 2218 public void characters(char[] ch, int start, int length) { 2219 this.m_currentString.append(ch, start, length); 2220 } 2221 } 2222} 2223