001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.io.PrintWriter; 032import java.util.ArrayList; 033import java.util.Iterator; 034import java.util.List; 035import java.util.Set; 036import java.util.TreeSet; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import javax.xml.parsers.ParserConfigurationException; 041 042import org.biojava.bio.seq.Sequence; 043import org.biojava.bio.seq.io.ParseException; 044import org.biojava.bio.seq.io.SeqIOListener; 045import org.biojava.bio.seq.io.SymbolTokenization; 046import org.biojava.bio.symbol.IllegalSymbolException; 047import org.biojava.bio.symbol.SimpleSymbolList; 048import org.biojava.bio.symbol.Symbol; 049import org.biojava.bio.symbol.SymbolList; 050import org.biojava.ontology.Term; 051import org.biojava.utils.ChangeVetoException; 052import org.biojava.utils.xml.PrettyXMLWriter; 053import org.biojava.utils.xml.XMLWriter; 054import org.biojavax.Comment; 055import org.biojavax.CrossRef; 056import org.biojavax.DocRef; 057import org.biojavax.DocRefAuthor; 058import org.biojavax.Namespace; 059import org.biojavax.Note; 060import org.biojavax.RankedCrossRef; 061import org.biojavax.RankedDocRef; 062import org.biojavax.RichAnnotation; 063import org.biojavax.RichObjectFactory; 064import org.biojavax.SimpleCrossRef; 065import org.biojavax.SimpleDocRef; 066import org.biojavax.SimpleDocRefAuthor; 067import org.biojavax.SimpleNote; 068import org.biojavax.SimpleRankedCrossRef; 069import org.biojavax.SimpleRankedDocRef; 070import org.biojavax.SimpleRichAnnotation; 071import org.biojavax.bio.seq.Position; 072import org.biojavax.bio.seq.RichFeature; 073import org.biojavax.bio.seq.RichLocation; 074import org.biojavax.bio.seq.RichSequence; 075import org.biojavax.bio.seq.SimplePosition; 076import org.biojavax.bio.seq.SimpleRichLocation; 077import org.biojavax.bio.taxa.NCBITaxon; 078import org.biojavax.bio.taxa.SimpleNCBITaxon; 079import org.biojavax.ontology.ComparableTerm; 080import org.biojavax.utils.StringTools; 081import org.biojavax.utils.XMLTools; 082import org.xml.sax.Attributes; 083import org.xml.sax.SAXException; 084import org.xml.sax.helpers.DefaultHandler; 085 086/** 087 * Format reader for INSDseq files. This version of INSDseq format will generate 088 * and write RichSequence objects. Loosely Based on code from the old, deprecated, 089 * org.biojava.bio.seq.io.GenbankXmlFormat object. 090 * 091 * Understands http://www.insdc.org/files/documents/INSD_V1.4.dtd 092 * 093 * Does NOT understand the "sites" keyword in INSDReference_position. Interprets 094 * this instead as an empty location. This is because 095 * there is no obvious way of representing the "sites" keyword in BioSQL. 096 * 097 * Note also that the INSDInterval tags and associate stuff are not read, as 098 * this is duplicate information to the INSDFeature_location tag which is 099 * already fully parsed. However, they are written on output, although there is 100 * no guarantee that the INSDInterval tags will exactly match the 101 * INSDFeature_location tag as it is not possible to exactly reflect its contents 102 * using these. 103 * 104 * @author Alan Li (code based on his work) 105 * @author Richard Holland 106 * @author George Waldon 107 * @since 1.5 108 */ 109public class INSDseqFormat extends RichSequenceFormat.BasicFormat { 110 111 // Register this format with the format auto-guesser. 112 static { 113 RichSequence.IOTools.registerFormat(INSDseqFormat.class); 114 } 115 116 /** 117 * The name of this format 118 */ 119 public static final String INSDSEQ_FORMAT = "INSDseq"; 120 121 protected static final String INSDSEQS_GROUP_TAG = "INSDSet"; 122 protected static final String INSDSEQ_TAG = "INSDSeq"; 123 124 protected static final String LOCUS_TAG = "INSDSeq_locus"; 125 protected static final String LENGTH_TAG = "INSDSeq_length"; 126 protected static final String TOPOLOGY_TAG = "INSDSeq_topology"; 127 protected static final String STRANDED_TAG = "INSDSeq_strandedness"; 128 protected static final String MOLTYPE_TAG = "INSDSeq_moltype"; 129 protected static final String DIVISION_TAG = "INSDSeq_division"; 130 protected static final String UPDATE_DATE_TAG = "INSDSeq_update-date"; 131 protected static final String CREATE_DATE_TAG = "INSDSeq_create-date"; 132 protected static final String UPDATE_REL_TAG = "INSDSeq_update-release"; 133 protected static final String CREATE_REL_TAG = "INSDSeq_create-release"; 134 protected static final String DEFINITION_TAG = "INSDSeq_definition"; 135 protected static final String DATABASE_XREF_TAG = "INSDSeq_database-reference"; 136 protected static final String XREF_TAG = "INSDXref"; 137 138 protected static final String ACCESSION_TAG = "INSDSeq_primary-accession"; 139 protected static final String ACC_VERSION_TAG = "INSDSeq_accession-version"; 140 protected static final String SECONDARY_ACCESSIONS_GROUP_TAG = "INSDSeq_secondary-accessions"; 141 protected static final String SECONDARY_ACCESSION_TAG = "INSDSecondary-accn"; 142 protected static final String OTHER_SEQIDS_GROUP_TAG = "INSDSeq_other-seqids"; 143 protected static final String OTHER_SEQID_TAG = "INSDSeqid"; 144 145 protected static final String KEYWORDS_GROUP_TAG = "INSDSeq_keywords"; 146 protected static final String KEYWORD_TAG = "INSDKeyword"; 147 148 protected static final String SOURCE_TAG = "INSDSeq_source"; 149 protected static final String ORGANISM_TAG = "INSDSeq_organism"; 150 protected static final String TAXONOMY_TAG = "INSDSeq_taxonomy"; 151 152 protected static final String REFERENCES_GROUP_TAG = "INSDSeq_references"; 153 protected static final String REFERENCE_TAG = "INSDReference"; 154 protected static final String REFERENCE_LOCATION_TAG = "INSDReference_reference"; 155 protected static final String REFERENCE_POSITION_TAG = "INSDReference_position"; 156 protected static final String TITLE_TAG = "INSDReference_title"; 157 protected static final String JOURNAL_TAG = "INSDReference_journal"; 158 protected static final String PUBMED_TAG = "INSDReference_pubmed"; 159 protected static final String XREF_DBNAME_TAG = "INSDXref_dbname"; 160 protected static final String XREF_ID_TAG = "INSDXref_id"; 161 protected static final String REMARK_TAG = "INSDReference_remark"; 162 protected static final String AUTHORS_GROUP_TAG = "INSDReference_authors"; 163 protected static final String AUTHOR_TAG = "INSDAuthor"; 164 protected static final String CONSORTIUM_TAG = "INSDReference_consortium"; 165 166 protected static final String COMMENT_TAG = "INSDSeq_comment"; 167 168 protected static final String FEATURES_GROUP_TAG = "INSDSeq_feature-table"; 169 protected static final String FEATURE_TAG = "INSDFeature"; 170 protected static final String FEATURE_KEY_TAG = "INSDFeature_key"; 171 protected static final String FEATURE_LOC_TAG = "INSDFeature_location"; 172 protected static final String FEATURE_INTERVALS_GROUP_TAG = "INSDFeature_intervals"; 173 protected static final String FEATURE_INTERVAL_TAG = "INSDInterval"; 174 protected static final String FEATURE_FROM_TAG = "INSDInterval_from"; 175 protected static final String FEATURE_TO_TAG = "INSDInterval_to"; 176 protected static final String FEATURE_POINT_TAG = "INSDInterval_point"; 177 protected static final String FEATURE_ISCOMP_TAG = "INSDInterval_iscomp"; 178 protected static final String FEATURE_INTERBP_TAG = "INSDInterval_interbp"; 179 protected static final String FEATURE_ACCESSION_TAG = "INSDInterval_accession"; 180 protected static final String FEATURE_OPERATOR_TAG = "INSDFeature_operator"; 181 protected static final String FEATURE_PARTIAL5_TAG = "INSDFeature_partial5"; 182 protected static final String FEATURE_PARTIAL3_TAG = "INSDFeature_partial3"; 183 protected static final String FEATUREQUALS_GROUP_TAG = "INSDFeature_quals"; 184 protected static final String FEATUREQUAL_TAG = "INSDQualifier"; 185 protected static final String FEATUREQUAL_NAME_TAG = "INSDQualifier_name"; 186 protected static final String FEATUREQUAL_VALUE_TAG = "INSDQualifier_value"; 187 188 protected static final String SEQUENCE_TAG = "INSDSeq_sequence"; 189 protected static final String CONTIG_TAG = "INSDSeq_contig"; 190 191 // dbxref line 192 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 193 194 protected static final Pattern xmlSchema = Pattern.compile(".*http://www\\.ebi\\.ac\\.uk/dtd/INSD_INSDSeq\\.dtd.*"); 195 196 /** 197 * Implements some INSDseq-specific terms. 198 */ 199 public static class Terms extends RichSequence.Terms { 200 /** 201 * Getter for the INSDseq term 202 * @return The INSDseq Term 203 */ 204 public static ComparableTerm getOtherSeqIdTerm() { 205 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("OtherSeqID"); 206 } 207 208 /** 209 * Getter for the INSDseq term 210 * @return The INSDseq Term 211 */ 212 public static ComparableTerm getINSDseqTerm() { 213 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("INSDseq"); 214 } 215 } 216 217 /** 218 * {@inheritDoc} 219 * A file is in INSDseq format if the second XML line contains the phrase "http://www.ebi.ac.uk/dtd/INSD_INSDSeq.dtd". 220 */ 221 public boolean canRead(File file) throws IOException { 222 BufferedReader br = new BufferedReader(new FileReader(file)); 223 br.readLine(); // skip first line 224 String secondLine = br.readLine(); 225 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 226 br.close(); 227 return readable; 228 } 229 230 /** 231 * {@inheritDoc} 232 * Always returns a DNA tokenizer. 233 */ 234 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 235 return RichSequence.IOTools.getDNAParser(); 236 } 237 238 /** 239 * {@inheritDoc} 240 * A stream is in INSDseq format if the second XML line contains the phrase "http://www.ebi.ac.uk/dtd/INSD_INSDSeq.dtd". 241 */ 242 public boolean canRead(BufferedInputStream stream) throws IOException { 243 stream.mark(2000); // some streams may not support this 244 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 245 br.readLine(); // skip first line 246 String secondLine = br.readLine(); 247 boolean readable = secondLine!=null && xmlSchema.matcher(secondLine).matches(); // check on second line 248 // don't close the reader as it'll close the stream too. 249 // br.close(); 250 stream.reset(); 251 return readable; 252 } 253 254 /** 255 * {@inheritDoc} 256 * Always returns a DNA tokenizer. 257 */ 258 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 259 return RichSequence.IOTools.getDNAParser(); 260 } 261 262 /** 263 * {@inheritDoc} 264 */ 265 public boolean readSequence(BufferedReader reader, 266 SymbolTokenization symParser, 267 SeqIOListener listener) 268 throws IllegalSymbolException, IOException, ParseException { 269 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 270 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 271 } 272 273 /** 274 * {@inheritDoc} 275 */ 276 public boolean readRichSequence(BufferedReader reader, 277 SymbolTokenization symParser, 278 RichSeqIOListener rlistener, 279 Namespace ns) 280 throws IllegalSymbolException, IOException, ParseException { 281 282 try { 283 DefaultHandler m_handler = new INSDseqHandler(this,symParser,rlistener,ns); 284 return XMLTools.readXMLChunk(reader, m_handler, INSDSEQ_TAG); 285 } catch (ParserConfigurationException e) { 286 throw new ParseException(e); 287 } catch (SAXException e) { 288 throw new ParseException(e); 289 } 290 } 291 292 private PrintWriter pw; 293 private XMLWriter xmlWriter; 294 295 private XMLWriter getXMLWriter() { 296 if(xmlWriter==null) { 297 // make an XML writer 298 pw = new PrintWriter(this.getPrintStream()); 299 xmlWriter = new PrettyXMLWriter(pw); 300 } 301 return xmlWriter; 302 } 303 304 /** 305 * {@inheritDoc} 306 */ 307 public void beginWriting() throws IOException { 308 XMLWriter xml = getXMLWriter(); 309 xml.printRaw("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); 310 xml.printRaw("<!DOCTYPE INSDSeq PUBLIC \"-//EMBL-EBI//INSD INSDSeq/EN\" \"http://www.insdc.org/files/documents/INSD_V1.4.dtd\">"); 311 xml.openTag(INSDSEQS_GROUP_TAG); 312 } 313 314 /** 315 * {@inheritDoc} 316 */ 317 public void finishWriting() throws IOException { 318 XMLWriter xml = getXMLWriter(); 319 xml.closeTag(INSDSEQS_GROUP_TAG); 320 pw.flush(); 321 } 322 323 /** 324 * {@inheritDoc} 325 */ 326 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 327 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 328 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 329 } 330 331 /** 332 * {@inheritDoc} 333 */ 334 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 335 if (this.getPrintStream()==null) this.setPrintStream(this.getPrintStream()); 336 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 337 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 338 } 339 340 /** 341 * {@inheritDoc} 342 * Namespace is ignored as INSDseq has no concept of it. 343 */ 344 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 345 RichSequence rs; 346 try { 347 if (seq instanceof RichSequence) rs = (RichSequence)seq; 348 else rs = RichSequence.Tools.enrich(seq); 349 } catch (ChangeVetoException e) { 350 throw new IOException("Unable to enrich sequence", e); 351 } 352 353 Set<Note> notes = rs.getNoteSet(); 354 List accessions = new ArrayList(); 355 List otherSeqIDs = new ArrayList(); 356 List kws = new ArrayList(); 357 String stranded = null; 358 String udat = null; 359 String cdat = null; 360 String urel = null; 361 String crel = null; 362 String moltype = rs.getAlphabet().getName(); 363 for (Iterator<Note> i = notes.iterator(); i.hasNext();) { 364 Note n = i.next(); 365 if (n.getTerm().equals(Terms.getStrandedTerm())) stranded=n.getValue(); 366 else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); 367 else if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); 368 else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue(); 369 else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue(); 370 else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); 371 else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) accessions.add(n.getValue()); 372 else if (n.getTerm().equals(Terms.getOtherSeqIdTerm())) otherSeqIDs.add(n.getValue()); 373 else if (n.getTerm().equals(Terms.getKeywordTerm())) kws.add(n.getValue()); 374 } 375 376 XMLWriter xml = getXMLWriter(); 377 xml.openTag(INSDSEQ_TAG); 378 379 xml.openTag(LOCUS_TAG); 380 xml.print(rs.getName()); 381 xml.closeTag(LOCUS_TAG); 382 383 xml.openTag(LENGTH_TAG); 384 xml.print(""+rs.length()); 385 xml.closeTag(LENGTH_TAG); 386 387 if (stranded!=null) { 388 xml.openTag(STRANDED_TAG); 389 xml.print(stranded); 390 xml.closeTag(STRANDED_TAG); 391 } 392 393 if (moltype!=null) { 394 xml.openTag(MOLTYPE_TAG); 395 xml.print(moltype); 396 xml.closeTag(MOLTYPE_TAG); 397 } 398 399 xml.openTag(TOPOLOGY_TAG); 400 if (rs.getCircular()) xml.print("circular"); 401 else xml.print("linear"); 402 xml.closeTag(TOPOLOGY_TAG); 403 404 if (rs.getDivision()!=null) { 405 xml.openTag(DIVISION_TAG); 406 xml.print(rs.getDivision()); 407 xml.closeTag(DIVISION_TAG); 408 } 409 410 xml.openTag(UPDATE_DATE_TAG); 411 xml.print(udat); 412 xml.closeTag(UPDATE_DATE_TAG); 413 414 if(cdat!=null) { 415 xml.openTag(CREATE_DATE_TAG); 416 xml.print(cdat); 417 xml.closeTag(CREATE_DATE_TAG); 418 } 419 420 if (urel!=null) { 421 xml.openTag(UPDATE_REL_TAG); 422 xml.print(urel); 423 xml.closeTag(UPDATE_REL_TAG); 424 } 425 426 if (crel!=null) { 427 xml.openTag(CREATE_REL_TAG); 428 xml.print(crel); 429 xml.closeTag(CREATE_REL_TAG); 430 } 431 432 if (rs.getDescription()!=null) { 433 xml.openTag(DEFINITION_TAG); 434 xml.print(rs.getDescription()); 435 xml.closeTag(DEFINITION_TAG); 436 } 437 438 xml.openTag(ACC_VERSION_TAG); 439 xml.print(rs.getAccession()+"."+rs.getVersion()); 440 xml.closeTag(ACC_VERSION_TAG); 441 442 if (!otherSeqIDs.isEmpty()) { 443 xml.openTag(OTHER_SEQIDS_GROUP_TAG); 444 for (Iterator i = otherSeqIDs.iterator(); i.hasNext(); ) { 445 446 xml.openTag(OTHER_SEQID_TAG); 447 xml.print((String)i.next()); 448 xml.closeTag(OTHER_SEQID_TAG); 449 450 } 451 xml.closeTag(OTHER_SEQIDS_GROUP_TAG); 452 } 453 454 if (!accessions.isEmpty()) { 455 xml.openTag(SECONDARY_ACCESSIONS_GROUP_TAG); 456 for (Iterator i = accessions.iterator(); i.hasNext(); ) { 457 458 xml.openTag(SECONDARY_ACCESSION_TAG); 459 xml.print((String)i.next()); 460 xml.closeTag(SECONDARY_ACCESSION_TAG); 461 462 } 463 xml.closeTag(SECONDARY_ACCESSIONS_GROUP_TAG); 464 } 465 466 if (!kws.isEmpty()) { 467 xml.openTag(KEYWORDS_GROUP_TAG); 468 for (Iterator i = kws.iterator(); i.hasNext(); ) { 469 xml.openTag(KEYWORD_TAG); 470 xml.print((String)i.next()); 471 xml.closeTag(KEYWORD_TAG); 472 } 473 xml.closeTag(KEYWORDS_GROUP_TAG); 474 } 475 476 NCBITaxon tax = rs.getTaxon(); 477 if (tax!=null) { 478 xml.openTag(SOURCE_TAG); 479 xml.print(tax.getDisplayName()); 480 xml.closeTag(SOURCE_TAG); 481 482 xml.openTag(ORGANISM_TAG); 483 xml.print(tax.getDisplayName().split("\\(")[0].trim()); 484 xml.closeTag(ORGANISM_TAG); 485 486 xml.openTag(TAXONOMY_TAG); 487 String h = tax.getNameHierarchy(); 488 xml.print(h.substring(0, h.length()-1)); // chomp dot 489 xml.closeTag(TAXONOMY_TAG); 490 } 491 492 // references - rank (bases x to y) 493 if (!rs.getRankedDocRefs().isEmpty()) { 494 xml.openTag(REFERENCES_GROUP_TAG); 495 for (Iterator<RankedDocRef> r = rs.getRankedDocRefs().iterator(); r.hasNext();) { 496 xml.openTag(REFERENCE_TAG); 497 498 RankedDocRef rdr = r.next(); 499 DocRef d = rdr.getDocumentReference(); 500 501 xml.openTag(REFERENCE_LOCATION_TAG); 502 xml.print(Integer.toString(rdr.getRank())); 503 xml.closeTag(REFERENCE_LOCATION_TAG); 504 505 RichLocation rdrl = rdr.getLocation(); 506 if(!rdrl.equals(RichLocation.EMPTY_LOCATION)) { 507 xml.openTag(REFERENCE_POSITION_TAG); 508 for (Iterator i = rdrl.blockIterator(); i.hasNext(); ) { 509 RichLocation l = (RichLocation)i.next(); 510 xml.print(l.getMin()+".."+l.getMax()); 511 if (i.hasNext()) xml.print("; "); 512 } 513 xml.closeTag(REFERENCE_POSITION_TAG); 514 } 515 516 xml.openTag(AUTHORS_GROUP_TAG); 517 List<DocRefAuthor> auths = d.getAuthorList(); 518 for (Iterator<DocRefAuthor> i = auths.iterator(); i.hasNext(); ) { 519 DocRefAuthor a = i.next(); 520 if (!a.isConsortium()) { 521 xml.openTag(AUTHOR_TAG); 522 xml.print(a.getName()); 523 xml.closeTag(AUTHOR_TAG); 524 i.remove(); 525 } 526 } 527 xml.closeTag(AUTHORS_GROUP_TAG); 528 if (!auths.isEmpty()) { // only consortia left in the set now 529 DocRefAuthor a = auths.iterator().next(); // take the first one only 530 xml.openTag(CONSORTIUM_TAG); 531 xml.print(a.getName()); 532 xml.closeTag(CONSORTIUM_TAG); 533 } 534 535 if (d.getTitle()!=null) { 536 xml.openTag(TITLE_TAG); 537 xml.print(d.getTitle()); 538 xml.closeTag(TITLE_TAG); 539 } 540 541 xml.openTag(JOURNAL_TAG); 542 xml.print(d.getLocation()); 543 xml.closeTag(JOURNAL_TAG); 544 545 CrossRef c = d.getCrossref(); 546 if (c!=null) { 547 if (c.getDbname().equals(Terms.PUBMED_KEY)) { 548 xml.openTag(PUBMED_TAG); 549 xml.print(c.getAccession()); 550 xml.closeTag(PUBMED_TAG); 551 } else { 552 xml.openTag(XREF_TAG); 553 xml.openTag(XREF_DBNAME_TAG); 554 xml.print(c.getDbname()); 555 xml.closeTag(XREF_DBNAME_TAG); 556 xml.openTag(XREF_ID_TAG); 557 xml.print(c.getAccession()); 558 xml.closeTag(XREF_ID_TAG); 559 xml.closeTag(XREF_TAG); 560 } 561 } 562 563 if (d.getRemark()!=null) { 564 xml.openTag(REMARK_TAG); 565 xml.print(d.getRemark()); 566 xml.closeTag(REMARK_TAG); 567 } 568 569 xml.closeTag(REFERENCE_TAG); 570 } 571 xml.closeTag(REFERENCES_GROUP_TAG); 572 } 573 574 if (!rs.getComments().isEmpty()) { 575 xml.openTag(COMMENT_TAG); 576 for (Iterator<Comment> i = rs.getComments().iterator(); i.hasNext(); ) xml.println(((Comment)i.next()).getComment()); 577 xml.closeTag(COMMENT_TAG); 578 } 579 580 581 // db references - only first one is output 582 if (!rs.getRankedCrossRefs().isEmpty()) { 583 Iterator<RankedCrossRef> r = rs.getRankedCrossRefs().iterator(); 584 RankedCrossRef rcr = r.next(); 585 CrossRef c = rcr.getCrossRef(); 586 Set<Note> noteset = c.getNoteSet(); 587 StringBuffer sb = new StringBuffer(); 588 sb.append(c.getDbname().toUpperCase()); 589 sb.append("; "); 590 sb.append(c.getAccession()); 591 boolean hasSecondary = false; 592 for (Iterator<Note> i = noteset.iterator(); i.hasNext(); ) { 593 Note n = i.next(); 594 if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { 595 sb.append("; "); 596 sb.append(n.getValue()); 597 hasSecondary = true; 598 } 599 } 600 //create unnecessary event firing 601 //if (!hasSecondary) sb.append("; -"); 602 603 xml.openTag(DATABASE_XREF_TAG); 604 xml.print(sb.toString()); 605 xml.closeTag(DATABASE_XREF_TAG); 606 } 607 608 if (!rs.getFeatureSet().isEmpty()) { 609 xml.openTag(FEATURES_GROUP_TAG); 610 for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { 611 RichFeature f = (RichFeature)i.next(); 612 xml.openTag(FEATURE_TAG); 613 614 xml.openTag(FEATURE_KEY_TAG); 615 xml.print(f.getTypeTerm().getName()); 616 xml.closeTag(FEATURE_KEY_TAG); 617 618 xml.openTag(FEATURE_LOC_TAG); 619 xml.print(GenbankLocationParser.writeLocation((RichLocation)f.getLocation())); 620 xml.closeTag(FEATURE_LOC_TAG); 621 622 // New in 1.4 - duplicate the location as a 623 // tree of XML tags. 624 xml.openTag(FEATURE_INTERVALS_GROUP_TAG); 625 626 RichLocation loc = (RichLocation)f.getLocation(); 627 boolean first = true; 628 boolean partial5 = false; 629 boolean partial3 = false; 630 Term operator = loc.getTerm(); 631 for (Iterator j = loc.blockIterator(); j.hasNext(); ) { 632 xml.openTag(FEATURE_INTERVAL_TAG); 633 634 RichLocation rl = (RichLocation)j.next(); 635 if (rl.getMin()==rl.getMax()) { 636 xml.openTag(FEATURE_POINT_TAG); 637 xml.print(""+rl.getMin()); 638 xml.closeTag(FEATURE_POINT_TAG); 639 } else { 640 xml.openTag(FEATURE_FROM_TAG); 641 xml.print(""+rl.getMin()); 642 xml.closeTag(FEATURE_FROM_TAG); 643 xml.openTag(FEATURE_TO_TAG); 644 xml.print(""+rl.getMax()); 645 xml.closeTag(FEATURE_TO_TAG); 646 } 647 boolean iscomp = rl.getStrand().equals(RichLocation.Strand.NEGATIVE_STRAND); 648 boolean interbp = 649 (rl.getMinPosition().getType()!=null && rl.getMinPosition().getType().equals(Position.BETWEEN_BASES)) || 650 (rl.getMaxPosition().getType()!=null && rl.getMaxPosition().getType().equals(Position.BETWEEN_BASES)); 651 if (first && rl.getMinPosition().getFuzzyStart()) partial5 = true; 652 if (!j.hasNext() && rl.getMaxPosition().getFuzzyEnd()) partial3 = true; 653 first = false; 654 655 xml.openTag(FEATURE_ISCOMP_TAG); 656 xml.print(""+iscomp); 657 xml.closeTag(FEATURE_ISCOMP_TAG); 658 659 xml.openTag(FEATURE_INTERBP_TAG); 660 xml.print(""+interbp); 661 xml.closeTag(FEATURE_INTERBP_TAG); 662 663 xml.openTag(FEATURE_ACCESSION_TAG); 664 xml.print(((RichSequence)f.getSequence()).getAccession()); 665 xml.closeTag(FEATURE_ACCESSION_TAG); 666 667 xml.closeTag(FEATURE_INTERVAL_TAG); 668 } 669 670 if (operator!=null) { 671 xml.openTag(FEATURE_OPERATOR_TAG); 672 xml.print(operator.getName()); 673 xml.closeTag(FEATURE_OPERATOR_TAG); 674 } 675 676 xml.openTag(FEATURE_PARTIAL5_TAG); 677 xml.print(""+partial5); 678 xml.closeTag(FEATURE_PARTIAL5_TAG); 679 680 xml.openTag(FEATURE_PARTIAL3_TAG); 681 xml.print(""+partial3); 682 xml.closeTag(FEATURE_PARTIAL3_TAG); 683 684 xml.closeTag(FEATURE_INTERVALS_GROUP_TAG); 685 686 xml.openTag(FEATUREQUALS_GROUP_TAG); 687 688 for (Iterator<Note> j = f.getNoteSet().iterator(); j.hasNext();) { 689 Note n = j.next(); 690 xml.openTag(FEATUREQUAL_TAG); 691 692 xml.openTag(FEATUREQUAL_NAME_TAG); 693 xml.print(""+n.getTerm().getName()); 694 xml.closeTag(FEATUREQUAL_NAME_TAG); 695 696 xml.openTag(FEATUREQUAL_VALUE_TAG); 697 if (n.getValue()!=null && !n.getValue().equals("")) { 698 if (n.getTerm().getName().equalsIgnoreCase("translation")) { 699 String[] lines = StringTools.wordWrap(n.getValue(), "\\s+", this.getLineWidth()); 700 for (int k = 0; k < lines.length; k++) xml.println(lines[k]); 701 } else { 702 xml.print(n.getValue()); 703 } 704 } 705 xml.closeTag(FEATUREQUAL_VALUE_TAG); 706 707 xml.closeTag(FEATUREQUAL_TAG); 708 } 709 // add-in to source feature only organism and db_xref="taxon:xyz" where present 710 if (f.getType().equalsIgnoreCase("source") && tax!=null) { 711 xml.openTag(FEATUREQUAL_TAG); 712 713 xml.openTag(FEATUREQUAL_NAME_TAG); 714 xml.print("db_xref"); 715 xml.closeTag(FEATUREQUAL_NAME_TAG); 716 717 xml.openTag(FEATUREQUAL_VALUE_TAG); 718 xml.print("taxon:"+tax.getNCBITaxID()); 719 xml.closeTag(FEATUREQUAL_VALUE_TAG); 720 721 xml.closeTag(FEATUREQUAL_TAG); 722 723 String displayName = tax.getDisplayName(); 724 if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); 725 726 xml.openTag(FEATUREQUAL_TAG); 727 728 xml.openTag(FEATUREQUAL_NAME_TAG); 729 xml.print("organism"); 730 xml.closeTag(FEATUREQUAL_NAME_TAG); 731 732 xml.openTag(FEATUREQUAL_VALUE_TAG); 733 xml.print(displayName); 734 xml.closeTag(FEATUREQUAL_VALUE_TAG); 735 736 xml.closeTag(FEATUREQUAL_TAG); 737 } 738 // add-in other dbxrefs where present 739 for (Iterator<RankedCrossRef> j = f.getRankedCrossRefs().iterator(); j.hasNext();) { 740 RankedCrossRef rcr = j.next(); 741 CrossRef cr = rcr.getCrossRef(); 742 xml.openTag(FEATUREQUAL_TAG); 743 744 xml.openTag(FEATUREQUAL_NAME_TAG); 745 xml.print("db_xref"); 746 xml.closeTag(FEATUREQUAL_NAME_TAG); 747 748 xml.openTag(FEATUREQUAL_VALUE_TAG); 749 xml.print(cr.getDbname()+":"+cr.getAccession()); 750 xml.closeTag(FEATUREQUAL_VALUE_TAG); 751 752 xml.closeTag(FEATUREQUAL_TAG); 753 } 754 xml.closeTag(FEATUREQUALS_GROUP_TAG); 755 756 xml.closeTag(FEATURE_TAG); 757 } 758 xml.closeTag(FEATURES_GROUP_TAG); 759 } 760 761 xml.openTag(SEQUENCE_TAG); 762 String[] lines = StringTools.wordWrap(rs.seqString(), "\\s+", this.getLineWidth()); 763 for (int i = 0; i < lines.length; i ++) xml.println(lines[i]); 764 xml.closeTag(SEQUENCE_TAG); 765 766 xml.closeTag(INSDSEQ_TAG); 767 768 pw.flush(); 769 } 770 771 /** 772 * {@inheritDoc} 773 */ 774 public String getDefaultFormat() { 775 return INSDSEQ_FORMAT; 776 } 777 778 // SAX event handler for parsing http://www.ebi.ac.uk/embl/Documentation/DTD/INSDSeq_v1.3.dtd.txt 779 private class INSDseqHandler extends DefaultHandler { 780 781 private RichSequenceFormat parent; 782 private SymbolTokenization symParser; 783 private RichSeqIOListener rlistener; 784 private Namespace ns; 785 private StringBuffer m_currentString; 786 787 private NCBITaxon tax; 788 private String organism; 789 private String accession; 790 private RichFeature.Template templ; 791 private String currFeatQual; 792 private String currRefLocation; 793 private List currRefAuthors; 794 private String currRefTitle; 795 private String currRefJournal; 796 private String currRefPubmed; 797 private String currRefRemark; 798 private String currRefPosition; 799 private String currRefXrefDBName; 800 private String currRefXrefID; 801 private List currRefXrefs; 802 private int rcrossrefCount; 803 804 // construct a new handler that will populate the given list of sequences 805 private INSDseqHandler(RichSequenceFormat parent, 806 SymbolTokenization symParser, 807 RichSeqIOListener rlistener, 808 Namespace ns) { 809 this.parent = parent; 810 this.symParser = symParser; 811 this.rlistener = rlistener; 812 this.ns = ns; 813 this.m_currentString = new StringBuffer(); 814 } 815 816 // process an opening tag 817 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 818 if (qName.equals(INSDSEQ_TAG)) { 819 try { 820 rlistener.startSequence(); 821 if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); 822 rlistener.setNamespace(ns); 823 } catch (ParseException e) { 824 throw new SAXException(e); 825 } 826 } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 827 currRefLocation = null; 828 currRefPosition = null; 829 currRefAuthors = new ArrayList(); 830 currRefTitle = null; 831 currRefJournal = null; 832 currRefPubmed = null; 833 currRefRemark = null; 834 currRefXrefs = new ArrayList(); 835 } else if (qName.equals(XREF_TAG) && !this.parent.getElideReferences()) { 836 currRefXrefDBName = null; 837 currRefXrefID = null; 838 } else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) { 839 templ = new RichFeature.Template(); 840 templ.annotation = new SimpleRichAnnotation(); 841 templ.sourceTerm = Terms.getINSDseqTerm(); 842 templ.featureRelationshipSet = new TreeSet(); 843 templ.rankedCrossRefs = new TreeSet(); 844 } 845 } 846 847 // process a closing tag - we will have read the text already 848 public void endElement(String uri, String localName, String qName) throws SAXException { 849 String val = this.m_currentString.toString().trim(); 850 851 try { 852 if (qName.equals(LOCUS_TAG)) 853 rlistener.setName(val); 854 else if (qName.equals(ACCESSION_TAG)) { 855 accession = val; 856 rlistener.setAccession(accession); 857 } else if (qName.equals(ACC_VERSION_TAG)) { 858 String parts[] = val.split("\\."); 859 accession = parts[0]; 860 rlistener.setAccession(accession); 861 if (parts.length>1) rlistener.setVersion(Integer.parseInt(parts[1])); 862 } else if (qName.equals(SECONDARY_ACCESSION_TAG)) { 863 rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),val); 864 } else if (qName.equals(OTHER_SEQID_TAG)) { 865 rlistener.addSequenceProperty(Terms.getOtherSeqIdTerm(),val); 866 } else if (qName.equals(DIVISION_TAG)) { 867 rlistener.setDivision(val); 868 } else if (qName.equals(MOLTYPE_TAG)) { 869 rlistener.addSequenceProperty(Terms.getMolTypeTerm(),val); 870 } else if (qName.equals(UPDATE_DATE_TAG)) { 871 rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(),val); 872 } else if (qName.equals(UPDATE_REL_TAG)) { 873 rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(),val); 874 } else if (qName.equals(CREATE_DATE_TAG)) { 875 rlistener.addSequenceProperty(Terms.getDateCreatedTerm(),val); 876 } else if (qName.equals(CREATE_REL_TAG)) { 877 rlistener.addSequenceProperty(Terms.getRelCreatedTerm(),val); 878 } else if (qName.equals(STRANDED_TAG)) { 879 rlistener.addSequenceProperty(Terms.getStrandedTerm(),val); 880 } else if (qName.equals(TOPOLOGY_TAG)) { 881 if ("circular".equals(val)) rlistener.setCircular(true); 882 } else if (qName.equals(DEFINITION_TAG)) { 883 rlistener.setDescription(val); 884 } else if (qName.equals(KEYWORD_TAG)) { 885 rlistener.addSequenceProperty(Terms.getKeywordTerm(), val); 886 } else if (qName.equals(COMMENT_TAG) && !this.parent.getElideComments()) { 887 rlistener.setComment(val); 888 } else if (qName.equals(DATABASE_XREF_TAG)) { 889 // database_identifier; primary_identifier; secondary_identifier.... 890 String[] parts = val.split(";"); 891 // construct a DBXREF out of the dbname part[0] and accession part[1] 892 CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)}); 893 // assign remaining bits of info as annotations 894 for (int j = 2; j < parts.length; j++) { 895 Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1); 896 try { 897 crossRef.getRichAnnotation().addNote(note); 898 } catch (ChangeVetoException ce) { 899 ParseException pe = new ParseException("Could not annotate identifier terms"); 900 pe.initCause(ce); 901 throw pe; 902 } 903 } 904 RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); 905 rlistener.setRankedCrossRef(rcrossRef); 906 } else if (qName.equals(SEQUENCE_TAG) && !this.parent.getElideSymbols()) { 907 try { 908 SymbolList sl = new SimpleSymbolList(symParser, 909 val.replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 910 rlistener.addSymbols(symParser.getAlphabet(), 911 (Symbol[])(sl.toList().toArray(new Symbol[0])), 912 0, sl.length()); 913 } catch (Exception e) { 914 throw new ParseException(e); 915 } 916 } else if (qName.equals(CONTIG_TAG)) 917 throw new SAXException("Cannot handle contigs yet"); 918 else if (qName.equals(REFERENCE_LOCATION_TAG) && !this.parent.getElideReferences()) { 919 currRefLocation = val; 920 } else if (qName.equals(REFERENCE_POSITION_TAG) && !this.parent.getElideReferences()) { 921 currRefPosition = val; 922 } else if (qName.equals(AUTHOR_TAG) && !this.parent.getElideReferences()) { 923 currRefAuthors.add(new SimpleDocRefAuthor(val,false,false)); 924 } else if (qName.equals(CONSORTIUM_TAG) && !this.parent.getElideReferences()) { 925 currRefAuthors.add(new SimpleDocRefAuthor(val,true,false)); 926 } else if (qName.equals(TITLE_TAG) && !this.parent.getElideReferences()) { 927 currRefTitle = val; 928 } else if (qName.equals(JOURNAL_TAG) && !this.parent.getElideReferences()) { 929 currRefJournal = val; 930 } else if (qName.equals(XREF_DBNAME_TAG) && !this.parent.getElideReferences()) { 931 currRefXrefDBName = val; 932 } else if (qName.equals(XREF_ID_TAG) && !this.parent.getElideReferences()) { 933 currRefXrefID = val; 934 } else if (qName.equals(XREF_TAG) && !this.parent.getElideReferences()) { 935 CrossRef xr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{ 936 currRefXrefDBName,currRefXrefID, new Integer(0)}); 937 currRefXrefs.add(xr); 938 } else if (qName.equals(PUBMED_TAG) && !this.parent.getElideReferences()) { 939 currRefPubmed = val; 940 } else if (qName.equals(REMARK_TAG) && !this.parent.getElideReferences() && !this.parent.getElideComments()) { 941 currRefRemark = val; 942 } else if (qName.equals(REFERENCE_TAG) && !this.parent.getElideReferences()) { 943 // create the crossref - medline gets priority, then pubmed, then doi 944 CrossRef dcr = null; 945 if (currRefPubmed!=null) { 946 dcr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, currRefPubmed, new Integer(0)}); 947 } else { 948 CrossRef pubmed = null; 949 CrossRef doi = null; 950 CrossRef other = null; 951 for (int i = 0; i < currRefXrefs.size(); i++) { 952 CrossRef cr = (CrossRef)currRefXrefs.get(i); 953 if(cr.getDbname().equals("pubmed")) pubmed = cr; 954 else if(cr.getDbname().equals("doi")) doi = cr; 955 else other = cr; 956 } 957 if(pubmed!=null) dcr = pubmed; 958 else if(doi!=null) dcr = doi; 959 else dcr = other; 960 } 961 // create the docref object 962 try { 963 DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{currRefAuthors,currRefJournal,currRefTitle}); 964 // assign the crossref to the docref 965 if (dcr!=null) dr.setCrossref(dcr); 966 // assign the remarks 967 dr.setRemark(currRefRemark); 968 // assign the docref to the bioentry 969 if (currRefPosition!=null) { 970 // Use the actual location specified. 971 RichLocation loc; 972 if (currRefPosition.equals("") || currRefPosition.equals("sites")) loc = RichLocation.EMPTY_LOCATION; 973 else { 974 List members = new ArrayList(); 975 String[] parts = currRefPosition.split(";\\s+"); 976 for (int i = 0; i < parts.length; i++) { 977 String[] parts2 = parts[i].split("\\.\\."); 978 if (parts2.length>1) { 979 RichLocation newLoc = new SimpleRichLocation( 980 new SimplePosition(Integer.parseInt(parts2[0])), 981 new SimplePosition(Integer.parseInt(parts2[1])), 982 i); 983 members.add(newLoc); 984 } else { 985 RichLocation newLoc = new SimpleRichLocation( 986 new SimplePosition(Integer.parseInt(parts2[0])), i); 987 members.add(newLoc); 988 } 989 } 990 loc = RichLocation.Tools.construct(members); 991 } 992 RankedDocRef rdr = new SimpleRankedDocRef(dr,loc,0); //rank set in listener 993 rlistener.setRankedDocRef(rdr); 994 } else { 995 //by default location on first position, full span would be better 996 RankedDocRef rdr = new SimpleRankedDocRef(dr,new Integer(1),new Integer(1),0); 997 rlistener.setRankedDocRef(rdr); 998 } 999 } catch (ChangeVetoException e) { 1000 throw new ParseException(e); 1001 } 1002 } 1003 else if (qName.equals(FEATURE_KEY_TAG) && !this.parent.getElideFeatures()) { 1004 templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(val); 1005 } else if (qName.equals(FEATURE_LOC_TAG) && !this.parent.getElideFeatures()) { 1006 String tidyLocStr = val.replaceAll("\\s+",""); 1007 templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); 1008 rlistener.startFeature(templ); 1009 rcrossrefCount = 0; 1010 // We don't read the hierarchy of tags for location as they 1011 // should contain the same information. 1012 } else if (qName.equals(FEATUREQUAL_NAME_TAG) && !this.parent.getElideFeatures()) { 1013 if (currFeatQual!=null) { 1014 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(currFeatQual),null); 1015 } 1016 currFeatQual = val; 1017 } else if (qName.equals(FEATUREQUAL_VALUE_TAG) && !this.parent.getElideFeatures()) { 1018 if (currFeatQual.equalsIgnoreCase("db_xref")) { 1019 Matcher m = dbxp.matcher(val); 1020 if (m.matches()) { 1021 String dbname = m.group(1); 1022 String raccession = m.group(2); 1023 if (dbname.equalsIgnoreCase("taxon")) { 1024 // Set the Taxon instead of a dbxref 1025 tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); 1026 rlistener.setTaxon(tax); 1027 try { 1028 if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); 1029 } catch (ChangeVetoException e) { 1030 throw new ParseException(e); 1031 } 1032 } else { 1033 try { 1034 CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); 1035 RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); 1036 rlistener.getCurrentFeature().addRankedCrossRef(rcr); 1037 } catch (ChangeVetoException e) { 1038 throw new ParseException(e); 1039 } 1040 } 1041 } else { 1042 throw new ParseException("Bad dbxref found: "+val); 1043 } 1044 } else if (currFeatQual.equalsIgnoreCase("organism")) { 1045 try { 1046 organism = val; 1047 if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); 1048 } catch (ChangeVetoException e) { 1049 throw new ParseException(e); 1050 } 1051 } else { 1052 if (currFeatQual.equalsIgnoreCase("translation")) { 1053 // strip spaces from sequence 1054 val = val.replaceAll("\\s+",""); 1055 } 1056 rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(currFeatQual),val); 1057 } 1058 currFeatQual = null; 1059 } else if (qName.equals(FEATURE_TAG) && !this.parent.getElideFeatures()) { 1060 rlistener.endFeature(); 1061 } 1062 1063 1064 else if (qName.equals(INSDSEQ_TAG)) 1065 rlistener.endSequence(); 1066 } catch (ParseException e) { 1067 throw new SAXException(e); 1068 } 1069 1070 // drop old string 1071 this.m_currentString.setLength(0); 1072 } 1073 1074 // process text inside tags 1075 public void characters(char[] ch, int start, int length) { 1076 this.m_currentString.append(ch, start, length); 1077 } 1078 } 1079} 1080