001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.OutputStream; 031import java.util.ArrayList; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Set; 035 036import java.util.TreeSet; 037import org.biojava.bio.BioError; 038import org.biojava.bio.BioException; 039import org.biojava.bio.seq.DNATools; 040import org.biojava.bio.seq.Feature; 041import org.biojava.bio.seq.NucleotideTools; 042import org.biojava.bio.seq.ProteinTools; 043import org.biojava.bio.seq.RNATools; 044import org.biojava.bio.seq.Sequence; 045import org.biojava.bio.seq.SequenceIterator; 046import org.biojava.bio.seq.StrandedFeature; 047import org.biojava.bio.seq.io.SymbolTokenization; 048import org.biojava.bio.symbol.Alphabet; 049import org.biojava.bio.symbol.AlphabetManager; 050import org.biojava.bio.symbol.SimpleSymbolList; 051import org.biojava.bio.symbol.SymbolList; 052import org.biojava.utils.ChangeType; 053import org.biojava.utils.ChangeVetoException; 054import org.biojavax.Namespace; 055import org.biojavax.Note; 056import org.biojavax.RichObjectFactory; 057import org.biojavax.SimpleNamespace; 058import org.biojavax.SimpleNote; 059import org.biojavax.bio.BioEntry; 060import org.biojavax.bio.seq.io.EMBLFormat; 061import org.biojavax.bio.seq.io.EMBLxmlFormat; 062import org.biojavax.bio.seq.io.FastaFormat; 063import org.biojavax.bio.seq.io.FastaHeader; 064import org.biojavax.bio.seq.io.GenbankFormat; 065import org.biojavax.bio.seq.io.HashedFastaIterator; 066import org.biojavax.bio.seq.io.INSDseqFormat; 067import org.biojavax.bio.seq.io.RichSequenceBuilderFactory; 068import org.biojavax.bio.seq.io.RichSequenceFormat; 069import org.biojavax.bio.seq.io.RichStreamReader; 070import org.biojavax.bio.seq.io.RichStreamWriter; 071import org.biojavax.bio.seq.io.UniProtFormat; 072import org.biojavax.bio.seq.io.UniProtXMLFormat; 073import org.biojavax.ontology.ComparableTerm; 074 075/** 076 * A rich sequence is a combination of a org.biojavax.bio.Bioentry and a 077 * Sequence. It inherits and merges the methods of both. The RichSequence is 078 * based on the BioSQL model and provides a richer array of methods to access 079 * information than Sequence does. Whenever possible RichSequence should be used 080 * in preference to Sequence. 081 * 082 * @author Mark Schreiber 083 * @author Richard Holland 084 * @author George Waldon 085 * @since 1.5 086 */ 087public interface RichSequence extends BioEntry, Sequence { 088 089 public static final ChangeType SYMLISTVERSION = new ChangeType( 090 "This sequences's symbollist version has changed", 091 "org.biojavax.bio.seq.RichSequence", "SYMLISTVERSION"); 092 093 public static final ChangeType CIRCULAR = new ChangeType( 094 "This sequences's circularity has changed", 095 "org.biojavax.bio.seq.RichSequence", "CIRCULAR"); 096 097 /** 098 * The version of the associated symbol list. Note the use of an object for 099 * the value means that it can be nulled. 100 * 101 * @return the version 102 */ 103 public Double getSeqVersion(); 104 105 /** 106 * Sets the version of the associated symbol list. Note the use of an object 107 * for the value means that it can be nulled. 108 * 109 * @param seqVersion 110 * the version to set. 111 * @throws ChangeVetoException 112 * if it doesn't want to change. 113 */ 114 public void setSeqVersion(Double seqVersion) throws ChangeVetoException; 115 116 /** 117 * The features for this sequence. 118 * 119 * @return a set of RichFeature objects. 120 */ 121 public Set<Feature> getFeatureSet(); 122 123 /** 124 * Sets the features of this sequence. Note that it is not checked to see if 125 * the features actually belong to this sequence, you'd best check that 126 * yourself and make changes using feature.setParent() if necessary. 127 * 128 * @param features 129 * the features to assign to this sequence, replacing all others. 130 * Must be a set of RichFeature objects. 131 * @throws ChangeVetoException 132 * if they could not be assigned. 133 */ 134 public void setFeatureSet(Set<Feature> features) throws ChangeVetoException; 135 136 /** 137 * Circularises the <code>Sequence</code>. The circular length can then be 138 * said to be the length of the sequence itself. 139 * 140 * @param circular 141 * set to true if you want it to be circular 142 * @throws ChangeVetoException 143 * if the change is blocked. Some implementations may choose not 144 * to support circularisation and should throw an exception 145 * here. Some implementations may only support this method for 146 * certain Alphabets. 147 */ 148 public void setCircular(boolean circular) throws ChangeVetoException; 149 150 /** 151 * Is the sequence circular? Circularity has implications for work with 152 * locations and any coordinate work eg symbolAt(int i). Classes that allow 153 * it should test this method when working with coordinates or locations / 154 * features. 155 * 156 * @return true if the this is circular else false. 157 */ 158 public boolean getCircular(); 159 160 /** 161 * A special function that returns the SymbolList that this RichSequence is 162 * based around. This should _not_ be the RichSequence object itself, as 163 * this function is used to perform actions on the symbol list without 164 * referring to the RichSequence object directly. 165 * 166 * @return the internal SymbolList of the RichSequence, NOT the RichSequence 167 * object itself. 168 */ 169 public SymbolList getInternalSymbolList(); 170 171 /** 172 * Stores a number of useful terms used across many sequence formats for 173 * consistency's sake. 174 */ 175 public static class Terms { 176 public static String SPECIES_KEY = "SPECIES"; 177 public static String STRAIN_KEY = "STRAIN"; 178 public static String TISSUE_KEY = "TISSUE"; 179 public static String TRANSPOSON_KEY = "TRANSPOSON"; 180 public static String PLASMID_KEY = "PLASMID"; 181 182 /** 183 * Holds a reference to the key that must be used to store PubMed 184 * references. 185 */ 186 public static final String PUBMED_KEY = "PUBMED"; 187 188 /** 189 * Holds a reference to the key that must be used to store Medline 190 * references. 191 */ 192 public static final String MEDLINE_KEY = "MEDLINE"; 193 194 /** 195 * Holds a reference to the key that must be used to store DOI 196 * references. 197 */ 198 public static final String DOI_KEY = "DOI"; 199 200 /** 201 * Getter for the secondary/tertiary/additional accession term 202 * 203 * @return A Term that represents the secondary accession tag 204 */ 205 public static ComparableTerm getAdditionalAccessionTerm() { 206 return RichObjectFactory.getDefaultOntology() 207 .getOrCreateTerm("acc"); 208 } 209 210 /** 211 * Getter for the keyword term 212 * 213 * @return a Term that represents the Keyword tag 214 */ 215 public static ComparableTerm getKeywordTerm() { 216 return RichObjectFactory.getDefaultOntology().getOrCreateTerm("kw"); 217 } 218 219 /** 220 * Getter for the date created term 221 * 222 * @return a Term 223 */ 224 public static ComparableTerm getDateCreatedTerm() { 225 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 226 "cdat"); 227 } 228 229 /** 230 * Getter for the date updated term 231 * 232 * @return a Term 233 */ 234 public static ComparableTerm getDateUpdatedTerm() { 235 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 236 "udat"); 237 } 238 239 /** 240 * Getter for the date annotated term 241 * 242 * @return a Term 243 */ 244 public static ComparableTerm getDateAnnotatedTerm() { 245 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 246 "adat"); 247 } 248 249 /** 250 * Getter for the release created term 251 * 252 * @return a Term 253 */ 254 public static ComparableTerm getRelCreatedTerm() { 255 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 256 "crel"); 257 } 258 259 /** 260 * Getter for the release updated term 261 * 262 * @return a Term 263 */ 264 public static ComparableTerm getRelUpdatedTerm() { 265 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 266 "urel"); 267 } 268 269 /** 270 * Getter for the release annotated term 271 * 272 * @return a Term 273 */ 274 public static ComparableTerm getRelAnnotatedTerm() { 275 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 276 "arel"); 277 } 278 279 /** 280 * getter for the MolType term 281 * 282 * @return a Term that represents the molecule type 283 */ 284 public static ComparableTerm getMolTypeTerm() { 285 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 286 "moltype"); 287 } 288 289 /** 290 * Getter for the Strand term; legal values are "single", "double", and 291 * "mixed". 292 * 293 * @return a Term that represents the Strand tag 294 */ 295 public static ComparableTerm getStrandedTerm() { 296 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 297 "stranded"); 298 } 299 300 /** 301 * Getter for the Organelle term 302 * 303 * @return a Term that represents the Organelle tag 304 */ 305 public static ComparableTerm getOrganelleTerm() { 306 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 307 "organelle"); 308 } 309 310 /** 311 * Getter for the GeneName term 312 * 313 * @return The GeneName Term 314 */ 315 public static ComparableTerm getGeneNameTerm() { 316 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 317 "gene_name"); 318 } 319 320 /** 321 * Getter for the GeneSynonym term 322 * 323 * @return The GeneSynonym Term 324 */ 325 public static ComparableTerm getGeneSynonymTerm() { 326 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 327 "gene_synonym"); 328 } 329 330 /** 331 * Getter for the OrderedLocusName term 332 * 333 * @return The OrderedLocusName Term 334 */ 335 public static ComparableTerm getOrderedLocusNameTerm() { 336 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 337 "gene_ordloc"); 338 } 339 340 /** 341 * Getter for the ORFName term 342 * 343 * @return The ORFName Term 344 */ 345 public static ComparableTerm getORFNameTerm() { 346 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 347 "gene_orf"); 348 } 349 350 /** 351 * Getter for the Strain term 352 * 353 * @return The Strain Term 354 */ 355 public static ComparableTerm getStrainTerm() { 356 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 357 "strain"); 358 } 359 360 /** 361 * Getter for the Species term 362 * 363 * @return The Species Term 364 */ 365 public static ComparableTerm getSpeciesTerm() { 366 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 367 "species"); 368 } 369 370 /** 371 * Getter for the Tissue term 372 * 373 * @return The Tissue Term 374 */ 375 public static ComparableTerm getTissueTerm() { 376 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 377 "tissue"); 378 } 379 380 /** 381 * Getter for the Transposon term 382 * 383 * @return The Transposon Term 384 */ 385 public static ComparableTerm getTransposonTerm() { 386 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 387 "transposon"); 388 } 389 390 /** 391 * Getter for the Plasmid term 392 * 393 * @return The plasmid Term 394 */ 395 public static ComparableTerm getPlasmidTerm() { 396 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 397 "plasmid"); 398 } 399 400 /** 401 * Getter for the DataClass term 402 * 403 * @return The DataClass Term 404 */ 405 public static ComparableTerm getDataClassTerm() { 406 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 407 "dataclass"); 408 } 409 410 /** 411 * Getter for the FTId term 412 * 413 * @return The FTId Term 414 */ 415 public static ComparableTerm getFTIdTerm() { 416 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 417 "feature_id"); 418 } 419 420 /** 421 * Getter for the FeatureDesc term 422 * 423 * @return The FeatureDesc Term 424 */ 425 public static ComparableTerm getFeatureDescTerm() { 426 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 427 "feature_desc"); 428 } 429 430 /** 431 * Getter for the copyright term 432 * 433 * @return The copyright Term 434 */ 435 public static ComparableTerm getCopyrightTerm() { 436 return RichObjectFactory.getDefaultOntology().getOrCreateTerm( 437 "copyright"); 438 } 439 } 440 441 /** 442 * Some useful tools for working with RichSequence objects. 443 * 444 * @since 1.5 445 */ 446 public static class Tools { 447 448 // because we are static we don't want any instances 449 private Tools() { 450 } 451 452 /** 453 * Create a new RichSequence in the default namespace. 454 * 455 * @param name 456 * The name for the sequence. Will also be used for the 457 * accession. 458 * @param seqString 459 * The sequence string 460 * @param alpha 461 * The <CODE>Alphabet</CODE> for the sequence 462 * @throws org.biojava.bio.BioException 463 * If the symbols in <CODE>seqString</CODE> are not valid in 464 * <CODE>alpha</CODE> 465 * @return A new <CODE>RichSequence</CODE>. All versions are 1 or 1.0 466 */ 467 public static RichSequence createRichSequence(String name, 468 String seqString, Alphabet alpha) throws BioException { 469 SymbolList syms = new SimpleSymbolList(alpha 470 .getTokenization("token"), seqString); 471 return createRichSequence(name, syms); 472 } 473 474 /** 475 * Create a new RichSequence in the specified namespace. 476 * 477 * @return A new <CODE>RichSequence</CODE>. All versions are 1 or 1.0 478 * @param namespace 479 * the namespace to create the sequence in. A singleton 480 * <CODE>Namespace</CODE> will be created or retrieved as 481 * appropriate. 482 * @param name 483 * The name for the sequence. Will also be used for the 484 * accession. 485 * @param seqString 486 * The sequence string 487 * @param alpha 488 * The <CODE>Alphabet</CODE> for the sequence 489 * @throws org.biojava.bio.BioException 490 * If the symbols in <CODE>seqString</CODE> are not valid in 491 * <CODE>alpha</CODE> 492 */ 493 public static RichSequence createRichSequence(String namespace, 494 String name, String seqString, Alphabet alpha) 495 throws BioException { 496 SymbolList syms = new SimpleSymbolList(alpha 497 .getTokenization("token"), seqString); 498 Namespace ns = (Namespace) RichObjectFactory.getObject( 499 SimpleNamespace.class, new Object[] { namespace }); 500 return createRichSequence(ns, name, syms); 501 } 502 503 /** 504 * Create a new RichSequence in the specified namespace. 505 * 506 * @return A new <CODE>RichSequence</CODE>. All versions are 1 or 1.0 507 * @param ns 508 * The namespace to create the sequence in. 509 * @param name 510 * The name for the sequence. Will also be used for the 511 * accession. 512 * @param seqString 513 * The sequence string 514 * @param alpha 515 * The <CODE>Alphabet</CODE> for the sequence 516 * @throws org.biojava.bio.BioException 517 * If the symbols in <CODE>seqString</CODE> are not valid in 518 * <CODE>alpha</CODE> 519 */ 520 public static RichSequence createRichSequence(Namespace ns, 521 String name, String seqString, Alphabet alpha) 522 throws BioException { 523 SymbolList syms = new SimpleSymbolList(alpha 524 .getTokenization("token"), seqString); 525 return createRichSequence(ns, name, syms); 526 } 527 528 /** 529 * Create a new RichSequence in the default namespace. 530 * 531 * @return A new <CODE>RichSequence</CODE>. All versions are 1 or 1.0 532 * @param syms 533 * The symbols to add to the sequence. 534 * @param name 535 * The name for the sequence. Will also be used for the 536 * accession. 537 */ 538 public static RichSequence createRichSequence(String name, 539 SymbolList syms) { 540 Namespace ns = RichObjectFactory.getDefaultNamespace(); 541 return createRichSequence(ns, name, syms); 542 } 543 544 /** 545 * Create a new RichSequence in the specified namespace. 546 * 547 * @return A new <CODE>RichSequence</CODE>. All versions are 1 or 1.0 548 * @param ns 549 * the namespace to create the sequence in. 550 * @param syms 551 * The symbols to add to the sequence. 552 * @param name 553 * The name for the sequence. Will also be used for the 554 * accession. 555 */ 556 public static RichSequence createRichSequence(Namespace ns, 557 String name, SymbolList syms) { 558 return new SimpleRichSequence(ns, name, name, 1, syms, new Double( 559 1.0)); 560 } 561 562 /** 563 * Boldly attempts to convert a <CODE>Sequence</CODE> into a 564 * <CODE>RichSequence</CODE>. <CODE>Sequence</CODE>s will be assigned to 565 * the default namespace. The accession will be assumed to be the name 566 * of the old sequence. The version of the sequence will be set to 0 and 567 * the seqversion set to 0.0. <CODE>Feature</CODE>s are converted to 568 * <CODE>RichFeature</CODE>s. The old <CODE>Annotation</CODE> bundle is 569 * converted to a <CODE>RichAnnotation</CODE> 570 * 571 * @param s 572 * The <CODE>Sequence</CODE> to enrich 573 * @throws ChangeVetoException 574 * if <CODE>s</CODE> is locked or the conversion fails. 575 * @return a new <CODE>RichSequence</CODE> 576 */ 577 public static RichSequence enrich(Sequence s) 578 throws ChangeVetoException { 579 if (s instanceof RichSequence) 580 return (RichSequence) s; 581 String name = s.getName(); 582 RichSequence rs = new SimpleRichSequence(RichObjectFactory 583 .getDefaultNamespace(), 584 name == null ? "UnknownName" : name, 585 name == null ? "UnknownAccession" : name, 0, s, new Double( 586 0.0)); 587 // Transfer features 588 for (Iterator i = s.features(); i.hasNext();) { 589 Feature f = (Feature) i.next(); 590 try { 591 rs.createFeature(f.makeTemplate()); 592 } catch (BioException e) { 593 throw new ChangeVetoException("They hates us!", e); 594 } 595 } 596 // Transfer annotations 597 for (Iterator<Object> i = s.getAnnotation().keys().iterator(); i.hasNext();) { 598 Object key = i.next(); 599 Object value = s.getAnnotation().getProperty(key); 600 rs.getAnnotation().setProperty(key, value); 601 } 602 return rs; 603 } 604 605 /** 606 * <p> 607 * Creates a new sequence from a subregion of another sequence. The 608 * sequence is not a view. The sequence can be given a new Namespace, 609 * Accession, Name, Identifier etc. or you can copy over the old values. 610 * For unique identification in databases we recommend you change at 611 * least the name and identifier. 612 * </p> 613 * <p> 614 * The new sequence will retain all features that are fully contained by 615 * the new subsequence, the note set (annotation), Taxon, and 616 * description, modified to reflect the subsequence as follows: 617 * 618 * <pre> 619 * seq.setDescription("subsequence (" + from + ":" + to + ") of " 620 * + s.getDescription()); 621 * </pre> 622 * 623 * No other properties are copied. 624 * 625 * @param newVersion 626 * the new version number 627 * @param seqVersion 628 * the new sequence version 629 * @param s 630 * the original <code>RichSequence</code>. 631 * @param from 632 * the 1st subsequence coordinate (inclusive) 633 * @param to 634 * the last subsequence coordinate (inclusive) 635 * @param newNamespace 636 * the new <code>Namespace</code> 637 * @param newName 638 * the new name 639 * @param newAccession 640 * the new accession number 641 * @param newIdentifier 642 * the new identifier 643 * @throws java.lang.IndexOutOfBoundsException 644 * if <CODE>from</CODE> or <CODE>to</CODE> lie outside of 645 * the bounds of <CODE>s</CODE>. 646 * @return A new <CODE>RichSequence</CODE> 647 */ 648 public static RichSequence subSequence(RichSequence s, int from, 649 int to, Namespace newNamespace, String newName, 650 String newAccession, String newIdentifier, int newVersion, 651 Double seqVersion) throws IndexOutOfBoundsException { 652 SymbolList symList = s.subList(from, to); 653 SimpleRichSequence seq = new SimpleRichSequence(newNamespace, 654 newName, newAccession, newVersion, symList, seqVersion); 655 RichLocation subLoc = new SimpleRichLocation(new SimplePosition( 656 from), new SimplePosition(to), 0); 657 RichLocation subLocComplement = new SimpleRichLocation( 658 new SimplePosition(from), new SimplePosition(to), 0, 659 RichLocation.Strand.NEGATIVE_STRAND); 660 try { 661 // copy features if appropriate 662 for (Iterator<Feature> i = s.features(); i.hasNext();) { 663 RichFeature f = (RichFeature) i.next(); 664 665 if (f.getStrand().equals(StrandedFeature.POSITIVE)) { 666 if (subLoc.contains(f.getLocation())) { 667 RichFeature.Template templ = (RichFeature.Template) f.makeTemplate(); 668 669 // change the location 670 Position min = new SimplePosition(templ.location.getMin() 671 - from + 1); 672 673 // System.out.println("getMin " + 674 // templ.location.getMin()); 675 676 Position max = new SimplePosition(templ.location.getMax() 677 - from + 1); 678 679 // System.out.println("getMax " + 680 // templ.location.getMax()); 681 682 templ.location = new SimpleRichLocation(min, max, 0); 683 seq.createFeature(templ); 684 } 685 } else { 686 if (subLocComplement.contains(f.getLocation())) { 687 RichFeature.Template templ = (RichFeature.Template) f.makeTemplate(); 688 689 // change the location 690 Position min = new SimplePosition(templ.location.getMin() 691 - from + 1); 692 693 // System.out.println("getMin " + 694 // templ.location.getMin()); 695 696 Position max = new SimplePosition(templ.location.getMax() 697 - from + 1); 698 699 // System.out.println("getMax " + 700 // templ.location.getMax()); 701 702 templ.location = new SimpleRichLocation(min, max, 703 0, RichLocation.Strand.NEGATIVE_STRAND); 704 seq.createFeature(templ); 705 } 706 } 707 708 } 709 710 // clone Notes 711 if (s.getNoteSet() != null) { 712 Set<Note> notes = s.getNoteSet(); 713 Iterator<Note> it = notes.iterator(); 714 Set ns = new TreeSet(); 715 while (it.hasNext()) { 716 Note note = it.next(); 717 ns.add(new SimpleNote( 718 note.getTerm(), 719 note.getValue(), 720 note.getRank())); 721 } 722 seq.setNoteSet(ns); 723 } 724 725 // copy other cruft 726 if (s.getTaxon() != null) { 727 seq.setTaxon(s.getTaxon()); 728 } 729 if (s.getDescription() != null) { 730 seq.setDescription("subsequence (" + from + ":" + to 731 + ") of " + s.getDescription()); 732 } 733 if (s.getDivision() != null) { 734 seq.setDivision(s.getDivision()); 735 } 736 } catch (ChangeVetoException ex) { 737 throw new BioError(ex); // something is rotten in Denmark! 738 } catch (BioException ex) { 739 throw new BioError(ex); // something is rotten in Denmark! 740 } 741 return seq; 742 } 743 } 744 745 /** 746 * A set of convenience methods for handling common file formats. 747 * 748 * @author Mark Schreiber 749 * @author Richard Holland 750 * @since 1.5 751 */ 752 public final class IOTools { 753 754 private static RichSequenceBuilderFactory factory = RichSequenceBuilderFactory.FACTORY; 755 756 // This can't be instantiated. 757 private IOTools() { 758 } 759 760 /** 761 * Register a new format with IOTools for auto-guessing. 762 * 763 * @param formatClass 764 * the <code>RichSequenceFormat</code> object to register. 765 */ 766 public static void registerFormat(Class formatClass) { 767 Object o; 768 try { 769 o = formatClass.newInstance(); 770 } catch (Exception e) { 771 throw new BioError(e); 772 } 773 if (!(o instanceof RichSequenceFormat)) 774 throw new BioError("Class " + formatClass 775 + " is not an implementation of RichSequenceFormat!"); 776 formatClasses.add(formatClass); 777 } 778 779 // Private reference to the formats we know about. 780 private static List<Class> formatClasses = new ArrayList<Class>(); 781 782 /** 783 * Guess which format a stream is then attempt to read it. 784 * 785 * @param stream 786 * the <code>BufferedInputStream</code> to attempt to read. 787 * @param seqFactory 788 * a factory used to build a <code>RichSequence</code> 789 * @param ns 790 * a <code>Namespace</code> to load the sequences into. Null 791 * implies that it should use the namespace specified in the 792 * file. If no namespace is specified in the file, then 793 * <code>RichObjectFactory.getDefaultNamespace()</code> is 794 * used. 795 * @return a <code>RichSequenceIterator</code> over each sequence in the 796 * file 797 * @throws IOException 798 * in case the stream is unrecognisable or problems occur in 799 * reading it. 800 */ 801 public static RichSequenceIterator readStream( 802 BufferedInputStream stream, 803 RichSequenceBuilderFactory seqFactory, Namespace ns) 804 throws IOException { 805 for (Iterator<Class> i = formatClasses.iterator(); i.hasNext();) { 806 Class formatClass = i.next(); 807 RichSequenceFormat format; 808 try { 809 format = (RichSequenceFormat) formatClass.newInstance(); 810 } catch (Exception e) { 811 throw new BioError(e); 812 } 813 if (format.canRead(stream)) { 814 SymbolTokenization sTok = format 815 .guessSymbolTokenization(stream); 816 BufferedReader br = new BufferedReader( 817 new InputStreamReader(stream)); 818 return new RichStreamReader(br, format, sTok, seqFactory, 819 ns); 820 } 821 } 822 throw new IOException("Could not recognise format of stream."); 823 } 824 825 /** 826 * Guess which format a stream is then attempt to read it. 827 * 828 * @return a <code>RichSequenceIterator</code> over each sequence in the 829 * file 830 * @param stream 831 * the <code>BufferedInputStream</code> to attempt to read. 832 * @param ns 833 * a <code>Namespace</code> to load the sequences into. Null 834 * implies that it should use the namespace specified in the 835 * file. If no namespace is specified in the file, then 836 * <code>RichObjectFactory.getDefaultNamespace()</code> is 837 * used. 838 * @throws java.io.IOException 839 * If the file cannot be read. 840 */ 841 public static RichSequenceIterator readStream( 842 BufferedInputStream stream, Namespace ns) throws IOException { 843 return readStream(stream, factory, ns); 844 } 845 846 /** 847 * Guess which format a file is then attempt to read it. 848 * 849 * @param file 850 * the <code>File</code> to attempt to read. 851 * @param seqFactory 852 * a factory used to build a <code>RichSequence</code> 853 * @param ns 854 * a <code>Namespace</code> to load the sequences into. Null 855 * implies that it should use the namespace specified in the 856 * file. If no namespace is specified in the file, then 857 * <code>RichObjectFactory.getDefaultNamespace()</code> is 858 * used. 859 * @return a <code>RichSequenceIterator</code> over each sequence in the 860 * file 861 * @throws IOException 862 * in case the file is unrecognisable or problems occur in 863 * reading it. 864 */ 865 public static RichSequenceIterator readFile(File file, 866 RichSequenceBuilderFactory seqFactory, Namespace ns) 867 throws IOException { 868 for (Iterator<Class> i = formatClasses.iterator(); i.hasNext();) { 869 Class formatClass = i.next(); 870 RichSequenceFormat format; 871 try { 872 format = (RichSequenceFormat) formatClass.newInstance(); 873 } catch (Exception e) { 874 throw new BioError(e); 875 } 876 if (format.canRead(file)) { 877 SymbolTokenization sTok = format 878 .guessSymbolTokenization(file); 879 BufferedReader br = new BufferedReader(new FileReader(file)); 880 return new RichStreamReader(br, format, sTok, seqFactory, 881 ns); 882 } 883 } 884 throw new IOException("Could not recognise format of file: " 885 + file.getName()); 886 } 887 888 /** 889 * Guess which format a file is then attempt to read it. 890 * 891 * @return a <code>RichSequenceIterator</code> over each sequence in the 892 * file 893 * @param file 894 * the <code>File</code> to attempt to read. 895 * @param ns 896 * a <code>Namespace</code> to load the sequences into. Null 897 * implies that it should use the namespace specified in the 898 * file. If no namespace is specified in the file, then 899 * <code>RichObjectFactory.getDefaultNamespace()</code> is 900 * used. 901 * @throws java.io.IOException 902 * If the file cannot be read. 903 */ 904 public static RichSequenceIterator readFile(File file, Namespace ns) 905 throws IOException { 906 return readFile(file, factory, ns); 907 } 908 909 /** 910 * Read a fasta file. 911 * 912 * @param br 913 * the <code>BufferedReader<code> to read data from 914 * @param sTok 915 * a <code>SymbolTokenization</code> that understands the 916 * sequences 917 * @param ns 918 * a <code>Namespace</code> to load the sequences into. Null 919 * implies that it should use the namespace specified in the 920 * file. If no namespace is specified in the file, then 921 * <code>RichObjectFactory.getDefaultNamespace()</code> is 922 * used. 923 * @return a <code>RichSequenceIterator</code> over each sequence in the 924 * fasta file 925 */ 926 public static RichSequenceIterator readFasta(BufferedReader br, 927 SymbolTokenization sTok, Namespace ns) { 928 return new RichStreamReader(br, new FastaFormat(), sTok, factory, 929 ns); 930 } 931 932 /** 933 * Read a fasta file building a custom type of <code>RichSequence</code> 934 * . For example, use <code>RichSequenceBuilderFactory.FACTORY</code> to 935 * emulate <code>readFasta(BufferedReader, SymbolTokenization)</code> 936 * and <code>RichSequenceBuilderFactory.PACKED</code> to force all 937 * symbols to be encoded using bit-packing. 938 * 939 * @param br 940 * the <code>BufferedReader</code> to read data from 941 * @param sTok 942 * a <code>SymbolTokenization</code> that understands the 943 * sequences 944 * @param seqFactory 945 * a factory used to build a <code>RichSequence</code> 946 * @param ns 947 * a <code>Namespace</code> to load the sequences into. Null 948 * implies that it should use the namespace specified in the 949 * file. If no namespace is specified in the file, then 950 * <code>RichObjectFactory.getDefaultNamespace()</code> is 951 * used. 952 * @return a <code>RichSequenceIterator</code> over each sequence in the 953 * fasta file 954 */ 955 public static RichSequenceIterator readFasta(BufferedReader br, 956 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 957 Namespace ns) { 958 return new RichStreamReader(br, new FastaFormat(), sTok, 959 seqFactory, ns); 960 } 961 962 /** 963 * Iterate over the sequences in an FASTA-format stream of DNA 964 * sequences. 965 * 966 * @param br 967 * the <code>BufferedReader</code> to read data from 968 * @param ns 969 * a <code>Namespace</code> to load the sequences into. Null 970 * implies that it should use the namespace specified in the 971 * file. If no namespace is specified in the file, then 972 * <code>RichObjectFactory.getDefaultNamespace()</code> is 973 * used. 974 * @return a <code>RichSequenceIterator</code> over each sequence in the 975 * fasta file 976 * @see #readHashedFastaDNA(BufferedInputStream, Namespace) for a 977 * speeded up version that can access sequences from memory. 978 */ 979 public static RichSequenceIterator readFastaDNA(BufferedReader br, 980 Namespace ns) { 981 return new RichStreamReader(br, new FastaFormat(), getDNAParser(), 982 factory, ns); 983 } 984 985 /** 986 * Iterate over the sequences in an FASTA-format stream of DNA 987 * sequences. In contrast to readFastaDNA, this provides a speeded up 988 * implementation where all sequences are accessed from memory. 989 * 990 * @param is 991 * the <code>BufferedInputStream</code> to read data from 992 * @param ns 993 * a <code>Namespace</code> to load the sequences into. Null 994 * implies that it should use the namespace specified in the 995 * file. If no namespace is specified in the file, then 996 * <code>RichObjectFactory.getDefaultNamespace()</code> is 997 * used. 998 * @return a <code>RichSequenceIterator</code> over each sequence in the 999 * fasta file 1000 * @throws BioException 1001 * if somethings goes wrong while reading the file. 1002 * @see #readFastaDNA 1003 */ 1004 public static RichSequenceIterator readHashedFastaDNA( 1005 BufferedInputStream is, Namespace ns) throws BioException { 1006 1007 Alphabet alpha = AlphabetManager.alphabetForName("DNA"); 1008 return new HashedFastaIterator(is, alpha, ns); 1009 1010 } 1011 1012 /** 1013 * Iterate over the sequences in an FASTA-format stream of RNA 1014 * sequences. 1015 * 1016 * @param br 1017 * the <code>BufferedReader</code> to read data from 1018 * @param ns 1019 * a <code>Namespace</code> to load the sequences into. Null 1020 * implies that it should use the namespace specified in the 1021 * file. If no namespace is specified in the file, then 1022 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1023 * used. 1024 * @return a <code>RichSequenceIterator</code> over each sequence in the 1025 * fasta file 1026 */ 1027 public static RichSequenceIterator readFastaRNA(BufferedReader br, 1028 Namespace ns) { 1029 return new RichStreamReader(br, new FastaFormat(), getRNAParser(), 1030 factory, ns); 1031 } 1032 1033 /** 1034 * Iterate over the sequences in an FASTA-format stream of Protein 1035 * sequences. 1036 * 1037 * @param br 1038 * the <code>BufferedReader</code> to read data from 1039 * @param ns 1040 * a <code>Namespace</code> to load the sequences into. Null 1041 * implies that it should use the namespace specified in the 1042 * file. If no namespace is specified in the file, then 1043 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1044 * used. 1045 * @return a <code>RichSequenceIterator</code> over each sequence in the 1046 * fasta file 1047 */ 1048 public static RichSequenceIterator readFastaProtein(BufferedReader br, 1049 Namespace ns) { 1050 return new RichStreamReader(br, new FastaFormat(), 1051 getProteinParser(), factory, ns); 1052 } 1053 1054 /** 1055 * Read a GenBank file using a custom type of SymbolList. For example, 1056 * use RichSequenceBuilderFactory.FACTORY to emulate 1057 * readFasta(BufferedReader, SymbolTokenization) and 1058 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1059 * using bit-packing. 1060 * 1061 * @param br 1062 * the <code>BufferedReader</code> to read data from 1063 * @param sTok 1064 * a <code>SymbolTokenization</code> that understands the 1065 * sequences 1066 * @param seqFactory 1067 * a factory used to build a <code>SymbolList</code> 1068 * @param ns 1069 * a <code>Namespace</code> to load the sequences into. Null 1070 * implies that it should use the namespace specified in the 1071 * file. If no namespace is specified in the file, then 1072 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1073 * used. 1074 * @return a <code>RichSequenceIterator</code> over each sequence in the 1075 * fasta file 1076 */ 1077 public static RichSequenceIterator readGenbank(BufferedReader br, 1078 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1079 Namespace ns) { 1080 return new RichStreamReader(br, new GenbankFormat(), sTok, 1081 seqFactory, ns); 1082 } 1083 1084 /** 1085 * Iterate over the sequences in an GenBank-format stream of DNA 1086 * sequences. 1087 * 1088 * @param br 1089 * the <code>BufferedReader</code> to read data from 1090 * @param ns 1091 * a <code>Namespace</code> to load the sequences into. Null 1092 * implies that it should use the namespace specified in the 1093 * file. If no namespace is specified in the file, then 1094 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1095 * used. 1096 * @return a <code>RichSequenceIterator</code> over each sequence in the 1097 * fasta file 1098 */ 1099 public static RichSequenceIterator readGenbankDNA(BufferedReader br, 1100 Namespace ns) { 1101 return new RichStreamReader(br, new GenbankFormat(), 1102 getDNAParser(), factory, ns); 1103 } 1104 1105 /** 1106 * Iterate over the sequences in an GenBank-format stream of RNA 1107 * sequences. 1108 * 1109 * @param br 1110 * the <code>BufferedReader</code> to read data from 1111 * @param ns 1112 * a <code>Namespace</code> to load the sequences into. Null 1113 * implies that it should use the namespace specified in the 1114 * file. If no namespace is specified in the file, then 1115 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1116 * used. 1117 * @return a <code>RichSequenceIterator</code> over each sequence in the 1118 * fasta file 1119 */ 1120 public static RichSequenceIterator readGenbankRNA(BufferedReader br, 1121 Namespace ns) { 1122 return new RichStreamReader(br, new GenbankFormat(), 1123 getRNAParser(), factory, ns); 1124 } 1125 1126 /** 1127 * Iterate over the sequences in an GenBank-format stream of Protein 1128 * sequences. 1129 * 1130 * @param br 1131 * the <code>BufferedReader</code> to read data from 1132 * @param ns 1133 * a <code>Namespace</code> to load the sequences into. Null 1134 * implies that it should use the namespace specified in the 1135 * file. If no namespace is specified in the file, then 1136 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1137 * used. 1138 * @return a <code>RichSequenceIterator</code> over each sequence in the 1139 * fasta file 1140 */ 1141 public static RichSequenceIterator readGenbankProtein( 1142 BufferedReader br, Namespace ns) { 1143 return new RichStreamReader(br, new GenbankFormat(), 1144 getProteinParser(), factory, ns); 1145 } 1146 1147 /** 1148 * Read a INSDseq file using a custom type of SymbolList. For example, 1149 * use RichSequenceBuilderFactory.FACTORY to emulate 1150 * readFasta(BufferedReader, SymbolTokenization) and 1151 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1152 * using bit-packing. 1153 * 1154 * @param br 1155 * the <code>BufferedReader</code> to read data from 1156 * @param sTok 1157 * a <code>SymbolTokenization</code> that understands the 1158 * sequences 1159 * @param seqFactory 1160 * a factory used to build a <code>SymbolList</code> 1161 * @param ns 1162 * a <code>Namespace</code> to load the sequences into. Null 1163 * implies that it should use the namespace specified in the 1164 * file. If no namespace is specified in the file, then 1165 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1166 * used. 1167 * @return a <code>RichSequenceIterator</code> over each sequence in the 1168 * fasta file 1169 */ 1170 public static RichSequenceIterator readINSDseq(BufferedReader br, 1171 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1172 Namespace ns) { 1173 return new RichStreamReader(br, new INSDseqFormat(), sTok, 1174 seqFactory, ns); 1175 } 1176 1177 /** 1178 * Iterate over the sequences in an INSDseq-format stream of DNA 1179 * sequences. 1180 * 1181 * @param br 1182 * the <code>BufferedReader</code> to read data from 1183 * @param ns 1184 * a <code>Namespace</code> to load the sequences into. Null 1185 * implies that it should use the namespace specified in the 1186 * file. If no namespace is specified in the file, then 1187 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1188 * used. 1189 * @return a <code>RichSequenceIterator</code> over each sequence in the 1190 * fasta file 1191 */ 1192 public static RichSequenceIterator readINSDseqDNA(BufferedReader br, 1193 Namespace ns) { 1194 return new RichStreamReader(br, new INSDseqFormat(), 1195 getDNAParser(), factory, ns); 1196 } 1197 1198 /** 1199 * Iterate over the sequences in an INSDseq-format stream of RNA 1200 * sequences. 1201 * 1202 * @param br 1203 * the <code>BufferedReader</code> to read data from 1204 * @param ns 1205 * a <code>Namespace</code> to load the sequences into. Null 1206 * implies that it should use the namespace specified in the 1207 * file. If no namespace is specified in the file, then 1208 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1209 * used. 1210 * @return a <code>RichSequenceIterator</code> over each sequence in the 1211 * fasta file 1212 */ 1213 public static RichSequenceIterator readINSDseqRNA(BufferedReader br, 1214 Namespace ns) { 1215 return new RichStreamReader(br, new INSDseqFormat(), 1216 getRNAParser(), factory, ns); 1217 } 1218 1219 /** 1220 * Iterate over the sequences in an INSDseq-format stream of Protein 1221 * sequences. 1222 * 1223 * @param br 1224 * the <code>BufferedReader</code> to read data from 1225 * @param ns 1226 * a <code>Namespace</code> to load the sequences into. Null 1227 * implies that it should use the namespace specified in the 1228 * file. If no namespace is specified in the file, then 1229 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1230 * used. 1231 * @return a <code>RichSequenceIterator</code> over each sequence in the 1232 * fasta file 1233 */ 1234 public static RichSequenceIterator readINSDseqProtein( 1235 BufferedReader br, Namespace ns) { 1236 return new RichStreamReader(br, new INSDseqFormat(), 1237 getProteinParser(), factory, ns); 1238 } 1239 1240 /** 1241 * Read a EMBLxml file using a custom type of SymbolList. For example, 1242 * use RichSequenceBuilderFactory.FACTORY to emulate 1243 * readFasta(BufferedReader, SymbolTokenization) and 1244 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1245 * using bit-packing. 1246 * 1247 * @param br 1248 * the <code>BufferedReader</code> to read data from 1249 * @param sTok 1250 * a <code>SymbolTokenization</code> that understands the 1251 * sequences 1252 * @param seqFactory 1253 * a factory used to build a <code>SymbolList</code> 1254 * @param ns 1255 * a <code>Namespace</code> to load the sequences into. Null 1256 * implies that it should use the namespace specified in the 1257 * file. If no namespace is specified in the file, then 1258 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1259 * used. 1260 * @return a <code>RichSequenceIterator</code> over each sequence in the 1261 * fasta file 1262 */ 1263 public static RichSequenceIterator readEMBLxml(BufferedReader br, 1264 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1265 Namespace ns) { 1266 return new RichStreamReader(br, new EMBLxmlFormat(), sTok, 1267 seqFactory, ns); 1268 } 1269 1270 /** 1271 * Iterate over the sequences in an EMBLxml-format stream of DNA 1272 * sequences. 1273 * 1274 * @param br 1275 * the <code>BufferedReader</code> to read data from 1276 * @param ns 1277 * a <code>Namespace</code> to load the sequences into. Null 1278 * implies that it should use the namespace specified in the 1279 * file. If no namespace is specified in the file, then 1280 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1281 * used. 1282 * @return a <code>RichSequenceIterator</code> over each sequence in the 1283 * fasta file 1284 */ 1285 public static RichSequenceIterator readEMBLxmlDNA(BufferedReader br, 1286 Namespace ns) { 1287 return new RichStreamReader(br, new EMBLxmlFormat(), 1288 getDNAParser(), factory, ns); 1289 } 1290 1291 /** 1292 * Iterate over the sequences in an EMBLxml-format stream of RNA 1293 * sequences. 1294 * 1295 * @param br 1296 * the <code>BufferedReader</code> to read data from 1297 * @param ns 1298 * a <code>Namespace</code> to load the sequences into. Null 1299 * implies that it should use the namespace specified in the 1300 * file. If no namespace is specified in the file, then 1301 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1302 * used. 1303 * @return a <code>RichSequenceIterator</code> over each sequence in the 1304 * fasta file 1305 */ 1306 public static RichSequenceIterator readEMBLxmlRNA(BufferedReader br, 1307 Namespace ns) { 1308 return new RichStreamReader(br, new EMBLxmlFormat(), 1309 getRNAParser(), factory, ns); 1310 } 1311 1312 /** 1313 * Iterate over the sequences in an EMBLxml-format stream of Protein 1314 * sequences. 1315 * 1316 * @param br 1317 * the <code>BufferedReader</code> to read data from 1318 * @param ns 1319 * a <code>Namespace</code> to load the sequences into. Null 1320 * implies that it should use the namespace specified in the 1321 * file. If no namespace is specified in the file, then 1322 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1323 * used. 1324 * @return a <code>RichSequenceIterator</code> over each sequence in the 1325 * fasta file 1326 */ 1327 public static RichSequenceIterator readEMBLxmlProtein( 1328 BufferedReader br, Namespace ns) { 1329 return new RichStreamReader(br, new EMBLxmlFormat(), 1330 getProteinParser(), factory, ns); 1331 } 1332 1333 /** 1334 * Read a EMBL file using a custom type of SymbolList. For example, use 1335 * RichSequenceBuilderFactory.FACTORY to emulate 1336 * readFasta(BufferedReader, SymbolTokenization) and 1337 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1338 * using bit-packing. 1339 * 1340 * @param br 1341 * the <code>BufferedReader</code> to read data from 1342 * @param sTok 1343 * a <code>SymbolTokenization</code> that understands the 1344 * sequences 1345 * @param seqFactory 1346 * a factory used to build a <code>SymbolList</code> 1347 * @param ns 1348 * a <code>Namespace</code> to load the sequences into. Null 1349 * implies that it should use the namespace specified in the 1350 * file. If no namespace is specified in the file, then 1351 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1352 * used. 1353 * @return a <code>RichSequenceIterator</code> over each sequence in the 1354 * fasta file 1355 */ 1356 public static RichSequenceIterator readEMBL(BufferedReader br, 1357 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1358 Namespace ns) { 1359 return new RichStreamReader(br, new EMBLFormat(), sTok, seqFactory, 1360 ns); 1361 } 1362 1363 /** 1364 * Iterate over the sequences in an EMBL-format stream of DNA sequences. 1365 * 1366 * @param br 1367 * the <code>BufferedReader</code> to read data from 1368 * @param ns 1369 * a <code>Namespace</code> to load the sequences into. Null 1370 * implies that it should use the namespace specified in the 1371 * file. If no namespace is specified in the file, then 1372 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1373 * used. 1374 * @return a <code>RichSequenceIterator</code> over each sequence in the 1375 * fasta file 1376 */ 1377 public static RichSequenceIterator readEMBLDNA(BufferedReader br, 1378 Namespace ns) { 1379 return new RichStreamReader(br, new EMBLFormat(), getDNAParser(), 1380 factory, ns); 1381 } 1382 1383 /** 1384 * Iterate over the sequences in an EMBL-format stream of RNA sequences. 1385 * 1386 * @param br 1387 * the <code>BufferedReader</code> to read data from 1388 * @param ns 1389 * a <code>Namespace</code> to load the sequences into. Null 1390 * implies that it should use the namespace specified in the 1391 * file. If no namespace is specified in the file, then 1392 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1393 * used. 1394 * @return a <code>RichSequenceIterator</code> over each sequence in the 1395 * fasta file 1396 */ 1397 public static RichSequenceIterator readEMBLRNA(BufferedReader br, 1398 Namespace ns) { 1399 return new RichStreamReader(br, new EMBLFormat(), getRNAParser(), 1400 factory, ns); 1401 } 1402 1403 /** 1404 * Iterate over the sequences in an EMBL-format stream of Protein 1405 * sequences. 1406 * 1407 * @param br 1408 * the <code>BufferedReader</code> to read data from 1409 * @param ns 1410 * a <code>Namespace</code> to load the sequences into. Null 1411 * implies that it should use the namespace specified in the 1412 * file. If no namespace is specified in the file, then 1413 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1414 * used. 1415 * @return a <code>RichSequenceIterator</code> over each sequence in the 1416 * fasta file 1417 */ 1418 public static RichSequenceIterator readEMBLProtein(BufferedReader br, 1419 Namespace ns) { 1420 return new RichStreamReader(br, new EMBLFormat(), 1421 getProteinParser(), factory, ns); 1422 } 1423 1424 /** 1425 * Read a UniProt file using a custom type of SymbolList. For example, 1426 * use RichSequenceBuilderFactory.FACTORY to emulate 1427 * readFasta(BufferedReader, SymbolTokenization) and 1428 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1429 * using bit-packing. 1430 * 1431 * @param br 1432 * the <code>BufferedReader</code> to read data from 1433 * @param sTok 1434 * a <code>SymbolTokenization</code> that understands the 1435 * sequences 1436 * @param seqFactory 1437 * a factory used to build a <code>SymbolList</code> 1438 * @param ns 1439 * a <code>Namespace</code> to load the sequences into. Null 1440 * implies that it should use the namespace specified in the 1441 * file. If no namespace is specified in the file, then 1442 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1443 * used. 1444 * @return a <code>RichSequenceIterator</code> over each sequence in the 1445 * fasta file 1446 */ 1447 public static RichSequenceIterator readUniProt(BufferedReader br, 1448 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1449 Namespace ns) { 1450 return new RichStreamReader(br, new UniProtFormat(), sTok, 1451 seqFactory, ns); 1452 } 1453 1454 /** 1455 * Iterate over the sequences in an UniProt-format stream of RNA 1456 * sequences. 1457 * 1458 * @param br 1459 * the <code>BufferedReader</code> to read data from 1460 * @param ns 1461 * a <code>Namespace</code> to load the sequences into. Null 1462 * implies that it should use the namespace specified in the 1463 * file. If no namespace is specified in the file, then 1464 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1465 * used. 1466 * @return a <code>RichSequenceIterator</code> over each sequence in the 1467 * fasta file 1468 */ 1469 public static RichSequenceIterator readUniProt(BufferedReader br, 1470 Namespace ns) { 1471 return new RichStreamReader(br, new UniProtFormat(), 1472 getProteinParser(), factory, ns); 1473 } 1474 1475 /** 1476 * Read a UniProt XML file using a custom type of SymbolList. For 1477 * example, use RichSequenceBuilderFactory.FACTORY to emulate 1478 * readFasta(BufferedReader, SymbolTokenization) and 1479 * RichSequenceBuilderFactory.PACKED to force all symbols to be encoded 1480 * using bit-packing. 1481 * 1482 * @param br 1483 * the <code>BufferedReader</code> to read data from 1484 * @param sTok 1485 * a <code>SymbolTokenization</code> that understands the 1486 * sequences 1487 * @param seqFactory 1488 * a factory used to build a <code>SymbolList</code> 1489 * @param ns 1490 * a <code>Namespace</code> to load the sequences into. Null 1491 * implies that it should use the namespace specified in the 1492 * file. If no namespace is specified in the file, then 1493 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1494 * used. 1495 * @return a <code>RichSequenceIterator</code> over each sequence in the 1496 * fasta file 1497 */ 1498 public static RichSequenceIterator readUniProtXML(BufferedReader br, 1499 SymbolTokenization sTok, RichSequenceBuilderFactory seqFactory, 1500 Namespace ns) { 1501 return new RichStreamReader(br, new UniProtXMLFormat(), sTok, 1502 seqFactory, ns); 1503 } 1504 1505 /** 1506 * Iterate over the sequences in an UniProt XML-format stream of RNA 1507 * sequences. 1508 * 1509 * @param br 1510 * the <code>BufferedReader</code> to read data from 1511 * @param ns 1512 * a <code>Namespace</code> to load the sequences into. Null 1513 * implies that it should use the namespace specified in the 1514 * file. If no namespace is specified in the file, then 1515 * <code>RichObjectFactory.getDefaultNamespace()</code> is 1516 * used. 1517 * @return a <code>RichSequenceIterator</code> over each sequence in the 1518 * fasta file 1519 */ 1520 public static RichSequenceIterator readUniProtXML(BufferedReader br, 1521 Namespace ns) { 1522 return new RichStreamReader(br, new UniProtXMLFormat(), 1523 getProteinParser(), factory, ns); 1524 } 1525 1526 /** 1527 * Writes <CODE>Sequence</CODE>s from a <code>SequenceIterator</code> to 1528 * an <code>OutputStream </code>in Fasta Format. This makes for a useful 1529 * format filter where a <code>StreamReader</code> can be sent to the 1530 * <code>RichStreamWriter</code> after formatting. 1531 * 1532 * @param os 1533 * The stream to write fasta formatted data to 1534 * @param in 1535 * The source of input <CODE>RichSequence</CODE>s 1536 * @param ns 1537 * a <code>Namespace</code> to write the 1538 * <CODE>RichSequence</CODE>s to. <CODE>Null</CODE> implies 1539 * that it should use the namespace specified in the 1540 * individual sequence. 1541 * @param header 1542 * the FastaHeader 1543 * @throws java.io.IOException 1544 * if there is an IO problem 1545 */ 1546 public static void writeFasta(OutputStream os, SequenceIterator in, 1547 Namespace ns, FastaHeader header) throws IOException { 1548 FastaFormat fastaFormat = new FastaFormat(); 1549 if (header != null) { 1550 fastaFormat.setHeader(header); 1551 } 1552 RichStreamWriter sw = new RichStreamWriter(os, fastaFormat); 1553 sw.writeStream(in, ns); 1554 } 1555 1556 /** 1557 * Writes <CODE>Sequence</CODE>s from a <code>SequenceIterator</code> to 1558 * an <code>OutputStream </code>in Fasta Format. This makes for a useful 1559 * format filter where a <code>StreamReader</code> can be sent to the 1560 * <code>RichStreamWriter</code> after formatting. 1561 * 1562 * @param os 1563 * The stream to write fasta formatted data to 1564 * @param in 1565 * The source of input <CODE>RichSequence</CODE>s 1566 * @param ns 1567 * a <code>Namespace</code> to write the 1568 * <CODE>RichSequence</CODE>s to. <CODE>Null</CODE> implies 1569 * that it should use the namespace specified in the 1570 * individual sequence. 1571 * @throws java.io.IOException 1572 * if there is an IO problem 1573 */ 1574 public static void writeFasta(OutputStream os, SequenceIterator in, 1575 Namespace ns) throws IOException { 1576 writeFasta(os, in, ns, null); 1577 } 1578 1579 /** 1580 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1581 * in Fasta format. 1582 * 1583 * @param os 1584 * the <code>OutputStream</code>. 1585 * @param seq 1586 * the <code>Sequence</code>. 1587 * @param ns 1588 * a <code>Namespace</code> to write the sequences to. Null 1589 * implies that it should use the namespace specified in the 1590 * individual sequence. 1591 * @throws java.io.IOException 1592 * if there is an IO problem 1593 */ 1594 public static void writeFasta(OutputStream os, Sequence seq, 1595 Namespace ns) throws IOException { 1596 writeFasta(os, new SingleRichSeqIterator(seq), ns, null); 1597 } 1598 1599 /** 1600 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1601 * in Fasta format. 1602 * 1603 * @param os 1604 * the <code>OutputStream</code>. 1605 * @param seq 1606 * the <code>Sequence</code>. 1607 * @param ns 1608 * a <code>Namespace</code> to write the sequences to. Null 1609 * implies that it should use the namespace specified in the 1610 * individual sequence. 1611 * @param header 1612 * a <code>FastaHeader</code> that controls the fields in the 1613 * header. 1614 * @throws java.io.IOException 1615 * if there is an IO problem 1616 */ 1617 public static void writeFasta(OutputStream os, Sequence seq, 1618 Namespace ns, FastaHeader header) throws IOException { 1619 writeFasta(os, new SingleRichSeqIterator(seq), ns, header); 1620 } 1621 1622 /** 1623 * Writes sequences from a <code>SequenceIterator</code> to an 1624 * <code>OutputStream </code>in GenBank Format. This makes for a useful 1625 * format filter where a <code>StreamReader</code> can be sent to the 1626 * <code>RichStreamWriter</code> after formatting. 1627 * 1628 * @param os 1629 * The stream to write fasta formatted data to 1630 * @param in 1631 * The source of input Sequences 1632 * @param ns 1633 * a <code>Namespace</code> to write the sequences to. Null 1634 * implies that it should use the namespace specified in the 1635 * individual sequence. 1636 * @throws java.io.IOException 1637 * if there is an IO problem 1638 */ 1639 public static void writeGenbank(OutputStream os, SequenceIterator in, 1640 Namespace ns) throws IOException { 1641 RichStreamWriter sw = new RichStreamWriter(os, new GenbankFormat()); 1642 sw.writeStream(in, ns); 1643 } 1644 1645 /** 1646 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1647 * in GenBank format. 1648 * 1649 * @param os 1650 * the <code>OutputStream</code>. 1651 * @param seq 1652 * the <code>Sequence</code>. 1653 * @param ns 1654 * a <code>Namespace</code> to write the sequences to. Null 1655 * implies that it should use the namespace specified in the 1656 * individual sequence. 1657 * @throws java.io.IOException 1658 * if there is an IO problem 1659 */ 1660 public static void writeGenbank(OutputStream os, Sequence seq, 1661 Namespace ns) throws IOException { 1662 writeGenbank(os, new SingleRichSeqIterator(seq), ns); 1663 } 1664 1665 /** 1666 * Writes sequences from a <code>SequenceIterator</code> to an 1667 * <code>OutputStream </code>in INSDseq Format. This makes for a useful 1668 * format filter where a <code>StreamReader</code> can be sent to the 1669 * <code>RichStreamWriter</code> after formatting. 1670 * 1671 * @param os 1672 * The stream to write fasta formatted data to 1673 * @param in 1674 * The source of input Sequences 1675 * @param ns 1676 * a <code>Namespace</code> to write the sequences to. Null 1677 * implies that it should use the namespace specified in the 1678 * individual sequence. 1679 * @throws java.io.IOException 1680 * if there is an IO problem 1681 */ 1682 public static void writeINSDseq(OutputStream os, SequenceIterator in, 1683 Namespace ns) throws IOException { 1684 RichStreamWriter sw = new RichStreamWriter(os, new INSDseqFormat()); 1685 sw.writeStream(in, ns); 1686 } 1687 1688 /** 1689 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1690 * in INSDseq format. 1691 * 1692 * @param os 1693 * the <code>OutputStream</code>. 1694 * @param seq 1695 * the <code>Sequence</code>. 1696 * @param ns 1697 * a <code>Namespace</code> to write the sequences to. Null 1698 * implies that it should use the namespace specified in the 1699 * individual sequence. 1700 * @throws java.io.IOException 1701 * if there is an IO problem 1702 */ 1703 public static void writeINSDseq(OutputStream os, Sequence seq, 1704 Namespace ns) throws IOException { 1705 writeINSDseq(os, new SingleRichSeqIterator(seq), ns); 1706 } 1707 1708 /** 1709 * Writes sequences from a <code>SequenceIterator</code> to an 1710 * <code>OutputStream </code>in EMBLxml Format. This makes for a useful 1711 * format filter where a <code>StreamReader</code> can be sent to the 1712 * <code>RichStreamWriter</code> after formatting. 1713 * 1714 * @param os 1715 * The stream to write fasta formatted data to 1716 * @param in 1717 * The source of input Sequences 1718 * @param ns 1719 * a <code>Namespace</code> to write the sequences to. Null 1720 * implies that it should use the namespace specified in the 1721 * individual sequence. 1722 * @throws java.io.IOException 1723 * if there is an IO problem 1724 */ 1725 public static void writeEMBLxml(OutputStream os, SequenceIterator in, 1726 Namespace ns) throws IOException { 1727 RichStreamWriter sw = new RichStreamWriter(os, new EMBLxmlFormat()); 1728 sw.writeStream(in, ns); 1729 } 1730 1731 /** 1732 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1733 * in EMBLxml format. 1734 * 1735 * @param os 1736 * the <code>OutputStream</code>. 1737 * @param seq 1738 * the <code>Sequence</code>. 1739 * @param ns 1740 * a <code>Namespace</code> to write the sequences to. Null 1741 * implies that it should use the namespace specified in the 1742 * individual sequence. 1743 * @throws java.io.IOException 1744 * if there is an IO problem 1745 */ 1746 public static void writeEMBLxml(OutputStream os, Sequence seq, 1747 Namespace ns) throws IOException { 1748 writeEMBLxml(os, new SingleRichSeqIterator(seq), ns); 1749 } 1750 1751 /** 1752 * Writes sequences from a <code>SequenceIterator</code> to an 1753 * <code>OutputStream </code>in EMBL Format. This makes for a useful 1754 * format filter where a <code>StreamReader</code> can be sent to the 1755 * <code>RichStreamWriter</code> after formatting. 1756 * 1757 * @param os 1758 * The stream to write fasta formatted data to 1759 * @param in 1760 * The source of input Sequences 1761 * @param ns 1762 * a <code>Namespace</code> to write the sequences to. Null 1763 * implies that it should use the namespace specified in the 1764 * individual sequence. 1765 * @throws java.io.IOException 1766 * if there is an IO problem 1767 */ 1768 public static void writeEMBL(OutputStream os, SequenceIterator in, 1769 Namespace ns) throws IOException { 1770 RichStreamWriter sw = new RichStreamWriter(os, new EMBLFormat()); 1771 sw.writeStream(in, ns); 1772 } 1773 1774 /** 1775 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1776 * in EMBL format. 1777 * 1778 * @param os 1779 * the <code>OutputStream</code>. 1780 * @param seq 1781 * the <code>Sequence</code>. 1782 * @param ns 1783 * a <code>Namespace</code> to write the sequences to. Null 1784 * implies that it should use the namespace specified in the 1785 * individual sequence. 1786 * @throws java.io.IOException 1787 * if there is an IO problem 1788 */ 1789 public static void writeEMBL(OutputStream os, Sequence seq, Namespace ns) 1790 throws IOException { 1791 writeEMBL(os, new SingleRichSeqIterator(seq), ns); 1792 } 1793 1794 /** 1795 * Writes sequences from a <code>SequenceIterator</code> to an 1796 * <code>OutputStream </code>in UniProt Format. This makes for a useful 1797 * format filter where a <code>StreamReader</code> can be sent to the 1798 * <code>RichStreamWriter</code> after formatting. 1799 * 1800 * @param os 1801 * The stream to write fasta formatted data to 1802 * @param in 1803 * The source of input Sequences 1804 * @param ns 1805 * a <code>Namespace</code> to write the sequences to. Null 1806 * implies that it should use the namespace specified in the 1807 * individual sequence. 1808 * @throws java.io.IOException 1809 * if there is an IO problem 1810 */ 1811 public static void writeUniProt(OutputStream os, SequenceIterator in, 1812 Namespace ns) throws IOException { 1813 RichStreamWriter sw = new RichStreamWriter(os, new UniProtFormat()); 1814 sw.writeStream(in, ns); 1815 } 1816 1817 /** 1818 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1819 * in UniProt format. 1820 * 1821 * @param os 1822 * the <code>OutputStream</code>. 1823 * @param seq 1824 * the <code>Sequence</code>. 1825 * @param ns 1826 * a <code>Namespace</code> to write the sequences to. Null 1827 * implies that it should use the namespace specified in the 1828 * individual sequence. 1829 * @throws java.io.IOException 1830 * if there is an IO problem 1831 */ 1832 public static void writeUniProt(OutputStream os, Sequence seq, 1833 Namespace ns) throws IOException { 1834 writeUniProt(os, new SingleRichSeqIterator(seq), ns); 1835 } 1836 1837 /** 1838 * Writes sequences from a <code>SequenceIterator</code> to an 1839 * <code>OutputStream </code>in UniProt XML Format. This makes for a 1840 * useful format filter where a <code>StreamReader</code> can be sent to 1841 * the <code>RichStreamWriter</code> after formatting. 1842 * 1843 * @param os 1844 * The stream to write fasta formatted data to 1845 * @param in 1846 * The source of input Sequences 1847 * @param ns 1848 * a <code>Namespace</code> to write the sequences to. Null 1849 * implies that it should use the namespace specified in the 1850 * individual sequence. 1851 * @throws java.io.IOException 1852 * if there is an IO problem 1853 */ 1854 public static void writeUniProtXML(OutputStream os, 1855 SequenceIterator in, Namespace ns) throws IOException { 1856 RichStreamWriter sw = new RichStreamWriter(os, 1857 new UniProtXMLFormat()); 1858 sw.writeStream(in, ns); 1859 } 1860 1861 /** 1862 * Writes a single <code>Sequence</code> to an <code>OutputStream</code> 1863 * in UniProt XML format. 1864 * 1865 * @param os 1866 * the <code>OutputStream</code>. 1867 * @param seq 1868 * the <code>Sequence</code>. 1869 * @param ns 1870 * a <code>Namespace</code> to write the sequences to. Null 1871 * implies that it should use the namespace specified in the 1872 * individual sequence. 1873 * @throws java.io.IOException 1874 * if there is an IO problem 1875 */ 1876 public static void writeUniProtXML(OutputStream os, Sequence seq, 1877 Namespace ns) throws IOException { 1878 writeUniProtXML(os, new SingleRichSeqIterator(seq), ns); 1879 } 1880 1881 /** 1882 * Creates a DNA symbol tokenizer. 1883 * 1884 * @return a <code>SymbolTokenization</code> for parsing DNA. 1885 */ 1886 public static SymbolTokenization getDNAParser() { 1887 try { 1888 return DNATools.getDNA().getTokenization("token"); 1889 } catch (BioException ex) { 1890 throw new BioError("Assertion failing:" 1891 + " Couldn't get DNA token parser", ex); 1892 } 1893 } 1894 1895 /** 1896 * Creates a RNA symbol tokenizer. 1897 * 1898 * @return a <code>SymbolTokenization</code> for parsing RNA. 1899 */ 1900 public static SymbolTokenization getRNAParser() { 1901 try { 1902 return RNATools.getRNA().getTokenization("token"); 1903 } catch (BioException ex) { 1904 throw new BioError("Assertion failing:" 1905 + " Couldn't get RNA token parser", ex); 1906 } 1907 } 1908 1909 /** 1910 * Creates a nucleotide symbol tokenizer. 1911 * 1912 * @return a <code>SymbolTokenization</code> for parsing nucleotides. 1913 */ 1914 public static SymbolTokenization getNucleotideParser() { 1915 try { 1916 return NucleotideTools.getNucleotide().getTokenization("token"); 1917 } catch (BioException ex) { 1918 throw new BioError("Assertion failing:" 1919 + " Couldn't get nucleotide token parser", ex); 1920 } 1921 } 1922 1923 /** 1924 * Creates a protein symbol tokenizer. 1925 * 1926 * @return a <code>SymbolTokenization</code> for parsing protein. 1927 */ 1928 public static SymbolTokenization getProteinParser() { 1929 try { 1930 return ProteinTools.getTAlphabet().getTokenization("token"); 1931 } catch (BioException ex) { 1932 throw new BioError("Assertion failing:" 1933 + " Couldn't get PROTEIN token parser", ex); 1934 } 1935 } 1936 1937 /** 1938 * Used to iterate over a single rich sequence 1939 */ 1940 public static final class SingleRichSeqIterator implements 1941 RichSequenceIterator { 1942 1943 private RichSequence seq; 1944 1945 /** 1946 * Creates an iterator over a single sequence. 1947 * 1948 * @param seq 1949 * the sequence to iterate over. 1950 */ 1951 public SingleRichSeqIterator(Sequence seq) { 1952 try { 1953 if (seq instanceof RichSequence) 1954 this.seq = (RichSequence) seq; 1955 else 1956 this.seq = RichSequence.Tools.enrich(seq); 1957 } catch (ChangeVetoException e) { 1958 throw new RuntimeException("Unable to enrich sequence", e); 1959 } 1960 } 1961 1962 /** 1963 * {@inheritDoc} 1964 * 1965 * @return true if another <CODE>RichSequence</CODE> is available 1966 */ 1967 public boolean hasNext() { 1968 return seq != null; 1969 } 1970 1971 /** 1972 * {@inheritDoc} 1973 * 1974 * @return a <CODE>RichSequence</CODE> 1975 */ 1976 public Sequence nextSequence() { 1977 return this.nextRichSequence(); 1978 } 1979 1980 /** 1981 * {@inheritDoc} 1982 * 1983 * @return a <CODE>RichSequence</CODE> 1984 */ 1985 public BioEntry nextBioEntry() { 1986 return this.nextRichSequence(); 1987 } 1988 1989 /** 1990 * {@inheritDoc} 1991 * 1992 * @return a <CODE>RichSequence</CODE> 1993 */ 1994 public RichSequence nextRichSequence() { 1995 RichSequence seq = this.seq; 1996 this.seq = null; 1997 return seq; 1998 } 1999 } 2000 } 2001}