001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.File; 026import java.io.FileNotFoundException; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStream; 030import java.io.OutputStream; 031import java.io.PrintStream; 032import java.util.regex.Pattern; 033import java.util.regex.PatternSyntaxException; 034 035import org.biojava.bio.BioError; 036import org.biojava.bio.BioException; 037import org.biojava.bio.alignment.Alignment; 038import org.biojava.bio.seq.DNATools; 039import org.biojava.bio.seq.NucleotideTools; 040import org.biojava.bio.seq.ProteinTools; 041import org.biojava.bio.seq.RNATools; 042import org.biojava.bio.seq.Sequence; 043import org.biojava.bio.seq.SequenceIterator; 044import org.biojava.bio.seq.db.HashSequenceDB; 045import org.biojava.bio.seq.db.IDMaker; 046import org.biojava.bio.seq.db.SequenceDB; 047import org.biojava.bio.symbol.Alphabet; 048import org.biojava.bio.symbol.FiniteAlphabet; 049import org.biojava.bio.symbol.IllegalSymbolException; 050import org.biojava.utils.AssertionFailure; 051import org.biojava.utils.ChangeVetoException; 052 053/** 054 * A set of convenience methods for handling common file formats. 055 * 056 * @author Thomas Down 057 * @author Mark Schreiber 058 * @author Nimesh Singh 059 * @author Matthew Pocock 060 * @author Keith James 061 * @since 1.1 062 * @deprecated use org.biojavax.bio.seq.RichSequence.IOTools 063 */ 064public final class SeqIOTools { 065 private static SequenceBuilderFactory _emblBuilderFactory; 066 private static SequenceBuilderFactory _genbankBuilderFactory; 067 private static SequenceBuilderFactory _genpeptBuilderFactory; 068 private static SequenceBuilderFactory _swissprotBuilderFactory; 069 private static SequenceBuilderFactory _fastaBuilderFactory; 070 071 /** 072 * This can't be instantiated. 073 */ 074 private SeqIOTools() { 075 } 076 077 /** 078 * Get a default SequenceBuilderFactory for handling EMBL 079 * files. 080 * @return a <CODE>SmartSequenceBuilder.FACTORY</CODE> 081 */ 082 public static SequenceBuilderFactory getEmblBuilderFactory() { 083 if (_emblBuilderFactory == null) { 084 _emblBuilderFactory = 085 new EmblProcessor.Factory(SmartSequenceBuilder.FACTORY); 086 } 087 return _emblBuilderFactory; 088 } 089 090 /** 091 * Iterate over the sequences in an EMBL-format stream. 092 * @param br A reader for the EMBL source or file 093 * @return a <CODE>SequenceIterator</CODE> that iterates over each 094 * <CODE>Sequence</CODE> in the file 095 */ 096 public static SequenceIterator readEmbl(BufferedReader br) { 097 return new StreamReader(br, 098 new EmblLikeFormat(), 099 getDNAParser(), 100 getEmblBuilderFactory()); 101 } 102 103 /** 104 * Iterate over the sequences in an EMBL-format stream, but for RNA. 105 * @param br A reader for the EMBL source or file 106 * @return a <CODE>SequenceIterator</CODE> that iterates over each 107 * <CODE>Sequence</CODE> in the file 108 */ 109 public static SequenceIterator readEmblRNA(BufferedReader br) { 110 return new StreamReader(br, 111 new EmblLikeFormat(), 112 getRNAParser(), 113 getEmblBuilderFactory()); 114 } 115 116 /** 117 * Iterate over the sequences in an EMBL-format stream. 118 * @param br A reader for the EMBL source or file 119 * @return a <CODE>SequenceIterator</CODE> that iterates over each 120 * <CODE>Sequence</CODE> in the file 121 */ 122 public static SequenceIterator readEmblNucleotide(BufferedReader br) { 123 return new StreamReader(br, 124 new EmblLikeFormat(), 125 getNucleotideParser(), 126 getEmblBuilderFactory()); 127 } 128 129 /** 130 * Get a default SequenceBuilderFactory for handling GenBank 131 * files. 132 * @return a <code>SmartSequenceBuilder.FACTORY</code> 133 */ 134 public static SequenceBuilderFactory getGenbankBuilderFactory() { 135 if (_genbankBuilderFactory == null) { 136 _genbankBuilderFactory = 137 new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY); 138 } 139 return _genbankBuilderFactory; 140 } 141 142 /** 143 * Iterate over the sequences in an Genbank-format stream. 144 * @param br A reader for the Genbank source or file 145 * @return a <CODE>SequenceIterator</CODE> that iterates over each 146 * <CODE>Sequence</CODE> in the file 147 */ 148 public static SequenceIterator readGenbank(BufferedReader br) { 149 return new StreamReader(br, 150 new GenbankFormat(), 151 getDNAParser(), 152 getGenbankBuilderFactory()); 153 } 154 155 /** 156 * Iterate over the sequences in an GenbankXML-format stream. 157 * @param br A reader for the GenbanXML source or file 158 * @return a <CODE>SequenceIterator</CODE> that iterates over each 159 * <CODE>Sequence</CODE> in the file 160 */ 161 public static SequenceIterator readGenbankXml( BufferedReader br ) 162 { 163 return new StreamReader( br, 164 new GenbankXmlFormat(), 165 getDNAParser(), 166 getGenbankBuilderFactory() ); 167 } 168 169 /** 170 * Get a default SequenceBuilderFactory for handling Genpept 171 * files. 172 * @return a <code>SmartSequenceBuilder.FACTORY</code> 173 */ 174 public static SequenceBuilderFactory getGenpeptBuilderFactory() { 175 if (_genpeptBuilderFactory == null) { 176 _genpeptBuilderFactory = 177 new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY); 178 } 179 return _genpeptBuilderFactory; 180 } 181 182 /** 183 * Iterate over the sequences in an Genpept-format stream. 184 * @param br A reader for the Genpept source or file 185 * @return a <CODE>SequenceIterator</CODE> that iterates over each 186 * <CODE>Sequence</CODE> in the file 187 */ 188 public static SequenceIterator readGenpept(BufferedReader br) { 189 return new StreamReader(br, 190 new GenbankFormat(), 191 getProteinParser(), 192 getGenpeptBuilderFactory()); 193 } 194 195 /** 196 * Get a default SequenceBuilderFactory for handling Swissprot 197 * files. 198 * @return a <code>SmartSequenceBuilder.FACTORY</code> 199 */ 200 public static SequenceBuilderFactory getSwissprotBuilderFactory() { 201 if (_swissprotBuilderFactory == null) { 202 _swissprotBuilderFactory = 203 new SwissprotProcessor.Factory(SmartSequenceBuilder.FACTORY); 204 } 205 return _swissprotBuilderFactory; 206 } 207 208 /** 209 * Iterate over the sequences in an Swissprot-format stream. 210 * @param br A reader for the Swissprot source or file 211 * @return a <CODE>SequenceIterator</CODE> that iterates over each 212 * <CODE>Sequence</CODE> in the file 213 */ 214 public static SequenceIterator readSwissprot(BufferedReader br) { 215 return new StreamReader(br, 216 new EmblLikeFormat(), 217 getProteinParser(), 218 getSwissprotBuilderFactory()); 219 } 220 221 /** 222 * Get a default SequenceBuilderFactory for handling FASTA 223 * files. 224 * @return a <code>SmartSequenceBuilder.FACTORY</code> 225 */ 226 public static SequenceBuilderFactory getFastaBuilderFactory() { 227 if (_fastaBuilderFactory == null) { 228 _fastaBuilderFactory = new FastaDescriptionLineParser.Factory( 229 SmartSequenceBuilder.FACTORY); 230 } 231 return _fastaBuilderFactory; 232 } 233 234 /** 235 * Read a fasta file. 236 * 237 * @param br the BufferedReader to read data from 238 * @param sTok a SymbolTokenization that understands the sequences 239 * @return a SequenceIterator over each sequence in the fasta file 240 */ 241 public static SequenceIterator readFasta( 242 BufferedReader br, SymbolTokenization sTok) 243 { 244 return new StreamReader(br, 245 new FastaFormat(), 246 sTok, 247 getFastaBuilderFactory()); 248 } 249 250 /** 251 * Read a fasta file using a custom type of SymbolList. For example, 252 * use SmartSequenceBuilder.FACTORY to emulate readFasta(BufferedReader, 253 * SymbolTokenization) and SmartSequenceBuilder.BIT_PACKED to force all 254 * symbols to be encoded using bit-packing. 255 * @param br the BufferedReader to read data from 256 * @param sTok a SymbolTokenization that understands the sequences 257 * @param seqFactory a factory used to build a SymbolList 258 * @return a <CODE>SequenceIterator</CODE> that iterates over each 259 * <CODE>Sequence</CODE> in the file 260 */ 261 public static SequenceIterator readFasta( 262 BufferedReader br, 263 SymbolTokenization sTok, 264 SequenceBuilderFactory seqFactory) 265 { 266 return new StreamReader( 267 br, 268 new FastaFormat(), 269 sTok, 270 new FastaDescriptionLineParser.Factory(seqFactory)); 271 } 272 273 /** 274 * Iterate over the sequences in an FASTA-format stream of DNA sequences. 275 * @param br the BufferedReader to read data from 276 * @return a <CODE>SequenceIterator</CODE> that iterates over each 277 * <CODE>Sequence</CODE> in the file 278 */ 279 public static SequenceIterator readFastaDNA(BufferedReader br) { 280 return new StreamReader(br, 281 new FastaFormat(), 282 getDNAParser(), 283 getFastaBuilderFactory()); 284 } 285 286 /** 287 * Iterate over the sequences in an FASTA-format stream of RNA sequences. 288 * @param br the BufferedReader to read data from 289 * @return a <CODE>SequenceIterator</CODE> that iterates over each 290 * <CODE>Sequence</CODE> in the file 291 */ 292 public static SequenceIterator readFastaRNA(BufferedReader br) { 293 return new StreamReader(br, 294 new FastaFormat(), 295 getRNAParser(), 296 getFastaBuilderFactory()); 297 } 298 299 /** 300 * Iterate over the sequences in an FASTA-format stream of Protein sequences. 301 * @param br the BufferedReader to read data from 302 * @return a <CODE>SequenceIterator</CODE> that iterates over each 303 * <CODE>Sequence</CODE> in the file 304 */ 305 public static SequenceIterator readFastaProtein(BufferedReader br) { 306 return new StreamReader(br, 307 new FastaFormat(), 308 getProteinParser(), 309 getFastaBuilderFactory()); 310 } 311 312 /** 313 * Create a sequence database from a fasta file provided as an 314 * input stream. Note this somewhat duplicates functionality in 315 * the readFastaDNA and readFastaProtein methods but uses a stream 316 * rather than a reader and returns a SequenceDB rather than a 317 * SequenceIterator. If the returned DB is likely to be large then 318 * the above mentioned methods should be used. 319 * @return a <code>SequenceDB</code> containing all the <code>Sequences</code> 320 * in the file. 321 * @since 1.2 322 * @param seqFile The file containg the fasta formatted sequences 323 * @param alpha The <code>Alphabet</code> of the sequence, ie DNA, RNA etc 324 * @throws BioException if problems occur during reading of the 325 * stream. 326 */ 327 public static SequenceDB readFasta(InputStream seqFile, Alphabet alpha) 328 throws BioException { 329 HashSequenceDB db = new HashSequenceDB(IDMaker.byName); 330 SequenceBuilderFactory sbFact = 331 new FastaDescriptionLineParser.Factory(SmartSequenceBuilder.FACTORY); 332 FastaFormat fFormat = new FastaFormat(); 333 for (SequenceIterator seqI = new StreamReader(seqFile, 334 fFormat, 335 alpha.getTokenization("token"), 336 sbFact);seqI.hasNext();) { 337 Sequence seq = seqI.nextSequence(); 338 try { 339 db.addSequence(seq); 340 } catch (ChangeVetoException cve) { 341 throw new AssertionFailure( 342 "Could not successfully add sequence " 343 + seq.getName() 344 + " to sequence database", 345 cve); 346 } 347 } 348 return db; 349 } 350 351 /** 352 * Write a sequenceDB to an output stream in fasta format. 353 * @since 1.2 354 * @param os the stream to write the fasta formatted data to. 355 * @param db the database of <code>Sequence</code>s to write 356 * @throws IOException if there was an error while writing. 357 */ 358 public static void writeFasta(OutputStream os, SequenceDB db) 359 throws IOException { 360 StreamWriter sw = new StreamWriter(os,new FastaFormat()); 361 sw.writeStream(db.sequenceIterator()); 362 } 363 364 /** 365 * Writes sequences from a SequenceIterator to an OutputStream in 366 * Fasta Format. This makes for a useful format filter where a 367 * StreamReader can be sent to the StreamWriter after formatting. 368 * 369 * @since 1.2 370 * @param os The stream to write fasta formatted data to 371 * @param in The source of input <code>Sequences</code> 372 * @throws IOException if there was an error while writing. 373 */ 374 public static void writeFasta(OutputStream os, SequenceIterator in) 375 throws IOException { 376 StreamWriter sw = new StreamWriter(os,new FastaFormat()); 377 sw.writeStream(in); 378 } 379 380 /** 381 * Writes a single Sequence to an OutputStream in Fasta format. 382 * 383 * @param os the OutputStream. 384 * @param seq the Sequence. 385 * @throws IOException if there was an error while writing. 386 */ 387 public static void writeFasta(OutputStream os, Sequence seq) 388 throws IOException { 389 writeFasta(os, new SingleSeqIterator(seq)); 390 } 391 392 /** 393 * Writes a stream of Sequences to an OutputStream in EMBL format. 394 * 395 * @param os the OutputStream. 396 * @param in a SequenceIterator. 397 * @exception IOException if there was an error while writing. 398 */ 399 public static void writeEmbl(OutputStream os, SequenceIterator in) 400 throws IOException { 401 StreamWriter sw = new StreamWriter(os, new EmblLikeFormat()); 402 sw.writeStream(in); 403 } 404 405 /** 406 * Writes a single Sequence to an OutputStream in EMBL format. 407 * 408 * @param os the OutputStream. 409 * @param seq the Sequence. 410 * @throws IOException if there was an error while writing. 411 */ 412 public static void writeEmbl(OutputStream os, Sequence seq) throws IOException { 413 writeEmbl(os, new SingleSeqIterator(seq)); 414 } 415 416 /** 417 * Writes a stream of Sequences to an OutputStream in SwissProt 418 * format. 419 * @param os the OutputStream. 420 * @param in a SequenceIterator. 421 * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be converted to SwissProt 422 * format 423 * @exception IOException if there was an error while writing. 424 */ 425 public static void writeSwissprot(OutputStream os, SequenceIterator in) 426 throws IOException, BioException { 427 SequenceFormat former = new EmblLikeFormat(); 428 PrintStream ps = new PrintStream(os); 429 while (in.hasNext()) { 430 former.writeSequence(in.nextSequence(), ps); 431 } 432 } 433 434 /** 435 * Writes a single Sequence to an OutputStream in SwissProt format. 436 * @param os the OutputStream. 437 * @param seq the Sequence. 438 * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to SwissProt format 439 * @throws IOException if there was an error while writing. 440 */ 441 public static void writeSwissprot(OutputStream os, Sequence seq) 442 throws IOException, BioException { 443 writeSwissprot(os, new SingleSeqIterator(seq)); 444 } 445 446 /** 447 * Writes a stream of Sequences to an OutputStream in Genpept 448 * format. 449 * @param os the OutputStream. 450 * @param in a SequenceIterator. 451 * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to Genpept format 452 * @exception IOException if there was an error while writing. 453 */ 454 public static void writeGenpept(OutputStream os, SequenceIterator in) 455 throws IOException, BioException { 456 SequenceFormat former = new GenpeptFormat(); 457 PrintStream ps = new PrintStream(os); 458 while (in.hasNext()) { 459 former.writeSequence(in.nextSequence(), ps); 460 } 461 } 462 463 /** 464 * Writes a single Sequence to an OutputStream in Genpept format. 465 * @param os the OutputStream. 466 * @param seq the Sequence. 467 * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to Genpept format 468 * @throws IOException if there was an error while writing. 469 */ 470 public static void writeGenpept(OutputStream os, Sequence seq) 471 throws IOException, BioException { 472 writeGenpept(os, new SingleSeqIterator(seq)); 473 } 474 475 /** 476 * Writes a stream of Sequences to an OutputStream in Genbank 477 * format. 478 * 479 * @param os the OutputStream. 480 * @param in a SequenceIterator. 481 * @exception IOException if there was an error while writing. 482 */ 483 public static void writeGenbank(OutputStream os, SequenceIterator in) 484 throws IOException { 485 StreamWriter sw = new StreamWriter(os, new GenbankFormat()); 486 sw.writeStream(in); 487 } 488 489 /** 490 * Writes a single Sequence to an OutputStream in Genbank format. 491 * 492 * @param os the OutputStream. 493 * @param seq the Sequence. 494 * @throws IOException if there was an error while writing. 495 */ 496 public static void writeGenbank(OutputStream os, Sequence seq) 497 throws IOException { 498 writeGenbank(os, new SingleSeqIterator(seq)); 499 } 500 501 /** 502 * <code>identifyFormat</code> performs a case-insensitive mapping 503 * of a pair of common sequence format name (such as 'embl', 504 * 'genbank' or 'fasta') and alphabet name (such as 'dna', 'rna', 505 * 'protein', 'aa') to an integer. The value returned will be one 506 * of the public static final fields in 507 * <code>SeqIOConstants</code>, or a bitwise-or combination of 508 * them. The method will reject known illegal combinations of 509 * format and alphabet (such as swissprot + dna) by throwing an 510 * <code>IllegalArgumentException</code>. It will return the 511 * <code>SeqIOConstants.UNKNOWN</code> value when either format or 512 * alphabet are unknown. 513 * 514 * @param formatName a <code>String</code>. 515 * @param alphabetName a <code>String</code>. 516 * 517 * @return an <code>int</code>. 518 */ 519 public static int identifyFormat(String formatName, String alphabetName) { 520 int format, alpha; 521 if (formatName.equalsIgnoreCase("raw")) { 522 format = SeqIOConstants.RAW; 523 } 524 else if (formatName.equalsIgnoreCase("fasta")) { 525 format = SeqIOConstants.FASTA; 526 } 527 else if (formatName.equalsIgnoreCase("nbrf")) { 528 format = SeqIOConstants.NBRF; 529 } 530 else if (formatName.equalsIgnoreCase("ig")) { 531 format = SeqIOConstants.IG; 532 } 533 else if (formatName.equalsIgnoreCase("embl")) { 534 format = SeqIOConstants.EMBL; 535 } 536 else if (formatName.equalsIgnoreCase("swissprot") || 537 formatName.equalsIgnoreCase("swiss")) { 538 if (alphabetName.equalsIgnoreCase("aa") || 539 alphabetName.equalsIgnoreCase("protein")) { 540 return SeqIOConstants.SWISSPROT; 541 } else { 542 throw new IllegalArgumentException("Illegal format and alphabet " 543 + "combination " 544 + formatName 545 + " + " 546 + alphabetName); 547 } 548 } else if (formatName.equalsIgnoreCase("genbank")) { 549 format = SeqIOConstants.GENBANK; 550 } else if (formatName.equalsIgnoreCase("genpept")) { 551 if (alphabetName.equalsIgnoreCase("aa") || 552 alphabetName.equalsIgnoreCase("protein")) { 553 return SeqIOConstants.GENPEPT; 554 } else { 555 throw new IllegalArgumentException("Illegal format and alphabet " 556 + "combination " 557 + formatName 558 + " + " 559 + alphabetName); 560 } 561 } else if (formatName.equalsIgnoreCase("refseq")) { 562 format = SeqIOConstants.REFSEQ; 563 } else if (formatName.equalsIgnoreCase("gcg")) { 564 format = SeqIOConstants.GCG; 565 } else if (formatName.equalsIgnoreCase("gff")) { 566 format = SeqIOConstants.GFF; 567 } 568 else if (formatName.equalsIgnoreCase("pdb")) { 569 if (alphabetName.equalsIgnoreCase("aa") || 570 alphabetName.equalsIgnoreCase("protein")) { 571 return SeqIOConstants.PDB; 572 } else { 573 throw new IllegalArgumentException("Illegal format and alphabet " 574 + "combination " 575 + formatName 576 + " + " 577 + alphabetName); 578 } 579 } else if (formatName.equalsIgnoreCase("phred")) { 580 if (alphabetName.equalsIgnoreCase("dna")) { 581 return SeqIOConstants.PHRED; 582 } else { 583 throw new IllegalArgumentException("Illegal format and alphabet " 584 + "combination " 585 + formatName 586 + " + " 587 + alphabetName); 588 } 589 } else if (formatName.equalsIgnoreCase("clustal")) { 590 format = AlignIOConstants.CLUSTAL; 591 } else if (formatName.equalsIgnoreCase("msf")) { 592 format = AlignIOConstants.MSF; 593 } 594 else { 595 return SeqIOConstants.UNKNOWN; 596 } 597 598 if (alphabetName.equalsIgnoreCase("dna")) { 599 alpha = SeqIOConstants.DNA; 600 } else if (alphabetName.equalsIgnoreCase("rna")) { 601 alpha = SeqIOConstants.RNA; 602 } else if (alphabetName.equalsIgnoreCase("aa") || 603 alphabetName.equalsIgnoreCase("protein")) { 604 alpha = SeqIOConstants.AA; 605 } else { 606 return SeqIOConstants.UNKNOWN; 607 } 608 609 return (format | alpha); 610 } 611 612 /** 613 * <code>getSequenceFormat</code> accepts a value which represents 614 * a sequence format and returns the relevant 615 * <code>SequenceFormat</code> object. 616 * 617 * @param identifier an <code>int</code> which represents a binary 618 * value with bits set according to the scheme described in 619 * <code>SeqIOConstants</code>. 620 * 621 * @return a <code>SequenceFormat</code>. 622 * 623 * @exception BioException if an error occurs. 624 */ 625 public static SequenceFormat getSequenceFormat(int identifier) 626 throws BioException { 627 628 // Mask the sequence format bytes 629 int alphaType = identifier & (~ 0xffff); 630 if (alphaType == 0) 631 throw new IllegalArgumentException("No alphabet was set in the identifier"); 632 633 // Mask alphabet bytes 634 int formatType = identifier & (~ 0xffff0000); 635 if (formatType == 0) 636 throw new IllegalArgumentException("No format was set in the identifier"); 637 638 switch (identifier) { 639 case SeqIOConstants.FASTA_DNA: 640 case SeqIOConstants.FASTA_RNA: 641 case SeqIOConstants.FASTA_AA: 642 return new FastaFormat(); 643 case SeqIOConstants.EMBL_DNA: 644 case SeqIOConstants.EMBL_RNA: 645 return new EmblLikeFormat(); 646 case SeqIOConstants.GENBANK_DNA: 647 case SeqIOConstants.GENBANK_RNA: 648 return new GenbankFormat(); 649 case SeqIOConstants.SWISSPROT: 650 return new EmblLikeFormat(); 651 default: 652 throw new BioException("No SequenceFormat available for " 653 + "format/alphabet identifier '" 654 + identifier 655 + "'"); 656 } 657 } 658 659 /** 660 * <code>getBuilderFactory</code> accepts a value which represents 661 * a sequence format and returns the relevant 662 * <code>SequenceBuilderFactory</code> object. 663 * 664 * @param identifier an <code>int</code> which represents a binary 665 * value with bits set according to the scheme described in 666 * <code>SeqIOConstants</code>. 667 * 668 * @return a <code>SequenceBuilderFactory</code>. 669 * 670 * @exception BioException if an error occurs. 671 */ 672 public static SequenceBuilderFactory getBuilderFactory(int identifier) 673 throws BioException { 674 675 // Mask the sequence format bytes 676 int alphaType = identifier & (~ 0xffff); 677 if (alphaType == 0) 678 throw new IllegalArgumentException("No alphabet was set in the identifier"); 679 680 // Mask alphabet bytes 681 int formatType = identifier & (~ 0xffff0000); 682 if (formatType == 0) 683 throw new IllegalArgumentException("No format was set in the identifier"); 684 685 switch (identifier) { 686 case SeqIOConstants.FASTA_DNA: 687 case SeqIOConstants.FASTA_RNA: 688 case SeqIOConstants.FASTA_AA: 689 return getFastaBuilderFactory(); 690 case SeqIOConstants.EMBL_DNA: 691 return getEmblBuilderFactory(); 692 case SeqIOConstants.GENBANK_DNA: 693 return getGenbankBuilderFactory(); 694 case SeqIOConstants.SWISSPROT: 695 return getSwissprotBuilderFactory(); 696 case SeqIOConstants.GENPEPT: 697 return getGenpeptBuilderFactory(); 698 default: 699 throw new BioException("No SequenceBuilderFactory available for " 700 + "format/alphabet identifier '" 701 + identifier 702 + "'"); 703 } 704 } 705 706 /** 707 * <code>getAlphabet</code> accepts a value which represents a 708 * sequence format and returns the relevant 709 * <code>FiniteAlphabet</code> object. 710 * 711 * @param identifier an <code>int</code> which represents a binary 712 * value with bits set according to the scheme described in 713 * <code>SeqIOConstants</code>. 714 * 715 * @return a <code>FiniteAlphabet</code>. 716 * 717 * @exception BioException if an error occurs. 718 */ 719 public static FiniteAlphabet getAlphabet(int identifier) 720 throws BioException { 721 722 // Mask the sequence format bytes 723 int alphaType = identifier & (~ 0xffff); 724 if (alphaType == 0) 725 throw new IllegalArgumentException("No alphabet was set in the identifier"); 726 727 switch (alphaType) { 728 case SeqIOConstants.DNA: 729 return DNATools.getDNA(); 730 case SeqIOConstants.RNA: 731 return RNATools.getRNA(); 732 case SeqIOConstants.AA: 733 return ProteinTools.getTAlphabet(); 734 default: 735 throw new BioException("No FiniteAlphabet available for " 736 + "alphabet identifier '" 737 + identifier 738 + "'"); 739 } 740 } 741 742 // 743 // The following methods provide an alternate interface for 744 // reading and writing sequences and alignments. (Nimesh Singh). 745 // 746 // 747 748 /** 749 * Attempts to guess the filetype of a file given the name. For 750 * use with the functions below that take an int fileType as a 751 * parameter. EMBL and Genbank files are assumed to contain DNA 752 * sequence. 753 * @deprecated because there is no standard file naming convention 754 * and guessing by file name is inherantly error prone and bad. 755 * @param seqFile the <CODE>File</CODE> to read from. 756 * @throws java.io.IOException if <CODE>seqFile</CODE> cannot be read 757 * @throws java.io.FileNotFoundException if <CODE>seqFile</CODE> cannot be found 758 * @return a value that describes the file type. 759 */ 760 public static int guessFileType(File seqFile) 761 throws IOException, FileNotFoundException { 762 //First tries by matching an extension 763 String fileName = seqFile.getName(); 764 try { 765 if (Pattern.matches(".*\\u002eem.*", fileName)) { 766 return SeqIOConstants.EMBL_DNA; 767 } 768 else if (Pattern.matches(".*\\u002edat.*", fileName)) { 769 return SeqIOConstants.EMBL_DNA; 770 } 771 else if (Pattern.matches(".*\\u002egb.*", fileName)) { 772 return SeqIOConstants.GENBANK_DNA; 773 } 774 else if (Pattern.matches(".*\\u002esp.*", fileName)) { 775 return SeqIOConstants.SWISSPROT; 776 } 777 else if (Pattern.matches(".*\\u002egp.*", fileName)) { 778 return SeqIOConstants.GENPEPT; 779 } 780 else if (Pattern.matches(".*\\u002efa.*", fileName)) { 781 return guessFastaType(seqFile); 782 } 783 else if (Pattern.matches(".*\\u002emsf.*", fileName)) { 784 return guessMsfType(seqFile); 785 } 786 } catch (PatternSyntaxException e) { 787 throw new BioError("Internal error in SeqIOTools", e); 788 } 789 790 //Reads the file to guess based on content 791 BufferedReader br = new BufferedReader(new FileReader(seqFile)); 792 String line1 = br.readLine(); 793 br.close(); 794 795 if (line1.startsWith(">")) { 796 return guessFastaType(seqFile); 797 } 798 else if (line1.startsWith("PileUp")) { 799 return guessMsfType(seqFile); 800 } 801 else if (line1.startsWith("!!AA_MULTIPLE_ALIGNMENT")) { 802 return AlignIOConstants.MSF_AA; 803 } 804 else if (line1.startsWith("!!NA_MULTIPLE_ALIGNMENT")) { 805 return AlignIOConstants.MSF_DNA; 806 } 807 else if (line1.startsWith("ID")) { 808 for (int i = 0; i < line1.length(); i++) { 809 if (Character.toUpperCase(line1.charAt(i)) == 'P' && 810 Character.toUpperCase(line1.charAt(i+1)) == 'R' && 811 Character.toUpperCase(line1.charAt(i+2)) == 'T') { 812 return SeqIOConstants.SWISSPROT; 813 } 814 } 815 return SeqIOConstants.EMBL_DNA; 816 } 817 else if (line1.toUpperCase().startsWith("LOCUS")) { 818 for (int i = 0; i < line1.length(); i++) { 819 if (Character.toUpperCase(line1.charAt(i)) == 'A' && 820 Character.toUpperCase(line1.charAt(i+1)) == 'A') { 821 return SeqIOConstants.GENPEPT; 822 } 823 } 824 return SeqIOConstants.GENBANK_DNA; 825 } 826 else if (line1.length() >= 45 && 827 line1.substring(19, 45).equalsIgnoreCase("GENETIC SEQUENCE DATA BANK")) { 828 return guessGenType(fileName); 829 } 830 else { 831 return SeqIOConstants.UNKNOWN; 832 } 833 } 834 835 /** 836 * Attempts to retrieve the most appropriate 837 * <code>SequenceBuilder</code> object for some combination of 838 * <code>Alphabet</code> and <code>SequenceFormat</code> 839 * 840 * @param format currently supports <code>FastaFormat</code>, 841 * <code>GenbankFormat</code>, <code>EmblLikeFormat</code> 842 * @param alpha currently only supports the DNA and Protein 843 * alphabets 844 * 845 * @return the <code>SequenceBuilderFactory</code> 846 * 847 * @throws BioException if the combination of alpha and format is 848 * unrecognized. 849 * 850 * @deprecated as this essentially duplicates the operation 851 * available in the method <code>identifyBuilderFactory</code>. 852 */ 853 public static SequenceBuilderFactory formatToFactory(SequenceFormat format, 854 Alphabet alpha) 855 throws BioException { 856 857 if ((format instanceof FastaFormat) && 858 (alpha == DNATools.getDNA() || 859 alpha == ProteinTools.getAlphabet())) { 860 861 return getFastaBuilderFactory(); 862 } 863 else if (format instanceof GenbankFormat && 864 alpha == DNATools.getDNA()) { 865 866 return getGenbankBuilderFactory(); 867 } 868 else if (format instanceof GenbankFormat && 869 alpha == ProteinTools.getAlphabet()) { 870 return getGenpeptBuilderFactory(); 871 } 872 else if (format instanceof EmblLikeFormat && 873 alpha == DNATools.getDNA()){ 874 return getEmblBuilderFactory(); 875 } 876 else if (format instanceof EmblLikeFormat && 877 alpha == ProteinTools.getAlphabet()) { 878 return getSwissprotBuilderFactory(); 879 } 880 else { 881 throw new BioException("Unknown combination of" 882 + " Alphabet and Format"); 883 } 884 } 885 886 /** 887 * Reads a file with the specified format and alphabet 888 * @param formatName the name of the format eg genbank or 889 * swissprot (case insensitive) 890 * @param alphabetName the name of the alphabet eg dna or rna or 891 * protein (case insensitive) 892 * @param br a BufferedReader for the input 893 * @return either an Alignment object or a SequenceIterator 894 * (depending on the format read) 895 * @throws BioException if an error occurs while reading or a 896 * unrecognized format, alphabet combination is used (eg swissprot 897 * and DNA). 898 * 899 * @since 1.3 900 */ 901 public static Object fileToBiojava(String formatName, 902 String alphabetName, 903 BufferedReader br) 904 throws BioException { 905 906 int fileType = identifyFormat(formatName, alphabetName); 907 908 return fileToBiojava(fileType, br); 909 } 910 911 /** 912 * Reads a file and returns the corresponding Biojava object. You 913 * need to cast it as an Alignment or a SequenceIterator as 914 * appropriate. 915 * @param fileType a value that describes the file type 916 * @param br the reader for the input 917 * @throws org.biojava.bio.BioException if the file cannot be parsed 918 * @return either a <code>SequenceIterator</code> if the file type is a 919 * sequence file, or a <code>Alignment</code> if the file is a sequence 920 * alignment. 921 */ 922 public static Object fileToBiojava(int fileType, BufferedReader br) 923 throws BioException { 924 925 // Mask the sequence format bytes 926 int alphaType = fileType & (~ 0xffff); 927 if (alphaType == 0) 928 throw new IllegalArgumentException("No alphabet was set in the identifier"); 929 930 // Mask alphabet bytes 931 int formatType = fileType & (~ 0xffff0000); 932 if (formatType == 0) 933 throw new IllegalArgumentException("No format was set in the identifier"); 934 935 switch (fileType) { 936 case AlignIOConstants.MSF_DNA: 937 case AlignIOConstants.MSF_AA: 938 case AlignIOConstants.FASTA_DNA: 939 case AlignIOConstants.FASTA_AA: 940 return fileToAlign(fileType, br); 941 case SeqIOConstants.FASTA_DNA: 942 case SeqIOConstants.FASTA_AA: 943 case SeqIOConstants.EMBL_DNA: 944 case SeqIOConstants.GENBANK_DNA: 945 case SeqIOConstants.SWISSPROT: 946 case SeqIOConstants.GENPEPT: 947 return fileToSeq(fileType, br); 948 default: 949 throw new BioException("Unknown file type '" 950 + fileType 951 + "'"); 952 } 953 } 954 955 /** 956 * Writes a Biojava <code>SequenceIterator</code>, 957 * <code>SequenceDB</code>, <code>Sequence</code> or <code>Aligment</code> 958 * to an <code>OutputStream</code> 959 * 960 * @param formatName eg fasta, GenBank (case insensitive) 961 * @param alphabetName eg DNA, RNA (case insensititve) 962 * @param os where to write to 963 * @param biojava the object to write 964 * @throws BioException problems getting data from the biojava object. 965 * @throws IOException if there are IO problems 966 * @throws IllegalSymbolException a Symbol cannot be parsed 967 */ 968 public static void biojavaToFile(String formatName, String alphabetName, 969 OutputStream os, Object biojava) 970 throws BioException, IOException, IllegalSymbolException{ 971 int fileType = identifyFormat(formatName,alphabetName); 972 biojavaToFile(fileType, os, biojava); 973 } 974 975 /** 976 * Converts a Biojava object to the given filetype. 977 * @param fileType a value that describes the type of sequence file 978 * @param os the stream to write the formatted results to 979 * @param biojava a <code>SequenceIterator</code>, <code>SequenceDB</code>, 980 * <code>Sequence</code>, or <code>Alignment</code> 981 * @throws org.biojava.bio.BioException if <code>biojava</code> cannot be 982 * converted to that format. 983 * @throws java.io.IOException if the output cannot be written to 984 * <code>os</code> 985 * @throws org.biojava.bio.symbol.IllegalSymbolException if <code>biojava 986 * </code> contains a <code>Symbol</code> that cannot be understood by the 987 * parser. 988 */ 989 public static void biojavaToFile(int fileType, OutputStream os, 990 Object biojava) 991 throws BioException, IOException, IllegalSymbolException { 992 switch (fileType) { 993 case AlignIOConstants.MSF_DNA: 994 case AlignIOConstants.MSF_AA: 995 case AlignIOConstants.FASTA_DNA: 996 case AlignIOConstants.FASTA_AA: 997 alignToFile(fileType, os, (Alignment) biojava); 998 break; 999 case SeqIOConstants.FASTA_DNA: 1000 case SeqIOConstants.FASTA_AA: 1001 case SeqIOConstants.EMBL_DNA: 1002 case SeqIOConstants.GENBANK_DNA: 1003 case SeqIOConstants.SWISSPROT: 1004 case SeqIOConstants.GENPEPT: 1005 if(biojava instanceof SequenceDB){ 1006 seqToFile(fileType, os, ((SequenceDB)biojava).sequenceIterator()); 1007 }else if(biojava instanceof Sequence){ 1008 seqToFile(fileType, os, new SingleSeqIterator((Sequence)biojava)); 1009 }else{ 1010 seqToFile(fileType, os, (SequenceIterator) biojava); 1011 } 1012 break; 1013 default: 1014 throw new BioException("Unknown file type '" 1015 + fileType 1016 + "'"); 1017 } 1018 } 1019 1020 /** 1021 * Helper function for guessFileName. 1022 */ 1023 private static int guessFastaType(File seqFile) 1024 throws IOException, FileNotFoundException { 1025 BufferedReader br = new BufferedReader(new FileReader(seqFile)); 1026 String line = br.readLine(); 1027 line = br.readLine(); 1028 br.close(); 1029 for (int i = 0; i < line.length(); i++) { 1030 if (Character.toUpperCase(line.charAt(i)) == 'F' || 1031 Character.toUpperCase(line.charAt(i)) == 'L' || 1032 Character.toUpperCase(line.charAt(i)) == 'I' || 1033 Character.toUpperCase(line.charAt(i)) == 'P' || 1034 Character.toUpperCase(line.charAt(i)) == 'Q' || 1035 Character.toUpperCase(line.charAt(i)) == 'E') { 1036 return SeqIOConstants.FASTA_AA; 1037 } 1038 } 1039 1040 return SeqIOConstants.FASTA_DNA; 1041 } 1042 1043 private static SymbolTokenization getDNAParser() { 1044 try { 1045 return DNATools.getDNA().getTokenization("token"); 1046 } catch (BioException ex) { 1047 throw new BioError("Assertion failing:" 1048 + " Couldn't get DNA token parser",ex); 1049 } 1050 } 1051 1052 private static SymbolTokenization getRNAParser() { 1053 try { 1054 return RNATools.getRNA().getTokenization("token"); 1055 } catch (BioException ex) { 1056 throw new BioError("Assertion failing:" 1057 + " Couldn't get RNA token parser",ex); 1058 } 1059 } 1060 1061 private static SymbolTokenization getNucleotideParser() { 1062 try { 1063 return NucleotideTools.getNucleotide().getTokenization("token"); 1064 } catch (BioException ex) { 1065 throw new BioError("Assertion failing:" 1066 + " Couldn't get nucleotide token parser",ex); 1067 } 1068 } 1069 1070 private static SymbolTokenization getProteinParser() { 1071 try { 1072 return ProteinTools.getTAlphabet().getTokenization("token"); 1073 } catch (BioException ex) { 1074 throw new BioError("Assertion failing:" 1075 + " Couldn't get PROTEIN token parser",ex); 1076 } 1077 } 1078 1079 /** 1080 * Helper function for guessFileName. 1081 */ 1082 private static int guessMsfType(File seqFile) 1083 throws IOException, FileNotFoundException { 1084 BufferedReader br = new BufferedReader(new FileReader(seqFile)); 1085 String line = br.readLine(); 1086 if (line.startsWith("!!NA_MULTIPLE_ALIGNMENT")) { 1087 return AlignIOConstants.MSF_DNA; 1088 } 1089 else if (line.startsWith("!!AA_MULTIPLE_ALIGNMENT")) { 1090 return AlignIOConstants.MSF_AA; 1091 } 1092 else { 1093 while (line.indexOf("Type: ") == -1) { 1094 line = br.readLine(); 1095 } 1096 br.close(); 1097 int typeIndex = line.indexOf("Type: ") + 6; 1098 if (line.substring(typeIndex).startsWith("N")) { 1099 return AlignIOConstants.MSF_DNA; 1100 } 1101 else if (line.substring(typeIndex).startsWith("P")) { 1102 return AlignIOConstants.MSF_AA; 1103 } 1104 else { 1105 return AlignIOConstants.UNKNOWN; 1106 } 1107 } 1108 } 1109 1110 /** 1111 * Helper function for guessFileName. 1112 */ 1113 private static int guessGenType(String fileName) 1114 throws IOException, FileNotFoundException { 1115 BufferedReader br = new BufferedReader(new FileReader(fileName)); 1116 String line = br.readLine(); 1117 while (line.indexOf("LOCUS") == -1) { 1118 line = br.readLine(); 1119 } 1120 br.close(); 1121 for (int i = 0; i < line.length(); i++) { 1122 if (Character.toUpperCase(line.charAt(i)) == 'A' && 1123 Character.toUpperCase(line.charAt(i+1)) == 'A') { 1124 return SeqIOConstants.GENPEPT; 1125 } 1126 } 1127 return SeqIOConstants.GENBANK_DNA; 1128 } 1129 1130 /** 1131 * Converts a file to an Biojava alignment. 1132 */ 1133 private static Alignment fileToAlign(int fileType, BufferedReader br) 1134 throws BioException { 1135 switch(fileType) { 1136 case AlignIOConstants.MSF_DNA: 1137 case AlignIOConstants.MSF_AA: 1138 return (new MSFAlignmentFormat()).read(br); 1139 case AlignIOConstants.FASTA_DNA: 1140 case AlignIOConstants.FASTA_AA: 1141 return (new FastaAlignmentFormat()).read(br); 1142 default: 1143 throw new BioException("Unknown file type '" 1144 + fileType 1145 + "'"); 1146 } 1147 } 1148 1149 /** 1150 * Converts a file to a Biojava sequence. 1151 */ 1152 private static SequenceIterator fileToSeq(int fileType, 1153 BufferedReader br) 1154 throws BioException { 1155 switch (fileType) { 1156 case SeqIOConstants.FASTA_DNA: 1157 return SeqIOTools.readFastaDNA(br); 1158 case SeqIOConstants.FASTA_AA: 1159 return SeqIOTools.readFastaProtein(br); 1160 case SeqIOConstants.EMBL_DNA: 1161 return SeqIOTools.readEmbl(br); 1162 case SeqIOConstants.GENBANK_DNA: 1163 return SeqIOTools.readGenbank(br); 1164 case SeqIOConstants.SWISSPROT: 1165 return SeqIOTools.readSwissprot(br); 1166 case SeqIOConstants.GENPEPT: 1167 return SeqIOTools.readGenpept(br); 1168 default: 1169 throw new BioException("Unknown file type '" 1170 + fileType 1171 + "'"); 1172 } 1173 } 1174 1175 /** 1176 * Converts a Biojava alignment to the given filetype. 1177 */ 1178 private static void alignToFile(int fileType, OutputStream os, 1179 Alignment align) 1180 throws BioException, IllegalSymbolException { 1181 switch(fileType) { 1182 case AlignIOConstants.MSF_DNA: 1183 (new MSFAlignmentFormat()).writeDna(os, align); 1184 break; 1185 case AlignIOConstants.MSF_AA: 1186 (new MSFAlignmentFormat()).writeProtein(os, align); 1187 break; 1188 case AlignIOConstants.FASTA_DNA: 1189 (new FastaAlignmentFormat()).writeDna(os, align); 1190 break; 1191 case AlignIOConstants.FASTA_AA: 1192 (new FastaAlignmentFormat()).writeProtein(os, align); 1193 break; 1194 default: 1195 throw new BioException("Unknown file type '" 1196 + fileType 1197 + "'"); 1198 } 1199 } 1200 1201 /** 1202 * Converts a Biojava sequence to the given filetype. 1203 */ 1204 private static void seqToFile(int fileType, OutputStream os, 1205 SequenceIterator seq) 1206 throws IOException, BioException { 1207 switch (fileType) { 1208 case SeqIOConstants.FASTA_DNA: 1209 case SeqIOConstants.FASTA_AA: 1210 SeqIOTools.writeFasta(os, seq); 1211 break; 1212 case SeqIOConstants.EMBL_DNA: 1213 SeqIOTools.writeEmbl(os, seq); 1214 break; 1215 case SeqIOConstants.SWISSPROT: 1216 SeqIOTools.writeSwissprot(os, seq); 1217 break; 1218 case SeqIOConstants.GENBANK_DNA: 1219 SeqIOTools.writeGenbank(os, seq); 1220 break; 1221 case SeqIOConstants.GENPEPT: 1222 SeqIOTools.writeGenpept(os, seq); 1223 break; 1224 default: 1225 throw new BioException("Unknown file type '" 1226 + fileType 1227 + "'"); 1228 } 1229 } 1230 1231 private static final class SingleSeqIterator 1232 implements SequenceIterator { 1233 private Sequence seq; 1234 SingleSeqIterator(Sequence seq) { 1235 this.seq = seq; 1236 } 1237 1238 public boolean hasNext() { 1239 return seq != null; 1240 } 1241 1242 public Sequence nextSequence() { 1243 Sequence seq = this.seq; 1244 this.seq = null; 1245 return seq; 1246 } 1247 } 1248}