001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.File;
026import java.io.FileNotFoundException;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStream;
030import java.io.OutputStream;
031import java.io.PrintStream;
032import java.util.regex.Pattern;
033import java.util.regex.PatternSyntaxException;
034
035import org.biojava.bio.BioError;
036import org.biojava.bio.BioException;
037import org.biojava.bio.alignment.Alignment;
038import org.biojava.bio.seq.DNATools;
039import org.biojava.bio.seq.NucleotideTools;
040import org.biojava.bio.seq.ProteinTools;
041import org.biojava.bio.seq.RNATools;
042import org.biojava.bio.seq.Sequence;
043import org.biojava.bio.seq.SequenceIterator;
044import org.biojava.bio.seq.db.HashSequenceDB;
045import org.biojava.bio.seq.db.IDMaker;
046import org.biojava.bio.seq.db.SequenceDB;
047import org.biojava.bio.symbol.Alphabet;
048import org.biojava.bio.symbol.FiniteAlphabet;
049import org.biojava.bio.symbol.IllegalSymbolException;
050import org.biojava.utils.AssertionFailure;
051import org.biojava.utils.ChangeVetoException;
052
053/**
054 * A set of convenience methods for handling common file formats.
055 *
056 * @author Thomas Down
057 * @author Mark Schreiber
058 * @author Nimesh Singh
059 * @author Matthew Pocock
060 * @author Keith James
061 * @since 1.1
062 * @deprecated use org.biojavax.bio.seq.RichSequence.IOTools
063 */
064public final class SeqIOTools  {
065    private static SequenceBuilderFactory _emblBuilderFactory;
066    private static SequenceBuilderFactory _genbankBuilderFactory;
067    private static SequenceBuilderFactory _genpeptBuilderFactory;
068    private static SequenceBuilderFactory _swissprotBuilderFactory;
069    private static SequenceBuilderFactory _fastaBuilderFactory;
070
071    /**
072     * This can't be instantiated.
073     */
074    private SeqIOTools() {
075    }
076
077    /**
078     * Get a default SequenceBuilderFactory for handling EMBL
079     * files.
080     * @return a <CODE>SmartSequenceBuilder.FACTORY</CODE>
081     */
082    public static SequenceBuilderFactory getEmblBuilderFactory() {
083        if (_emblBuilderFactory == null) {
084            _emblBuilderFactory =
085                new EmblProcessor.Factory(SmartSequenceBuilder.FACTORY);
086        }
087        return _emblBuilderFactory;
088    }
089
090    /**
091     * Iterate over the sequences in an EMBL-format stream.
092     * @param br A reader for the EMBL source or file
093     * @return a <CODE>SequenceIterator</CODE> that iterates over each
094     * <CODE>Sequence</CODE> in the file
095     */
096    public static SequenceIterator readEmbl(BufferedReader br) {
097        return new StreamReader(br,
098                                new EmblLikeFormat(),
099                                getDNAParser(),
100                                getEmblBuilderFactory());
101    }
102
103    /**
104     * Iterate over the sequences in an EMBL-format stream, but for RNA.
105     * @param br A reader for the EMBL source or file
106     * @return a <CODE>SequenceIterator</CODE> that iterates over each
107     * <CODE>Sequence</CODE> in the file
108     */
109    public static SequenceIterator readEmblRNA(BufferedReader br) {
110        return new StreamReader(br,
111                                new EmblLikeFormat(),
112                                getRNAParser(),
113                                getEmblBuilderFactory());
114    }
115
116    /**
117     * Iterate over the sequences in an EMBL-format stream.
118     * @param br A reader for the EMBL source or file
119     * @return a <CODE>SequenceIterator</CODE> that iterates over each
120     * <CODE>Sequence</CODE> in the file
121     */
122    public static SequenceIterator readEmblNucleotide(BufferedReader br) {
123        return new StreamReader(br,
124                                new EmblLikeFormat(),
125                                getNucleotideParser(),
126                                getEmblBuilderFactory());
127    }
128
129    /**
130     * Get a default SequenceBuilderFactory for handling GenBank
131     * files.
132     * @return a <code>SmartSequenceBuilder.FACTORY</code>
133     */
134    public static SequenceBuilderFactory getGenbankBuilderFactory() {
135        if (_genbankBuilderFactory == null) {
136            _genbankBuilderFactory =
137                new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY);
138        }
139        return _genbankBuilderFactory;
140    }
141
142    /**
143     * Iterate over the sequences in an Genbank-format stream.
144     * @param br A reader for the Genbank source or file
145     * @return a <CODE>SequenceIterator</CODE> that iterates over each
146     * <CODE>Sequence</CODE> in the file
147     */
148    public static SequenceIterator readGenbank(BufferedReader br) {
149        return new StreamReader(br,
150                                new GenbankFormat(),
151                                getDNAParser(),
152                                getGenbankBuilderFactory());
153    }
154    
155    /**
156     * Iterate over the sequences in an GenbankXML-format stream.
157     * @param br A reader for the GenbanXML source or file
158     * @return a <CODE>SequenceIterator</CODE> that iterates over each
159     * <CODE>Sequence</CODE> in the file
160     */    
161    public static SequenceIterator readGenbankXml( BufferedReader br )
162    {
163        return new StreamReader( br,
164                                 new GenbankXmlFormat(),
165                                 getDNAParser(),
166                                 getGenbankBuilderFactory() );
167    }
168
169    /**
170    * Get a default SequenceBuilderFactory for handling Genpept
171    * files.
172    * @return a <code>SmartSequenceBuilder.FACTORY</code>
173    */
174    public static SequenceBuilderFactory getGenpeptBuilderFactory() {
175        if (_genpeptBuilderFactory == null) {
176            _genpeptBuilderFactory =
177                new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY);
178        }
179        return _genpeptBuilderFactory;
180    }
181
182    /**
183    * Iterate over the sequences in an Genpept-format stream.
184    * @param br A reader for the Genpept source or file
185    * @return a <CODE>SequenceIterator</CODE> that iterates over each
186    * <CODE>Sequence</CODE> in the file
187    */
188    public static SequenceIterator readGenpept(BufferedReader br) {
189        return new StreamReader(br,
190                                new GenbankFormat(),
191                                getProteinParser(),
192                                getGenpeptBuilderFactory());
193    }
194
195    /**
196     * Get a default SequenceBuilderFactory for handling Swissprot
197     * files.
198     * @return a <code>SmartSequenceBuilder.FACTORY</code>
199     */
200    public static SequenceBuilderFactory getSwissprotBuilderFactory() {
201        if (_swissprotBuilderFactory == null) {
202            _swissprotBuilderFactory =
203                new SwissprotProcessor.Factory(SmartSequenceBuilder.FACTORY);
204        }
205        return _swissprotBuilderFactory;
206    }
207
208    /**
209     * Iterate over the sequences in an Swissprot-format stream.
210    * @param br A reader for the Swissprot source or file
211    * @return a <CODE>SequenceIterator</CODE> that iterates over each
212    * <CODE>Sequence</CODE> in the file     
213     */
214    public static SequenceIterator readSwissprot(BufferedReader br) {
215        return new StreamReader(br,
216                                new EmblLikeFormat(),
217                                getProteinParser(),
218                                getSwissprotBuilderFactory());
219    }
220
221    /**
222     * Get a default SequenceBuilderFactory for handling FASTA
223     * files.
224     * @return a <code>SmartSequenceBuilder.FACTORY</code>
225     */
226    public static SequenceBuilderFactory getFastaBuilderFactory() {
227        if (_fastaBuilderFactory == null) {
228            _fastaBuilderFactory = new FastaDescriptionLineParser.Factory(
229                        SmartSequenceBuilder.FACTORY);
230        }
231        return _fastaBuilderFactory;
232    }
233
234    /**
235     * Read a fasta file.
236     *
237     * @param br    the BufferedReader to read data from
238     * @param sTok  a SymbolTokenization that understands the sequences
239     * @return      a SequenceIterator over each sequence in the fasta file
240     */
241    public static SequenceIterator readFasta(
242            BufferedReader br, SymbolTokenization sTok)
243    {
244      return new StreamReader(br,
245                              new FastaFormat(),
246                              sTok,
247                              getFastaBuilderFactory());
248    }
249
250    /**
251     * Read a fasta file using a custom type of SymbolList. For example,
252     * use SmartSequenceBuilder.FACTORY to emulate readFasta(BufferedReader,
253     * SymbolTokenization) and SmartSequenceBuilder.BIT_PACKED to force all
254     * symbols to be encoded using bit-packing.
255     * @param br the BufferedReader to read data from
256     * @param sTok a SymbolTokenization that understands the sequences
257     * @param seqFactory a factory used to build a SymbolList
258     * @return a <CODE>SequenceIterator</CODE> that iterates over each
259     * <CODE>Sequence</CODE> in the file
260     */
261    public static SequenceIterator readFasta(
262            BufferedReader br,
263            SymbolTokenization sTok,
264            SequenceBuilderFactory seqFactory)
265    {
266      return new StreamReader(
267              br,
268              new FastaFormat(),
269              sTok,
270              new FastaDescriptionLineParser.Factory(seqFactory));
271    }
272
273    /**
274     * Iterate over the sequences in an FASTA-format stream of DNA sequences.
275     * @param br the BufferedReader to read data from
276     * @return a <CODE>SequenceIterator</CODE> that iterates over each
277     * <CODE>Sequence</CODE> in the file
278     */
279    public static SequenceIterator readFastaDNA(BufferedReader br) {
280        return new StreamReader(br,
281                                new FastaFormat(),
282                                getDNAParser(),
283                                getFastaBuilderFactory());
284    }
285
286    /**
287     * Iterate over the sequences in an FASTA-format stream of RNA sequences.
288     * @param br the BufferedReader to read data from
289     * @return a <CODE>SequenceIterator</CODE> that iterates over each
290     * <CODE>Sequence</CODE> in the file
291     */
292    public static SequenceIterator readFastaRNA(BufferedReader br) {
293        return new StreamReader(br,
294                                new FastaFormat(),
295                                getRNAParser(),
296                                getFastaBuilderFactory());
297    }
298
299    /**
300     * Iterate over the sequences in an FASTA-format stream of Protein sequences.
301     * @param br the BufferedReader to read data from
302     * @return a <CODE>SequenceIterator</CODE> that iterates over each
303     * <CODE>Sequence</CODE> in the file
304     */
305    public static SequenceIterator readFastaProtein(BufferedReader br) {
306        return new StreamReader(br,
307                                new FastaFormat(),
308                                getProteinParser(),
309                                getFastaBuilderFactory());
310    }
311
312    /**
313     * Create a sequence database from a fasta file provided as an
314     * input stream.  Note this somewhat duplicates functionality in
315     * the readFastaDNA and readFastaProtein methods but uses a stream
316     * rather than a reader and returns a SequenceDB rather than a
317     * SequenceIterator. If the returned DB is likely to be large then
318     * the above mentioned methods should be used.
319     * @return a <code>SequenceDB</code> containing all the <code>Sequences</code>
320     * in the file.
321     * @since 1.2
322     * @param seqFile The file containg the fasta formatted sequences
323     * @param alpha The <code>Alphabet</code> of the sequence, ie DNA, RNA etc
324     * @throws BioException if problems occur during reading of the
325     * stream.
326     */
327    public static SequenceDB readFasta(InputStream seqFile, Alphabet alpha)
328        throws BioException {
329        HashSequenceDB db = new HashSequenceDB(IDMaker.byName);
330        SequenceBuilderFactory sbFact =
331            new FastaDescriptionLineParser.Factory(SmartSequenceBuilder.FACTORY);
332        FastaFormat fFormat = new FastaFormat();
333        for (SequenceIterator seqI = new StreamReader(seqFile,
334                                                      fFormat,
335                                                      alpha.getTokenization("token"),
336                                                      sbFact);seqI.hasNext();) {
337            Sequence seq = seqI.nextSequence();
338            try {
339                db.addSequence(seq);
340            } catch (ChangeVetoException cve) {
341                throw new AssertionFailure(
342                  "Could not successfully add sequence "
343                  + seq.getName()
344                  + " to sequence database",
345                  cve);
346            }
347        }
348        return db;
349    }
350
351    /**
352     * Write a sequenceDB to an output stream in fasta format.
353     * @since 1.2
354     * @param os the stream to write the fasta formatted data to.
355     * @param db the database of <code>Sequence</code>s to write
356     * @throws IOException if there was an error while writing.
357     */
358    public static void writeFasta(OutputStream os, SequenceDB db)
359        throws IOException {
360        StreamWriter sw = new StreamWriter(os,new FastaFormat());
361        sw.writeStream(db.sequenceIterator());
362    }
363
364    /**
365     * Writes sequences from a SequenceIterator to an OutputStream in
366     * Fasta Format.  This makes for a useful format filter where a
367     * StreamReader can be sent to the StreamWriter after formatting.
368     * 
369     * @since 1.2
370     * @param os The stream to write fasta formatted data to
371     * @param in The source of input <code>Sequences</code>
372     * @throws IOException if there was an error while writing.
373     */
374    public static void writeFasta(OutputStream os, SequenceIterator in)
375        throws IOException {
376        StreamWriter sw = new StreamWriter(os,new FastaFormat());
377        sw.writeStream(in);
378    }
379
380    /**
381     * Writes a single Sequence to an OutputStream in Fasta format.
382     *
383     * @param os  the OutputStream.
384     * @param seq  the Sequence.
385     * @throws IOException if there was an error while writing.
386     */
387    public static void writeFasta(OutputStream os, Sequence seq)
388        throws IOException {
389        writeFasta(os, new SingleSeqIterator(seq));
390    }
391
392    /**
393     * Writes a stream of Sequences to an OutputStream in EMBL format.
394     *
395     * @param os the OutputStream.
396     * @param in a SequenceIterator.
397     * @exception IOException if there was an error while writing.
398     */
399    public static void writeEmbl(OutputStream os, SequenceIterator in)
400        throws IOException {
401        StreamWriter sw = new StreamWriter(os, new EmblLikeFormat());
402        sw.writeStream(in);
403    }
404
405    /**
406     * Writes a single Sequence to an OutputStream in EMBL format.
407     *
408     * @param os  the OutputStream.
409     * @param seq  the Sequence.
410     * @throws IOException if there was an error while writing.
411     */
412    public static void writeEmbl(OutputStream os, Sequence seq) throws IOException {
413        writeEmbl(os, new SingleSeqIterator(seq));
414    }
415
416    /**
417     * Writes a stream of Sequences to an OutputStream in SwissProt
418     * format.
419     * @param os the OutputStream.
420     * @param in a SequenceIterator.
421     * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be converted to SwissProt
422     * format
423     * @exception IOException if there was an error while writing.
424     */
425    public static void writeSwissprot(OutputStream os, SequenceIterator in)
426        throws IOException, BioException {
427        SequenceFormat former = new EmblLikeFormat();
428        PrintStream ps = new PrintStream(os);
429        while (in.hasNext()) {
430            former.writeSequence(in.nextSequence(), ps);
431        }
432    }
433
434    /**
435     * Writes a single Sequence to an OutputStream in SwissProt format.
436     * @param os the OutputStream.
437     * @param seq the Sequence.
438     * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to SwissProt format
439     * @throws IOException if there was an error while writing.
440     */
441    public static void writeSwissprot(OutputStream os, Sequence seq)
442        throws IOException, BioException {
443        writeSwissprot(os, new SingleSeqIterator(seq));
444    }
445
446    /**
447     * Writes a stream of Sequences to an OutputStream in Genpept
448     * format.
449     * @param os the OutputStream.
450     * @param in a SequenceIterator.
451     * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to Genpept format
452     * @exception IOException if there was an error while writing.
453     */
454    public static void writeGenpept(OutputStream os, SequenceIterator in)
455        throws IOException, BioException {
456        SequenceFormat former = new GenpeptFormat();
457        PrintStream ps = new PrintStream(os);
458        while (in.hasNext()) {
459            former.writeSequence(in.nextSequence(), ps);
460        }
461    }
462
463    /**
464     * Writes a single Sequence to an OutputStream in Genpept format.
465     * @param os the OutputStream.
466     * @param seq the Sequence.
467     * @throws org.biojava.bio.BioException if the <CODE>Sequence</CODE> cannot be written to Genpept format
468     * @throws IOException if there was an error while writing.
469     */
470    public static void writeGenpept(OutputStream os, Sequence seq)
471    throws IOException, BioException {
472      writeGenpept(os, new SingleSeqIterator(seq));
473    }
474
475    /**
476     * Writes a stream of Sequences to an OutputStream in Genbank
477     * format.
478     *
479     * @param os the OutputStream.
480     * @param in a SequenceIterator.
481     * @exception IOException if there was an error while writing.
482     */
483    public static void writeGenbank(OutputStream os, SequenceIterator in)
484        throws IOException {
485        StreamWriter sw = new StreamWriter(os, new GenbankFormat());
486        sw.writeStream(in);
487    }
488
489    /**
490     * Writes a single Sequence to an OutputStream in Genbank format.
491     *
492     * @param os  the OutputStream.
493     * @param seq  the Sequence.
494     * @throws IOException if there was an error while writing.
495     */
496    public static void writeGenbank(OutputStream os, Sequence seq)
497        throws IOException {
498        writeGenbank(os, new SingleSeqIterator(seq));
499    }
500
501   /**
502     * <code>identifyFormat</code> performs a case-insensitive mapping
503     * of a pair of common sequence format name (such as 'embl',
504     * 'genbank' or 'fasta') and alphabet name (such as 'dna', 'rna',
505     * 'protein', 'aa') to an integer. The value returned will be one
506     * of the public static final fields in
507     * <code>SeqIOConstants</code>, or a bitwise-or combination of
508     * them. The method will reject known illegal combinations of
509     * format and alphabet (such as swissprot + dna) by throwing an
510     * <code>IllegalArgumentException</code>. It will return the
511     * <code>SeqIOConstants.UNKNOWN</code> value when either format or
512     * alphabet are unknown.
513     *
514     * @param formatName a <code>String</code>.
515     * @param alphabetName a <code>String</code>.
516     *
517     * @return an <code>int</code>.
518     */
519    public static int identifyFormat(String formatName, String alphabetName) {
520        int format, alpha;
521        if (formatName.equalsIgnoreCase("raw")) {
522            format = SeqIOConstants.RAW;
523        }
524        else if (formatName.equalsIgnoreCase("fasta")) {
525            format = SeqIOConstants.FASTA;
526        }
527        else if (formatName.equalsIgnoreCase("nbrf")) {
528            format = SeqIOConstants.NBRF;
529        }
530        else if (formatName.equalsIgnoreCase("ig")) {
531            format = SeqIOConstants.IG;
532        }
533        else if (formatName.equalsIgnoreCase("embl")) {
534            format = SeqIOConstants.EMBL;
535        }
536        else if (formatName.equalsIgnoreCase("swissprot") ||
537                 formatName.equalsIgnoreCase("swiss")) {
538            if (alphabetName.equalsIgnoreCase("aa") ||
539                alphabetName.equalsIgnoreCase("protein")) {
540                return SeqIOConstants.SWISSPROT;
541            } else {
542                throw new IllegalArgumentException("Illegal format and alphabet "
543                                                   + "combination "
544                                                   + formatName
545                                                   + " + "
546                                                   + alphabetName);
547            }
548        } else if (formatName.equalsIgnoreCase("genbank")) {
549            format = SeqIOConstants.GENBANK;
550        } else if (formatName.equalsIgnoreCase("genpept")) {
551            if (alphabetName.equalsIgnoreCase("aa") ||
552                alphabetName.equalsIgnoreCase("protein")) {
553                return SeqIOConstants.GENPEPT;
554            } else {
555                throw new IllegalArgumentException("Illegal format and alphabet "
556                                                   + "combination "
557                                                   + formatName
558                                                   + " + "
559                                                   + alphabetName);
560            }
561        } else if (formatName.equalsIgnoreCase("refseq")) {
562            format = SeqIOConstants.REFSEQ;
563        } else if (formatName.equalsIgnoreCase("gcg")) {
564            format = SeqIOConstants.GCG;
565        } else if (formatName.equalsIgnoreCase("gff")) {
566            format = SeqIOConstants.GFF;
567        }
568        else if (formatName.equalsIgnoreCase("pdb")) {
569            if (alphabetName.equalsIgnoreCase("aa") ||
570                alphabetName.equalsIgnoreCase("protein")) {
571                return SeqIOConstants.PDB;
572            } else {
573                throw new IllegalArgumentException("Illegal format and alphabet "
574                                                   + "combination "
575                                                   + formatName
576                                                   + " + "
577                                                   + alphabetName);
578            }
579        } else if (formatName.equalsIgnoreCase("phred")) {
580            if (alphabetName.equalsIgnoreCase("dna")) {
581                return SeqIOConstants.PHRED;
582            } else {
583                throw new IllegalArgumentException("Illegal format and alphabet "
584                                                   + "combination "
585                                                   + formatName
586                                                   + " + "
587                                                   + alphabetName);
588            }
589        } else if (formatName.equalsIgnoreCase("clustal")) {
590            format = AlignIOConstants.CLUSTAL;
591        } else if (formatName.equalsIgnoreCase("msf")) {
592            format = AlignIOConstants.MSF;
593        }
594        else {
595            return SeqIOConstants.UNKNOWN;
596        }
597
598        if (alphabetName.equalsIgnoreCase("dna")) {
599            alpha = SeqIOConstants.DNA;
600        } else if (alphabetName.equalsIgnoreCase("rna")) {
601            alpha = SeqIOConstants.RNA;
602        } else if (alphabetName.equalsIgnoreCase("aa") ||
603                 alphabetName.equalsIgnoreCase("protein")) {
604            alpha = SeqIOConstants.AA;
605        } else {
606            return SeqIOConstants.UNKNOWN;
607        }
608
609        return (format | alpha);
610    }
611
612    /**
613     * <code>getSequenceFormat</code> accepts a value which represents
614     * a sequence format and returns the relevant
615     * <code>SequenceFormat</code> object.
616     *
617     * @param identifier an <code>int</code> which represents a binary
618     * value with bits set according to the scheme described in
619     * <code>SeqIOConstants</code>.
620     *
621     * @return a <code>SequenceFormat</code>.
622     *
623     * @exception BioException if an error occurs.
624     */
625    public static SequenceFormat getSequenceFormat(int identifier)
626        throws BioException {
627
628        // Mask the sequence format bytes
629        int alphaType = identifier & (~ 0xffff);
630        if (alphaType == 0)
631            throw new IllegalArgumentException("No alphabet was set in the identifier");
632
633        // Mask alphabet bytes
634        int formatType = identifier & (~ 0xffff0000);
635        if (formatType == 0)
636            throw new IllegalArgumentException("No format was set in the identifier");
637
638        switch (identifier) {
639            case SeqIOConstants.FASTA_DNA:
640            case SeqIOConstants.FASTA_RNA:
641            case SeqIOConstants.FASTA_AA:
642                return new FastaFormat();
643            case SeqIOConstants.EMBL_DNA:
644            case SeqIOConstants.EMBL_RNA:
645                return new EmblLikeFormat();
646            case SeqIOConstants.GENBANK_DNA:
647            case SeqIOConstants.GENBANK_RNA:
648                return new GenbankFormat();
649            case SeqIOConstants.SWISSPROT:
650                return new EmblLikeFormat();
651            default:
652                throw new BioException("No SequenceFormat available for "
653                                       + "format/alphabet identifier '"
654                                       + identifier
655                                       + "'");
656        }
657    }
658
659    /**
660     * <code>getBuilderFactory</code> accepts a value which represents
661     * a sequence format and returns the relevant
662     * <code>SequenceBuilderFactory</code> object.
663     *
664     * @param identifier an <code>int</code> which represents a binary
665     * value with bits set according to the scheme described in
666     * <code>SeqIOConstants</code>.
667     *
668     * @return a <code>SequenceBuilderFactory</code>.
669     *
670     * @exception BioException if an error occurs.
671     */
672    public static SequenceBuilderFactory getBuilderFactory(int identifier)
673        throws BioException {
674
675        // Mask the sequence format bytes
676        int alphaType = identifier & (~ 0xffff);
677        if (alphaType == 0)
678            throw new IllegalArgumentException("No alphabet was set in the identifier");
679
680        // Mask alphabet bytes
681        int formatType = identifier & (~ 0xffff0000);
682        if (formatType == 0)
683            throw new IllegalArgumentException("No format was set in the identifier");
684
685        switch (identifier) {
686            case SeqIOConstants.FASTA_DNA:
687            case SeqIOConstants.FASTA_RNA:
688            case SeqIOConstants.FASTA_AA:
689                return getFastaBuilderFactory();
690            case SeqIOConstants.EMBL_DNA:
691                    return getEmblBuilderFactory();
692            case SeqIOConstants.GENBANK_DNA:
693                return getGenbankBuilderFactory();
694            case SeqIOConstants.SWISSPROT:
695                return getSwissprotBuilderFactory();
696            case SeqIOConstants.GENPEPT:
697                return getGenpeptBuilderFactory();
698            default:
699                throw new BioException("No SequenceBuilderFactory available for "
700                                       + "format/alphabet identifier '"
701                                       + identifier
702                                       + "'");
703        }
704    }
705
706    /**
707     * <code>getAlphabet</code> accepts a value which represents a
708     * sequence format and returns the relevant
709     * <code>FiniteAlphabet</code> object.
710     *
711     * @param identifier an <code>int</code> which represents a binary
712     * value with bits set according to the scheme described in
713     * <code>SeqIOConstants</code>.
714     *
715     * @return a <code>FiniteAlphabet</code>.
716     *
717     * @exception BioException if an error occurs.
718     */
719    public static FiniteAlphabet getAlphabet(int identifier)
720        throws BioException {
721
722        // Mask the sequence format bytes
723        int alphaType = identifier & (~ 0xffff);
724        if (alphaType == 0)
725            throw new IllegalArgumentException("No alphabet was set in the identifier");
726
727        switch (alphaType) {
728            case SeqIOConstants.DNA:
729                return DNATools.getDNA();
730            case SeqIOConstants.RNA:
731                return RNATools.getRNA();
732            case SeqIOConstants.AA:
733                return ProteinTools.getTAlphabet();
734            default:
735                throw new BioException("No FiniteAlphabet available for "
736                                       + "alphabet identifier '"
737                                       + identifier
738                                       + "'");
739        }
740    }
741
742    //
743    // The following methods provide an alternate interface for
744    // reading and writing sequences and alignments. (Nimesh Singh).
745    //
746    //
747
748    /**
749     * Attempts to guess the filetype of a file given the name.  For
750     * use with the functions below that take an int fileType as a
751     * parameter. EMBL and Genbank files are assumed to contain DNA
752     * sequence.
753     * @deprecated because there is no standard file naming convention
754     * and guessing by file name is inherantly error prone and bad.
755     * @param seqFile the <CODE>File</CODE> to read from.
756     * @throws java.io.IOException if <CODE>seqFile</CODE> cannot be read
757     * @throws java.io.FileNotFoundException if <CODE>seqFile</CODE> cannot be found
758     * @return a value that describes the file type.
759     */
760    public static int guessFileType(File seqFile)
761        throws IOException, FileNotFoundException {
762        //First tries by matching an extension
763        String fileName = seqFile.getName();
764        try {
765            if (Pattern.matches(".*\\u002eem.*", fileName)) {
766                return SeqIOConstants.EMBL_DNA;
767            }
768            else if (Pattern.matches(".*\\u002edat.*", fileName)) {
769                return SeqIOConstants.EMBL_DNA;
770            }
771            else if (Pattern.matches(".*\\u002egb.*", fileName)) {
772                return SeqIOConstants.GENBANK_DNA;
773            }
774            else if (Pattern.matches(".*\\u002esp.*", fileName)) {
775                return SeqIOConstants.SWISSPROT;
776            }
777            else if (Pattern.matches(".*\\u002egp.*", fileName)) {
778                return SeqIOConstants.GENPEPT;
779            }
780            else if (Pattern.matches(".*\\u002efa.*", fileName)) {
781                return guessFastaType(seqFile);
782            }
783            else if (Pattern.matches(".*\\u002emsf.*", fileName)) {
784                return guessMsfType(seqFile);
785            }
786        } catch (PatternSyntaxException e) {
787            throw new BioError("Internal error in SeqIOTools", e);
788        }
789
790        //Reads the file to guess based on content
791        BufferedReader br = new BufferedReader(new FileReader(seqFile));
792        String line1 = br.readLine();
793        br.close();
794
795        if (line1.startsWith(">")) {
796            return guessFastaType(seqFile);
797        }
798        else if (line1.startsWith("PileUp")) {
799            return guessMsfType(seqFile);
800        }
801        else if (line1.startsWith("!!AA_MULTIPLE_ALIGNMENT")) {
802            return AlignIOConstants.MSF_AA;
803        }
804        else if (line1.startsWith("!!NA_MULTIPLE_ALIGNMENT")) {
805            return AlignIOConstants.MSF_DNA;
806        }
807        else if (line1.startsWith("ID")) {
808            for (int i = 0; i < line1.length(); i++) {
809                if (Character.toUpperCase(line1.charAt(i)) == 'P' &&
810                    Character.toUpperCase(line1.charAt(i+1)) == 'R' &&
811                    Character.toUpperCase(line1.charAt(i+2)) == 'T') {
812                    return SeqIOConstants.SWISSPROT;
813                }
814            }
815            return SeqIOConstants.EMBL_DNA;
816        }
817        else if (line1.toUpperCase().startsWith("LOCUS")) {
818            for (int i = 0; i < line1.length(); i++) {
819                if (Character.toUpperCase(line1.charAt(i)) == 'A' &&
820                    Character.toUpperCase(line1.charAt(i+1)) == 'A') {
821                    return SeqIOConstants.GENPEPT;
822                }
823            }
824            return SeqIOConstants.GENBANK_DNA;
825        }
826        else if (line1.length() >= 45 &&
827                 line1.substring(19, 45).equalsIgnoreCase("GENETIC SEQUENCE DATA BANK")) {
828            return guessGenType(fileName);
829        }
830        else {
831            return SeqIOConstants.UNKNOWN;
832        }
833    }
834
835    /**
836     * Attempts to retrieve the most appropriate
837     * <code>SequenceBuilder</code> object for some combination of
838     * <code>Alphabet</code> and <code>SequenceFormat</code>
839     *
840     * @param format currently supports <code>FastaFormat</code>,
841     * <code>GenbankFormat</code>, <code>EmblLikeFormat</code>
842     * @param alpha currently only supports the DNA and Protein
843     * alphabets
844     *
845     * @return the <code>SequenceBuilderFactory</code>
846     *
847     * @throws BioException if the combination of alpha and format is
848     * unrecognized.
849     *
850     * @deprecated as this essentially duplicates the operation
851     * available in the method <code>identifyBuilderFactory</code>.
852     */
853    public static SequenceBuilderFactory formatToFactory(SequenceFormat format,
854                                                         Alphabet alpha)
855        throws BioException {
856
857        if ((format instanceof FastaFormat) &&
858           (alpha == DNATools.getDNA() ||
859            alpha == ProteinTools.getAlphabet())) {
860
861            return getFastaBuilderFactory();
862        }
863        else if (format instanceof GenbankFormat &&
864                alpha == DNATools.getDNA()) {
865
866            return getGenbankBuilderFactory();
867        }
868        else if (format instanceof GenbankFormat &&
869                 alpha == ProteinTools.getAlphabet()) {
870            return getGenpeptBuilderFactory();
871        }
872        else if (format instanceof EmblLikeFormat &&
873                 alpha == DNATools.getDNA()){
874            return getEmblBuilderFactory();
875        }
876        else if (format instanceof EmblLikeFormat &&
877                 alpha == ProteinTools.getAlphabet()) {
878            return getSwissprotBuilderFactory();
879        }
880        else {
881            throw new BioException("Unknown combination of"
882                                   + " Alphabet and Format");
883      }
884    }
885
886    /**
887     * Reads a file with the specified format and alphabet
888     * @param formatName the name of the format eg genbank or
889     * swissprot (case insensitive)
890     * @param alphabetName the name of the alphabet eg dna or rna or
891     * protein (case insensitive)
892     * @param br a BufferedReader for the input
893     * @return either an Alignment object or a SequenceIterator
894     * (depending on the format read)
895     * @throws BioException if an error occurs while reading or a
896     * unrecognized format, alphabet combination is used (eg swissprot
897     * and DNA).
898     *
899     * @since 1.3
900     */
901    public static Object fileToBiojava(String formatName,
902                                       String alphabetName,
903                                       BufferedReader br)
904        throws BioException {
905
906        int fileType = identifyFormat(formatName, alphabetName);
907
908        return fileToBiojava(fileType, br);
909    }
910
911    /**
912     * Reads a file and returns the corresponding Biojava object. You
913     * need to cast it as an Alignment or a SequenceIterator as
914     * appropriate.
915     * @param fileType a value that describes the file type
916     * @param br the reader for the input
917     * @throws org.biojava.bio.BioException if the file cannot be parsed
918     * @return either a <code>SequenceIterator</code> if the file type is a 
919     * sequence file, or a <code>Alignment</code> if the file is a sequence
920     * alignment.
921     */
922    public static Object fileToBiojava(int fileType, BufferedReader br)
923        throws BioException {
924
925        // Mask the sequence format bytes
926        int alphaType = fileType & (~ 0xffff);
927        if (alphaType == 0)
928            throw new IllegalArgumentException("No alphabet was set in the identifier");
929
930        // Mask alphabet bytes
931        int formatType = fileType & (~ 0xffff0000);
932        if (formatType == 0)
933            throw new IllegalArgumentException("No format was set in the identifier");
934
935        switch (fileType) {
936            case AlignIOConstants.MSF_DNA:
937            case AlignIOConstants.MSF_AA:
938            case AlignIOConstants.FASTA_DNA:
939            case AlignIOConstants.FASTA_AA:
940                return fileToAlign(fileType, br);
941            case SeqIOConstants.FASTA_DNA:
942            case SeqIOConstants.FASTA_AA:
943            case SeqIOConstants.EMBL_DNA:
944            case SeqIOConstants.GENBANK_DNA:
945            case SeqIOConstants.SWISSPROT:
946            case SeqIOConstants.GENPEPT:
947                return fileToSeq(fileType, br);
948            default:
949                throw new BioException("Unknown file type '"
950                                       + fileType
951                                       + "'");
952        }
953    }
954
955    /**
956     * Writes a Biojava <code>SequenceIterator</code>,
957     * <code>SequenceDB</code>, <code>Sequence</code> or <code>Aligment</code>
958     * to an <code>OutputStream</code>
959     *
960     * @param formatName eg fasta, GenBank (case insensitive)
961     * @param alphabetName eg DNA, RNA (case insensititve)
962     * @param os where to write to
963     * @param biojava the object to write
964     * @throws BioException problems getting data from the biojava object.
965     * @throws IOException if there are IO problems
966     * @throws IllegalSymbolException a Symbol cannot be parsed
967     */
968    public static void biojavaToFile(String formatName, String alphabetName,
969                                     OutputStream os, Object biojava)
970    throws BioException, IOException, IllegalSymbolException{
971      int fileType = identifyFormat(formatName,alphabetName);
972      biojavaToFile(fileType, os, biojava);
973    }
974
975    /**
976     * Converts a Biojava object to the given filetype.
977     * @param fileType a value that describes the type of sequence file
978     * @param os the stream to write the formatted results to
979     * @param biojava a <code>SequenceIterator</code>, <code>SequenceDB</code>, 
980     * <code>Sequence</code>, or <code>Alignment</code>
981     * @throws org.biojava.bio.BioException if <code>biojava</code> cannot be 
982     * converted to that format.
983     * @throws java.io.IOException if the output cannot be written to 
984     * <code>os</code>
985     * @throws org.biojava.bio.symbol.IllegalSymbolException if <code>biojava
986     * </code> contains a <code>Symbol</code> that cannot be understood by the
987     * parser.
988     */
989    public static void biojavaToFile(int fileType, OutputStream os,
990                                     Object biojava)
991        throws BioException, IOException, IllegalSymbolException {
992        switch (fileType) {
993            case AlignIOConstants.MSF_DNA:
994            case AlignIOConstants.MSF_AA:
995            case AlignIOConstants.FASTA_DNA:
996            case AlignIOConstants.FASTA_AA:
997                alignToFile(fileType, os, (Alignment) biojava);
998                break;
999            case SeqIOConstants.FASTA_DNA:
1000            case SeqIOConstants.FASTA_AA:
1001            case SeqIOConstants.EMBL_DNA:
1002            case SeqIOConstants.GENBANK_DNA:
1003            case SeqIOConstants.SWISSPROT:
1004            case SeqIOConstants.GENPEPT:
1005                if(biojava instanceof SequenceDB){
1006                  seqToFile(fileType, os, ((SequenceDB)biojava).sequenceIterator());
1007                }else if(biojava instanceof Sequence){
1008                  seqToFile(fileType, os, new SingleSeqIterator((Sequence)biojava));
1009                }else{
1010                  seqToFile(fileType, os, (SequenceIterator) biojava);
1011                }
1012                break;
1013            default:
1014                throw new BioException("Unknown file type '"
1015                                       + fileType
1016                                       + "'");
1017        }
1018    }
1019
1020    /**
1021     * Helper function for guessFileName.
1022     */
1023    private static int guessFastaType(File seqFile)
1024        throws IOException, FileNotFoundException {
1025        BufferedReader br = new BufferedReader(new FileReader(seqFile));
1026        String line = br.readLine();
1027        line = br.readLine();
1028        br.close();
1029        for (int i = 0; i < line.length(); i++) {
1030            if (Character.toUpperCase(line.charAt(i)) == 'F' ||
1031                Character.toUpperCase(line.charAt(i)) == 'L' ||
1032                Character.toUpperCase(line.charAt(i)) == 'I' ||
1033                Character.toUpperCase(line.charAt(i)) == 'P' ||
1034                Character.toUpperCase(line.charAt(i)) == 'Q' ||
1035                Character.toUpperCase(line.charAt(i)) == 'E') {
1036                return SeqIOConstants.FASTA_AA;
1037            }
1038        }
1039
1040        return SeqIOConstants.FASTA_DNA;
1041    }
1042
1043    private static SymbolTokenization getDNAParser() {
1044        try {
1045            return DNATools.getDNA().getTokenization("token");
1046        } catch (BioException ex) {
1047            throw new BioError("Assertion failing:"
1048                               + " Couldn't get DNA token parser",ex);
1049        }
1050    }
1051
1052    private static SymbolTokenization getRNAParser() {
1053        try {
1054            return RNATools.getRNA().getTokenization("token");
1055        } catch (BioException ex) {
1056            throw new BioError("Assertion failing:"
1057                               + " Couldn't get RNA token parser",ex);
1058        }
1059    }
1060
1061    private static SymbolTokenization getNucleotideParser() {
1062        try {
1063            return NucleotideTools.getNucleotide().getTokenization("token");
1064        } catch (BioException ex) {
1065            throw new BioError("Assertion failing:"
1066                               + " Couldn't get nucleotide token parser",ex);
1067        }
1068    }
1069
1070    private static SymbolTokenization getProteinParser() {
1071        try {
1072            return ProteinTools.getTAlphabet().getTokenization("token");
1073        } catch (BioException ex) {
1074            throw new BioError("Assertion failing:"
1075                               + " Couldn't get PROTEIN token parser",ex);
1076        }
1077    }
1078
1079    /**
1080     * Helper function for guessFileName.
1081     */
1082    private static int guessMsfType(File seqFile)
1083        throws IOException, FileNotFoundException {
1084        BufferedReader br = new BufferedReader(new FileReader(seqFile));
1085        String line = br.readLine();
1086        if (line.startsWith("!!NA_MULTIPLE_ALIGNMENT")) {
1087            return AlignIOConstants.MSF_DNA;
1088        }
1089        else if (line.startsWith("!!AA_MULTIPLE_ALIGNMENT")) {
1090            return AlignIOConstants.MSF_AA;
1091        }
1092        else {
1093            while (line.indexOf("Type: ") == -1) {
1094                line = br.readLine();
1095            }
1096            br.close();
1097            int typeIndex = line.indexOf("Type: ") + 6;
1098            if (line.substring(typeIndex).startsWith("N")) {
1099                return AlignIOConstants.MSF_DNA;
1100            }
1101            else if (line.substring(typeIndex).startsWith("P")) {
1102                return AlignIOConstants.MSF_AA;
1103            }
1104            else {
1105                return AlignIOConstants.UNKNOWN;
1106            }
1107        }
1108    }
1109
1110    /**
1111     * Helper function for guessFileName.
1112     */
1113    private static int guessGenType(String fileName)
1114        throws IOException, FileNotFoundException {
1115        BufferedReader br = new BufferedReader(new FileReader(fileName));
1116        String line = br.readLine();
1117        while (line.indexOf("LOCUS") == -1) {
1118            line = br.readLine();
1119        }
1120        br.close();
1121        for (int i = 0; i < line.length(); i++) {
1122            if (Character.toUpperCase(line.charAt(i)) == 'A' &&
1123                Character.toUpperCase(line.charAt(i+1)) == 'A') {
1124                    return SeqIOConstants.GENPEPT;
1125            }
1126        }
1127        return SeqIOConstants.GENBANK_DNA;
1128    }
1129
1130    /**
1131     * Converts a file to an Biojava alignment.
1132     */
1133    private static Alignment fileToAlign(int fileType, BufferedReader br)
1134        throws BioException {
1135        switch(fileType) {
1136            case AlignIOConstants.MSF_DNA:
1137            case AlignIOConstants.MSF_AA:
1138                return (new MSFAlignmentFormat()).read(br);
1139            case AlignIOConstants.FASTA_DNA:
1140            case AlignIOConstants.FASTA_AA:
1141                return (new FastaAlignmentFormat()).read(br);
1142            default:
1143                throw new BioException("Unknown file type '"
1144                                       + fileType
1145                                       + "'");
1146        }
1147    }
1148
1149    /**
1150     * Converts a file to a Biojava sequence.
1151     */
1152    private static SequenceIterator fileToSeq(int fileType,
1153                                              BufferedReader br)
1154        throws BioException {
1155        switch (fileType) {
1156            case SeqIOConstants.FASTA_DNA:
1157                return SeqIOTools.readFastaDNA(br);
1158            case SeqIOConstants.FASTA_AA:
1159                return SeqIOTools.readFastaProtein(br);
1160            case SeqIOConstants.EMBL_DNA:
1161                return SeqIOTools.readEmbl(br);
1162            case SeqIOConstants.GENBANK_DNA:
1163                return SeqIOTools.readGenbank(br);
1164            case SeqIOConstants.SWISSPROT:
1165                return SeqIOTools.readSwissprot(br);
1166            case SeqIOConstants.GENPEPT:
1167                return SeqIOTools.readGenpept(br);
1168            default:
1169                throw new BioException("Unknown file type '"
1170                                       + fileType
1171                                       + "'");
1172        }
1173    }
1174
1175    /**
1176     * Converts a Biojava alignment to the given filetype.
1177     */
1178    private static void alignToFile(int fileType, OutputStream os,
1179                                    Alignment align)
1180        throws BioException, IllegalSymbolException {
1181        switch(fileType) {
1182            case AlignIOConstants.MSF_DNA:
1183                (new MSFAlignmentFormat()).writeDna(os, align);
1184                break;
1185            case AlignIOConstants.MSF_AA:
1186                (new MSFAlignmentFormat()).writeProtein(os, align);
1187                break;
1188            case AlignIOConstants.FASTA_DNA:
1189                (new FastaAlignmentFormat()).writeDna(os, align);
1190                break;
1191            case AlignIOConstants.FASTA_AA:
1192                (new FastaAlignmentFormat()).writeProtein(os, align);
1193                break;
1194            default:
1195                throw new BioException("Unknown file type '"
1196                                       + fileType
1197                                       + "'");
1198        }
1199    }
1200
1201    /**
1202     * Converts a Biojava sequence to the given filetype.
1203     */
1204    private static void seqToFile(int fileType, OutputStream os,
1205                                  SequenceIterator seq)
1206        throws IOException, BioException {
1207        switch (fileType) {
1208            case SeqIOConstants.FASTA_DNA:
1209            case SeqIOConstants.FASTA_AA:
1210                SeqIOTools.writeFasta(os, seq);
1211                break;
1212            case SeqIOConstants.EMBL_DNA:
1213                SeqIOTools.writeEmbl(os, seq);
1214                break;
1215            case SeqIOConstants.SWISSPROT:
1216                SeqIOTools.writeSwissprot(os, seq);
1217                break;
1218            case SeqIOConstants.GENBANK_DNA:
1219                SeqIOTools.writeGenbank(os, seq);
1220                break;
1221            case SeqIOConstants.GENPEPT:
1222                SeqIOTools.writeGenpept(os, seq);
1223                break;
1224            default:
1225                throw new BioException("Unknown file type '"
1226                                       + fileType
1227                                       + "'");
1228        }
1229    }
1230
1231    private static final class SingleSeqIterator
1232        implements SequenceIterator {
1233        private Sequence seq;
1234        SingleSeqIterator(Sequence seq) {
1235            this.seq = seq;
1236        }
1237
1238        public boolean hasNext() {
1239            return seq != null;
1240        }
1241
1242        public Sequence nextSequence() {
1243            Sequence seq = this.seq;
1244            this.seq = null;
1245            return seq;
1246        }
1247    }
1248}