001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.program.sax.blastxml;
023
024import org.biojava.bio.seq.io.game.ElementRecognizer;
025import org.biojava.utils.stax.DelegationManager;
026import org.biojava.utils.stax.StAXContentHandler;
027import org.xml.sax.Attributes;
028import org.xml.sax.SAXException;
029import org.xml.sax.helpers.AttributesImpl;
030import org.xml.sax.helpers.DefaultHandler;
031
032/**
033 * This class parses NCBI Blast XML output.
034 * <p>
035 * It has two modes:-
036 * i) single output document mode: this takes a document
037 * containing a single BlastOutput element and parses it.
038 * This is generated when a single query is searched against
039 * a sequence database.
040 * <p>
041 * ii) multiple query document mode: unfortunately, NCBI
042 * BLAST concatenates the results of multiple searches in
043 * one file.  This leads to an ill-formed document that violates
044 * every XML format known to the human race and other nearby 
045 * civilisations.  This parser will take a bowdlerised version of
046 * this output that is wrapped in a blast_aggregate element.
047 * <p>
048 * The massaged form is generated by stripping the XML element and
049 * DOCTYPE elements and wrapping all the classes in a single
050 * blast_aggregate element.  In Linux, this can be done with:-
051 * <pre>
052 * #!/bin/sh
053 * # Converts a Blast XML output to something vaguely well-formed
054 * # for parsing.
055 * # Use: blast_aggregate <XML output> <editted file>
056 *
057 * # strips all &lt;?xml&gt; and &lt;!DOCTYPE&gt; tags
058 * # encapsulates the multiple &lt;BlastOutput&gt; elements into &lt;blast_aggregator&gt;
059 *
060 * sed '/&gt;?xml/d' $1 | sed '/&lt;!DOCTYPE/d' | sed '1i\
061 * &lt;blast_aggregate&gt;
062 * $a\
063 * &lt;/blast_aggregate&gt;' > $2
064 *</pre>
065
066 * @author David Huen
067 */
068public class BlastXMLParser
069    extends StAXFeatureHandler
070{
071    boolean firstTime = true;
072
073    // constructor
074    public BlastXMLParser()
075    {
076        // this is the base element class
077        this.staxenv = this;
078//        System.out.println("staxenv " + staxenv);
079        // just set a DefaultHandler: does nothing worthwhile.
080        this.listener = new DefaultHandler();
081    }
082
083    /**
084     * sets the ContentHandler for this object
085     */
086    public void setContentHandler(org.xml.sax.ContentHandler listener)
087    {
088        this.listener = listener;
089    }
090
091    /**
092     * we override the superclass startElement method so we can determine the
093     * the start tag type and use it to set up delegation for the superclass.
094     */
095    public void startElement(
096            String nsURI,
097            String localName,
098            String qName,
099            Attributes attrs,
100            DelegationManager dm)
101        throws SAXException
102    {
103//        System.out.println("localName is " + localName);
104        if (firstTime) {
105            // what kind of tag do we have?
106            if (localName.equals("BlastOutput")) {
107                // this is a well-formed XML document from NCBI BLAST
108                // pertaining to one search result
109                super.addHandler(
110                    new ElementRecognizer.ByLocalName("BlastOutput"),
111                    new StAXHandlerFactory() {
112                        public StAXContentHandler getHandler(StAXFeatureHandler staxenv) {
113                            return new BlastOutputHandler(staxenv);
114                        }
115                    }
116                );
117            }
118            else if (localName.equals("blast_aggregate")) {
119                // this is my phony aggregate document that exists to
120                // legitimise otherwise ill-formed output from NCBI Blast
121                super.addHandler(new ElementRecognizer.ByLocalName("blast_aggregate"),
122                    new StAXHandlerFactory() {
123                        public StAXContentHandler getHandler(StAXFeatureHandler staxenv) {
124                            return new BlastAggregator(staxenv);
125                        }
126                    }
127                );
128            }
129            else {
130                throw new SAXException("illegal element " + localName);
131            }
132
133            firstTime = false;
134
135            // setup the root element of the output
136            AttributesImpl bldscAttrs = new AttributesImpl();
137            bldscAttrs.addAttribute("", "xmlns", "xmlns", CDATA, "");
138            bldscAttrs.addAttribute(biojavaUri, "biojava", "xmlns:biojava", CDATA, "http://www.biojava.org");
139            listener.startElement(biojavaUri, "BlastLikeDataSetCollection", biojavaUri + ":BlastLikeDataSetCollection", bldscAttrs);
140        }
141
142        // now invoke delegation
143//        super.startElement(nsURI, localName, qName, attrs, dm);
144
145        level++;
146
147        // perform delegation
148        // we must delegate only on features that are directly attached.
149        // if I do not check that that's so, any element of a kind I delegate
150        // on will be detected any depth within unrecognized tags.
151        if (level == 1) {
152//        System.out.println("StaxFeaturehandler.startElement starting. localName: " + localName + " " + level);
153            for (int i = handlers.size() - 1; i >= 0; --i) {
154                Binding b = (Binding) handlers.get(i);
155                if (b.recognizer.filterStartElement(nsURI, localName, qName, attrs)) {
156                    dm.delegate(b.handlerFactory.getHandler(this));
157                    return;
158                }
159            }
160        }
161
162        // call the element specific handler now.
163        // remember that if we we have a delegation failure we pass here too!
164        if (level == 1) {
165            startElementHandler(nsURI, localName, qName, attrs);
166        }
167    }
168
169    public void endElementHandler(
170            String nsURI,
171            String localName,
172            String qName,
173            StAXContentHandler handler)
174             throws SAXException
175    {
176        listener.endElement(biojavaUri, "BlastLikeDataSetCollection", biojavaUri + ":BlastLikeDataSetCollection");
177    }
178}