001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.sax.blastxml; 023 024import org.biojava.bio.seq.io.game.ElementRecognizer; 025import org.biojava.utils.stax.DelegationManager; 026import org.biojava.utils.stax.StAXContentHandler; 027import org.xml.sax.Attributes; 028import org.xml.sax.SAXException; 029import org.xml.sax.helpers.AttributesImpl; 030import org.xml.sax.helpers.DefaultHandler; 031 032/** 033 * This class parses NCBI Blast XML output. 034 * <p> 035 * It has two modes:- 036 * i) single output document mode: this takes a document 037 * containing a single BlastOutput element and parses it. 038 * This is generated when a single query is searched against 039 * a sequence database. 040 * <p> 041 * ii) multiple query document mode: unfortunately, NCBI 042 * BLAST concatenates the results of multiple searches in 043 * one file. This leads to an ill-formed document that violates 044 * every XML format known to the human race and other nearby 045 * civilisations. This parser will take a bowdlerised version of 046 * this output that is wrapped in a blast_aggregate element. 047 * <p> 048 * The massaged form is generated by stripping the XML element and 049 * DOCTYPE elements and wrapping all the classes in a single 050 * blast_aggregate element. In Linux, this can be done with:- 051 * <pre> 052 * #!/bin/sh 053 * # Converts a Blast XML output to something vaguely well-formed 054 * # for parsing. 055 * # Use: blast_aggregate <XML output> <editted file> 056 * 057 * # strips all <?xml> and <!DOCTYPE> tags 058 * # encapsulates the multiple <BlastOutput> elements into <blast_aggregator> 059 * 060 * sed '/>?xml/d' $1 | sed '/<!DOCTYPE/d' | sed '1i\ 061 * <blast_aggregate> 062 * $a\ 063 * </blast_aggregate>' > $2 064 *</pre> 065 066 * @author David Huen 067 */ 068public class BlastXMLParser 069 extends StAXFeatureHandler 070{ 071 boolean firstTime = true; 072 073 // constructor 074 public BlastXMLParser() 075 { 076 // this is the base element class 077 this.staxenv = this; 078// System.out.println("staxenv " + staxenv); 079 // just set a DefaultHandler: does nothing worthwhile. 080 this.listener = new DefaultHandler(); 081 } 082 083 /** 084 * sets the ContentHandler for this object 085 */ 086 public void setContentHandler(org.xml.sax.ContentHandler listener) 087 { 088 this.listener = listener; 089 } 090 091 /** 092 * we override the superclass startElement method so we can determine the 093 * the start tag type and use it to set up delegation for the superclass. 094 */ 095 public void startElement( 096 String nsURI, 097 String localName, 098 String qName, 099 Attributes attrs, 100 DelegationManager dm) 101 throws SAXException 102 { 103// System.out.println("localName is " + localName); 104 if (firstTime) { 105 // what kind of tag do we have? 106 if (localName.equals("BlastOutput")) { 107 // this is a well-formed XML document from NCBI BLAST 108 // pertaining to one search result 109 super.addHandler( 110 new ElementRecognizer.ByLocalName("BlastOutput"), 111 new StAXHandlerFactory() { 112 public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { 113 return new BlastOutputHandler(staxenv); 114 } 115 } 116 ); 117 } 118 else if (localName.equals("blast_aggregate")) { 119 // this is my phony aggregate document that exists to 120 // legitimise otherwise ill-formed output from NCBI Blast 121 super.addHandler(new ElementRecognizer.ByLocalName("blast_aggregate"), 122 new StAXHandlerFactory() { 123 public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { 124 return new BlastAggregator(staxenv); 125 } 126 } 127 ); 128 } 129 else { 130 throw new SAXException("illegal element " + localName); 131 } 132 133 firstTime = false; 134 135 // setup the root element of the output 136 AttributesImpl bldscAttrs = new AttributesImpl(); 137 bldscAttrs.addAttribute("", "xmlns", "xmlns", CDATA, ""); 138 bldscAttrs.addAttribute(biojavaUri, "biojava", "xmlns:biojava", CDATA, "http://www.biojava.org"); 139 listener.startElement(biojavaUri, "BlastLikeDataSetCollection", biojavaUri + ":BlastLikeDataSetCollection", bldscAttrs); 140 } 141 142 // now invoke delegation 143// super.startElement(nsURI, localName, qName, attrs, dm); 144 145 level++; 146 147 // perform delegation 148 // we must delegate only on features that are directly attached. 149 // if I do not check that that's so, any element of a kind I delegate 150 // on will be detected any depth within unrecognized tags. 151 if (level == 1) { 152// System.out.println("StaxFeaturehandler.startElement starting. localName: " + localName + " " + level); 153 for (int i = handlers.size() - 1; i >= 0; --i) { 154 Binding b = (Binding) handlers.get(i); 155 if (b.recognizer.filterStartElement(nsURI, localName, qName, attrs)) { 156 dm.delegate(b.handlerFactory.getHandler(this)); 157 return; 158 } 159 } 160 } 161 162 // call the element specific handler now. 163 // remember that if we we have a delegation failure we pass here too! 164 if (level == 1) { 165 startElementHandler(nsURI, localName, qName, attrs); 166 } 167 } 168 169 public void endElementHandler( 170 String nsURI, 171 String localName, 172 String qName, 173 StAXContentHandler handler) 174 throws SAXException 175 { 176 listener.endElement(biojavaUri, "BlastLikeDataSetCollection", biojavaUri + ":BlastLikeDataSetCollection"); 177 } 178}