001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.program.sax; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025 026import org.xml.sax.Attributes; 027import org.xml.sax.InputSource; 028import org.xml.sax.SAXException; 029import org.xml.sax.helpers.AttributesImpl; 030 031/** 032 * A facade class allowing for direct SAX2-like parsing of the native 033 * output from Blast-like bioinformatics software. Because the parser is SAX2 034 * compliant, application writers can simply pass XML ContentHandlers 035 * to the parser in order to receive notifcation of SAX2 events. 036 * <p> 037 * The SAX2 events produced are as if the input to the parser was 038 * an XML file validating against the biojava BlastLikeDataSetCollection DTD. 039 * There is no requirement for an intermediate conversion of native output to 040 * XML format. An application of the parsing framework, however, is to 041 * create XML format files from native output files. 042 * <p> 043 * The biojava Blast-like parsing framework is designed to uses minimal 044 * memory,so that in principle, extremely large native outputs can be 045 * parsed and XML ContentHandlers can listen only for small amounts of 046 * information. 047 * <p> 048 * The framework currently supports parsing of native output from 049 * the following bioinformatics programs. Please note that if 050 * you are using different versions of NCBI or WU Blast to those 051 * listed below, it is worth considering trying setting the parsing 052 * mode to Lazy, which means parsing will be attempted if the program 053 * is recognised, regardless of version. 054 * <ul> 055 * <li>NCBI Blast version 2.0.11 056 * <li>NCBI Blast version 2.2.2 057 * <li>NCBI Blast version 2.2.3 058 * <li>WU-Blast version 2.0a19mp-washu 059 * <li>HMMER 2.1.1 hmmsearch 060 * </ul> 061 * Planned addition support 062 * <ul> 063 * <li> Support for HMMER hmmpfam almost there but not fully tested 064 * </ul> 065 * <p> 066 * <p> 067 * <b>Notes to SAX driver writers</b> 068 * <p> 069 * The framework that this parser is built on is designed to be 070 * extensible with support for both different pieces of software 071 * (<i>i.e.</i> not just software that produces Blast-like output), 072 * and multiple versions of programs. 073 * <p> 074 * This class inherits from the 075 * org.biojava.bio.program.sax.AbstractNativeAppSAXParser 076 * abstract base class. The abstract base class is a good place to 077 * start looking if you want to write new native application SAX2 parsers. 078 * This and releated classes have only package-level visibility. 079 * Typically, application writers are expected to provide a facade class 080 * in this package (similar to the current class) to allow 081 * users access to functionality. 082 * <p> 083 * NB Support for InputSource is not complete due to the fact 084 * that URLs are not resolved and cannot, therefore, be used 085 * as an InputSource. System pathnames, ByteStreams and CharacterStreams, 086 * however, are all supported. 087 * <p> 088 * 089 * Copyright © 2000 Cambridge Antibody Technology. 090 * 091 * <p> 092 * Primary author -<ul> 093 * <li>Simon Brocklehurst (CAT) 094 * </ul> 095 * Other authors -<ul> 096 * <li>Tim Dilks (CAT) 097 * <li>Colin Hardman (CAT) 098 * <li>Stuart Johnston (CAT) 099 * <li>Mathieu Wiepert (Mayo Foundation) 100 * <li>Travis Banks 101 *</ul> 102 * 103 * @author Cambridge Antibody Technology (CAT) 104 * @author Travis Banks 105 * @version 1.0 106 * 107 * @see org.biojava.bio.program.BlastLikeToXMLConverter 108 */ 109public class BlastLikeSAXParser extends AbstractNativeAppSAXParser { 110 111 private BlastLikeVersionSupport oVersion = new BlastLikeVersionSupport(); 112 private BlastSAXParser oBlast; 113 114 private AttributesImpl oAtts = new AttributesImpl(); 115 private QName oAttQName = new QName(this); 116 private boolean tValidFormat = false; 117 118 private static final int STARTUP = 0; 119 private static final int INSIDE_FILE = 1; 120 121 private String oStoredLine = null; 122 123 /** 124 * Initialises SAXParser, and sets default namespace prefix 125 * to "biojava". 126 */ 127 public BlastLikeSAXParser() { 128 this.changeState(STARTUP); 129 130 //centralised setting of namespace prefix 131 //the setting is cascaded everywhere else 132 this.setNamespacePrefix("biojava"); 133 this.addPrefixMapping("biojava","http://www.biojava.org"); 134 135 oVersion.setMode(BlastLikeVersionSupport.LAZY); 136 } 137 138 /** 139 * <code>parse</code> initiates the parsing operation. 140 * 141 * @param poSource an <code>InputSource</code>. 142 * @exception IOException if an error occurs. 143 * @exception SAXException if an error occurs. 144 */ 145 public void parse(InputSource poSource ) 146 throws IOException, SAXException { 147 148 BufferedReader oContents; 149 String oLine; 150 151 this.changeState(STARTUP); 152 153 //Use method form superclass 154 oContents = this.getContentStream(poSource); 155 //This sets contentHandler document for XSLT 156 this.getContentHandler().startDocument(); 157 158 try { 159 // loop over file 160 oLine = oContents.readLine(); 161 while (oLine != null) { 162 //interpret line and send messages accordingly 163 this.interpret(oContents,oLine); 164 //do extra interpretation of lines reached by subparser 165 //objects 166 if (iState == INSIDE_FILE) { 167 oLine = oStoredLine; 168 if (oStoredLine != null) { 169 this.interpret(oContents,oLine); 170 } 171 } else { 172 oLine = oContents.readLine(); 173 } 174 175 } // end while 176 } catch (IOException x) { 177 System.out.println(x.getMessage()); 178 System.out.println("File read interrupted"); 179 } // end try/catch 180 181 //at end of file... 182 oContents.close(); 183 184 if (!tValidFormat) { 185 throw (new SAXException("Could not recognise the format " + 186 "of this file as one supported by the framework.")); 187 } 188 189 this.endElement(new QName(this, 190 this.prefix("BlastLikeDataSetCollection"))); 191 } 192 193 /** 194 * This is the default, parsing will be attempted only if both 195 * the program e.g. NCBI BlastP, and a particular version 196 * are recognised as bsing supported. 197 * 198 */ 199 public void setModeStrict() { 200 oVersion.setMode(BlastLikeVersionSupport.STRICT); 201 } 202 203 /** 204 * Setting the mode to lazy means that, if the program is recognised, 205 * e.g. WU-TBlastX, then parsing will be attempted even if 206 * the particular version is not recognised. Using this option 207 * is more likely to result in erroneous parsing than if the 208 * strict mode is used. 209 * 210 */ 211 public void setModeLazy() { 212 oVersion.setMode(BlastLikeVersionSupport.LAZY); 213 } 214 215 /** 216 * Deal with line according to state parser is in. 217 * 218 * @param poLine A line of Blast output 219 */ 220 private void interpret(BufferedReader poContents, String poLine) 221 throws SAXException { 222 //For a brand new collection, 223 //check for the start of a new BlastDataSet 224 if (iState == STARTUP) { 225 //look for characteristic of start of dataset 226 if (oVersion.isStartOfDataSet(poLine)) { 227 //For GCG, oVersion is set as an indicator to get 228 //program info from the second line 229 if (oVersion.getProgram() == BlastLikeVersionSupport.GCG) { 230 //if GCG, skip to next line to get program info 231 try { 232 poLine = poContents.readLine (); 233 } catch (java.io.IOException x) { 234 System.out.println(x.getMessage()); 235 System.out.println("File read interrupted"); 236 throw (new SAXException("Error parsing GCG File")); 237 } // end try/catch 238 } 239 240 tValidFormat = oVersion.assignProgramAndVersion(poLine); 241 242 if (!oVersion.isSupported()) { 243 throw (new SAXException("Program " 244 + oVersion.getProgramString() 245 + " Version " 246 + oVersion.getVersionString() 247 + " is not supported by the biojava blast-like " 248 + "parsing framework")); 249 } 250 251 oAtts.clear(); 252 oAttQName.setQName("xmlns"); 253 //check if namespace configuration means attribute 254 //should not be reported. 255 if (!oAttQName.getLocalName().equals("")) { 256 oAtts.addAttribute(oAttQName.getURI(), 257 oAttQName.getLocalName(), 258 oAttQName.getQName(), 259 "CDATA",""); 260 } 261 262 oAttQName.setQName("xmlns:biojava"); 263 //check if namespace configuration means attribute 264 //should not be reported. 265 if (!oAttQName.getLocalName().equals("")) { 266 oAtts.addAttribute(oAttQName.getURI(), 267 oAttQName.getLocalName(), 268 oAttQName.getQName(), 269 "CDATA","http://www.biojava.org"); 270 } 271 272 this.startElement(new QName(this, 273 this.prefix("BlastLikeDataSetCollection")), 274 (Attributes)oAtts); 275 276 this.onNewDataSet(poContents,poLine); 277 return; 278 } 279 } //End check for the start of a new BlastDataSet 280 281 if (iState == INSIDE_FILE) { 282 //look for characteristic of start of dataset 283 if (oVersion.isStartOfDataSet(poLine)) { 284 285 tValidFormat = oVersion.assignProgramAndVersion(poLine); 286 287 this.onNewDataSet(poContents,poLine); 288 289 return; 290 } 291 } //End check for the start of a new BlastDataSet 292 } 293 294 /** 295 * 296 * When this method is called, the line will look something line: 297 * 298 * BLASTN 2.0.11 [Jan-20-2000] 299 * 300 * The above would be parsed to program blastn, and version number. 301 * 302 * @param poLine - 303 */ 304 private void onNewDataSet(BufferedReader poContents, String poLine) 305 throws SAXException { 306 307 //choose according to version... 308 oBlast = new BlastSAXParser(oVersion,this.getNamespacePrefix()); 309 String oLine=""; 310 311 //Parse Contents stream up to end of a single BlastDataSet. 312 oBlast.setContentHandler(oHandler); 313 while(oLine!=null) { 314 oLine = oBlast.parse(poContents,poLine); 315 } 316 317 this.changeState(INSIDE_FILE); 318 319 //now make sure to interpret the line the BlastSAXParser returned from 320 //in top-level parse method. 321 if (oLine != null) { 322 oStoredLine = oLine; 323 return; 324 // this.interpret(poContents,oLine); 325 } else { 326 //here if at the EOF 327 oStoredLine = null; 328 return; 329 } 330 } 331} 332 333