001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.sax;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025
026import org.xml.sax.Attributes;
027import org.xml.sax.InputSource;
028import org.xml.sax.SAXException;
029import org.xml.sax.helpers.AttributesImpl;
030
031/**
032 * A facade class allowing for direct SAX2-like parsing of the native
033 * output from Blast-like bioinformatics software.  Because the parser is SAX2
034 * compliant, application writers can simply pass XML ContentHandlers
035 * to the parser in order to receive notifcation of SAX2 events.
036 * <p>
037 * The SAX2 events produced are as if the input to the parser was
038 * an XML file validating against the biojava BlastLikeDataSetCollection DTD.
039 * There is no requirement for an intermediate conversion of native output to
040 * XML format. An application of the parsing framework, however, is to
041 * create XML format files from native output files.
042 * <p>
043 * The biojava Blast-like parsing framework is designed to uses minimal 
044 * memory,so that in principle, extremely large native outputs can be
045 * parsed and XML ContentHandlers can listen only for small amounts of
046 * information.
047 * <p>
048 * The framework currently supports parsing of native output from
049 * the following bioinformatics programs. Please note that if
050 * you are using different versions of NCBI or WU Blast to those
051 * listed below, it is worth considering trying setting the parsing 
052 * mode to Lazy, which means parsing will be attempted if the program
053 * is recognised, regardless of version.
054 * <ul>
055 * <li>NCBI Blast version 2.0.11
056 * <li>NCBI Blast version 2.2.2
057 * <li>NCBI Blast version 2.2.3
058 * <li>WU-Blast version 2.0a19mp-washu
059 * <li>HMMER 2.1.1 hmmsearch 
060 * </ul>
061 * Planned addition support
062 * <ul>
063 * <li> Support for HMMER hmmpfam almost there but not fully tested
064 * </ul>
065 * <p>
066 * <p>
067 * <b>Notes to SAX driver writers</b>
068 * <p>
069 * The framework that this parser is built on is designed to be
070 * extensible with support for both different pieces of software
071 * (<i>i.e.</i> not just software that produces Blast-like output),
072 * and multiple versions of programs.
073 * <p>
074 * This class inherits from the 
075 * org.biojava.bio.program.sax.AbstractNativeAppSAXParser
076 * abstract base class.  The abstract base class is a good place to
077 * start looking if you want to write new native application SAX2 parsers.
078 * This and releated classes have only package-level visibility.
079 * Typically, application writers are expected to provide a facade class
080 * in this package (similar to the current class) to allow
081 * users access to functionality.
082 * <p>
083 * NB Support for InputSource is not complete due to the fact
084 * that URLs are not resolved and cannot, therefore, be used
085 * as an InputSource.  System pathnames, ByteStreams and CharacterStreams,
086 * however, are all supported.
087 * <p>
088 *
089 * Copyright &copy; 2000 Cambridge Antibody Technology.
090 * 
091 * <p>
092 * Primary author -<ul>
093 * <li>Simon Brocklehurst (CAT)
094 * </ul>
095 * Other authors  -<ul>
096 * <li>Tim Dilks          (CAT)
097 * <li>Colin Hardman      (CAT)
098 * <li>Stuart Johnston    (CAT)
099 * <li>Mathieu Wiepert    (Mayo Foundation)
100 * <li>Travis Banks
101 *</ul>
102 *
103 * @author Cambridge Antibody Technology (CAT)
104 * @author Travis Banks
105 * @version 1.0
106 *
107 * @see org.biojava.bio.program.BlastLikeToXMLConverter
108 */
109public class BlastLikeSAXParser extends AbstractNativeAppSAXParser {
110
111    private BlastLikeVersionSupport oVersion  = new BlastLikeVersionSupport();
112    private BlastSAXParser          oBlast; 
113
114    private AttributesImpl          oAtts     = new AttributesImpl();
115    private QName                   oAttQName = new QName(this);     
116    private boolean                 tValidFormat  = false;
117
118    private static final int        STARTUP                   = 0;
119    private static final int        INSIDE_FILE               = 1;
120
121    private String                  oStoredLine = null;
122
123    /**
124     * Initialises SAXParser, and sets default namespace prefix
125     * to "biojava".
126     */
127    public BlastLikeSAXParser() {
128        this.changeState(STARTUP);
129
130        //centralised setting of namespace prefix
131        //the setting is cascaded everywhere else
132        this.setNamespacePrefix("biojava");
133        this.addPrefixMapping("biojava","http://www.biojava.org");
134        
135        oVersion.setMode(BlastLikeVersionSupport.LAZY);
136    }
137
138    /**
139     * <code>parse</code> initiates the parsing operation.
140     *
141     * @param poSource an <code>InputSource</code>.
142     * @exception IOException if an error occurs.
143     * @exception SAXException if an error occurs.
144     */
145    public void parse(InputSource poSource ) 
146    throws IOException, SAXException {
147
148        BufferedReader            oContents;
149        String                    oLine;
150
151        this.changeState(STARTUP);
152
153        //Use method form superclass
154        oContents = this.getContentStream(poSource);
155        //This sets contentHandler document for XSLT
156        this.getContentHandler().startDocument();
157        
158        try {
159            // loop over file
160            oLine = oContents.readLine();
161            while (oLine != null) {
162                //interpret line and send messages accordingly        
163                this.interpret(oContents,oLine);
164                //do extra interpretation of lines reached by subparser
165                //objects
166                if (iState == INSIDE_FILE) {
167                    oLine = oStoredLine;
168                    if (oStoredLine != null) {
169                        this.interpret(oContents,oLine);
170                    }
171                } else {
172                    oLine = oContents.readLine();
173                }
174
175            } // end while
176        } catch (IOException x) {
177            System.out.println(x.getMessage());
178            System.out.println("File read interrupted");
179        } // end try/catch
180
181        //at end of file...
182        oContents.close();
183
184        if (!tValidFormat) {
185            throw (new SAXException("Could not recognise the format " +
186            "of this file as one supported by the framework."));
187        }
188
189        this.endElement(new QName(this,
190                this.prefix("BlastLikeDataSetCollection")));
191    }
192
193    /**
194     * This is the default, parsing will be attempted only if both
195     * the program e.g. NCBI BlastP, and a particular version 
196     * are recognised as bsing supported.
197     *
198     */
199    public void setModeStrict() {
200        oVersion.setMode(BlastLikeVersionSupport.STRICT);
201    }
202
203    /**
204     * Setting the mode to lazy means that, if the program is recognised,
205     * e.g. WU-TBlastX, then parsing will be attempted even if 
206     * the particular version is not recognised. Using this option
207     * is more likely to result in erroneous parsing than if the
208     * strict mode is used.
209     *
210     */
211    public void setModeLazy() {
212        oVersion.setMode(BlastLikeVersionSupport.LAZY);
213    }
214
215    /**
216     * Deal with line according to state parser is in.
217     *
218     * @param poLine     A line of Blast output
219     */
220    private void interpret(BufferedReader poContents, String poLine)
221    throws SAXException {
222        //For a brand new collection,
223        //check for the start of a new BlastDataSet
224        if (iState == STARTUP) {
225            //look for characteristic of start of dataset
226            if (oVersion.isStartOfDataSet(poLine)) {
227                //For GCG, oVersion is set as an indicator to get 
228                //program info from the second line
229                if (oVersion.getProgram() == BlastLikeVersionSupport.GCG) { 
230                    //if GCG, skip to next line to get program info
231                    try {
232                        poLine = poContents.readLine ();
233                    } catch (java.io.IOException x) {
234                        System.out.println(x.getMessage());
235                        System.out.println("File read interrupted");
236                        throw (new SAXException("Error parsing GCG File"));
237                    } // end try/catch
238                }
239
240                tValidFormat = oVersion.assignProgramAndVersion(poLine);
241
242                if (!oVersion.isSupported()) {
243                    throw (new SAXException("Program "
244                            + oVersion.getProgramString()
245                            + " Version "
246                            + oVersion.getVersionString()
247                            + " is not supported by the biojava blast-like "
248                            + "parsing framework"));
249                }
250
251                oAtts.clear();
252                oAttQName.setQName("xmlns");
253                //check if namespace configuration means attribute
254                //should not be reported.
255                if (!oAttQName.getLocalName().equals("")) {
256                    oAtts.addAttribute(oAttQName.getURI(),
257                            oAttQName.getLocalName(),
258                            oAttQName.getQName(),
259                            "CDATA","");
260                }
261
262                oAttQName.setQName("xmlns:biojava");
263                //check if namespace configuration means attribute
264                //should not be reported.
265                if (!oAttQName.getLocalName().equals("")) {
266                    oAtts.addAttribute(oAttQName.getURI(),
267                            oAttQName.getLocalName(),
268                            oAttQName.getQName(),
269                            "CDATA","http://www.biojava.org");
270                }
271
272                this.startElement(new QName(this,
273                        this.prefix("BlastLikeDataSetCollection")),
274                        (Attributes)oAtts);
275
276                this.onNewDataSet(poContents,poLine);
277                return;
278            }
279        }   //End check for the start of a new BlastDataSet
280
281        if (iState == INSIDE_FILE) {
282            //look for characteristic of start of dataset
283            if (oVersion.isStartOfDataSet(poLine)) {
284
285                tValidFormat = oVersion.assignProgramAndVersion(poLine);
286
287                this.onNewDataSet(poContents,poLine);
288
289                return;
290            }
291        }   //End check for the start of a new BlastDataSet
292    }
293
294    /**
295     *
296     * When this method is called, the line will look something line:
297     *
298     * BLASTN 2.0.11 [Jan-20-2000]
299     *
300     * The above would be parsed to program blastn, and version number.
301     *
302     * @param poLine     -
303     */
304    private void onNewDataSet(BufferedReader poContents, String poLine)
305    throws SAXException {
306
307        //choose according to version...
308        oBlast = new BlastSAXParser(oVersion,this.getNamespacePrefix());
309        String oLine="";
310
311        //Parse Contents stream up to end of a single BlastDataSet.
312        oBlast.setContentHandler(oHandler);
313        while(oLine!=null) {
314                oLine = oBlast.parse(poContents,poLine);
315        }
316
317        this.changeState(INSIDE_FILE);
318
319        //now make sure to interpret the line the BlastSAXParser returned from
320        //in top-level parse method.
321        if (oLine != null) {
322            oStoredLine = oLine;
323            return;
324            //           this.interpret(poContents,oLine);
325        } else {
326            //here if at the EOF
327            oStoredLine = null;
328            return;
329        }
330    }
331}
332
333