001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.sax;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.util.StringTokenizer;
026
027import org.xml.sax.Attributes;
028import org.xml.sax.InputSource;
029import org.xml.sax.SAXException;
030import org.xml.sax.helpers.AttributesImpl;
031
032/**
033 * A SAX2 parser for dealing with multiple sequences in
034 * FASTA format.
035 *
036 * For example:
037 * <pre>
038 * >Seq1
039 * GATCGATCGTAGCTAGATGCTAGCATGCTAGCTGACTGATCGATCGTAGCTAGCTAGCTGACTG
040 * >Seq2
041 * GATCGATCGTAGCTAGATGCTAGCATGCTAGCTGACTGATCGATCGTAGCTAGCTAGCTGACTG
042 * </pre>
043 * <p>
044 *
045 * Copyright &copy; 2000,2001 Cambridge Antibody Technology.
046 
047 * <p>
048 * Primary author -<ul>
049 * <li>Simon Brocklehurst (CAT)
050 * </ul>
051 * Other authors  -<ul>
052 * <li>Neil Benn          (CAT)
053 * <li>Lawrence Bower     (CAT)
054 * <li>Derek Crockford    (CAT)
055 * <li>Tim Dilks          (CAT)
056 * <li>Colin Hardman      (CAT)
057 * <li>Stuart Johnston    (CAT)
058 *</ul>
059 *
060 * @author Cambridge Antibody Technology (CAT)
061 * @author Greg Cox
062 * @version 1.0
063 *
064 */
065public class FastaSequenceSAXParser extends AbstractNativeAppSAXParser {
066
067
068
069    private AttributesImpl          oAtts     = new AttributesImpl();
070    private QName                   oAttQName = new QName(this);
071    private char[]                  aoChars;
072
073    private StringBuffer            oSeqName  = new StringBuffer();
074    private StringBuffer            oSeq      = new StringBuffer();
075    private boolean                 tOnFirst  = true;
076
077    private static final int        STARTUP            = 0;
078    private static final int        IN_STREAM          = 1;
079
080
081    /**
082     * Initialises internal state
083     * Sets namespace prefix to "biojava"
084     */
085    public FastaSequenceSAXParser() {
086        iState = STARTUP;
087        this.setNamespacePrefix("biojava");
088    }
089
090    /**
091     * Describe 'parse' method here.
092     *
093     * @param poSource   -
094     */
095    public void parse(InputSource poSource )
096        throws IOException,SAXException {
097
098        BufferedReader            oContents;
099        String                    oLine = null;
100
101        //Use method form superclass
102        oContents = this.getContentStream(poSource);
103
104        // loop over file
105        try {
106            // loop over file
107            oLine = oContents.readLine();
108            while (oLine != null) {
109                //System.out.println(oLine);
110                this.interpret(oContents,oLine);
111                oLine = oContents.readLine();
112            } // end while
113        } catch (java.io.IOException x) {
114            System.out.println(x.getMessage());
115            System.out.println("Stream read interupted");
116        } // end try/catch
117
118        //at end of stream...
119        //do final sequence
120        this.emitSequence();
121
122        this.endElement(new QName(this,
123                                  this.prefix("SequenceCollection")));
124        oContents.close();
125
126    }
127
128    /**
129     * Describe <code>interpret</code> method here.
130     *
131     * @param poContents a <code>BufferedReader</code> value
132     * @param poLine a <code>String</code> value
133     * @exception SAXException if an error occurs
134     */
135    private void interpret(BufferedReader poContents, String poLine)
136        throws SAXException {
137
138
139        if (iState == STARTUP) {
140            oAtts.clear();
141            this.startElement(
142              new QName(this,
143                        this.prefix("SequenceCollection")),
144                                  (Attributes)oAtts);
145            this.changeState(IN_STREAM);
146        }
147
148        if (iState == IN_STREAM) {
149            //look for the start of first record i.e.a header
150            if ( poLine.startsWith(">") ) {
151                if (!tOnFirst) {
152                    this.emitSequence();
153                }
154                this.parseHeaderLine(poLine);
155                oSeq.setLength(0);
156                return;
157            } else {
158                this.appendSequence(poLine);
159            }
160
161        }
162    }
163    /**
164     * Parse the header part of a record i.e. >myName, and
165     * emit messages.
166     *
167     * @param poLine a <code>String</code> value
168     */
169    private void parseHeaderLine(String poLine) {
170        oSeqName.setLength(0);
171        oSeqName.append(poLine.substring(1));
172
173        //flip flag to begin emitting sequence elements
174        tOnFirst = false;
175        //System.out.println(oSeqName);
176    }
177    /**
178     * Builds up sequence data - NB white space is
179     * removed.
180     *
181     * @param poLine a <code>String</code> value
182     */
183    private void appendSequence(String poLine) {
184        StringTokenizer oSt = new StringTokenizer(poLine,"\n\t\r ");
185        while (oSt.hasMoreTokens()) {
186          oSeq.append(oSt.nextToken());
187        }
188    }
189    /**
190     * Describe <code>emitSequence</code> method here.
191     *
192     */
193    private void emitSequence() throws SAXException {
194            oAtts.clear();
195
196            oAttQName.setQName("sequenceName");
197            oAtts.addAttribute(oAttQName.getURI(),
198                           oAttQName.getLocalName(),
199                           oAttQName.getQName(),
200                           "CDATA",oSeqName.substring(0));
201
202            this.startElement(
203              new QName(this,
204                        this.prefix("Sequence")),
205                                  (Attributes)oAtts);
206
207            aoChars = oSeq.substring(0).toCharArray();
208            this.characters(aoChars,0,aoChars.length);
209            this.endElement(new QName(this,this.prefix("Sequence")));
210
211    }
212}