001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.sax;
022
023import java.io.BufferedReader;
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.HashMap;
027import java.util.StringTokenizer;
028
029import org.xml.sax.Attributes;
030import org.xml.sax.InputSource;
031import org.xml.sax.SAXException;
032import org.xml.sax.helpers.AttributesImpl;
033
034/**
035 * A SAX2 parser for dealing with a multiple sequence
036 * alignment as produced by ClustalW outputing .aln format.
037 * For example,
038 * <pre>
039  K1C0_XENLA/125-441      DKVHALETANTELERKIKEWYEKQRPGSSSGDGAKDYSKYYT
040  K1C4_XENLA/81-396       EKVRALEAANADLELKIREWYEKQK-GSGIGAGSKDFSKYFE
041  K1C5_XENLA/73-384       DRVRSLEQANHELELKIREYLDKK-----AAVGSLDYSGYYN
042  keratin15               DKVRALEEANADLEVKIHDWYQKQTP----ASPECDYSQYFK
043
044  K1C0_XENLA/125-441      -----AKFLLQNDNARLAADDFKMKFEN--------------
045  K1C4_XENLA/81-396       -----SRVVLQIDNAKLAADDFRLKFEN--------------
046  K1C5_XENLA/73-384       -----TRLVLSIDNAKLAADDFKIKYES--------------
047  keratin15               -----SRVILEIDNARLAADDFRLKYEN--------------
048 * </pre>
049 * <p>
050 * Please note, this parser reads the whole alignment in to
051 * core memory and thus does not scale to work with very large
052 * alignments on low-end hardware.
053 * <p>
054 * Please also note that this class has not been tested with
055 * many version of CLUSTAL W.
056 *
057 * Copyright &copy; 2000,2001 Cambridge Antibody Technology.
058 * 
059 * <p>
060 * Primary author -<ul>
061 * <li>Simon Brocklehurst (CAT)
062 * </ul>
063 * Other authors  -<ul>
064 * <li>Neil Benn          (CAT)
065 * <li>Lawrence Bower     (CAT)
066 * <li>Derek Crockford    (CAT)
067 * <li>Tim Dilks          (CAT)
068 * <li>Colin Hardman      (CAT)
069 * <li>Stuart Johnston    (CAT)
070 *</ul>
071 *
072 * @author Cambridge Antibody Technology (CAT)
073 * @author Greg Cox
074 * @version 1.0
075 *
076 */
077public class ClustalWAlignmentSAXParser extends AbstractNativeAppSAXParser {
078
079
080
081    private AttributesImpl          oAtts      = new AttributesImpl();
082    private QName                   oAttQName  = new QName(this);
083    private char[]                  aoChars;
084
085    private String                  oSeqName;
086    private String                  oTmpSeq;
087    private StringBuffer            oSeq         = new StringBuffer();
088    private HashMap                 oAlignment   = new HashMap();
089    private ArrayList               oSeqNameList = new ArrayList();
090
091    private static final int        STARTUP            = 0;
092    private static final int        IN_STREAM          = 1;
093
094
095    /**
096     * Initialises internal state
097     * Sets namespace prefix to "biojava"
098     */
099    public ClustalWAlignmentSAXParser() {
100        iState = STARTUP;
101        this.setNamespacePrefix("biojava");
102    }
103
104    /**
105     * Describe 'parse' method here.
106     *
107     * @param poSource   -
108     */
109    public void parse(InputSource poSource )
110        throws IOException,SAXException {
111
112        BufferedReader            oContents;
113        String                    oLine = null;
114
115        //Use method form superclass
116        oContents = this.getContentStream(poSource);
117
118        // loop over file
119        try {
120            // loop over file
121            oLine = oContents.readLine();
122            while (oLine != null) {
123                //System.out.println(oLine);
124                this.interpret(oContents,oLine);
125                oLine = oContents.readLine();
126            } // end while
127        } catch (java.io.IOException x) {
128            System.out.println(x.getMessage());
129            System.out.println("Stream read interrupted");
130        } // end try/catch
131
132        //at end of stream...
133
134        //at this point, alignment is parsed, now cycle through
135        //and emit elements
136        for (int i = 0; i < oSeqNameList.size(); i++) {
137            oSeqName = (String) oSeqNameList.get(i);
138            this.emitSequence(oSeqName,
139                              (String) oAlignment.get(oSeqName));
140
141
142        }
143
144        this.endElement(new QName(this,
145                                  this.prefix("SequenceCollection")));
146        oContents.close();
147
148    }
149
150    /**
151     * Describe <code>interpret</code> method here.
152     *
153     * @param poContents a <code>BufferedReader</code> value
154     * @param poLine a <code>String</code> value
155     * @exception SAXException if an error occurs
156     */
157    private void interpret(BufferedReader poContents, String poLine)
158        throws SAXException {
159
160
161        if (iState == STARTUP) {
162            oAtts.clear();
163            this.startElement(
164              new QName(this,
165                        this.prefix("SequenceCollection")),
166                                  (Attributes)oAtts);
167            this.changeState(IN_STREAM);
168        }
169
170        if (iState == IN_STREAM) {
171
172            if (this.lineIsRelevant(poLine)) {
173                //build aligment in memory
174                this.appendToAlignment(poLine);
175            }
176
177        }
178    }
179    /**
180     * Parse a relevant line, and add to alignment
181     *
182     * @param poLine a <code>String</code> value
183     */
184    private void appendToAlignment(String poLine) {
185        //System.out.println(poLine);
186        StringTokenizer oSt = new StringTokenizer(poLine,"\n\t\r ");
187
188        //First token is sequence name
189        oSeqName = oSt.nextToken();
190        //System.out.println(oSeqName);
191
192        oSeq.setLength(0);
193        while (oSt.hasMoreTokens()) {
194          oSeq.append(oSt.nextToken());
195        }
196        //System.out.println(oSeq);
197
198        //At this point, have name of sequence, and a segment of the sequence
199
200        //Update object...
201
202        if (oAlignment.get(oSeqName) == null) {
203            //Here if on first occurence of this sequence
204            //Add to alignment
205            oAlignment.put(oSeqName,oSeq.substring(0));
206            //maintain ordered list of sequence names
207            oSeqNameList.add(oSeqName);
208        } else {
209            //Here if building up an existing sequence
210            oTmpSeq = (String) oAlignment.get(oSeqName);
211            oAlignment.put(oSeqName,oTmpSeq.concat(oSeq.substring(0)));
212        }
213    }
214
215    /**
216     * Only interested in lines that are part of the alignment.
217     * Returns true if line is in alignment, false if not.
218     *
219     * @param poLine a <code>String</code> value
220     * @return a <code>boolean</code> value
221     */
222    private boolean lineIsRelevant(String poLine) {
223
224        //blank lines not relevant
225        //lines that starts with a space  not relevant (consensus line)
226        //lines that start with "CLUSTAL W (" not relevant (title)
227
228        if ( (poLine.trim().equals("")) ||
229             (poLine.startsWith(" ")) ||
230             (poLine.startsWith("CLUSTAL W (")) ) {
231
232            //System.out.println("Irrelevant|"+poLine+"|");
233            return false;
234        }
235
236        //if here,line is part of alignment, so return true
237        return true;
238    }
239    /**
240     * Emit a sequence element
241     *
242     * @param poSequenceName a <code>String</code> value
243     * @param poSequence a <code>String</code> value
244     * @exception SAXException if an error occurs
245     */
246    private void emitSequence(String poSequenceName, String poSequence) throws SAXException {
247            oAtts.clear();
248
249            oAttQName.setQName("sequenceName");
250            oAtts.addAttribute(oAttQName.getURI(),
251                           oAttQName.getLocalName(),
252                           oAttQName.getQName(),
253                           "CDATA",poSequenceName);
254
255            this.startElement(
256              new QName(this,
257                        this.prefix("Sequence")),
258                                  (Attributes)oAtts);
259
260            aoChars = poSequence.toCharArray();
261            this.characters(aoChars,0,aoChars.length);
262            this.endElement(new QName(this,this.prefix("Sequence")));
263
264    }
265}