001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.program.sax; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.HashMap; 027import java.util.StringTokenizer; 028 029import org.xml.sax.Attributes; 030import org.xml.sax.InputSource; 031import org.xml.sax.SAXException; 032import org.xml.sax.helpers.AttributesImpl; 033 034/** 035 * A SAX2 parser for dealing with a multiple sequence 036 * alignment as produced by ClustalW outputing .aln format. 037 * For example, 038 * <pre> 039 K1C0_XENLA/125-441 DKVHALETANTELERKIKEWYEKQRPGSSSGDGAKDYSKYYT 040 K1C4_XENLA/81-396 EKVRALEAANADLELKIREWYEKQK-GSGIGAGSKDFSKYFE 041 K1C5_XENLA/73-384 DRVRSLEQANHELELKIREYLDKK-----AAVGSLDYSGYYN 042 keratin15 DKVRALEEANADLEVKIHDWYQKQTP----ASPECDYSQYFK 043 044 K1C0_XENLA/125-441 -----AKFLLQNDNARLAADDFKMKFEN-------------- 045 K1C4_XENLA/81-396 -----SRVVLQIDNAKLAADDFRLKFEN-------------- 046 K1C5_XENLA/73-384 -----TRLVLSIDNAKLAADDFKIKYES-------------- 047 keratin15 -----SRVILEIDNARLAADDFRLKYEN-------------- 048 * </pre> 049 * <p> 050 * Please note, this parser reads the whole alignment in to 051 * core memory and thus does not scale to work with very large 052 * alignments on low-end hardware. 053 * <p> 054 * Please also note that this class has not been tested with 055 * many version of CLUSTAL W. 056 * 057 * Copyright © 2000,2001 Cambridge Antibody Technology. 058 * 059 * <p> 060 * Primary author -<ul> 061 * <li>Simon Brocklehurst (CAT) 062 * </ul> 063 * Other authors -<ul> 064 * <li>Neil Benn (CAT) 065 * <li>Lawrence Bower (CAT) 066 * <li>Derek Crockford (CAT) 067 * <li>Tim Dilks (CAT) 068 * <li>Colin Hardman (CAT) 069 * <li>Stuart Johnston (CAT) 070 *</ul> 071 * 072 * @author Cambridge Antibody Technology (CAT) 073 * @author Greg Cox 074 * @version 1.0 075 * 076 */ 077public class ClustalWAlignmentSAXParser extends AbstractNativeAppSAXParser { 078 079 080 081 private AttributesImpl oAtts = new AttributesImpl(); 082 private QName oAttQName = new QName(this); 083 private char[] aoChars; 084 085 private String oSeqName; 086 private String oTmpSeq; 087 private StringBuffer oSeq = new StringBuffer(); 088 private HashMap oAlignment = new HashMap(); 089 private ArrayList oSeqNameList = new ArrayList(); 090 091 private static final int STARTUP = 0; 092 private static final int IN_STREAM = 1; 093 094 095 /** 096 * Initialises internal state 097 * Sets namespace prefix to "biojava" 098 */ 099 public ClustalWAlignmentSAXParser() { 100 iState = STARTUP; 101 this.setNamespacePrefix("biojava"); 102 } 103 104 /** 105 * Describe 'parse' method here. 106 * 107 * @param poSource - 108 */ 109 public void parse(InputSource poSource ) 110 throws IOException,SAXException { 111 112 BufferedReader oContents; 113 String oLine = null; 114 115 //Use method form superclass 116 oContents = this.getContentStream(poSource); 117 118 // loop over file 119 try { 120 // loop over file 121 oLine = oContents.readLine(); 122 while (oLine != null) { 123 //System.out.println(oLine); 124 this.interpret(oContents,oLine); 125 oLine = oContents.readLine(); 126 } // end while 127 } catch (java.io.IOException x) { 128 System.out.println(x.getMessage()); 129 System.out.println("Stream read interrupted"); 130 } // end try/catch 131 132 //at end of stream... 133 134 //at this point, alignment is parsed, now cycle through 135 //and emit elements 136 for (int i = 0; i < oSeqNameList.size(); i++) { 137 oSeqName = (String) oSeqNameList.get(i); 138 this.emitSequence(oSeqName, 139 (String) oAlignment.get(oSeqName)); 140 141 142 } 143 144 this.endElement(new QName(this, 145 this.prefix("SequenceCollection"))); 146 oContents.close(); 147 148 } 149 150 /** 151 * Describe <code>interpret</code> method here. 152 * 153 * @param poContents a <code>BufferedReader</code> value 154 * @param poLine a <code>String</code> value 155 * @exception SAXException if an error occurs 156 */ 157 private void interpret(BufferedReader poContents, String poLine) 158 throws SAXException { 159 160 161 if (iState == STARTUP) { 162 oAtts.clear(); 163 this.startElement( 164 new QName(this, 165 this.prefix("SequenceCollection")), 166 (Attributes)oAtts); 167 this.changeState(IN_STREAM); 168 } 169 170 if (iState == IN_STREAM) { 171 172 if (this.lineIsRelevant(poLine)) { 173 //build aligment in memory 174 this.appendToAlignment(poLine); 175 } 176 177 } 178 } 179 /** 180 * Parse a relevant line, and add to alignment 181 * 182 * @param poLine a <code>String</code> value 183 */ 184 private void appendToAlignment(String poLine) { 185 //System.out.println(poLine); 186 StringTokenizer oSt = new StringTokenizer(poLine,"\n\t\r "); 187 188 //First token is sequence name 189 oSeqName = oSt.nextToken(); 190 //System.out.println(oSeqName); 191 192 oSeq.setLength(0); 193 while (oSt.hasMoreTokens()) { 194 oSeq.append(oSt.nextToken()); 195 } 196 //System.out.println(oSeq); 197 198 //At this point, have name of sequence, and a segment of the sequence 199 200 //Update object... 201 202 if (oAlignment.get(oSeqName) == null) { 203 //Here if on first occurence of this sequence 204 //Add to alignment 205 oAlignment.put(oSeqName,oSeq.substring(0)); 206 //maintain ordered list of sequence names 207 oSeqNameList.add(oSeqName); 208 } else { 209 //Here if building up an existing sequence 210 oTmpSeq = (String) oAlignment.get(oSeqName); 211 oAlignment.put(oSeqName,oTmpSeq.concat(oSeq.substring(0))); 212 } 213 } 214 215 /** 216 * Only interested in lines that are part of the alignment. 217 * Returns true if line is in alignment, false if not. 218 * 219 * @param poLine a <code>String</code> value 220 * @return a <code>boolean</code> value 221 */ 222 private boolean lineIsRelevant(String poLine) { 223 224 //blank lines not relevant 225 //lines that starts with a space not relevant (consensus line) 226 //lines that start with "CLUSTAL W (" not relevant (title) 227 228 if ( (poLine.trim().equals("")) || 229 (poLine.startsWith(" ")) || 230 (poLine.startsWith("CLUSTAL W (")) ) { 231 232 //System.out.println("Irrelevant|"+poLine+"|"); 233 return false; 234 } 235 236 //if here,line is part of alignment, so return true 237 return true; 238 } 239 /** 240 * Emit a sequence element 241 * 242 * @param poSequenceName a <code>String</code> value 243 * @param poSequence a <code>String</code> value 244 * @exception SAXException if an error occurs 245 */ 246 private void emitSequence(String poSequenceName, String poSequence) throws SAXException { 247 oAtts.clear(); 248 249 oAttQName.setQName("sequenceName"); 250 oAtts.addAttribute(oAttQName.getURI(), 251 oAttQName.getLocalName(), 252 oAttQName.getQName(), 253 "CDATA",poSequenceName); 254 255 this.startElement( 256 new QName(this, 257 this.prefix("Sequence")), 258 (Attributes)oAtts); 259 260 aoChars = poSequence.toCharArray(); 261 this.characters(aoChars,0,aoChars.length); 262 this.endElement(new QName(this,this.prefix("Sequence"))); 263 264 } 265}