001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.program.sax; 022 023import java.io.BufferedReader; 024import java.io.IOException; 025import java.util.StringTokenizer; 026 027import org.xml.sax.Attributes; 028import org.xml.sax.InputSource; 029import org.xml.sax.SAXException; 030import org.xml.sax.helpers.AttributesImpl; 031 032/** 033 * A SAX2 parser for dealing with multiple sequences in 034 * FASTA format. 035 * 036 * For example: 037 * <pre> 038 * >Seq1 039 * GATCGATCGTAGCTAGATGCTAGCATGCTAGCTGACTGATCGATCGTAGCTAGCTAGCTGACTG 040 * >Seq2 041 * GATCGATCGTAGCTAGATGCTAGCATGCTAGCTGACTGATCGATCGTAGCTAGCTAGCTGACTG 042 * </pre> 043 * <p> 044 * 045 * Copyright © 2000,2001 Cambridge Antibody Technology. 046 047 * <p> 048 * Primary author -<ul> 049 * <li>Simon Brocklehurst (CAT) 050 * </ul> 051 * Other authors -<ul> 052 * <li>Neil Benn (CAT) 053 * <li>Lawrence Bower (CAT) 054 * <li>Derek Crockford (CAT) 055 * <li>Tim Dilks (CAT) 056 * <li>Colin Hardman (CAT) 057 * <li>Stuart Johnston (CAT) 058 *</ul> 059 * 060 * @author Cambridge Antibody Technology (CAT) 061 * @author Greg Cox 062 * @version 1.0 063 * 064 */ 065public class FastaSequenceSAXParser extends AbstractNativeAppSAXParser { 066 067 068 069 private AttributesImpl oAtts = new AttributesImpl(); 070 private QName oAttQName = new QName(this); 071 private char[] aoChars; 072 073 private StringBuffer oSeqName = new StringBuffer(); 074 private StringBuffer oSeq = new StringBuffer(); 075 private boolean tOnFirst = true; 076 077 private static final int STARTUP = 0; 078 private static final int IN_STREAM = 1; 079 080 081 /** 082 * Initialises internal state 083 * Sets namespace prefix to "biojava" 084 */ 085 public FastaSequenceSAXParser() { 086 iState = STARTUP; 087 this.setNamespacePrefix("biojava"); 088 } 089 090 /** 091 * Describe 'parse' method here. 092 * 093 * @param poSource - 094 */ 095 public void parse(InputSource poSource ) 096 throws IOException,SAXException { 097 098 BufferedReader oContents; 099 String oLine = null; 100 101 //Use method form superclass 102 oContents = this.getContentStream(poSource); 103 104 // loop over file 105 try { 106 // loop over file 107 oLine = oContents.readLine(); 108 while (oLine != null) { 109 //System.out.println(oLine); 110 this.interpret(oContents,oLine); 111 oLine = oContents.readLine(); 112 } // end while 113 } catch (java.io.IOException x) { 114 System.out.println(x.getMessage()); 115 System.out.println("Stream read interupted"); 116 } // end try/catch 117 118 //at end of stream... 119 //do final sequence 120 this.emitSequence(); 121 122 this.endElement(new QName(this, 123 this.prefix("SequenceCollection"))); 124 oContents.close(); 125 126 } 127 128 /** 129 * Describe <code>interpret</code> method here. 130 * 131 * @param poContents a <code>BufferedReader</code> value 132 * @param poLine a <code>String</code> value 133 * @exception SAXException if an error occurs 134 */ 135 private void interpret(BufferedReader poContents, String poLine) 136 throws SAXException { 137 138 139 if (iState == STARTUP) { 140 oAtts.clear(); 141 this.startElement( 142 new QName(this, 143 this.prefix("SequenceCollection")), 144 (Attributes)oAtts); 145 this.changeState(IN_STREAM); 146 } 147 148 if (iState == IN_STREAM) { 149 //look for the start of first record i.e.a header 150 if ( poLine.startsWith(">") ) { 151 if (!tOnFirst) { 152 this.emitSequence(); 153 } 154 this.parseHeaderLine(poLine); 155 oSeq.setLength(0); 156 return; 157 } else { 158 this.appendSequence(poLine); 159 } 160 161 } 162 } 163 /** 164 * Parse the header part of a record i.e. >myName, and 165 * emit messages. 166 * 167 * @param poLine a <code>String</code> value 168 */ 169 private void parseHeaderLine(String poLine) { 170 oSeqName.setLength(0); 171 oSeqName.append(poLine.substring(1)); 172 173 //flip flag to begin emitting sequence elements 174 tOnFirst = false; 175 //System.out.println(oSeqName); 176 } 177 /** 178 * Builds up sequence data - NB white space is 179 * removed. 180 * 181 * @param poLine a <code>String</code> value 182 */ 183 private void appendSequence(String poLine) { 184 StringTokenizer oSt = new StringTokenizer(poLine,"\n\t\r "); 185 while (oSt.hasMoreTokens()) { 186 oSeq.append(oSt.nextToken()); 187 } 188 } 189 /** 190 * Describe <code>emitSequence</code> method here. 191 * 192 */ 193 private void emitSequence() throws SAXException { 194 oAtts.clear(); 195 196 oAttQName.setQName("sequenceName"); 197 oAtts.addAttribute(oAttQName.getURI(), 198 oAttQName.getLocalName(), 199 oAttQName.getQName(), 200 "CDATA",oSeqName.substring(0)); 201 202 this.startElement( 203 new QName(this, 204 this.prefix("Sequence")), 205 (Attributes)oAtts); 206 207 aoChars = oSeq.substring(0).toCharArray(); 208 this.characters(aoChars,0,aoChars.length); 209 this.endElement(new QName(this,this.prefix("Sequence"))); 210 211 } 212}