001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.gff; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.util.ArrayList; 027import java.util.List; 028import java.util.Map; 029import java.util.NoSuchElementException; 030import java.util.StringTokenizer; 031 032import org.biojava.bio.BioException; 033import org.biojava.bio.seq.StrandedFeature; 034import org.biojava.utils.ParserException; 035import org.biojava.utils.SmallMap; 036 037/** 038 * Parse a stream of GFF text into a stream of records and comments. 039 * 040 * @author Matthew Pocock 041 * @author Thomas Down 042 * @author Keith James (docs) 043 */ 044public class GFFParser { 045 private GFFErrorHandler errors = GFFErrorHandler.ABORT_PARSING; 046 047 /** 048 * Set the error handler used by this parser. 049 */ 050 051 public void setErrorHandler(GFFErrorHandler errors) { 052 this.errors = errors; 053 } 054 055 /** 056 * Find the error handler used by this parser. 057 */ 058 059 public GFFErrorHandler getErrorHandler() { 060 return errors; 061 } 062 063 /** 064 * Informs <span class="arg">handler</span> of each line of 065 * gff read from <span class="arg">bReader</span>. This form 066 * of the method should only be used if no locator string is 067 * available for the resource being parsed. 068 * 069 * @param bReader the <span class="type">BufferedReader</span> to parse 070 * @param handler the <span class="type">GFFDocumentHandler</span> that will 071 * listen for 'stuff' 072 * 073 * @throws <span class="type">IOException</span> if for any reason 074 * <span class="arg">bReader</span> throws one 075 * @throws <span class="type">BioException</span> if 076 * <span class="arg">handler</span> can not correct a parse error 077 */ 078 079 public void parse(BufferedReader bReader, GFFDocumentHandler handler) 080 throws IOException, BioException, ParserException 081 { 082 parse(bReader, handler, "unknown:"); 083 } 084 085 /** 086 * Informs <span class="arg">handler</span> of each line of 087 * GFF read from <span class="arg">bReader</span> 088 * 089 * @param bReader the <span class="type">BufferedReader</span> to parse 090 * @param handler the <span class="type">GFFDocumentHandler</span> that will 091 * listen for 'stuff' 092 * 093 * @throws <span class="type">IOException</span> if for any reason 094 * <span class="arg">bReader</span> throws one 095 * @throws <span class="type">BioException</span> if 096 * <span class="arg">handler</span> can not correct a parse error 097 */ 098 099 public void parse(BufferedReader bReader, GFFDocumentHandler handler, String locator) 100 throws IOException, BioException, ParserException 101 { 102 handler.startDocument(locator); 103 ArrayList aList = new ArrayList(); 104 int lineNum = 0; 105 for(String line = bReader.readLine(); line != null; line = bReader.readLine()) { 106 ++lineNum; 107 108 try { 109 aList.clear(); 110 if(line.startsWith("#")) { 111 handler.commentLine(line.substring(1)); 112 } else if (line.length() == 0) { 113 } else { 114 StringTokenizer st = new StringTokenizer(line, "\t", false); 115 while(st.hasMoreTokens() && aList.size() < 8) { 116 String token = st.nextToken(); 117 aList.add(token); 118 } 119 120 if(aList.size() < 7) { 121 throw new ParserException( 122 "Line doesn't look like GFF", 123 locator, 124 lineNum, 125 line ); 126 } 127 128 String rest = null; 129 String comment = null; 130 if(st.hasMoreTokens()) { 131 try { 132 rest = st.nextToken(((char) 0) + ""); 133 } catch (NoSuchElementException nsee) { 134 } 135 } 136 if(rest != null) { 137 int ci = rest.indexOf("#"); 138 if (ci != -1) { 139 comment = rest.substring(ci); 140 rest = rest.substring(0, ci); 141 } 142 } 143 144 GFFRecord record = createRecord(handler, aList, rest, comment); 145 handler.recordLine(record); 146 } 147 } catch (ParserException ex) { 148 throw new ParserException(ex.getMessage(), 149 locator, 150 lineNum, 151 line); 152 } catch (IgnoreRecordException ex) { 153 // Silently skip any more work on this record 154 } 155 } 156 handler.endDocument(); 157 } 158 159 /** 160 * Actually turns a list of tokens, some value string and a comment into a 161 * <span class="type">GFFRecord</span> and informs 162 * <span class="arg">handler</span>. 163 * 164 * @param handler a <span class="type">GFFDocumentHandler</span> to inform of 165 * any parse errors, and the completed <span class="type">GFFRecord</span> 166 * @param aList a <span class="type">List</span> containing the 8 mandatory GFF columns 167 * @param rest a <span class="type">String</span> representing the unparsed 168 * attribute-value text, or <span class="kw">null</span> if there is none 169 * @param comment a <span class="type">String</span> containing the comment (without the 170 * leading '<code>#</code>' character. 171 * @throws <span class="type">BioException</span> if <span class="arg">handler</span> 172 * could not correct a parse error 173 */ 174 protected GFFRecord createRecord(GFFDocumentHandler handler, 175 List aList, 176 String rest, 177 String comment) 178 throws BioException, ParserException, IgnoreRecordException 179 { 180 SimpleGFFRecord record = new SimpleGFFRecord(); 181 182 record.setSeqName((String) aList.get(0)); 183 record.setSource((String) aList.get(1)); 184 record.setFeature((String) aList.get(2)); 185 186 int start = -1; 187 try { 188 start = Integer.parseInt( (String) aList.get(3)); 189 } catch (NumberFormatException nfe) { 190 start = errors.invalidStart((String) aList.get(3)); 191 } 192 record.setStart(start); 193 194 int end = -1; 195 try { 196 end = Integer.parseInt( (String) aList.get(4)); 197 } catch (NumberFormatException nfe) { 198 end = errors.invalidEnd((String) aList.get(3)); 199 } 200 record.setEnd(end); 201 202 String score = (String) aList.get(5); 203 if(score == null || 204 score.equals("") || 205 score.equals(".") || 206 score.equals("0") 207 ) 208 { 209 record.setScore(GFFTools.NO_SCORE); 210 } else { 211 double sc = 0.0; 212 try { 213 sc = Double.parseDouble(score); 214 } catch (NumberFormatException nfe) { 215 sc = errors.invalidScore(score); 216 } 217 record.setScore(sc); 218 } 219 220 String strand = (String) aList.get(6); 221 if(strand == null || strand.equals("") || strand.equals(".")) { 222 record.setStrand(StrandedFeature.UNKNOWN); 223 } else { 224 if(strand.equals("+")) { 225 record.setStrand(StrandedFeature.POSITIVE); 226 } else if(strand.equals("-")) { 227 record.setStrand(StrandedFeature.NEGATIVE); 228 } else { 229 record.setStrand(errors.invalidStrand(strand)); 230 } 231 } 232 233 String frame = (String) aList.get(7); 234 if(frame.equals(".")) { 235 record.setFrame(GFFTools.NO_FRAME); 236 } else { 237 int fr = 0; 238 try { 239 fr = Integer.parseInt(frame); 240 } catch (NumberFormatException nfe) { 241 fr = errors.invalidFrame(frame); 242 } 243 record.setFrame(fr); 244 } 245 246 if (rest != null) 247 record.setGroupAttributes(parseAttribute(rest)); 248 else 249 record.setGroupAttributes(new SmallMap()); 250 record.setComment(comment); 251 252 return record; 253 } 254 255 /** 256 * Parse <span class="arg">attValList</span> into a 257 * <span class="type">Map</span> of attributes and value lists. 258 * <p> 259 * The resulting <span class="type">Map</span> will have 260 * <span class="type">String</span> keys, with 261 * <span class="type">List</span> values. If there are no values 262 * associated with a key, then it will have an empty 263 * <span class="type">List</span>, not <span class="kw">null</span> as 264 * its value. 265 * 266 * @param attValList the <span class="type">String</span> to parse 267 * @return a <span class="type">Map</span> of parsed attributes and value lists 268 */ 269 270 protected Map parseAttribute(String attValList) { 271 Map attMap = new SmallMap(); 272 273 StringTokenizer sTok = new StringTokenizer(attValList, ";", false); 274 while(sTok.hasMoreTokens()) { 275 String attVal = sTok.nextToken().trim(); 276 String attName; 277 List valList = new ArrayList(); 278 int spaceIndx = attVal.indexOf(" "); 279 if(spaceIndx == -1) { 280 attName = attVal; 281 } else { 282 attName = attVal.substring(0, spaceIndx); 283 attValList = attVal.substring(spaceIndx).trim(); 284 while(attValList.length() > 0) { 285 if(attValList.startsWith("\"")) { 286 // System.out.println("Quoted"); 287 int quoteIndx = 0; 288 do { 289 quoteIndx++; 290 quoteIndx = attValList.indexOf("\"", quoteIndx); 291 } while(quoteIndx != -1 && attValList.charAt(quoteIndx-1) == '\\'); 292 if(quoteIndx > 0){ 293 valList.add(attValList.substring(1, quoteIndx)); 294 attValList = attValList.substring(quoteIndx+1).trim(); 295 }else{ 296 valList.add(attValList); 297 attValList = ""; 298 } 299 } else { 300 spaceIndx = attValList.indexOf(" "); 301 if(spaceIndx == -1) { 302 valList.add(attValList); 303 attValList = ""; 304 } else { 305 valList.add(attValList.substring(0, spaceIndx)); 306 attValList = attValList.substring(spaceIndx).trim(); 307 } 308 } 309 } 310 } 311 attMap.put(attName, valList); 312 } 313 314 return attMap; 315 } 316}