001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.program.gff3; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.util.ArrayList; 027import java.util.List; 028import java.util.NoSuchElementException; 029import java.util.StringTokenizer; 030 031import org.biojava.bio.Annotation; 032import org.biojava.bio.BioError; 033import org.biojava.bio.BioException; 034import org.biojava.bio.program.gff.GFFErrorHandler; 035import org.biojava.bio.program.gff.GFFTools; 036import org.biojava.bio.program.gff.IgnoreRecordException; 037import org.biojava.bio.seq.StrandedFeature; 038import org.biojava.ontology.AlreadyExistsException; 039import org.biojava.ontology.OntoTools; 040import org.biojava.ontology.Ontology; 041import org.biojava.ontology.OntologyException; 042import org.biojava.ontology.Term; 043import org.biojava.utils.ChangeVetoException; 044import org.biojava.utils.ParserException; 045 046/** 047 * Parse a stream of GFF text into a stream of records and comments. 048 * 049 * <p> 050 * Developed from {@link org.biojava.bio.program.gff.GFFParser GFFParser}. 051 * </p> 052 * 053 * @author Matthew Pocock 054 */ 055public class GFF3Parser { 056 private GFFErrorHandler errors = GFFErrorHandler.ABORT_PARSING; 057 058 /** 059 * Set the error handler used by this parser. 060 */ 061 062 public void setErrorHandler(GFFErrorHandler errors) { 063 this.errors = errors; 064 } 065 066 /** 067 * Find the error handler used by this parser. 068 */ 069 070 public GFFErrorHandler getErrorHandler() { 071 return errors; 072 } 073 074 /** 075 * Informs <span class="arg">handler</span> of each line of 076 * gff read from <span class="arg">bReader</span>. This form 077 * of the method should only be used if no locator string is 078 * available for the resource being parsed. 079 * 080 * @param bReader the <span class="type">BufferedReader</span> to parse 081 * @param handler the <span class="type">GFF3DocumentHandler</span> that will 082 * listen for 'stuff' 083 * @param ontology an Ontology that all terms should come from 084 * 085 * @throws <span class="type">IOException</span> if for any reason 086 * <span class="arg">bReader</span> throws one 087 * @throws <span class="type">BioException</span> if 088 * <span class="arg">handler</span> can not correct a parse error 089 */ 090 091 public void parse( 092 BufferedReader bReader, 093 GFF3DocumentHandler handler, 094 Ontology ontology 095 ) 096 throws IOException, BioException, ParserException 097 { 098 parse(bReader, handler, ontology, "unknown:"); 099 } 100 101 /** 102 * Informs <span class="arg">handler</span> of each line of 103 * GFF read from <span class="arg">bReader</span> 104 * 105 * @param bReader the <span class="type">BufferedReader</span> to parse 106 * @param handler the <span class="type">GFF3DocumentHandler</span> that will 107 * listen for 'stuff' 108 * @param ontology an Ontology that all terms should come from 109 * 110 * @throws <span class="type">IOException</span> if for any reason 111 * <span class="arg">bReader</span> throws one 112 * @throws <span class="type">BioException</span> if 113 * <span class="arg">handler</span> can not correct a parse error 114 */ 115 116 public void parse( 117 BufferedReader bReader, 118 GFF3DocumentHandler handler, 119 Ontology ontology, 120 String locator 121 ) 122 throws IOException, BioException, ParserException 123 { 124 Ontology fallBack; 125 try { 126 fallBack = OntoTools.getDefaultFactory().createOntology( 127 "Unknown in " + locator, 128 "" 129 ); 130 } catch (OntologyException ex) { 131 throw new ParserException("Couldn't create fallback ontology", ex); 132 } 133 134 handler.startDocument(locator); 135 ArrayList aList = new ArrayList(); 136 int lineNum = 0; 137 for(String line = bReader.readLine(); line != null; line = bReader.readLine()) { 138 ++lineNum; 139 140 try { 141 aList.clear(); 142 if(line.startsWith("#")) { 143 handler.commentLine(line.substring(1)); 144 } else if (line.length() == 0) { 145 } else { 146 StringTokenizer st = new StringTokenizer(line, "\t", false); 147 while(st.hasMoreTokens() && aList.size() < 8) { 148 String token = st.nextToken(); 149 aList.add(token); 150 } 151 String rest = null; 152 String comment = null; 153 if(st.hasMoreTokens()) { 154 try { 155 rest = st.nextToken(((char) 0) + ""); 156 } catch (NoSuchElementException nsee) { 157 } 158 } 159 if(rest != null) { 160 int ci = rest.indexOf("#"); 161 if (ci != -1) { 162 comment = rest.substring(ci); 163 rest = rest.substring(0, ci); 164 } 165 } 166 GFF3Record record = createRecord(handler, aList, rest, comment, ontology, fallBack); 167 handler.recordLine(record); 168 } 169 } catch (ParserException ex) { 170 throw new ParserException(ex, "", 171 locator, 172 lineNum, 173 line); 174 } catch (IgnoreRecordException ex) { 175 // Silently skip any more work on this record 176 } 177 } 178 handler.endDocument(); 179 } 180 181 /** 182 * Actually turns a list of tokens, some value string and a comment into a 183 * <span class="type">GFF3Record</span> and informs 184 * <span class="arg">handler</span>. 185 * 186 * @param handler a <span class="type">GFF3DocumentHandler</span> to inform of 187 * any parse errors, and the completed <span class="type">GFF3Record</span> 188 * @param aList a <span class="type">List</span> containing the 8 mandatory GFF columns 189 * @param rest a <span class="type">String</span> representing the unparsed 190 * attribute-value text, or <span class="kw">null</span> if there is none 191 * @param comment a <span class="type">String</span> containing the comment (without the 192 * leading '<code>#</code>' character. 193 * @param ontology the Ontology to resolve Terms in 194 * @throws <span class="type">BioException</span> if <span class="arg">handler</span> 195 * could not correct a parse error 196 */ 197 protected GFF3Record createRecord( 198 GFF3DocumentHandler handler, 199 List aList, 200 String rest, 201 String comment, 202 Ontology ontology, 203 Ontology fallBack 204 ) 205 throws BioException, ParserException, IgnoreRecordException 206 { 207 GFF3Record.Impl record = new GFF3Record.Impl(); 208 209 record.setSequenceID((String) aList.get(0)); 210 211 { 212 Term st; 213 String stn = (String) aList.get(1); 214 if(ontology.containsTerm(stn)) { 215 st = ontology.getTerm(stn); 216 } else if(fallBack.containsTerm(stn)) { 217 st = fallBack.getTerm(stn); 218 } else { 219 try { 220 st = fallBack.createTerm(stn, ""); 221 } catch (AlreadyExistsException te) { 222 throw new BioError("Assertion Failure: Term should not yet exist", te); 223 } catch (ChangeVetoException cve) { 224 throw new BioError("Assertion Failure: Unable to create term", cve); 225 } 226 } 227 record.setSource(st); 228 } 229 230 { 231 Term tt; 232 String ttn = (String) aList.get(2); 233 if(ontology.containsTerm(ttn)) { 234 tt = ontology.getTerm(ttn); 235 } else if(fallBack.containsTerm(ttn)) { 236 tt = fallBack.getTerm(ttn); 237 } else { 238 try { 239 tt = fallBack.createTerm(ttn, ""); 240 } catch (AlreadyExistsException te) { 241 throw new BioError("Assertion Failure: Term should not yet exist", te); 242 } catch (ChangeVetoException cve) { 243 throw new BioError("Assertion Failure: Unable to create term", cve); 244 } 245 } 246 record.setType(tt); 247 } 248 249 int start = -1; 250 try { 251 start = Integer.parseInt( (String) aList.get(3)); 252 } catch (NumberFormatException nfe) { 253 start = errors.invalidStart((String) aList.get(3)); 254 } 255 record.setStart(start); 256 257 int end = -1; 258 try { 259 end = Integer.parseInt( (String) aList.get(4)); 260 } catch (NumberFormatException nfe) { 261 end = errors.invalidEnd((String) aList.get(3)); 262 } 263 record.setEnd(end); 264 265 String score = (String) aList.get(5); 266 if( 267 score == null || 268 score.equals("") || 269 score.equals(".") || 270 score.equals("0") 271 ) 272 { 273 record.setScore(GFFTools.NO_SCORE); 274 } else { 275 double sc = 0.0; 276 try { 277 sc = Double.parseDouble(score); 278 } catch (NumberFormatException nfe) { 279 sc = errors.invalidScore(score); 280 } 281 record.setScore(sc); 282 } 283 284 String strand = (String) aList.get(6); 285 if(strand == null || strand.equals("") || strand.equals(".")) { 286 record.setStrand(StrandedFeature.UNKNOWN); 287 } else { 288 if(strand.equals("+")) { 289 record.setStrand(StrandedFeature.POSITIVE); 290 } else if(strand.equals("-")) { 291 record.setStrand(StrandedFeature.NEGATIVE); 292 } else { 293 record.setStrand(errors.invalidStrand(strand)); 294 } 295 } 296 297 String frame = (String) aList.get(7); 298 if(frame.equals(".")) { 299 record.setPhase(GFFTools.NO_FRAME); 300 } else { 301 int fr = 0; 302 try { 303 fr = Integer.parseInt(frame); 304 } catch (NumberFormatException nfe) { 305 fr = errors.invalidFrame(frame); 306 } 307 record.setPhase(fr); 308 } 309 310 if (rest != null) { 311 try { 312 parseAttribute(rest, record.getAnnotation(), ontology, fallBack); 313 } catch (ChangeVetoException cve) { 314 throw new BioException("Unable to populate annotations", cve); 315 } 316 } 317 318 return record; 319 } 320 321 /** 322 * Parse <span class="arg">attValList</span> into a 323 * <span class="type">Map</span> of attributes and value lists. 324 * <p> 325 * Populates an Annotation instance with Ontology Term keys and string/list 326 * values. 327 * </p> 328 * 329 * @param attValList the <span class="type">String</span> to parse 330 */ 331 332 protected void parseAttribute(String attValList, Annotation anno, Ontology onto, Ontology fallBack) 333 throws ChangeVetoException { 334 StringTokenizer sTok = new StringTokenizer(attValList, ";", false); 335 while(sTok.hasMoreTokens()) { 336 String attVal = sTok.nextToken().trim(); 337 String attName; 338 List valList = new ArrayList(); 339 int spaceIndx = attVal.indexOf("="); 340 if(spaceIndx == -1) { 341 attName = attVal; 342 } else { 343 attName = attVal.substring(0, spaceIndx); 344 attValList = attVal.substring(spaceIndx+1).trim(); 345 while(attValList.length() > 0) { 346 if(attValList.startsWith("\"")) { 347 // System.out.println("Quoted"); 348 int quoteIndx = 0; 349 do { 350 quoteIndx++; 351 quoteIndx = attValList.indexOf("\"", quoteIndx); 352 } while(quoteIndx != -1 && attValList.charAt(quoteIndx-1) == '\\'); 353 if(quoteIndx > 0){ 354 valList.add(attValList.substring(1, quoteIndx)); 355 attValList = attValList.substring(quoteIndx+1).trim(); 356 }else{ 357 valList.add(attValList); 358 attValList = ""; 359 } 360 } else { 361 int commaIndx = attValList.indexOf(","); 362 if(commaIndx == -1) { 363 valList.add(attValList); 364 attValList = ""; 365 } else { 366 valList.add(attValList.substring(0, commaIndx)); 367 attValList = attValList.substring(commaIndx+1).trim(); 368 } 369 } 370 } 371 } 372 373 Term key; 374 if(onto.containsTerm(attName)) { 375 key = onto.getTerm(attName); 376 } else if(fallBack.containsTerm(attName)) { 377 key = fallBack.getTerm(attName); 378 } else { 379 try { 380 key = fallBack.createTerm(attName, ""); 381 } catch (AlreadyExistsException te) { 382 throw new BioError("Assertion Failure: Term should not be there yet",te); 383 } catch (ChangeVetoException cve) { 384 throw new BioError("Assertion Failure: Unable to create term", cve); 385 } 386 } 387 anno.setProperty(key, valList); 388 } 389 } 390}