001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Jan 18, 2008 021 * 022 */ 023 024package org.biojava.ontology.obo; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.text.SimpleDateFormat; 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.Iterator; 032import java.util.List; 033import java.util.Locale; 034import java.util.Map; 035import java.util.Properties; 036import java.util.Set; 037import java.util.StringTokenizer; 038import java.util.Vector; 039 040import org.biojava.bio.seq.io.ParseException; 041import org.biojava.ontology.Synonym; 042 043 044/** A class to parse the content of an OBO file. It delegates handling of the 045 * content to the OBOFileEventListener implementation. 046 * 047 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part) 048 * 049 * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup 050 * Thanks to the OboEdit developers for giving permission to release this in BioJava. 051 * 052 * 053 * @author Andreas Prlic 054 * @author John Day Richter 055 * @since 1.6 056 */ 057public class OboFileParser { 058 059 List<OboFileEventListener> listeners; 060 061 protected String line; 062 protected int linenum = 0; 063 protected int totalSize = 0; 064 protected int bytesRead = 0; 065 protected StringBuffer tempBuffer = new StringBuffer(); 066 protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US); 067 068 069 protected static final Map<Character, Character> escapeChars = 070 new HashMap<Character, Character>(); 071 072 protected static final Map<Character, Character> unescapeChars = 073 new HashMap<Character, Character>(); 074 075 static { 076 escapeChars.put(new Character('n'), new Character('\n')); 077 escapeChars.put(new Character('W'), new Character(' ')); 078 escapeChars.put(new Character('t'), new Character('\t')); 079 escapeChars.put(new Character(':'), new Character(':')); 080 escapeChars.put(new Character(','), new Character(',')); 081 escapeChars.put(new Character('"'), new Character('"')); 082 escapeChars.put(new Character('\''), new Character('\'')); 083 escapeChars.put(new Character('\\'), new Character('\\')); 084 escapeChars.put(new Character('{'), new Character('{')); 085 escapeChars.put(new Character('}'), new Character('}')); 086 escapeChars.put(new Character('('), new Character('(')); 087 escapeChars.put(new Character(')'), new Character(')')); 088 escapeChars.put(new Character('['), new Character('[')); 089 escapeChars.put(new Character(']'), new Character(']')); 090 escapeChars.put(new Character('!'), new Character('!')); 091 Iterator <Character> it = escapeChars.keySet().iterator(); 092 while (it.hasNext()) { 093 Character key = it.next(); 094 Character value = escapeChars.get(key); 095 unescapeChars.put(value, key); 096 } 097 } 098 099 public static class SOPair { 100 public String str = null; 101 102 public int index = -1; 103 104 public int endIndex = -1; 105 106 public SOPair(String str, int index) { 107 this(str, index, -1); 108 } 109 110 public SOPair(String str, int index, int endIndex) { 111 this.str = str; 112 this.index = index; 113 this.endIndex = endIndex; 114 } 115 116 117 } 118 119 120 121 122 public OboFileParser(){ 123 listeners = new ArrayList<OboFileEventListener>(); 124 } 125 126 127 128 public void addOboFileEventListener(OboFileEventListener listener){ 129 listeners.add(listener); 130 } 131 132 public List<OboFileEventListener> getOboFileEventListener(){ 133 return listeners; 134 } 135 136 /** parse an ontology file 137 * 138 * @param oboFile 139 * @throws IOException 140 * @throws ParseException 141 */ 142 public void parseOBO(BufferedReader oboFile) throws IOException,ParseException{ 143 144 String line; 145 String currentStanza; 146 147 while ((line = oboFile.readLine()) != null) { 148 if (line.length() == 0) 149 continue; 150 151 if ( line.charAt(0) == '[') { 152 if (line.charAt(line.length() - 1) != ']') 153 throw new ParseException("Unclosed stanza: \"" + line + "\"" ); 154 String stanzaname = line.substring(1, line.length() - 1); 155 if (stanzaname.length() < 1) 156 throw new ParseException("Empty stanza: \"" +line+"\""); 157 currentStanza = stanzaname; 158 159 //System.out.println("stanza: " + currentStanza); 160 triggerNewStanza(currentStanza); 161 162 } else { 163 // a content line 164 SOPair pair; 165 166 pair = unescape(line, ':', 0, true); 167 168 //sSystem.out.println(pair); 169 String name = pair.str; 170 int lineEnd = findUnescaped(line, '!', 0, line.length(), true); 171 if (lineEnd == -1) 172 lineEnd = line.length(); 173 174 // find nested values 175 NestedValue nv = null; 176 177 int trailingStartIndex = -1; 178 int trailingEndIndex = -1; 179 for (int i = lineEnd - 1; i >= 0; i--) { 180 if (Character.isWhitespace(line.charAt(i))) { 181 // keep going until we see non-whitespace 182 } else if (line.charAt(i) == '}') { 183 // if the first thing we see is a closing brace, 184 // we have a trailing modifier 185 if (i >= 1 && line.charAt(i - 1) == '\\') 186 continue; 187 trailingEndIndex = i; 188 break; 189 } else 190 break; 191 } 192 193 if (trailingEndIndex != -1) { 194 for (int i = trailingEndIndex - 1; i >= 0; i--) { 195 if (line.charAt(i) == '{') { 196 if (i >= 1 && line.charAt(i - 1) == '\\') 197 continue; 198 trailingStartIndex = i + 1; 199 } 200 } 201 } 202 203 int valueStopIndex; 204 if (trailingStartIndex == -1 && trailingEndIndex != -1) 205 throw new ParseException("Unterminated trailing modifier. " + line); 206 else if (trailingStartIndex != -1) { 207 valueStopIndex = trailingStartIndex - 1; 208 String trailing = line.substring(trailingStartIndex, 209 trailingEndIndex).trim(); 210 nv = new NestedValue(); 211 getNestedValue(nv, trailing, 0); 212 } else 213 valueStopIndex = lineEnd; 214 215 String value = line.substring(pair.index + 1, valueStopIndex).trim(); 216 /* 217 * if (nv != null) System.err.println("nv = "+nv+", value = 218 * |"+value+"|"); 219 */ 220 if (value.length() == 0) 221 throw new ParseException("Tag found with no value "+ line); 222 223 if ( isSynonym(name)){ 224 Synonym synonym = parseSynonym(name,value); 225 triggerNewSynonym(synonym); 226 } else { 227 //System.out.println("new key:" + name + " " + value); 228 triggerNewKey(name,value); 229 } 230 //System.out.println("parsed key: " + name +" value: " + value + " nv: " + nv); 231 232 233 234 } 235 } 236 } 237 238 private boolean isSynonym(String key){ 239 if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM)) 240 return true; 241 return false; 242 } 243 244 /** parse the Synonym String from the Term. 245 * value can be: 246 * <pre>"ca_bind" RELATED [uniprot:curation]</pre> 247 * @param value 248 * @return the synonym text 249 */ 250 private Synonym parseSynonym(String key, String value) throws ParseException{ 251 252 //System.out.println("PARSE SYNONYM " + key + " " + value); 253 int startIndex = findUnescaped(value, '"', 0, value.length()); 254 if (startIndex == -1) 255 throw new ParseException("Expected \"" + line + " " + linenum); 256 SOPair p = unescape(value, '"', startIndex + 1, value.length(), 257 true); 258 int defIndex = findUnescaped(value, '[', p.index, value.length()); 259 if (defIndex == -1) { 260 throw new ParseException("Badly formatted synonym. " 261 + "No dbxref list found." + line + " " + linenum ); 262 } 263 String leftovers = value.substring(p.index + 1, defIndex).trim(); 264 StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t"); 265 int scope = Synonym.RELATED_SYNONYM; 266 267 if ( key.equals(OboFileHandler.EXACT_SYNONYM)) 268 scope = Synonym.EXACT_SYNONYM; 269 else if ( key.equals(OboFileHandler.BROAD_SYNONYM)) 270 scope = Synonym.BROAD_SYNONYM; 271 else if ( key.equals(OboFileHandler.NARROW_SYNONYM)) 272 scope = Synonym.NARROW_SYNONYM; 273 274 275 String catID = null; 276 for (int i = 0; tokenizer.hasMoreTokens(); i++) { 277 String token = tokenizer.nextToken(); 278 //System.out.println("TOKEN:" +token); 279 if (i == 0) { 280 if (token.equals("RELATED")) 281 scope = Synonym.RELATED_SYNONYM; 282 else if (token.equals("UNSPECIFIED")) 283 scope = Synonym.RELATED_SYNONYM; 284 else if (token.equals("EXACT")) 285 scope = Synonym.EXACT_SYNONYM; 286 else if (token.equals("BROAD")) 287 scope = Synonym.BROAD_SYNONYM; 288 else if (token.equals("NARROW")) 289 scope = Synonym.NARROW_SYNONYM; 290 else 291 throw new ParseException("Found unexpected scope " 292 + "identifier " + token + line); 293 } else if (i == 1) { 294 catID = token; 295 } else 296 throw new ParseException("Expected dbxref list," 297 + " instead found " + token + line ); 298 } 299 300 Synonym synonym = new Synonym(); 301 synonym.setScope(scope); 302 synonym.setCategory(catID); 303 synonym.setName(p.str); 304 //System.out.println("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope()); 305 306 Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length()); 307 308 // set the refs in the synonym 309 for (Map<String, Object> ref : refs){ 310 String xref = (String) ref.get("xref"); 311 String desc = (String) ref.get("desc"); 312 //System.out.println(xref + " " + desc); 313 NestedValue nv = (NestedValue) ref.get("nv"); 314 //TODO: add implementation for this... 315 } 316 317 318 return synonym; 319 } 320 321 protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws ParseException { 322 Vector<Map<String,Object>> temp = new Vector<Map<String,Object>>(); 323 boolean stop = false; 324 while (!stop) { 325 int braceIndex = findUnescaped(line, '{', startoffset, endoffset); 326 int endIndex = findUnescaped(line, ',', startoffset, endoffset, 327 true); 328 boolean trailing = false; 329 if (endIndex == -1) { 330 endIndex = findUnescaped(line, ']', startoffset, endoffset, 331 true); 332 if (endIndex == -1) { 333 throw new ParseException("Unterminated xref list " + line); 334 } 335 stop = true; 336 } 337 if (braceIndex != -1 && braceIndex < endIndex) { 338 endIndex = braceIndex; 339 trailing = true; 340 } 341 342 Map<String, Object> pair = parseXref(line, 343 startoffset, 344 endIndex); 345 if (pair == null) { 346 startoffset++; 347 continue; 348 } 349 NestedValue nv = null; 350 if (trailing) { 351 nv = new NestedValue(); 352 endIndex = getNestedValue(nv, line, endIndex + 1); 353 if (endIndex == -1) { 354 throw new ParseException("Badly formatted " 355 + "trailing properties " + line); 356 } 357 pair.put("nv",nv); 358 } 359 360 temp.add(pair); 361 startoffset = endIndex + 1; 362 } 363 Map<String,Object>[] out = new HashMap[temp.size()]; 364 for (int i = 0; i < temp.size(); i++) { 365 Map<String, Object> pair = temp.get(i); 366 out[i] = pair; 367 } 368 return out; 369 } 370 371 protected Map<String,Object> parseXref(String line, 372 int startoffset, int endoffset) throws ParseException { 373 String xref_str = null; 374 String desc_str = null; 375 376 SOPair xref = unescape(line, '"', startoffset, endoffset, false); 377 xref_str = xref.str.trim(); 378 if (xref_str.length() == 0) 379 return null; 380 381 if (xref.index != -1) { 382 SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true); 383 desc_str = desc.str.trim(); 384 } 385 386 387 Map<String, Object> m = new HashMap<String, Object>(); 388 m.put("xref",xref_str); 389 m.put("desc",desc_str); 390 return m; 391 } 392 393 394 395 private void triggerNewStanza(String stanza){ 396 Iterator<OboFileEventListener> iter = listeners.iterator(); 397 while (iter.hasNext()){ 398 OboFileEventListener li = iter.next(); 399 li.newStanza(stanza); 400 } 401 } 402 403 private void triggerNewKey(String key, String value){ 404 Iterator<OboFileEventListener> iter = listeners.iterator(); 405 while (iter.hasNext()){ 406 OboFileEventListener li = iter.next(); 407 li.newKey(key, value); 408 } 409 } 410 411 private void triggerNewSynonym(Synonym synonym){ 412 Iterator<OboFileEventListener> iter = listeners.iterator(); 413 while (iter.hasNext()){ 414 OboFileEventListener li = iter.next(); 415 li.newSynonym(synonym); 416 } 417 } 418 419 public static String escape(String str, boolean escapespaces) { 420 StringBuffer out = new StringBuffer(); 421 for (int i = 0; i < str.length(); i++) { 422 char c = str.charAt(i); 423 Object o = unescapeChars.get(new Character(c)); 424 if (o == null) 425 out.append(c); 426 else { 427 if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) { 428 out.append("\\" + o); 429 } else 430 out.append(c); 431 } 432 } 433 return out.toString(); 434 } 435 436 public String unescape(String str) throws ParseException { 437 return unescape(str, '\0', 0, str.length(), false).str; 438 } 439 440 public SOPair unescape(String str, char toChar, int startindex, 441 boolean mustFindChar) throws ParseException { 442 return unescape(str, toChar, startindex, str.length(), mustFindChar); 443 } 444 445 public SOPair unescape(String str, char toChar, int startindex, 446 int endindex, boolean mustFindChar) throws ParseException { 447 StringBuffer out = new StringBuffer(); 448 int endValue = -1; 449 for (int i = startindex; i < endindex; i++) { 450 char c = str.charAt(i); 451 if (c == '\\') { 452 i++; 453 c = str.charAt(i); 454 Character mapchar = escapeChars 455 .get(new Character(c)); 456 if (mapchar == null) 457 throw new ParseException("Unrecognized escape" 458 + " character " + c + " found."); 459 out.append(mapchar); 460 } else if (c == toChar) { 461 endValue = i; 462 break; 463 } else { 464 out.append(c); 465 } 466 } 467 if (endValue == -1 && mustFindChar) { 468 throw new ParseException("Expected " + toChar + "." + str); 469 } 470 return new SOPair(out.toString(), endValue); 471 } 472 473 474 public static int findUnescaped(String str, char toChar) { 475 return findUnescaped(str, toChar, 0, str.length()); 476 } 477 478 public static int findUnescaped(String str, char toChar, int startIndex, 479 int endIndex) { 480 return findUnescaped(str, toChar, startIndex, endIndex, false); 481 } 482 483 public static int findUnescaped(String str, char toChar, int startindex, 484 int endindex, boolean honorQuotes) { 485 boolean inQuotes = false; 486 char quoteChar = '\0'; 487 for (int i = startindex; i < endindex; i++) { 488 char c = str.charAt(i); 489 if (c == '\\') { 490 i++; 491 continue; 492 } else if (inQuotes) { 493 if (c == quoteChar) 494 inQuotes = false; 495 continue; 496 497 } else if (c == toChar) { 498 return i; 499 } else if (honorQuotes && isQuote(c)) { 500 inQuotes = true; 501 quoteChar = c; 502 } 503 } 504 return -1; 505 } 506 507 public static boolean isEscapeStarter(char c) { 508 return c == '\\'; 509 } 510 511 public static boolean isQuote(char c) { 512 return c == '"'; 513 } 514 515 protected StringBuffer getTempBuffer() { 516 tempBuffer.delete(0, tempBuffer.length()); 517 return tempBuffer; 518 } 519 520 protected SOPair readQuotedString(String value, int startIndex, 521 int stopIndex, char terminatingChar, boolean requireQuotes, 522 boolean legalEndOfLine) throws ParseException { 523 524 char quoteChar = '\0'; 525 StringBuffer out = getTempBuffer(); 526 int i = startIndex; 527 boolean useQuotes = false; 528 529 for (; i < stopIndex; i++) { 530 // burn through any leading whitespace 531 if (Character.isWhitespace(value.charAt(i))) 532 continue; 533 534 // if the first non-whitespace character is not a quote, 535 // proceed in non-quoted mode 536 else if (!isQuote(value.charAt(i))) { 537 if (requireQuotes) 538 throw new ParseException( 539 "Expected start of quoted string. " + 540 line + " " + value+ " at linenr " + linenum); 541 useQuotes = false; 542 break; 543 } else { 544 useQuotes = true; 545 quoteChar = value.charAt(i); 546 i++; 547 break; 548 } 549 } 550 551 // look for a closing quote or final delimiter 552 for (; i < stopIndex; i++) { 553 if (isEscapeStarter(value.charAt(i))) { 554 i++; 555 if (i >= value.length()) 556 throw new ParseException("Incomplete escape sequence. " + line); 557 out.append(value.charAt(i)); 558 } else if ((useQuotes && value.charAt(i) == quoteChar) 559 || (!useQuotes && value.charAt(i) == terminatingChar)) { 560 if (!useQuotes) 561 return new SOPair(out.toString().trim(), startIndex, i - 1); 562 else 563 return new SOPair(out.toString(), startIndex, i); 564 } else { 565 out.append(value.charAt(i)); 566 } 567 } 568 if (!useQuotes && legalEndOfLine) 569 return new SOPair(out.toString().trim(), startIndex, i); 570 else 571 throw new ParseException("Unterminated quoted string. " +line); 572 } 573 574 protected int getNestedValue(NestedValue nv, String str, int startIndex) 575 throws ParseException { 576 while (startIndex < str.length()) { 577 int equalsIndex = findUnescaped(str, '=', startIndex, str.length()); 578 if (equalsIndex == -1) 579 throw new ParseException("Expected = in trailing modifier " +line); 580 String name = str.substring(startIndex, equalsIndex).trim(); 581 SOPair value = readQuotedString(str, equalsIndex + 1, str.length(), 582 ',', false, true); 583 584 Properties pv = new Properties(); 585 pv.setProperty(unescape(name),value.str); 586 587 588 nv.addPropertyValue(pv); 589 startIndex = value.endIndex + 1; 590 for (; startIndex < str.length(); startIndex++) { 591 if (Character.isWhitespace(str.charAt(startIndex))) 592 continue; 593 else if (str.charAt(startIndex) == ',') { 594 startIndex++; 595 break; 596 } else { 597 System.err.println("found character |" 598 + str.charAt(startIndex) + "|"); 599 throw new ParseException("Expected comma in trailing modifier. " + 600 line + " linenr: " + linenum); 601 } 602 } 603 } 604 return str.length(); 605 } 606 607} 608 609class NestedValue { 610 611 /** 612 * 613 */ 614 private static final long serialVersionUID = -7529450225162773796L; 615 protected Properties propertyValues = new Properties(); 616 protected String name; 617 protected String suggestedComment; 618 619 public NestedValue() { 620 } 621 622 public String toString(){ 623 String txt = "NestedValue: " ; 624 Set<Object> keys = propertyValues.keySet(); 625 Iterator<Object> iter = keys.iterator(); 626 while (iter.hasNext()){ 627 String key = iter.next().toString(); 628 String value = propertyValues.get(key).toString(); 629 txt += " [" + key + ":" + value + "]"; 630 } 631 632 633 return txt; 634 } 635 636 public String getName() { 637 return name; 638 } 639 640 public Properties getPropertyValues() { 641 return propertyValues; 642 } 643 644 public void addPropertyValue(Properties pv) { 645 Set<Object> keys = pv.keySet(); 646 Iterator<Object> iter = keys.iterator(); 647 while (iter.hasNext()){ 648 String key = iter.next().toString(); 649 String value = pv.get(key).toString(); 650 propertyValues.setProperty(key, value); 651 } 652 653 } 654 655 @Override 656 public Object clone() { 657 try { 658 return super.clone(); 659 } catch (CloneNotSupportedException ex) { 660 // this will never happen 661 return null; 662 } 663 } 664 665 public String getSuggestedComment() { 666 return suggestedComment; 667 } 668 669 public void setSuggestedComment(String suggestedComment) { 670 this.suggestedComment = suggestedComment; 671 } 672} 673 674