001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Jan 18, 2008 021 * 022 */ 023 024package org.biojava.nbio.ontology.obo; 025 026import org.biojava.nbio.ontology.Synonym; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import java.io.BufferedReader; 031import java.io.IOException; 032import java.text.SimpleDateFormat; 033import java.util.*; 034 035 036/** A class to parse the content of an OBO file. It delegates handling of the 037 * content to the OBOFileEventListener implementation. 038 * 039 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part) 040 * 041 * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup 042 * Thanks to the OboEdit developers for giving permission to release this in BioJava. 043 * 044 * 045 * @author Andreas Prlic 046 * @author John Day Richter 047 * @since 1.6 048 */ 049public class OboFileParser { 050 051 private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class); 052 053 List<OboFileEventListener> listeners; 054 055 protected String line; 056 protected int linenum = 0; 057 protected int totalSize = 0; 058 protected int bytesRead = 0; 059 protected StringBuffer tempBuffer = new StringBuffer(); 060 protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US); 061 062 063 protected static final Map<Character, Character> escapeChars = 064 new HashMap<Character, Character>(); 065 066 protected static final Map<Character, Character> unescapeChars = 067 new HashMap<Character, Character>(); 068 069 static { 070 escapeChars.put(new Character('n'), new Character('\n')); 071 escapeChars.put(new Character('W'), new Character(' ')); 072 escapeChars.put(new Character('t'), new Character('\t')); 073 escapeChars.put(new Character(':'), new Character(':')); 074 escapeChars.put(new Character(','), new Character(',')); 075 escapeChars.put(new Character('"'), new Character('"')); 076 escapeChars.put(new Character('\''), new Character('\'')); 077 escapeChars.put(new Character('\\'), new Character('\\')); 078 escapeChars.put(new Character('{'), new Character('{')); 079 escapeChars.put(new Character('}'), new Character('}')); 080 escapeChars.put(new Character('('), new Character('(')); 081 escapeChars.put(new Character(')'), new Character(')')); 082 escapeChars.put(new Character('['), new Character('[')); 083 escapeChars.put(new Character(']'), new Character(']')); 084 escapeChars.put(new Character('!'), new Character('!')); 085 Iterator <Character> it = escapeChars.keySet().iterator(); 086 while (it.hasNext()) { 087 Character key = it.next(); 088 Character value = escapeChars.get(key); 089 unescapeChars.put(value, key); 090 } 091 } 092 093 public static class SOPair { 094 public String str = null; 095 096 public int index = -1; 097 098 public int endIndex = -1; 099 100 public SOPair(String str, int index) { 101 this(str, index, -1); 102 } 103 104 public SOPair(String str, int index, int endIndex) { 105 this.str = str; 106 this.index = index; 107 this.endIndex = endIndex; 108 } 109 110 111 } 112 113 114 115 116 public OboFileParser(){ 117 listeners = new ArrayList<OboFileEventListener>(); 118 } 119 120 121 122 public void addOboFileEventListener(OboFileEventListener listener){ 123 listeners.add(listener); 124 } 125 126 public List<OboFileEventListener> getOboFileEventListener(){ 127 return listeners; 128 } 129 130 /** parse an ontology file 131 * 132 * @param oboFile 133 * @throws IOException 134 * @throws IOException 135 */ 136 public void parseOBO(BufferedReader oboFile) throws IOException{ 137 138 String line; 139 String currentStanza; 140 141 while ((line = oboFile.readLine()) != null) { 142 if (line.length() == 0) 143 continue; 144 145 if ( line.charAt(0) == '[') { 146 if (line.charAt(line.length() - 1) != ']') 147 throw new IOException("Unclosed stanza: \"" + line + "\"" ); 148 String stanzaname = line.substring(1, line.length() - 1); 149 if (stanzaname.length() < 1) 150 throw new IOException("Empty stanza: \"" +line+"\""); 151 currentStanza = stanzaname; 152 153 //logger.info("stanza: {}", currentStanza); 154 triggerNewStanza(currentStanza); 155 156 } else { 157 // a content line 158 SOPair pair; 159 160 pair = unescape(line, ':', 0, true); 161 162 //logger.info(pair); 163 String name = pair.str; 164 int lineEnd = findUnescaped(line, '!', 0, line.length(), true); 165 if (lineEnd == -1) 166 lineEnd = line.length(); 167 168 // find nested values 169 NestedValue nv = null; 170 171 int trailingStartIndex = -1; 172 int trailingEndIndex = -1; 173 for (int i = lineEnd - 1; i >= 0; i--) { 174 if (Character.isWhitespace(line.charAt(i))) { 175 // keep going until we see non-whitespace 176 } else if (line.charAt(i) == '}') { 177 // if the first thing we see is a closing brace, 178 // we have a trailing modifier 179 if (i >= 1 && line.charAt(i - 1) == '\\') 180 continue; 181 trailingEndIndex = i; 182 break; 183 } else 184 break; 185 } 186 187 if (trailingEndIndex != -1) { 188 for (int i = trailingEndIndex - 1; i >= 0; i--) { 189 if (line.charAt(i) == '{') { 190 if (i >= 1 && line.charAt(i - 1) == '\\') 191 continue; 192 trailingStartIndex = i + 1; 193 } 194 } 195 } 196 197 int valueStopIndex; 198 if (trailingStartIndex == -1 && trailingEndIndex != -1) 199 throw new IOException("Unterminated trailing modifier. " + line); 200 else if (trailingStartIndex != -1) { 201 valueStopIndex = trailingStartIndex - 1; 202 String trailing = line.substring(trailingStartIndex, 203 trailingEndIndex).trim(); 204 nv = new NestedValue(); 205 getNestedValue(nv, trailing, 0); 206 } else 207 valueStopIndex = lineEnd; 208 209 String value = line.substring(pair.index + 1, valueStopIndex).trim(); 210 /* 211 * if (nv != null) logger.warn("nv = "+nv+", value = 212 * |"+value+"|"); 213 */ 214 if (value.length() == 0) 215 throw new IOException("Tag found with no value "+ line); 216 217 if ( isSynonym(name)){ 218 Synonym synonym = parseSynonym(name,value); 219 triggerNewSynonym(synonym); 220 } else { 221 //logger.info("new key:" + name + " " + value); 222 triggerNewKey(name,value); 223 } 224 //logger.info("parsed key: " + name +" value: " + value + " nv: " + nv); 225 226 227 228 } 229 } 230 } 231 232 private boolean isSynonym(String key){ 233 if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM)) 234 return true; 235 return false; 236 } 237 238 /** parse the Synonym String from the Term. 239 * value can be: 240 * <pre>"ca_bind" RELATED [uniprot:curation]</pre> 241 * @param value 242 * @return the synonym text 243 */ 244 private Synonym parseSynonym(String key, String value) throws IOException{ 245 246 //logger.info("PARSE SYNONYM " + key + " " + value); 247 int startIndex = findUnescaped(value, '"', 0, value.length()); 248 if (startIndex == -1) 249 throw new IOException("Expected \"" + line + " " + linenum); 250 SOPair p = unescape(value, '"', startIndex + 1, value.length(), 251 true); 252 int defIndex = findUnescaped(value, '[', p.index, value.length()); 253 if (defIndex == -1) { 254 throw new IOException("Badly formatted synonym. " 255 + "No dbxref list found." + line + " " + linenum ); 256 } 257 String leftovers = value.substring(p.index + 1, defIndex).trim(); 258 StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t"); 259 int scope = Synonym.RELATED_SYNONYM; 260 261 if ( key.equals(OboFileHandler.EXACT_SYNONYM)) 262 scope = Synonym.EXACT_SYNONYM; 263 else if ( key.equals(OboFileHandler.BROAD_SYNONYM)) 264 scope = Synonym.BROAD_SYNONYM; 265 else if ( key.equals(OboFileHandler.NARROW_SYNONYM)) 266 scope = Synonym.NARROW_SYNONYM; 267 268 269 String catID = null; 270 for (int i = 0; tokenizer.hasMoreTokens(); i++) { 271 String token = tokenizer.nextToken(); 272 //logger.info("TOKEN:" +token); 273 if (i == 0) { 274 if (token.equals("RELATED")) 275 scope = Synonym.RELATED_SYNONYM; 276 else if (token.equals("UNSPECIFIED")) 277 scope = Synonym.RELATED_SYNONYM; 278 else if (token.equals("EXACT")) 279 scope = Synonym.EXACT_SYNONYM; 280 else if (token.equals("BROAD")) 281 scope = Synonym.BROAD_SYNONYM; 282 else if (token.equals("NARROW")) 283 scope = Synonym.NARROW_SYNONYM; 284 else 285 throw new IOException("Found unexpected scope " 286 + "identifier " + token + line); 287 } else if (i == 1) { 288 catID = token; 289 } else 290 throw new IOException("Expected dbxref list," 291 + " instead found " + token + line ); 292 } 293 294 Synonym synonym = new Synonym(); 295 synonym.setScope(scope); 296 synonym.setCategory(catID); 297 synonym.setName(p.str); 298 //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope()); 299 300 Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length()); 301 302 // set the refs in the synonym 303 for (Map<String, Object> ref : refs){ 304 @SuppressWarnings("unused") 305 String xref = (String) ref.get("xref"); 306 @SuppressWarnings("unused") 307 String desc = (String) ref.get("desc"); 308 //logger.info(xref + " " + desc); 309 @SuppressWarnings("unused") 310 NestedValue nv = (NestedValue) ref.get("nv"); 311 //TODO: add implementation for this... 312 } 313 314 315 return synonym; 316 } 317 318 protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException { 319 Vector<Map<String,Object>> temp = new Vector<Map<String,Object>>(); 320 boolean stop = false; 321 while (!stop) { 322 int braceIndex = findUnescaped(line, '{', startoffset, endoffset); 323 int endIndex = findUnescaped(line, ',', startoffset, endoffset, 324 true); 325 boolean trailing = false; 326 if (endIndex == -1) { 327 endIndex = findUnescaped(line, ']', startoffset, endoffset, 328 true); 329 if (endIndex == -1) { 330 throw new IOException("Unterminated xref list " + line); 331 } 332 stop = true; 333 } 334 if (braceIndex != -1 && braceIndex < endIndex) { 335 endIndex = braceIndex; 336 trailing = true; 337 } 338 339 Map<String, Object> pair = parseXref(line, 340 startoffset, 341 endIndex); 342 if (pair == null) { 343 startoffset++; 344 continue; 345 } 346 NestedValue nv = null; 347 if (trailing) { 348 nv = new NestedValue(); 349 endIndex = getNestedValue(nv, line, endIndex + 1); 350 if (endIndex == -1) { 351 throw new IOException("Badly formatted " 352 + "trailing properties " + line); 353 } 354 pair.put("nv",nv); 355 } 356 357 temp.add(pair); 358 startoffset = endIndex + 1; 359 } 360 Map<String,Object>[] out = new HashMap[temp.size()]; 361 for (int i = 0; i < temp.size(); i++) { 362 Map<String, Object> pair = temp.get(i); 363 out[i] = pair; 364 } 365 return out; 366 } 367 368 protected Map<String,Object> parseXref(String line, 369 int startoffset, int endoffset) throws IOException { 370 String xref_str = null; 371 String desc_str = null; 372 373 SOPair xref = unescape(line, '"', startoffset, endoffset, false); 374 xref_str = xref.str.trim(); 375 if (xref_str.length() == 0) 376 return null; 377 378 if (xref.index != -1) { 379 SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true); 380 desc_str = desc.str.trim(); 381 } 382 383 384 Map<String, Object> m = new HashMap<String, Object>(); 385 m.put("xref",xref_str); 386 m.put("desc",desc_str); 387 return m; 388 } 389 390 391 392 private void triggerNewStanza(String stanza){ 393 Iterator<OboFileEventListener> iter = listeners.iterator(); 394 while (iter.hasNext()){ 395 OboFileEventListener li = iter.next(); 396 li.newStanza(stanza); 397 } 398 } 399 400 private void triggerNewKey(String key, String value){ 401 Iterator<OboFileEventListener> iter = listeners.iterator(); 402 while (iter.hasNext()){ 403 OboFileEventListener li = iter.next(); 404 li.newKey(key, value); 405 } 406 } 407 408 private void triggerNewSynonym(Synonym synonym){ 409 Iterator<OboFileEventListener> iter = listeners.iterator(); 410 while (iter.hasNext()){ 411 OboFileEventListener li = iter.next(); 412 li.newSynonym(synonym); 413 } 414 } 415 416 public static String escape(String str, boolean escapespaces) { 417 StringBuffer out = new StringBuffer(); 418 for (int i = 0; i < str.length(); i++) { 419 char c = str.charAt(i); 420 Object o = unescapeChars.get(new Character(c)); 421 if (o == null) 422 out.append(c); 423 else { 424 if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) { 425 out.append("\\").append(o); 426 } else 427 out.append(c); 428 } 429 } 430 return out.toString(); 431 } 432 433 public String unescape(String str) throws IOException { 434 return unescape(str, '\0', 0, str.length(), false).str; 435 } 436 437 public SOPair unescape(String str, char toChar, int startindex, 438 boolean mustFindChar) throws IOException { 439 return unescape(str, toChar, startindex, str.length(), mustFindChar); 440 } 441 442 public SOPair unescape(String str, char toChar, int startindex, 443 int endindex, boolean mustFindChar) throws IOException { 444 StringBuffer out = new StringBuffer(); 445 int endValue = -1; 446 for (int i = startindex; i < endindex; i++) { 447 char c = str.charAt(i); 448 if (c == '\\') { 449 i++; 450 c = str.charAt(i); 451 Character mapchar = escapeChars 452 .get(new Character(c)); 453 if (mapchar == null) 454 throw new IOException("Unrecognized escape" 455 + " character " + c + " found."); 456 out.append(mapchar); 457 } else if (c == toChar) { 458 endValue = i; 459 break; 460 } else { 461 out.append(c); 462 } 463 } 464 if (endValue == -1 && mustFindChar) { 465 throw new IOException("Expected " + toChar + "." + str); 466 } 467 return new SOPair(out.toString(), endValue); 468 } 469 470 471 public static int findUnescaped(String str, char toChar) { 472 return findUnescaped(str, toChar, 0, str.length()); 473 } 474 475 public static int findUnescaped(String str, char toChar, int startIndex, 476 int endIndex) { 477 return findUnescaped(str, toChar, startIndex, endIndex, false); 478 } 479 480 public static int findUnescaped(String str, char toChar, int startindex, 481 int endindex, boolean honorQuotes) { 482 boolean inQuotes = false; 483 char quoteChar = '\0'; 484 for (int i = startindex; i < endindex; i++) { 485 char c = str.charAt(i); 486 if (c == '\\') { 487 i++; 488 continue; 489 } else if (inQuotes) { 490 if (c == quoteChar) 491 inQuotes = false; 492 continue; 493 494 } else if (c == toChar) { 495 return i; 496 } else if (honorQuotes && isQuote(c)) { 497 inQuotes = true; 498 quoteChar = c; 499 } 500 } 501 return -1; 502 } 503 504 public static boolean isEscapeStarter(char c) { 505 return c == '\\'; 506 } 507 508 public static boolean isQuote(char c) { 509 return c == '"'; 510 } 511 512 protected StringBuffer getTempBuffer() { 513 tempBuffer.delete(0, tempBuffer.length()); 514 return tempBuffer; 515 } 516 517 protected SOPair readQuotedString(String value, int startIndex, 518 int stopIndex, char terminatingChar, boolean requireQuotes, 519 boolean legalEndOfLine) throws IOException { 520 521 char quoteChar = '\0'; 522 StringBuffer out = getTempBuffer(); 523 int i = startIndex; 524 boolean useQuotes = false; 525 526 for (; i < stopIndex; i++) { 527 // burn through any leading whitespace 528 if (Character.isWhitespace(value.charAt(i))) 529 continue; 530 531 // if the first non-whitespace character is not a quote, 532 // proceed in non-quoted mode 533 else if (!isQuote(value.charAt(i))) { 534 if (requireQuotes) 535 throw new IOException( 536 "Expected start of quoted string. " + 537 line + " " + value+ " at linenr " + linenum); 538 useQuotes = false; 539 break; 540 } else { 541 useQuotes = true; 542 quoteChar = value.charAt(i); 543 i++; 544 break; 545 } 546 } 547 548 // look for a closing quote or final delimiter 549 for (; i < stopIndex; i++) { 550 if (isEscapeStarter(value.charAt(i))) { 551 i++; 552 if (i >= value.length()) 553 throw new IOException("Incomplete escape sequence. " + line); 554 out.append(value.charAt(i)); 555 } else if ((useQuotes && value.charAt(i) == quoteChar) 556 || (!useQuotes && value.charAt(i) == terminatingChar)) { 557 if (!useQuotes) 558 return new SOPair(out.toString().trim(), startIndex, i - 1); 559 else 560 return new SOPair(out.toString(), startIndex, i); 561 } else { 562 out.append(value.charAt(i)); 563 } 564 } 565 if (!useQuotes && legalEndOfLine) 566 return new SOPair(out.toString().trim(), startIndex, i); 567 else 568 throw new IOException("Unterminated quoted string. " +line); 569 } 570 571 protected int getNestedValue(NestedValue nv, String str, int startIndex) 572 throws IOException { 573 while (startIndex < str.length()) { 574 int equalsIndex = findUnescaped(str, '=', startIndex, str.length()); 575 if (equalsIndex == -1) 576 throw new IOException("Expected = in trailing modifier " +line); 577 String name = str.substring(startIndex, equalsIndex).trim(); 578 SOPair value = readQuotedString(str, equalsIndex + 1, str.length(), 579 ',', false, true); 580 581 Properties pv = new Properties(); 582 pv.setProperty(unescape(name),value.str); 583 584 585 nv.addPropertyValue(pv); 586 startIndex = value.endIndex + 1; 587 for (; startIndex < str.length(); startIndex++) { 588 if (Character.isWhitespace(str.charAt(startIndex))) 589 continue; 590 else if (str.charAt(startIndex) == ',') { 591 startIndex++; 592 break; 593 } else { 594 logger.error("found character |{}|", str.charAt(startIndex)); 595 throw new IOException("Expected comma in trailing modifier. " + 596 line + " linenr: " + linenum); 597 } 598 } 599 } 600 return str.length(); 601 } 602 603} 604 605class NestedValue { 606 607 protected Properties propertyValues = new Properties(); 608 protected String name; 609 protected String suggestedComment; 610 611 public NestedValue() { 612 } 613 614 @Override 615 public String toString(){ 616 String txt = "NestedValue: " ; 617 Set<Object> keys = propertyValues.keySet(); 618 Iterator<Object> iter = keys.iterator(); 619 while (iter.hasNext()){ 620 String key = iter.next().toString(); 621 String value = propertyValues.get(key).toString(); 622 txt += " [" + key + ":" + value + "]"; 623 } 624 625 626 return txt; 627 } 628 629 public String getName() { 630 return name; 631 } 632 633 public Properties getPropertyValues() { 634 return propertyValues; 635 } 636 637 public void addPropertyValue(Properties pv) { 638 Set<Object> keys = pv.keySet(); 639 Iterator<Object> iter = keys.iterator(); 640 while (iter.hasNext()){ 641 String key = iter.next().toString(); 642 String value = pv.get(key).toString(); 643 propertyValues.setProperty(key, value); 644 } 645 646 } 647 648 @Override 649 public Object clone() { 650 try { 651 return super.clone(); 652 } catch (CloneNotSupportedException ex) { 653 // this will never happen 654 return null; 655 } 656 } 657 658 public String getSuggestedComment() { 659 return suggestedComment; 660 } 661 662 public void setSuggestedComment(String suggestedComment) { 663 this.suggestedComment = suggestedComment; 664 } 665} 666 667