001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Jan 18, 2008 021 * 022 */ 023 024package org.biojava.nbio.ontology.obo; 025 026import org.biojava.nbio.ontology.Synonym; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import java.io.BufferedReader; 031import java.io.IOException; 032import java.text.SimpleDateFormat; 033import java.util.*; 034 035 036/** 037 * A class to parse the content of an OBO file. It delegates handling of the 038 * content to the OBOFileEventListener implementation. 039 * 040 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part) 041 * 042 * See <a href="http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup">link</a> 043 * Thanks to the OboEdit developers for giving permission to release this in BioJava. 044 * 045 * 046 * @author Andreas Prlic 047 * @author John Day Richter 048 * @since 1.6 049 */ 050public class OboFileParser { 051 052 private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class); 053 054 List<OboFileEventListener> listeners; 055 056 protected String line; 057 protected int linenum = 0; 058 protected int totalSize = 0; 059 protected int bytesRead = 0; 060 protected StringBuffer tempBuffer = new StringBuffer(); 061 protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US); 062 063 064 protected static final Map<Character, Character> escapeChars = 065 new HashMap<>(); 066 067 protected static final Map<Character, Character> unescapeChars = 068 new HashMap<>(); 069 070 static { 071 escapeChars.put('n', '\n'); 072 escapeChars.put('W', ' '); 073 escapeChars.put('t', '\t'); 074 escapeChars.put(':', ':'); 075 escapeChars.put(',', ','); 076 escapeChars.put('"', '"'); 077 escapeChars.put('\'', '\''); 078 escapeChars.put('\\', '\\'); 079 escapeChars.put('{', '{'); 080 escapeChars.put('}', '}'); 081 escapeChars.put('(', '('); 082 escapeChars.put(')', ')'); 083 escapeChars.put('[', '['); 084 escapeChars.put(']', ']'); 085 escapeChars.put('!', '!'); 086 Iterator <Character> it = escapeChars.keySet().iterator(); 087 while (it.hasNext()) { 088 Character key = it.next(); 089 Character value = escapeChars.get(key); 090 unescapeChars.put(value, key); 091 } 092 } 093 094 public static class SOPair { 095 public String str = null; 096 097 public int index = -1; 098 099 public int endIndex = -1; 100 101 public SOPair(String str, int index) { 102 this(str, index, -1); 103 } 104 105 public SOPair(String str, int index, int endIndex) { 106 this.str = str; 107 this.index = index; 108 this.endIndex = endIndex; 109 } 110 111 112 } 113 114 115 116 117 public OboFileParser(){ 118 listeners = new ArrayList<>(); 119 } 120 121 122 123 public void addOboFileEventListener(OboFileEventListener listener){ 124 listeners.add(listener); 125 } 126 127 public List<OboFileEventListener> getOboFileEventListener(){ 128 return listeners; 129 } 130 131 /** parse an ontology file 132 * 133 * @param oboFile 134 * @throws IOException 135 * @throws IOException 136 */ 137 public void parseOBO(BufferedReader oboFile) throws IOException{ 138 139 String line; 140 String currentStanza; 141 142 while ((line = oboFile.readLine()) != null) { 143 if (line.length() == 0) 144 continue; 145 146 if ( line.charAt(0) == '[') { 147 if (line.charAt(line.length() - 1) != ']') 148 throw new IOException("Unclosed stanza: \"" + line + "\"" ); 149 String stanzaname = line.substring(1, line.length() - 1); 150 if (stanzaname.length() < 1) 151 throw new IOException("Empty stanza: \"" +line+"\""); 152 currentStanza = stanzaname; 153 154 //logger.info("stanza: {}", currentStanza); 155 triggerNewStanza(currentStanza); 156 157 } else { 158 // a content line 159 SOPair pair; 160 161 pair = unescape(line, ':', 0, true); 162 163 //logger.info(pair); 164 String name = pair.str; 165 int lineEnd = findUnescaped(line, '!', 0, line.length(), true); 166 if (lineEnd == -1) 167 lineEnd = line.length(); 168 169 // find nested values 170 NestedValue nv = null; 171 172 int trailingStartIndex = -1; 173 int trailingEndIndex = -1; 174 for (int i = lineEnd - 1; i >= 0; i--) { 175 if (Character.isWhitespace(line.charAt(i))) { 176 // keep going until we see non-whitespace 177 } else if (line.charAt(i) == '}') { 178 // if the first thing we see is a closing brace, 179 // we have a trailing modifier 180 if (i >= 1 && line.charAt(i - 1) == '\\') 181 continue; 182 trailingEndIndex = i; 183 break; 184 } else 185 break; 186 } 187 188 if (trailingEndIndex != -1) { 189 for (int i = trailingEndIndex - 1; i >= 0; i--) { 190 if (line.charAt(i) == '{') { 191 if (i >= 1 && line.charAt(i - 1) == '\\') 192 continue; 193 trailingStartIndex = i + 1; 194 } 195 } 196 } 197 198 int valueStopIndex; 199 if (trailingStartIndex == -1 && trailingEndIndex != -1) 200 throw new IOException("Unterminated trailing modifier. " + line); 201 else if (trailingStartIndex != -1) { 202 valueStopIndex = trailingStartIndex - 1; 203 String trailing = line.substring(trailingStartIndex, 204 trailingEndIndex).trim(); 205 nv = new NestedValue(); 206 getNestedValue(nv, trailing, 0); 207 } else 208 valueStopIndex = lineEnd; 209 210 String value = line.substring(pair.index + 1, valueStopIndex).trim(); 211 /* 212 * if (nv != null) logger.warn("nv = "+nv+", value = 213 * |"+value+"|"); 214 */ 215 if (value.length() == 0) 216 throw new IOException("Tag found with no value "+ line); 217 218 if ( isSynonym(name)){ 219 Synonym synonym = parseSynonym(name,value); 220 triggerNewSynonym(synonym); 221 } else { 222 //logger.info("new key:" + name + " " + value); 223 triggerNewKey(name,value); 224 } 225 //logger.info("parsed key: " + name +" value: " + value + " nv: " + nv); 226 227 228 229 } 230 } 231 } 232 233 private boolean isSynonym(String key){ 234 if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM)) 235 return true; 236 return false; 237 } 238 239 /** parse the Synonym String from the Term. 240 * value can be: 241 * <pre>"ca_bind" RELATED [uniprot:curation]</pre> 242 * @param value 243 * @return the synonym text 244 */ 245 private Synonym parseSynonym(String key, String value) throws IOException{ 246 247 //logger.info("PARSE SYNONYM " + key + " " + value); 248 int startIndex = findUnescaped(value, '"', 0, value.length()); 249 if (startIndex == -1) 250 throw new IOException("Expected \"" + line + " " + linenum); 251 SOPair p = unescape(value, '"', startIndex + 1, value.length(), 252 true); 253 int defIndex = findUnescaped(value, '[', p.index, value.length()); 254 if (defIndex == -1) { 255 throw new IOException("Badly formatted synonym. " 256 + "No dbxref list found." + line + " " + linenum ); 257 } 258 String leftovers = value.substring(p.index + 1, defIndex).trim(); 259 StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t"); 260 int scope = Synonym.RELATED_SYNONYM; 261 262 if ( key.equals(OboFileHandler.EXACT_SYNONYM)) 263 scope = Synonym.EXACT_SYNONYM; 264 else if ( key.equals(OboFileHandler.BROAD_SYNONYM)) 265 scope = Synonym.BROAD_SYNONYM; 266 else if ( key.equals(OboFileHandler.NARROW_SYNONYM)) 267 scope = Synonym.NARROW_SYNONYM; 268 269 270 String catID = null; 271 for (int i = 0; tokenizer.hasMoreTokens(); i++) { 272 String token = tokenizer.nextToken(); 273 //logger.info("TOKEN:" +token); 274 if (i == 0) { 275 if ("RELATED".equals(token)) 276 scope = Synonym.RELATED_SYNONYM; 277 else if ("UNSPECIFIED".equals(token)) 278 scope = Synonym.RELATED_SYNONYM; 279 else if ("EXACT".equals(token)) 280 scope = Synonym.EXACT_SYNONYM; 281 else if ("BROAD".equals(token)) 282 scope = Synonym.BROAD_SYNONYM; 283 else if ("NARROW".equals(token)) 284 scope = Synonym.NARROW_SYNONYM; 285 else 286 throw new IOException("Found unexpected scope " 287 + "identifier " + token + line); 288 } else if (i == 1) { 289 catID = token; 290 } else 291 throw new IOException("Expected dbxref list," 292 + " instead found " + token + line ); 293 } 294 295 Synonym synonym = new Synonym(); 296 synonym.setScope(scope); 297 synonym.setCategory(catID); 298 synonym.setName(p.str); 299 //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope()); 300 301 Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length()); 302 303 // set the refs in the synonym 304 for (Map<String, Object> ref : refs){ 305 @SuppressWarnings("unused") 306 String xref = (String) ref.get("xref"); 307 @SuppressWarnings("unused") 308 String desc = (String) ref.get("desc"); 309 //logger.info(xref + " " + desc); 310 @SuppressWarnings("unused") 311 NestedValue nv = (NestedValue) ref.get("nv"); 312 //TODO: add implementation for this... 313 } 314 315 316 return synonym; 317 } 318 319 protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException { 320 Vector<Map<String,Object>> temp = new Vector<>(); 321 boolean stop = false; 322 while (!stop) { 323 int braceIndex = findUnescaped(line, '{', startoffset, endoffset); 324 int endIndex = findUnescaped(line, ',', startoffset, endoffset, 325 true); 326 boolean trailing = false; 327 if (endIndex == -1) { 328 endIndex = findUnescaped(line, ']', startoffset, endoffset, 329 true); 330 if (endIndex == -1) { 331 throw new IOException("Unterminated xref list " + line); 332 } 333 stop = true; 334 } 335 if (braceIndex != -1 && braceIndex < endIndex) { 336 endIndex = braceIndex; 337 trailing = true; 338 } 339 340 Map<String, Object> pair = parseXref(line, 341 startoffset, 342 endIndex); 343 if (pair == null) { 344 startoffset++; 345 continue; 346 } 347 NestedValue nv = null; 348 if (trailing) { 349 nv = new NestedValue(); 350 endIndex = getNestedValue(nv, line, endIndex + 1); 351 if (endIndex == -1) { 352 throw new IOException("Badly formatted " 353 + "trailing properties " + line); 354 } 355 pair.put("nv",nv); 356 } 357 358 temp.add(pair); 359 startoffset = endIndex + 1; 360 } 361 Map<String,Object>[] out = new HashMap[temp.size()]; 362 for (int i = 0; i < temp.size(); i++) { 363 Map<String, Object> pair = temp.get(i); 364 out[i] = pair; 365 } 366 return out; 367 } 368 369 protected Map<String,Object> parseXref(String line, 370 int startoffset, int endoffset) throws IOException { 371 String xref_str = null; 372 String desc_str = null; 373 374 SOPair xref = unescape(line, '"', startoffset, endoffset, false); 375 xref_str = xref.str.trim(); 376 if (xref_str.length() == 0) 377 return null; 378 379 if (xref.index != -1) { 380 SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true); 381 desc_str = desc.str.trim(); 382 } 383 384 385 Map<String, Object> m = new HashMap<>(); 386 m.put("xref",xref_str); 387 m.put("desc",desc_str); 388 return m; 389 } 390 391 392 393 private void triggerNewStanza(String stanza){ 394 Iterator<OboFileEventListener> iter = listeners.iterator(); 395 while (iter.hasNext()){ 396 OboFileEventListener li = iter.next(); 397 li.newStanza(stanza); 398 } 399 } 400 401 private void triggerNewKey(String key, String value){ 402 Iterator<OboFileEventListener> iter = listeners.iterator(); 403 while (iter.hasNext()){ 404 OboFileEventListener li = iter.next(); 405 li.newKey(key, value); 406 } 407 } 408 409 private void triggerNewSynonym(Synonym synonym){ 410 Iterator<OboFileEventListener> iter = listeners.iterator(); 411 while (iter.hasNext()){ 412 OboFileEventListener li = iter.next(); 413 li.newSynonym(synonym); 414 } 415 } 416 417 public static String escape(String str, boolean escapespaces) { 418 StringBuffer out = new StringBuffer(); 419 for (int i = 0; i < str.length(); i++) { 420 char c = str.charAt(i); 421 Object o = unescapeChars.get(c); 422 if (o == null) 423 out.append(c); 424 else { 425 if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) { 426 out.append("\\").append(o); 427 } else 428 out.append(c); 429 } 430 } 431 return out.toString(); 432 } 433 434 public String unescape(String str) throws IOException { 435 return unescape(str, '\0', 0, str.length(), false).str; 436 } 437 438 public SOPair unescape(String str, char toChar, int startindex, 439 boolean mustFindChar) throws IOException { 440 return unescape(str, toChar, startindex, str.length(), mustFindChar); 441 } 442 443 public SOPair unescape(String str, char toChar, int startindex, 444 int endindex, boolean mustFindChar) throws IOException { 445 StringBuffer out = new StringBuffer(); 446 int endValue = -1; 447 for (int i = startindex; i < endindex; i++) { 448 char c = str.charAt(i); 449 if (c == '\\') { 450 i++; 451 c = str.charAt(i); 452 Character mapchar = escapeChars 453 .get(c); 454 if (mapchar == null) 455 throw new IOException("Unrecognized escape" 456 + " character " + c + " found."); 457 out.append(mapchar); 458 } else if (c == toChar) { 459 endValue = i; 460 break; 461 } else { 462 out.append(c); 463 } 464 } 465 if (endValue == -1 && mustFindChar) { 466 throw new IOException("Expected " + toChar + "." + str); 467 } 468 return new SOPair(out.toString(), endValue); 469 } 470 471 472 public static int findUnescaped(String str, char toChar) { 473 return findUnescaped(str, toChar, 0, str.length()); 474 } 475 476 public static int findUnescaped(String str, char toChar, int startIndex, 477 int endIndex) { 478 return findUnescaped(str, toChar, startIndex, endIndex, false); 479 } 480 481 public static int findUnescaped(String str, char toChar, int startindex, 482 int endindex, boolean honorQuotes) { 483 boolean inQuotes = false; 484 char quoteChar = '\0'; 485 for (int i = startindex; i < endindex; i++) { 486 char c = str.charAt(i); 487 if (c == '\\') { 488 i++; 489 continue; 490 } else if (inQuotes) { 491 if (c == quoteChar) 492 inQuotes = false; 493 continue; 494 495 } else if (c == toChar) { 496 return i; 497 } else if (honorQuotes && isQuote(c)) { 498 inQuotes = true; 499 quoteChar = c; 500 } 501 } 502 return -1; 503 } 504 505 public static boolean isEscapeStarter(char c) { 506 return c == '\\'; 507 } 508 509 public static boolean isQuote(char c) { 510 return c == '"'; 511 } 512 513 protected StringBuffer getTempBuffer() { 514 tempBuffer.delete(0, tempBuffer.length()); 515 return tempBuffer; 516 } 517 518 protected SOPair readQuotedString(String value, int startIndex, 519 int stopIndex, char terminatingChar, boolean requireQuotes, 520 boolean legalEndOfLine) throws IOException { 521 522 char quoteChar = '\0'; 523 StringBuffer out = getTempBuffer(); 524 int i = startIndex; 525 boolean useQuotes = false; 526 527 for (; i < stopIndex; i++) { 528 // burn through any leading whitespace 529 if (Character.isWhitespace(value.charAt(i))) 530 continue; 531 532 // if the first non-whitespace character is not a quote, 533 // proceed in non-quoted mode 534 else if (!isQuote(value.charAt(i))) { 535 if (requireQuotes) 536 throw new IOException( 537 "Expected start of quoted string. " + 538 line + " " + value+ " at linenr " + linenum); 539 useQuotes = false; 540 break; 541 } else { 542 useQuotes = true; 543 quoteChar = value.charAt(i); 544 i++; 545 break; 546 } 547 } 548 549 // look for a closing quote or final delimiter 550 for (; i < stopIndex; i++) { 551 if (isEscapeStarter(value.charAt(i))) { 552 i++; 553 if (i >= value.length()) 554 throw new IOException("Incomplete escape sequence. " + line); 555 out.append(value.charAt(i)); 556 } else if ((useQuotes && value.charAt(i) == quoteChar) 557 || (!useQuotes && value.charAt(i) == terminatingChar)) { 558 if (!useQuotes) 559 return new SOPair(out.toString().trim(), startIndex, i - 1); 560 else 561 return new SOPair(out.toString(), startIndex, i); 562 } else { 563 out.append(value.charAt(i)); 564 } 565 } 566 if (!useQuotes && legalEndOfLine) 567 return new SOPair(out.toString().trim(), startIndex, i); 568 else 569 throw new IOException("Unterminated quoted string. " +line); 570 } 571 572 protected int getNestedValue(NestedValue nv, String str, int startIndex) 573 throws IOException { 574 while (startIndex < str.length()) { 575 int equalsIndex = findUnescaped(str, '=', startIndex, str.length()); 576 if (equalsIndex == -1) 577 throw new IOException("Expected = in trailing modifier " +line); 578 String name = str.substring(startIndex, equalsIndex).trim(); 579 SOPair value = readQuotedString(str, equalsIndex + 1, str.length(), 580 ',', false, true); 581 582 Properties pv = new Properties(); 583 pv.setProperty(unescape(name),value.str); 584 585 586 nv.addPropertyValue(pv); 587 startIndex = value.endIndex + 1; 588 for (; startIndex < str.length(); startIndex++) { 589 if (Character.isWhitespace(str.charAt(startIndex))) 590 continue; 591 else if (str.charAt(startIndex) == ',') { 592 startIndex++; 593 break; 594 } else { 595 logger.error("found character |{}|", str.charAt(startIndex)); 596 throw new IOException("Expected comma in trailing modifier. " + 597 line + " linenr: " + linenum); 598 } 599 } 600 } 601 return str.length(); 602 } 603 604} 605 606class NestedValue { 607 608 protected Properties propertyValues = new Properties(); 609 protected String name; 610 protected String suggestedComment; 611 612 public NestedValue() { 613 } 614 615 @Override 616 public String toString(){ 617 String txt = "NestedValue: " ; 618 Set<Object> keys = propertyValues.keySet(); 619 Iterator<Object> iter = keys.iterator(); 620 while (iter.hasNext()){ 621 String key = iter.next().toString(); 622 String value = propertyValues.get(key).toString(); 623 txt += " [" + key + ":" + value + "]"; 624 } 625 626 627 return txt; 628 } 629 630 public String getName() { 631 return name; 632 } 633 634 public Properties getPropertyValues() { 635 return propertyValues; 636 } 637 638 public void addPropertyValue(Properties pv) { 639 Set<Object> keys = pv.keySet(); 640 Iterator<Object> iter = keys.iterator(); 641 while (iter.hasNext()){ 642 String key = iter.next().toString(); 643 String value = pv.get(key).toString(); 644 propertyValues.setProperty(key, value); 645 } 646 647 } 648 649 @Override 650 public Object clone() { 651 try { 652 return super.clone(); 653 } catch (CloneNotSupportedException ex) { 654 // this will never happen 655 return null; 656 } 657 } 658 659 public String getSuggestedComment() { 660 return suggestedComment; 661 } 662 663 public void setSuggestedComment(String suggestedComment) { 664 this.suggestedComment = suggestedComment; 665 } 666} 667 668