001package org.biojava.ontology.io; 002 003import java.io.BufferedReader; 004import java.io.IOException; 005import java.util.StringTokenizer; 006 007import org.biojava.bio.BioError; 008import org.biojava.ontology.AlreadyExistsException; 009import org.biojava.ontology.Ontology; 010import org.biojava.ontology.OntologyException; 011import org.biojava.ontology.OntologyFactory; 012import org.biojava.ontology.Term; 013import org.biojava.ontology.Triple; 014import org.biojava.utils.ChangeVetoException; 015 016/** 017 * Parse tab-delimited ontology files into Ontology objects. 018 * 019 * <p> 020 * The tab-delimited ontology files have three types of lines. Lines that are 021 * pure white space can be discarded. Comment lines begin with a hash (#) and 022 * can be discarded. The payload lines contain three fields seperated by tabs. 023 * These are <code>subject</code>, <code>predicate</code> and 024 * <code>object</code>. 025 * By convention, the content of each field contains no spaces. 026 * </p> 027 * 028 * <p> 029 * By convention, if there are comment lines beginning with <code>name:</code> 030 * or <code>description:</code> and these appear before any predicate 031 * declarations then they become the name and description of the ontology. 032 * Otherwise, the name and description will be the empty string. 033 * </p> 034 * 035 * <p> 036 * Term names normally will be just a term name like <code>predicate</code> or 037 * <code>person</code>. There are also terms that represent collections of 038 * triples. For example, here is the declaration for the 'triple' type in 039 * the core ontology. 040 * </p> 041 * 042 * <code><pre> 043 * ... 044 * triple is-a any 045 * triple has-a source 046 * triple has-a target 047 * triple has-a predicate 048 * (triple,has-a,any) size 3 049 * ... 050 * </pre></code> 051 * 052 * <p> 053 * The first four lines just associate triple with some type with a predicate 054 * (e.g. is-a or has-a). The fifth line says that something must have a size of 055 * three. The 'something' is <code>(triple,has-a,any) size 3</code> and is 056 * short-hand for a collection of triples that state that the source must be 057 * <code>triple</code>, the target must be <code>any</code> and the predicate 058 * must be <code>has-a</code>. This whole expression states that a triple 059 * has exactly three has-a relationships; that is, exactly three properties. 060 * </p> 061 * 062 * @author Matthew Pocock 063 */ 064public class TabDelimParser { 065 /** 066 * Parse an ontology from a reader. 067 * The reader will be emptied of text. It is the caller's responsibility to 068 * close the reader. 069 * 070 * @param in the BufferedReader to read from 071 * @param of an OntologyFactory used to create the Ontology instance 072 * @return a new Ontology 073 * @throws IOException if there is some problem with the buffered reader 074 * @throws OntologyException if it was not possible to instantiate a new 075 * ontology 076 */ 077 public Ontology parse(BufferedReader in, OntologyFactory of) 078 throws IOException, OntologyException { 079 String name = ""; 080 String description = ""; 081 Ontology onto = null; 082 083 for( 084 String line = in.readLine(); 085 line != null; 086 line = in.readLine() 087 ) { 088 line = line.trim(); 089 if(line.length() > 0) { 090 if(line.startsWith("#")) { 091 // comment line - let's try to pull out name or description 092 093 if(line.startsWith("#name:")) { 094 name = line.substring("#name:".length()).trim(); 095 } else if(line.startsWith("#description:")) { 096 description = line.substring("#description:".length()).trim(); 097 } 098 } else { 099 try { 100 // make sure we have an ontology 101 if(onto == null) { 102 onto = of.createOntology(name, description); 103 } 104 105 // build a tripple 106 107 /* 108 109 int t1 = line.indexOf("\t"); 110 int t2 = line.indexOf("\t", t1 + 1); 111 112 String subject = line.substring(0, t1); 113 String predicate = line.substring(t1 + 1, t2); 114 String object = line.substring(t2 + 1); 115 116 */ 117 118 StringTokenizer toke = new StringTokenizer(line); 119 String subject = toke.nextToken(); 120 String predicate = toke.nextToken(); 121 String object = toke.nextToken(); 122 123 Term subT = resolveTerm(subject, onto); 124 Term objT = resolveTerm(object, onto); 125 Term relT = resolveTerm(predicate, onto); 126 127 Triple trip = resolveTriple(subT, objT, relT, onto); 128 trip = trip==null?null:trip; // prevent unused field error 129 } catch (StringIndexOutOfBoundsException e) { 130 throw new IOException("Could not parse line: " + line); 131 } 132 } 133 } 134 } 135 136 return onto; 137 } 138 139 private Term resolveTerm(String termName, Ontology onto) { 140 boolean isTrippleTerm = termName.startsWith("(") && termName.endsWith(")"); 141 142 if(onto.containsTerm(termName)) { 143 return onto.getTerm(termName); 144 } else { 145 try { 146 if(isTrippleTerm) { 147 int c1 = termName.indexOf(","); 148 int c2 = termName.indexOf(",", c1 + 1); 149 150 String source = termName.substring(1, c1); 151 String target = termName.substring(c2 + 1, termName.length() - 1); 152 String predicate = termName.substring(c1 + 1, c2); 153 154 Term st = resolveTerm(source, onto); 155 Term tt = resolveTerm(target, onto); 156 Term rt = resolveTerm(predicate, onto); 157 158 return onto.createTriple(st, tt, rt, null, null); 159 } else { 160 return onto.createTerm(termName, ""); 161 } 162 } catch (AlreadyExistsException aee) { 163 throw new BioError("Assertion Failure: Could not create term", aee); 164 } catch (ChangeVetoException cve) { 165 throw new BioError("Assertion Failure: Could not create term", cve); 166 } 167 } 168 } 169 170 private Triple resolveTriple(Term sub, Term obj, Term rel, Ontology onto) { 171 if(onto.containsTriple(sub, obj, rel)) { 172 return (Triple) onto.getTriples(sub, obj, rel).iterator().next(); 173 } else { 174 try { 175 return onto.createTriple(sub, obj, rel, null, null); 176 } catch (AlreadyExistsException aee) { 177 throw new BioError("Assertion Failure: Could not create triple",aee); 178 } catch (ChangeVetoException cve) { 179 throw new BioError("Assertion Failure: Could not create triple", cve); 180 } 181 } 182 } 183}