001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.nbio.ontology.io; 023 024import org.biojava.nbio.ontology.*; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.text.ParseException; 029import java.util.ArrayList; 030import java.util.List; 031import java.util.StringTokenizer; 032 033/** 034 * Simple parser for the Gene Ontology (GO) flatfile format. 035 * 036 * @author Thomas Down 037 * @since 1.4 038 */ 039 040public class GOParser { 041 public Ontology parseGO(BufferedReader goFile, 042 String ontoName, 043 String ontoDescription, 044 OntologyFactory factory) 045 throws ParseException, IOException 046 { 047 try { 048 Ontology onto = factory.createOntology(ontoName, ontoDescription); 049 Term isa = onto.importTerm(OntoTools.IS_A, null); 050 Term partof = null; // fixme: onto.importTerm(OntoTools.PART_OF, null); 051 List<Term> termStack = new ArrayList<Term>(); 052 String line; 053 while ((line = goFile.readLine()) != null) { 054 int leadSpaces = 0; 055 while (line.charAt(leadSpaces) == ' ') { 056 ++leadSpaces; 057 } 058 line = line.trim(); 059 if (line.startsWith("!")) { 060 continue; 061 } 062 063 StringTokenizer toke = new StringTokenizer(line, "%<$", true); 064 String parentRel = toke.nextToken(); 065 Term term = parseTerm(onto, toke.nextToken()); 066 if (parentRel.equals("%")) { 067 safeAddTriple(onto, term, termStack.get(leadSpaces - 1), isa); 068 } else if (parentRel.equals("<")) { 069 safeAddTriple(onto, term, termStack.get(leadSpaces - 1), partof); 070 } 071 while (toke.hasMoreTokens()) { 072 String altRel = toke.nextToken(); 073 Term altTerm = parseTerm(onto, toke.nextToken()); 074 if (altRel.equals("%")) { 075 safeAddTriple(onto, term, altTerm, isa); 076 } else if (altRel.equals("<")) { 077 safeAddTriple(onto, term, altTerm, partof); 078 } 079 } 080 081 if (termStack.size() == leadSpaces) { 082 termStack.add(term); 083 } else { 084 termStack.set(leadSpaces, term); 085 } 086 } 087 return onto; 088 } catch (AlreadyExistsException ex) { 089 throw new RuntimeException( "Duplication in ontology"); 090 } catch (OntologyException ex) { 091 throw new RuntimeException(ex); 092 } 093 } 094 095 private void safeAddTriple(Ontology onto, Term s, Term o, Term p) 096 throws AlreadyExistsException 097 { 098 if (!onto.containsTriple(s, o, p)) { 099 onto.createTriple(s, o, p, null, null); 100 } 101 } 102 103 private Term parseTerm(Ontology onto, String s) 104 throws ParseException, AlreadyExistsException 105 { 106 int semi = s.indexOf(';'); 107 int semi2 = s.indexOf(';', semi + 1); 108 if (semi < 0) { 109 throw new RuntimeException("No semicolon in " + s); 110 } 111 String termDesc = s.substring(0, semi).trim(); 112 String termName; 113 if (semi2 < 0) { 114 termName = s.substring(semi + 1).trim(); 115 } else { 116 termName = s.substring(semi + 1, semi2).trim(); 117 } 118 StringTokenizer toke = new StringTokenizer(termName, ", "); 119 termName = toke.nextToken(); 120 if (onto.containsTerm(termName)) { 121 return onto.getTerm(termName); 122 } else { 123 Term t = onto.createTerm(termName, termDesc); 124 if (toke.hasMoreTokens()) { 125 List<String> secondaries = new ArrayList<String>(); 126 while (toke.hasMoreTokens()) { 127 secondaries.add(toke.nextToken()); 128 } 129 t.getAnnotation().setProperty("go.secondary_ids", secondaries); 130 } 131 return t; 132 } 133 } 134} 135 136