001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.nbio.ontology.io;
023
024import org.biojava.nbio.ontology.*;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.text.ParseException;
029import java.util.ArrayList;
030import java.util.List;
031import java.util.StringTokenizer;
032
033/**
034 * Simple parser for the Gene Ontology (GO) flatfile format.
035 *
036 * @author Thomas Down
037 * @since 1.4
038 */
039
040public class GOParser {
041        public Ontology parseGO(BufferedReader goFile,
042                                                        String ontoName,
043                                                        String ontoDescription,
044                                                        OntologyFactory factory)
045                throws ParseException, IOException
046        {
047                try {
048                        Ontology onto = factory.createOntology(ontoName, ontoDescription);
049                        Term isa = onto.importTerm(OntoTools.IS_A, null);
050                        Term partof = null; // fixme: onto.importTerm(OntoTools.PART_OF, null);
051                        List<Term> termStack = new ArrayList<Term>();
052                        String line;
053                        while ((line = goFile.readLine()) != null) {
054                                int leadSpaces = 0;
055                                while (line.charAt(leadSpaces) == ' ') {
056                                        ++leadSpaces;
057                                }
058                                line = line.trim();
059                                if (line.startsWith("!")) {
060                                        continue;
061                                }
062
063                                StringTokenizer toke = new StringTokenizer(line, "%<$", true);
064                                String parentRel = toke.nextToken();
065                                Term term = parseTerm(onto, toke.nextToken());
066                                if (parentRel.equals("%")) {
067                                        safeAddTriple(onto, term, termStack.get(leadSpaces - 1), isa);
068                                } else if (parentRel.equals("<")) {
069                                        safeAddTriple(onto, term, termStack.get(leadSpaces - 1), partof);
070                                }
071                                while (toke.hasMoreTokens()) {
072                                        String altRel = toke.nextToken();
073                                        Term altTerm = parseTerm(onto, toke.nextToken());
074                                        if (altRel.equals("%")) {
075                                                safeAddTriple(onto, term, altTerm, isa);
076                                        } else if (altRel.equals("<")) {
077                                                safeAddTriple(onto, term, altTerm, partof);
078                                        }
079                                }
080
081                                if (termStack.size() == leadSpaces) {
082                                        termStack.add(term);
083                                } else {
084                                        termStack.set(leadSpaces, term);
085                                }
086                        }
087                        return onto;
088                } catch (AlreadyExistsException ex) {
089                        throw new RuntimeException( "Duplication in ontology");
090                } catch (OntologyException ex) {
091                        throw new RuntimeException(ex);
092                }
093        }
094
095        private void safeAddTriple(Ontology onto, Term s, Term o, Term p)
096                throws AlreadyExistsException
097        {
098                if (!onto.containsTriple(s, o, p)) {
099                        onto.createTriple(s, o, p, null, null);
100                }
101        }
102
103        private Term parseTerm(Ontology onto, String s)
104                throws ParseException, AlreadyExistsException
105        {
106                int semi = s.indexOf(';');
107                int semi2 = s.indexOf(';', semi + 1);
108                if (semi < 0) {
109                        throw new RuntimeException("No semicolon in " + s);
110                }
111                String termDesc = s.substring(0, semi).trim();
112                String termName;
113                if (semi2 < 0) {
114                        termName = s.substring(semi + 1).trim();
115                } else {
116                        termName = s.substring(semi + 1, semi2).trim();
117                }
118                StringTokenizer toke = new StringTokenizer(termName, ", ");
119                termName = toke.nextToken();
120                if (onto.containsTerm(termName)) {
121                        return onto.getTerm(termName);
122                } else {
123                        Term t = onto.createTerm(termName, termDesc);
124                        if (toke.hasMoreTokens()) {
125                                List<String> secondaries = new ArrayList<String>();
126                                while (toke.hasMoreTokens()) {
127                                        secondaries.add(toke.nextToken());
128                                }
129                                t.getAnnotation().setProperty("go.secondary_ids", secondaries);
130                        }
131                        return t;
132                }
133        }
134}
135
136