001/*
002 *                  BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 * 
020 * Created on Jan 18, 2008
021 * 
022 */
023
024package org.biojava.ontology.obo;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.text.SimpleDateFormat;
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.Iterator;
032import java.util.List;
033import java.util.Locale;
034import java.util.Map;
035import java.util.Properties;
036import java.util.Set;
037import java.util.StringTokenizer;
038import java.util.Vector;
039
040import org.biojava.bio.seq.io.ParseException;
041import org.biojava.ontology.Synonym;
042
043
044/** A class to parse the content of an OBO file. It delegates handling of the 
045 * content to the OBOFileEventListener implementation.
046 * 
047 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part)
048 * 
049 * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup
050 * Thanks to the OboEdit developers for giving permission to release this in BioJava.
051 *  
052 * 
053 * @author Andreas Prlic
054 * @author John Day Richter
055 * @since 1.6
056 */
057public class OboFileParser {
058
059        List<OboFileEventListener> listeners;
060
061        protected String line;
062        protected int linenum = 0;
063        protected int totalSize = 0;
064        protected int bytesRead = 0;
065        protected StringBuffer tempBuffer = new StringBuffer();
066        protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US);
067
068
069        protected static final Map<Character, Character> escapeChars =
070                new HashMap<Character, Character>();
071
072        protected static final Map<Character, Character> unescapeChars = 
073                new HashMap<Character, Character>();
074
075        static {
076                escapeChars.put(new Character('n'), new Character('\n'));
077                escapeChars.put(new Character('W'), new Character(' '));
078                escapeChars.put(new Character('t'), new Character('\t'));
079                escapeChars.put(new Character(':'), new Character(':'));                        
080                escapeChars.put(new Character(','), new Character(','));
081                escapeChars.put(new Character('"'), new Character('"'));
082                escapeChars.put(new Character('\''), new Character('\''));              
083                escapeChars.put(new Character('\\'), new Character('\\'));
084                escapeChars.put(new Character('{'), new Character('{'));
085                escapeChars.put(new Character('}'), new Character('}'));
086                escapeChars.put(new Character('('), new Character('('));
087                escapeChars.put(new Character(')'), new Character(')'));
088                escapeChars.put(new Character('['), new Character('['));
089                escapeChars.put(new Character(']'), new Character(']'));
090                escapeChars.put(new Character('!'), new Character('!'));
091                Iterator <Character> it = escapeChars.keySet().iterator();
092                while (it.hasNext()) {
093                        Character key = it.next();
094                        Character value = escapeChars.get(key);
095                        unescapeChars.put(value, key);
096                }
097        }
098
099        public static class SOPair {
100                public String str = null;
101
102                public int index = -1;
103
104                public int endIndex = -1;
105
106                public SOPair(String str, int index) {
107                        this(str, index, -1);
108                }
109
110                public SOPair(String str, int index, int endIndex) {
111                        this.str = str;
112                        this.index = index;
113                        this.endIndex = endIndex;
114                }
115                
116                
117        }
118
119
120
121
122        public OboFileParser(){
123                listeners = new ArrayList<OboFileEventListener>();
124        }
125
126
127
128        public void addOboFileEventListener(OboFileEventListener listener){
129                listeners.add(listener);
130        }
131
132        public List<OboFileEventListener> getOboFileEventListener(){
133                return listeners;
134        }
135
136        /** parse an ontology file
137         * 
138         * @param oboFile
139         * @throws IOException
140         * @throws ParseException 
141         */
142        public void parseOBO(BufferedReader oboFile) throws IOException,ParseException{
143
144                String line;
145                String currentStanza;
146
147                while ((line = oboFile.readLine()) != null) {
148                        if (line.length() == 0)
149                                continue;
150
151                        if ( line.charAt(0) == '[') {
152                                if (line.charAt(line.length() - 1) != ']')
153                                        throw new ParseException("Unclosed stanza: \"" + line + "\"" );
154                                String stanzaname = line.substring(1, line.length() - 1);
155                                if (stanzaname.length() < 1)
156                                        throw new ParseException("Empty stanza: \"" +line+"\"");
157                                currentStanza = stanzaname;                             
158
159                                //System.out.println("stanza: " + currentStanza);
160                                triggerNewStanza(currentStanza);
161
162                        } else {
163                                // a content line
164                                SOPair pair;
165
166                                pair = unescape(line, ':', 0, true);
167
168                                //sSystem.out.println(pair);
169                                String name = pair.str;
170                                int lineEnd = findUnescaped(line, '!', 0, line.length(), true);
171                                if (lineEnd == -1)
172                                        lineEnd = line.length();
173
174                                // find nested values
175                                NestedValue nv = null;
176
177                                int trailingStartIndex = -1;
178                                int trailingEndIndex = -1;
179                                for (int i = lineEnd - 1; i >= 0; i--) {
180                                        if (Character.isWhitespace(line.charAt(i))) {
181                                                // keep going until we see non-whitespace
182                                        } else if (line.charAt(i) == '}') {
183                                                // if the first thing we see is a closing brace,
184                                                // we have a trailing modifier
185                                                if (i >= 1 && line.charAt(i - 1) == '\\')
186                                                        continue;
187                                                trailingEndIndex = i;
188                                                break;
189                                        } else
190                                                break;
191                                }
192
193                                if (trailingEndIndex != -1) {
194                                        for (int i = trailingEndIndex - 1; i >= 0; i--) {
195                                                if (line.charAt(i) == '{') {
196                                                        if (i >= 1 && line.charAt(i - 1) == '\\')
197                                                                continue;
198                                                        trailingStartIndex = i + 1;
199                                                }
200                                        }
201                                }
202
203                                int valueStopIndex;
204                                if (trailingStartIndex == -1 && trailingEndIndex != -1)
205                                        throw new ParseException("Unterminated trailing modifier. " + line);
206                                else if (trailingStartIndex != -1) {
207                                        valueStopIndex = trailingStartIndex - 1;
208                                        String trailing = line.substring(trailingStartIndex,
209                                                        trailingEndIndex).trim();
210                                        nv = new NestedValue();
211                                        getNestedValue(nv, trailing, 0);
212                                } else
213                                        valueStopIndex = lineEnd;
214
215                                String value = line.substring(pair.index + 1, valueStopIndex).trim();
216                                /*
217                                 * if (nv != null) System.err.println("nv = "+nv+", value =
218                                 * |"+value+"|");
219                                 */
220                                if (value.length() == 0)
221                                        throw new ParseException("Tag found with no value "+ line);
222
223                                if ( isSynonym(name)){
224                                        Synonym synonym = parseSynonym(name,value);
225                                        triggerNewSynonym(synonym);
226                                } else {
227                                        //System.out.println("new key:" + name + " " + value);
228                                        triggerNewKey(name,value);
229                                }
230                                //System.out.println("parsed key: " + name +" value: " + value + " nv: " + nv);
231
232
233
234                        }
235                }
236        }
237
238        private boolean isSynonym(String key){
239                if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM))
240                        return true;
241                return false;
242        }
243
244        /** parse the Synonym String from the Term.
245         * value can be: 
246         * <pre>"ca_bind" RELATED [uniprot:curation]</pre>
247         * @param value
248         * @return the synonym text
249         */
250        private Synonym parseSynonym(String key, String value) throws ParseException{
251
252                //System.out.println("PARSE SYNONYM " + key +  " " + value);
253                int startIndex = findUnescaped(value, '"', 0, value.length());
254                if (startIndex == -1)
255                        throw new ParseException("Expected \"" +  line + " " + linenum);
256                SOPair p = unescape(value, '"', startIndex + 1, value.length(),
257                                true);
258                int defIndex = findUnescaped(value, '[', p.index, value.length());
259                if (defIndex == -1) {
260                        throw new ParseException("Badly formatted synonym. " 
261                                        + "No dbxref list found." + line + " " + linenum );
262                }
263                String leftovers = value.substring(p.index + 1, defIndex).trim();
264                StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t");
265                int scope = Synonym.RELATED_SYNONYM;
266                
267                if ( key.equals(OboFileHandler.EXACT_SYNONYM))
268                        scope = Synonym.EXACT_SYNONYM;
269                else if ( key.equals(OboFileHandler.BROAD_SYNONYM))
270                        scope = Synonym.BROAD_SYNONYM;
271                else if ( key.equals(OboFileHandler.NARROW_SYNONYM))                    
272                        scope = Synonym.NARROW_SYNONYM;
273                
274                
275                String catID = null;
276                for (int i = 0; tokenizer.hasMoreTokens(); i++) {
277                        String token = tokenizer.nextToken();
278                        //System.out.println("TOKEN:" +token);
279                        if (i == 0) {
280                                if (token.equals("RELATED"))
281                                        scope = Synonym.RELATED_SYNONYM;
282                                else if (token.equals("UNSPECIFIED"))
283                                        scope = Synonym.RELATED_SYNONYM;
284                                else if (token.equals("EXACT"))
285                                        scope = Synonym.EXACT_SYNONYM;
286                                else if (token.equals("BROAD"))
287                                        scope = Synonym.BROAD_SYNONYM;
288                                else if (token.equals("NARROW"))
289                                        scope = Synonym.NARROW_SYNONYM;
290                                else
291                                        throw new ParseException("Found unexpected scope "
292                                                        + "identifier " + token + line);
293                        } else if (i == 1) {
294                                catID = token;
295                        } else
296                                throw new ParseException("Expected dbxref list,"
297                                                + " instead found " + token +   line );
298                }
299
300                Synonym synonym = new Synonym();
301                synonym.setScope(scope);
302                synonym.setCategory(catID);
303                synonym.setName(p.str);
304                //System.out.println("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope());
305
306                Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length());
307                
308                // set the refs in the synonym
309                for (Map<String, Object> ref : refs){
310                        String xref = (String) ref.get("xref");
311                        String desc = (String) ref.get("desc");
312                        //System.out.println(xref + " " + desc);
313                        NestedValue nv = (NestedValue) ref.get("nv");
314                        //TODO: add implementation for this...
315                }
316                
317
318                return synonym;
319        }
320
321        protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws ParseException {
322                Vector<Map<String,Object>> temp = new Vector<Map<String,Object>>();
323                boolean stop = false;
324                while (!stop) {
325                        int braceIndex = findUnescaped(line, '{', startoffset, endoffset);
326                        int endIndex = findUnescaped(line, ',', startoffset, endoffset,
327                                        true);
328                        boolean trailing = false;
329                        if (endIndex == -1) {
330                                endIndex = findUnescaped(line, ']', startoffset, endoffset,
331                                                true);
332                                if (endIndex == -1) {
333                                        throw new ParseException("Unterminated xref list " + line);
334                                }
335                                stop = true;
336                        }
337                        if (braceIndex != -1 && braceIndex < endIndex) {
338                                endIndex = braceIndex;
339                                trailing = true;
340                        }
341
342                        Map<String, Object> pair = parseXref(line, 
343                                        startoffset,
344                                        endIndex);
345                        if (pair == null) {
346                                startoffset++;
347                                continue;
348                        }
349                        NestedValue nv = null;
350                        if (trailing) {
351                                nv = new NestedValue();
352                                endIndex = getNestedValue(nv, line, endIndex + 1);
353                                if (endIndex == -1) {
354                                        throw new ParseException("Badly formatted "
355                                                        + "trailing properties " + line);
356                                }
357                                pair.put("nv",nv);
358                        }
359
360                        temp.add(pair);
361                        startoffset = endIndex + 1;
362                }
363                Map<String,Object>[] out = new HashMap[temp.size()];
364                for (int i = 0; i < temp.size(); i++) {
365                        Map<String, Object> pair =  temp.get(i);
366                        out[i] = pair;
367                }
368                return out;
369        }
370
371        protected Map<String,Object> parseXref(String line, 
372                        int startoffset, int endoffset) throws ParseException {
373                String xref_str = null;
374                String desc_str = null;
375
376                SOPair xref = unescape(line, '"', startoffset, endoffset, false);
377                xref_str = xref.str.trim();
378                if (xref_str.length() == 0)
379                        return null;
380
381                if (xref.index != -1) {
382                        SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true);
383                        desc_str = desc.str.trim();
384                }
385
386
387                Map<String, Object> m = new HashMap<String, Object>();
388                m.put("xref",xref_str);
389                m.put("desc",desc_str);
390                return m;
391        }
392
393
394
395        private void triggerNewStanza(String stanza){
396                Iterator<OboFileEventListener> iter = listeners.iterator();
397                while (iter.hasNext()){
398                        OboFileEventListener li = iter.next();
399                        li.newStanza(stanza);
400                }               
401        }
402
403        private void triggerNewKey(String key, String value){
404                Iterator<OboFileEventListener> iter = listeners.iterator();
405                while (iter.hasNext()){
406                        OboFileEventListener li = iter.next();
407                        li.newKey(key, value);
408                }
409        }
410
411        private void triggerNewSynonym(Synonym synonym){
412                Iterator<OboFileEventListener> iter = listeners.iterator();
413                while (iter.hasNext()){
414                        OboFileEventListener li = iter.next();
415                        li.newSynonym(synonym);
416                }
417        }
418
419        public static String escape(String str, boolean escapespaces) {
420                StringBuffer out = new StringBuffer();
421                for (int i = 0; i < str.length(); i++) {
422                        char c = str.charAt(i);
423                        Object o = unescapeChars.get(new Character(c));
424                        if (o == null)
425                                out.append(c);
426                        else {
427                                if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) {
428                                        out.append("\\" + o);
429                                } else
430                                        out.append(c);
431                        }
432                }
433                return out.toString();
434        }
435
436        public String unescape(String str) throws ParseException {
437                return unescape(str, '\0', 0, str.length(), false).str;
438        }
439
440        public SOPair unescape(String str, char toChar, int startindex,
441                        boolean mustFindChar) throws ParseException {
442                return unescape(str, toChar, startindex, str.length(), mustFindChar);
443        }
444
445        public SOPair unescape(String str, char toChar, int startindex,
446                        int endindex, boolean mustFindChar) throws ParseException {
447                StringBuffer out = new StringBuffer();
448                int endValue = -1;
449                for (int i = startindex; i < endindex; i++) {
450                        char c = str.charAt(i);
451                        if (c == '\\') {
452                                i++;
453                                c = str.charAt(i);
454                                Character mapchar = escapeChars
455                                .get(new Character(c));
456                                if (mapchar == null)
457                                        throw new ParseException("Unrecognized escape"
458                                                        + " character " + c + " found.");
459                                out.append(mapchar);
460                        } else if (c == toChar) {
461                                endValue = i;
462                                break;
463                        } else {
464                                out.append(c);
465                        }
466                }
467                if (endValue == -1 && mustFindChar) {
468                        throw new ParseException("Expected " + toChar + "." + str);
469                }
470                return new SOPair(out.toString(), endValue);
471        }
472
473
474        public static int findUnescaped(String str, char toChar) {
475                return findUnescaped(str, toChar, 0, str.length());
476        }
477
478        public static int findUnescaped(String str, char toChar, int startIndex,
479                        int endIndex) {
480                return findUnescaped(str, toChar, startIndex, endIndex, false);
481        }
482
483        public static int findUnescaped(String str, char toChar, int startindex,
484                        int endindex, boolean honorQuotes) {
485                boolean inQuotes = false;
486                char quoteChar = '\0';
487                for (int i = startindex; i < endindex; i++) {
488                        char c = str.charAt(i);
489                        if (c == '\\') {
490                                i++;
491                                continue;
492                        } else if (inQuotes) {
493                                if (c == quoteChar) 
494                                        inQuotes = false;
495                                continue;
496                                
497                        } else if (c == toChar) {
498                                return i;
499                        } else if (honorQuotes && isQuote(c)) {
500                                inQuotes = true;
501                                quoteChar = c;
502                        }
503                }
504                return -1;
505        }
506
507        public static boolean isEscapeStarter(char c) {
508                return c == '\\';
509        }
510
511        public static boolean isQuote(char c) {
512                return c == '"';
513        }
514
515        protected StringBuffer getTempBuffer() {
516                tempBuffer.delete(0, tempBuffer.length());
517                return tempBuffer;
518        }
519
520        protected SOPair readQuotedString(String value, int startIndex,
521                        int stopIndex, char terminatingChar, boolean requireQuotes,
522                        boolean legalEndOfLine) throws ParseException {
523
524                char quoteChar = '\0';
525                StringBuffer out = getTempBuffer();
526                int i = startIndex;
527                boolean useQuotes = false;
528
529                for (; i < stopIndex; i++) {
530                        // burn through any leading whitespace
531                        if (Character.isWhitespace(value.charAt(i))) 
532                                continue;
533
534                        // if the first non-whitespace character is not a quote,
535                        // proceed in non-quoted mode                           
536                        else if (!isQuote(value.charAt(i))) {
537                                if (requireQuotes)
538                                        throw new ParseException(
539                                                        "Expected start of quoted string. " + 
540                                                        line + " " +  value+ " at linenr " + linenum);
541                                useQuotes = false;
542                                break;
543                        } else {
544                                useQuotes = true;
545                                quoteChar = value.charAt(i);
546                                i++;
547                                break;
548                        }
549                }
550
551                // look for a closing quote or final delimiter
552                for (; i < stopIndex; i++) {
553                        if (isEscapeStarter(value.charAt(i))) {
554                                i++;
555                                if (i >= value.length())
556                                        throw new ParseException("Incomplete escape sequence. " + line);
557                                out.append(value.charAt(i));
558                        } else if ((useQuotes && value.charAt(i) == quoteChar)
559                                        || (!useQuotes && value.charAt(i) == terminatingChar)) {
560                                if (!useQuotes)
561                                        return new SOPair(out.toString().trim(), startIndex, i - 1);
562                                else
563                                        return new SOPair(out.toString(), startIndex, i);
564                        } else {
565                                out.append(value.charAt(i));
566                        }
567                }
568                if (!useQuotes && legalEndOfLine)
569                        return new SOPair(out.toString().trim(), startIndex, i);
570                else
571                        throw new ParseException("Unterminated quoted string. " +line);
572        }
573
574        protected int getNestedValue(NestedValue nv, String str, int startIndex)
575        throws ParseException {
576                while (startIndex < str.length()) {
577                        int equalsIndex = findUnescaped(str, '=', startIndex, str.length());
578                        if (equalsIndex == -1)
579                                throw new ParseException("Expected = in trailing modifier " +line);
580                        String name = str.substring(startIndex, equalsIndex).trim();
581                        SOPair value = readQuotedString(str, equalsIndex + 1, str.length(),
582                                        ',', false, true);
583
584                        Properties pv = new Properties();
585                        pv.setProperty(unescape(name),value.str);
586
587
588                        nv.addPropertyValue(pv);
589                        startIndex = value.endIndex + 1;
590                        for (; startIndex < str.length(); startIndex++) {
591                                if (Character.isWhitespace(str.charAt(startIndex)))
592                                        continue;
593                                else if (str.charAt(startIndex) == ',') {
594                                        startIndex++;
595                                        break;
596                                } else {
597                                        System.err.println("found character |"
598                                                        + str.charAt(startIndex) + "|");
599                                        throw new ParseException("Expected comma in trailing modifier. " + 
600                                                        line + " linenr: " + linenum);
601                                }
602                        }
603                }
604                return str.length();
605        }
606
607}
608
609class NestedValue {
610
611        /**
612         *
613         */
614        private static final long serialVersionUID = -7529450225162773796L;
615        protected Properties propertyValues = new Properties();
616        protected String name;
617        protected String suggestedComment;
618
619        public NestedValue() {
620        }
621
622        public String toString(){
623                String txt = "NestedValue: " ;
624                Set<Object> keys = propertyValues.keySet();
625                Iterator<Object> iter = keys.iterator();
626                while (iter.hasNext()){
627                        String key = iter.next().toString();
628                        String value = propertyValues.get(key).toString();
629                        txt += " [" + key + ":" + value + "]";
630                }
631
632
633                return txt;
634        }
635
636        public String getName() {
637                return name;
638        }
639
640        public Properties getPropertyValues() {
641                return propertyValues;
642        }
643
644        public void addPropertyValue(Properties pv) {
645                Set<Object> keys = pv.keySet();
646                Iterator<Object> iter = keys.iterator();
647                while (iter.hasNext()){
648                        String key = iter.next().toString();
649                        String value = pv.get(key).toString();
650                        propertyValues.setProperty(key, value);         
651                }
652
653        }
654
655        @Override
656        public Object clone() {
657                try {
658                        return super.clone();
659                } catch (CloneNotSupportedException ex) {
660                        // this will never happen
661                        return null;
662                }
663        }
664
665        public String getSuggestedComment() {
666                return suggestedComment;
667        }
668
669        public void setSuggestedComment(String suggestedComment) {
670                this.suggestedComment = suggestedComment;
671        }
672}
673
674