001/*
002 *                  BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on Jan 18, 2008
021 *
022 */
023
024package org.biojava.nbio.ontology.obo;
025
026import org.biojava.nbio.ontology.Synonym;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import java.io.BufferedReader;
031import java.io.IOException;
032import java.text.SimpleDateFormat;
033import java.util.*;
034
035
036/** A class to parse the content of an OBO file. It delegates handling of the
037 * content to the OBOFileEventListener implementation.
038 *
039 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part)
040 *
041 * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup
042 * Thanks to the OboEdit developers for giving permission to release this in BioJava.
043 *
044 *
045 * @author Andreas Prlic
046 * @author John Day Richter
047 * @since 1.6
048 */
049public class OboFileParser {
050
051        private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class);
052
053        List<OboFileEventListener> listeners;
054
055        protected String line;
056        protected int linenum = 0;
057        protected int totalSize = 0;
058        protected int bytesRead = 0;
059        protected StringBuffer tempBuffer = new StringBuffer();
060        protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US);
061
062
063        protected static final Map<Character, Character> escapeChars =
064                new HashMap<Character, Character>();
065
066        protected static final Map<Character, Character> unescapeChars =
067                new HashMap<Character, Character>();
068
069        static {
070                escapeChars.put(new Character('n'), new Character('\n'));
071                escapeChars.put(new Character('W'), new Character(' '));
072                escapeChars.put(new Character('t'), new Character('\t'));
073                escapeChars.put(new Character(':'), new Character(':'));
074                escapeChars.put(new Character(','), new Character(','));
075                escapeChars.put(new Character('"'), new Character('"'));
076                escapeChars.put(new Character('\''), new Character('\''));
077                escapeChars.put(new Character('\\'), new Character('\\'));
078                escapeChars.put(new Character('{'), new Character('{'));
079                escapeChars.put(new Character('}'), new Character('}'));
080                escapeChars.put(new Character('('), new Character('('));
081                escapeChars.put(new Character(')'), new Character(')'));
082                escapeChars.put(new Character('['), new Character('['));
083                escapeChars.put(new Character(']'), new Character(']'));
084                escapeChars.put(new Character('!'), new Character('!'));
085                Iterator <Character> it = escapeChars.keySet().iterator();
086                while (it.hasNext()) {
087                        Character key = it.next();
088                        Character value = escapeChars.get(key);
089                        unescapeChars.put(value, key);
090                }
091        }
092
093        public static class SOPair {
094                public String str = null;
095
096                public int index = -1;
097
098                public int endIndex = -1;
099
100                public SOPair(String str, int index) {
101                        this(str, index, -1);
102                }
103
104                public SOPair(String str, int index, int endIndex) {
105                        this.str = str;
106                        this.index = index;
107                        this.endIndex = endIndex;
108                }
109
110
111        }
112
113
114
115
116        public OboFileParser(){
117                listeners = new ArrayList<OboFileEventListener>();
118        }
119
120
121
122        public void addOboFileEventListener(OboFileEventListener listener){
123                listeners.add(listener);
124        }
125
126        public List<OboFileEventListener> getOboFileEventListener(){
127                return listeners;
128        }
129
130        /** parse an ontology file
131         *
132         * @param oboFile
133         * @throws IOException
134         * @throws IOException
135         */
136        public void parseOBO(BufferedReader oboFile) throws IOException{
137
138                String line;
139                String currentStanza;
140
141                while ((line = oboFile.readLine()) != null) {
142                        if (line.length() == 0)
143                                continue;
144
145                        if ( line.charAt(0) == '[') {
146                                if (line.charAt(line.length() - 1) != ']')
147                                        throw new IOException("Unclosed stanza: \"" + line + "\"" );
148                                String stanzaname = line.substring(1, line.length() - 1);
149                                if (stanzaname.length() < 1)
150                                        throw new IOException("Empty stanza: \"" +line+"\"");
151                                currentStanza = stanzaname;
152
153                                //logger.info("stanza: {}", currentStanza);
154                                triggerNewStanza(currentStanza);
155
156                        } else {
157                                // a content line
158                                SOPair pair;
159
160                                pair = unescape(line, ':', 0, true);
161
162                                //logger.info(pair);
163                                String name = pair.str;
164                                int lineEnd = findUnescaped(line, '!', 0, line.length(), true);
165                                if (lineEnd == -1)
166                                        lineEnd = line.length();
167
168                                // find nested values
169                                NestedValue nv = null;
170
171                                int trailingStartIndex = -1;
172                                int trailingEndIndex = -1;
173                                for (int i = lineEnd - 1; i >= 0; i--) {
174                                        if (Character.isWhitespace(line.charAt(i))) {
175                                                // keep going until we see non-whitespace
176                                        } else if (line.charAt(i) == '}') {
177                                                // if the first thing we see is a closing brace,
178                                                // we have a trailing modifier
179                                                if (i >= 1 && line.charAt(i - 1) == '\\')
180                                                        continue;
181                                                trailingEndIndex = i;
182                                                break;
183                                        } else
184                                                break;
185                                }
186
187                                if (trailingEndIndex != -1) {
188                                        for (int i = trailingEndIndex - 1; i >= 0; i--) {
189                                                if (line.charAt(i) == '{') {
190                                                        if (i >= 1 && line.charAt(i - 1) == '\\')
191                                                                continue;
192                                                        trailingStartIndex = i + 1;
193                                                }
194                                        }
195                                }
196
197                                int valueStopIndex;
198                                if (trailingStartIndex == -1 && trailingEndIndex != -1)
199                                        throw new IOException("Unterminated trailing modifier. " + line);
200                                else if (trailingStartIndex != -1) {
201                                        valueStopIndex = trailingStartIndex - 1;
202                                        String trailing = line.substring(trailingStartIndex,
203                                                        trailingEndIndex).trim();
204                                        nv = new NestedValue();
205                                        getNestedValue(nv, trailing, 0);
206                                } else
207                                        valueStopIndex = lineEnd;
208
209                                String value = line.substring(pair.index + 1, valueStopIndex).trim();
210                                /*
211                                 * if (nv != null) logger.warn("nv = "+nv+", value =
212                                 * |"+value+"|");
213                                 */
214                                if (value.length() == 0)
215                                        throw new IOException("Tag found with no value "+ line);
216
217                                if ( isSynonym(name)){
218                                        Synonym synonym = parseSynonym(name,value);
219                                        triggerNewSynonym(synonym);
220                                } else {
221                                        //logger.info("new key:" + name + " " + value);
222                                        triggerNewKey(name,value);
223                                }
224                                //logger.info("parsed key: " + name +" value: " + value + " nv: " + nv);
225
226
227
228                        }
229                }
230        }
231
232        private boolean isSynonym(String key){
233                if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM))
234                        return true;
235                return false;
236        }
237
238        /** parse the Synonym String from the Term.
239         * value can be:
240         * <pre>"ca_bind" RELATED [uniprot:curation]</pre>
241         * @param value
242         * @return the synonym text
243         */
244        private Synonym parseSynonym(String key, String value) throws IOException{
245
246                //logger.info("PARSE SYNONYM " + key +  " " + value);
247                int startIndex = findUnescaped(value, '"', 0, value.length());
248                if (startIndex == -1)
249                        throw new IOException("Expected \"" +  line + " " + linenum);
250                SOPair p = unescape(value, '"', startIndex + 1, value.length(),
251                                true);
252                int defIndex = findUnescaped(value, '[', p.index, value.length());
253                if (defIndex == -1) {
254                        throw new IOException("Badly formatted synonym. "
255                                        + "No dbxref list found." + line + " " + linenum );
256                }
257                String leftovers = value.substring(p.index + 1, defIndex).trim();
258                StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t");
259                int scope = Synonym.RELATED_SYNONYM;
260
261                if ( key.equals(OboFileHandler.EXACT_SYNONYM))
262                        scope = Synonym.EXACT_SYNONYM;
263                else if ( key.equals(OboFileHandler.BROAD_SYNONYM))
264                        scope = Synonym.BROAD_SYNONYM;
265                else if ( key.equals(OboFileHandler.NARROW_SYNONYM))
266                        scope = Synonym.NARROW_SYNONYM;
267
268
269                String catID = null;
270                for (int i = 0; tokenizer.hasMoreTokens(); i++) {
271                        String token = tokenizer.nextToken();
272                        //logger.info("TOKEN:" +token);
273                        if (i == 0) {
274                                if (token.equals("RELATED"))
275                                        scope = Synonym.RELATED_SYNONYM;
276                                else if (token.equals("UNSPECIFIED"))
277                                        scope = Synonym.RELATED_SYNONYM;
278                                else if (token.equals("EXACT"))
279                                        scope = Synonym.EXACT_SYNONYM;
280                                else if (token.equals("BROAD"))
281                                        scope = Synonym.BROAD_SYNONYM;
282                                else if (token.equals("NARROW"))
283                                        scope = Synonym.NARROW_SYNONYM;
284                                else
285                                        throw new IOException("Found unexpected scope "
286                                                        + "identifier " + token + line);
287                        } else if (i == 1) {
288                                catID = token;
289                        } else
290                                throw new IOException("Expected dbxref list,"
291                                                + " instead found " + token +   line );
292                }
293
294                Synonym synonym = new Synonym();
295                synonym.setScope(scope);
296                synonym.setCategory(catID);
297                synonym.setName(p.str);
298                //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope());
299
300                Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length());
301
302                // set the refs in the synonym
303                for (Map<String, Object> ref : refs){
304                        @SuppressWarnings("unused")
305                        String xref = (String) ref.get("xref");
306                        @SuppressWarnings("unused")
307                        String desc = (String) ref.get("desc");
308                        //logger.info(xref + " " + desc);
309                        @SuppressWarnings("unused")
310                        NestedValue nv = (NestedValue) ref.get("nv");
311                        //TODO: add implementation for this...
312                }
313
314
315                return synonym;
316        }
317
318        protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException {
319                Vector<Map<String,Object>> temp = new Vector<Map<String,Object>>();
320                boolean stop = false;
321                while (!stop) {
322                        int braceIndex = findUnescaped(line, '{', startoffset, endoffset);
323                        int endIndex = findUnescaped(line, ',', startoffset, endoffset,
324                                        true);
325                        boolean trailing = false;
326                        if (endIndex == -1) {
327                                endIndex = findUnescaped(line, ']', startoffset, endoffset,
328                                                true);
329                                if (endIndex == -1) {
330                                        throw new IOException("Unterminated xref list " + line);
331                                }
332                                stop = true;
333                        }
334                        if (braceIndex != -1 && braceIndex < endIndex) {
335                                endIndex = braceIndex;
336                                trailing = true;
337                        }
338
339                        Map<String, Object> pair = parseXref(line,
340                                        startoffset,
341                                        endIndex);
342                        if (pair == null) {
343                                startoffset++;
344                                continue;
345                        }
346                        NestedValue nv = null;
347                        if (trailing) {
348                                nv = new NestedValue();
349                                endIndex = getNestedValue(nv, line, endIndex + 1);
350                                if (endIndex == -1) {
351                                        throw new IOException("Badly formatted "
352                                                        + "trailing properties " + line);
353                                }
354                                pair.put("nv",nv);
355                        }
356
357                        temp.add(pair);
358                        startoffset = endIndex + 1;
359                }
360                Map<String,Object>[] out = new HashMap[temp.size()];
361                for (int i = 0; i < temp.size(); i++) {
362                        Map<String, Object> pair =  temp.get(i);
363                        out[i] = pair;
364                }
365                return out;
366        }
367
368        protected Map<String,Object> parseXref(String line,
369                        int startoffset, int endoffset) throws IOException {
370                String xref_str = null;
371                String desc_str = null;
372
373                SOPair xref = unescape(line, '"', startoffset, endoffset, false);
374                xref_str = xref.str.trim();
375                if (xref_str.length() == 0)
376                        return null;
377
378                if (xref.index != -1) {
379                        SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true);
380                        desc_str = desc.str.trim();
381                }
382
383
384                Map<String, Object> m = new HashMap<String, Object>();
385                m.put("xref",xref_str);
386                m.put("desc",desc_str);
387                return m;
388        }
389
390
391
392        private void triggerNewStanza(String stanza){
393                Iterator<OboFileEventListener> iter = listeners.iterator();
394                while (iter.hasNext()){
395                        OboFileEventListener li = iter.next();
396                        li.newStanza(stanza);
397                }
398        }
399
400        private void triggerNewKey(String key, String value){
401                Iterator<OboFileEventListener> iter = listeners.iterator();
402                while (iter.hasNext()){
403                        OboFileEventListener li = iter.next();
404                        li.newKey(key, value);
405                }
406        }
407
408        private void triggerNewSynonym(Synonym synonym){
409                Iterator<OboFileEventListener> iter = listeners.iterator();
410                while (iter.hasNext()){
411                        OboFileEventListener li = iter.next();
412                        li.newSynonym(synonym);
413                }
414        }
415
416        public static String escape(String str, boolean escapespaces) {
417                StringBuffer out = new StringBuffer();
418                for (int i = 0; i < str.length(); i++) {
419                        char c = str.charAt(i);
420                        Object o = unescapeChars.get(new Character(c));
421                        if (o == null)
422                                out.append(c);
423                        else {
424                                if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) {
425                                        out.append("\\").append(o);
426                                } else
427                                        out.append(c);
428                        }
429                }
430                return out.toString();
431        }
432
433        public String unescape(String str) throws IOException {
434                return unescape(str, '\0', 0, str.length(), false).str;
435        }
436
437        public SOPair unescape(String str, char toChar, int startindex,
438                        boolean mustFindChar) throws IOException {
439                return unescape(str, toChar, startindex, str.length(), mustFindChar);
440        }
441
442        public SOPair unescape(String str, char toChar, int startindex,
443                        int endindex, boolean mustFindChar) throws IOException {
444                StringBuffer out = new StringBuffer();
445                int endValue = -1;
446                for (int i = startindex; i < endindex; i++) {
447                        char c = str.charAt(i);
448                        if (c == '\\') {
449                                i++;
450                                c = str.charAt(i);
451                                Character mapchar = escapeChars
452                                .get(new Character(c));
453                                if (mapchar == null)
454                                        throw new IOException("Unrecognized escape"
455                                                        + " character " + c + " found.");
456                                out.append(mapchar);
457                        } else if (c == toChar) {
458                                endValue = i;
459                                break;
460                        } else {
461                                out.append(c);
462                        }
463                }
464                if (endValue == -1 && mustFindChar) {
465                        throw new IOException("Expected " + toChar + "." + str);
466                }
467                return new SOPair(out.toString(), endValue);
468        }
469
470
471        public static int findUnescaped(String str, char toChar) {
472                return findUnescaped(str, toChar, 0, str.length());
473        }
474
475        public static int findUnescaped(String str, char toChar, int startIndex,
476                        int endIndex) {
477                return findUnescaped(str, toChar, startIndex, endIndex, false);
478        }
479
480        public static int findUnescaped(String str, char toChar, int startindex,
481                        int endindex, boolean honorQuotes) {
482                boolean inQuotes = false;
483                char quoteChar = '\0';
484                for (int i = startindex; i < endindex; i++) {
485                        char c = str.charAt(i);
486                        if (c == '\\') {
487                                i++;
488                                continue;
489                        } else if (inQuotes) {
490                                if (c == quoteChar)
491                                        inQuotes = false;
492                                continue;
493
494                        } else if (c == toChar) {
495                                return i;
496                        } else if (honorQuotes && isQuote(c)) {
497                                inQuotes = true;
498                                quoteChar = c;
499                        }
500                }
501                return -1;
502        }
503
504        public static boolean isEscapeStarter(char c) {
505                return c == '\\';
506        }
507
508        public static boolean isQuote(char c) {
509                return c == '"';
510        }
511
512        protected StringBuffer getTempBuffer() {
513                tempBuffer.delete(0, tempBuffer.length());
514                return tempBuffer;
515        }
516
517        protected SOPair readQuotedString(String value, int startIndex,
518                        int stopIndex, char terminatingChar, boolean requireQuotes,
519                        boolean legalEndOfLine) throws IOException {
520
521                char quoteChar = '\0';
522                StringBuffer out = getTempBuffer();
523                int i = startIndex;
524                boolean useQuotes = false;
525
526                for (; i < stopIndex; i++) {
527                        // burn through any leading whitespace
528                        if (Character.isWhitespace(value.charAt(i)))
529                                continue;
530
531                        // if the first non-whitespace character is not a quote,
532                        // proceed in non-quoted mode
533                        else if (!isQuote(value.charAt(i))) {
534                                if (requireQuotes)
535                                        throw new IOException(
536                                                        "Expected start of quoted string. " +
537                                                        line + " " +  value+ " at linenr " + linenum);
538                                useQuotes = false;
539                                break;
540                        } else {
541                                useQuotes = true;
542                                quoteChar = value.charAt(i);
543                                i++;
544                                break;
545                        }
546                }
547
548                // look for a closing quote or final delimiter
549                for (; i < stopIndex; i++) {
550                        if (isEscapeStarter(value.charAt(i))) {
551                                i++;
552                                if (i >= value.length())
553                                        throw new IOException("Incomplete escape sequence. " + line);
554                                out.append(value.charAt(i));
555                        } else if ((useQuotes && value.charAt(i) == quoteChar)
556                                        || (!useQuotes && value.charAt(i) == terminatingChar)) {
557                                if (!useQuotes)
558                                        return new SOPair(out.toString().trim(), startIndex, i - 1);
559                                else
560                                        return new SOPair(out.toString(), startIndex, i);
561                        } else {
562                                out.append(value.charAt(i));
563                        }
564                }
565                if (!useQuotes && legalEndOfLine)
566                        return new SOPair(out.toString().trim(), startIndex, i);
567                else
568                        throw new IOException("Unterminated quoted string. " +line);
569        }
570
571        protected int getNestedValue(NestedValue nv, String str, int startIndex)
572        throws IOException {
573                while (startIndex < str.length()) {
574                        int equalsIndex = findUnescaped(str, '=', startIndex, str.length());
575                        if (equalsIndex == -1)
576                                throw new IOException("Expected = in trailing modifier " +line);
577                        String name = str.substring(startIndex, equalsIndex).trim();
578                        SOPair value = readQuotedString(str, equalsIndex + 1, str.length(),
579                                        ',', false, true);
580
581                        Properties pv = new Properties();
582                        pv.setProperty(unescape(name),value.str);
583
584
585                        nv.addPropertyValue(pv);
586                        startIndex = value.endIndex + 1;
587                        for (; startIndex < str.length(); startIndex++) {
588                                if (Character.isWhitespace(str.charAt(startIndex)))
589                                        continue;
590                                else if (str.charAt(startIndex) == ',') {
591                                        startIndex++;
592                                        break;
593                                } else {
594                                        logger.error("found character |{}|", str.charAt(startIndex));
595                                        throw new IOException("Expected comma in trailing modifier. " +
596                                                        line + " linenr: " + linenum);
597                                }
598                        }
599                }
600                return str.length();
601        }
602
603}
604
605class NestedValue {
606
607        protected Properties propertyValues = new Properties();
608        protected String name;
609        protected String suggestedComment;
610
611        public NestedValue() {
612        }
613
614        @Override
615        public String toString(){
616                String txt = "NestedValue: " ;
617                Set<Object> keys = propertyValues.keySet();
618                Iterator<Object> iter = keys.iterator();
619                while (iter.hasNext()){
620                        String key = iter.next().toString();
621                        String value = propertyValues.get(key).toString();
622                        txt += " [" + key + ":" + value + "]";
623                }
624
625
626                return txt;
627        }
628
629        public String getName() {
630                return name;
631        }
632
633        public Properties getPropertyValues() {
634                return propertyValues;
635        }
636
637        public void addPropertyValue(Properties pv) {
638                Set<Object> keys = pv.keySet();
639                Iterator<Object> iter = keys.iterator();
640                while (iter.hasNext()){
641                        String key = iter.next().toString();
642                        String value = pv.get(key).toString();
643                        propertyValues.setProperty(key, value);
644                }
645
646        }
647
648        @Override
649        public Object clone() {
650                try {
651                        return super.clone();
652                } catch (CloneNotSupportedException ex) {
653                        // this will never happen
654                        return null;
655                }
656        }
657
658        public String getSuggestedComment() {
659                return suggestedComment;
660        }
661
662        public void setSuggestedComment(String suggestedComment) {
663                this.suggestedComment = suggestedComment;
664        }
665}
666
667