001/*
002 *                  BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on Jan 18, 2008
021 *
022 */
023
024package org.biojava.nbio.ontology.obo;
025
026import org.biojava.nbio.ontology.Synonym;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import java.io.BufferedReader;
031import java.io.IOException;
032import java.text.SimpleDateFormat;
033import java.util.*;
034
035
036/**
037 * A class to parse the content of an OBO file. It delegates handling of the
038 * content to the OBOFileEventListener implementation.
039 *
040 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part)
041 *
042 * See <a href="http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup">link</a>
043 * Thanks to the OboEdit developers for giving permission to release this in BioJava.
044 *
045 *
046 * @author Andreas Prlic
047 * @author John Day Richter
048 * @since 1.6
049 */
050public class OboFileParser {
051
052        private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class);
053
054        List<OboFileEventListener> listeners;
055
056        protected String line;
057        protected int linenum = 0;
058        protected int totalSize = 0;
059        protected int bytesRead = 0;
060        protected StringBuffer tempBuffer = new StringBuffer();
061        protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US);
062
063
064        protected static final Map<Character, Character> escapeChars =
065                new HashMap<>();
066
067        protected static final Map<Character, Character> unescapeChars =
068                new HashMap<>();
069
070        static {
071                escapeChars.put('n', '\n');
072                escapeChars.put('W', ' ');
073                escapeChars.put('t', '\t');
074                escapeChars.put(':', ':');
075                escapeChars.put(',', ',');
076                escapeChars.put('"', '"');
077                escapeChars.put('\'', '\'');
078                escapeChars.put('\\', '\\');
079                escapeChars.put('{', '{');
080                escapeChars.put('}', '}');
081                escapeChars.put('(', '(');
082                escapeChars.put(')', ')');
083                escapeChars.put('[', '[');
084                escapeChars.put(']', ']');
085                escapeChars.put('!', '!');
086                Iterator <Character> it = escapeChars.keySet().iterator();
087                while (it.hasNext()) {
088                        Character key = it.next();
089                        Character value = escapeChars.get(key);
090                        unescapeChars.put(value, key);
091                }
092        }
093
094        public static class SOPair {
095                public String str = null;
096
097                public int index = -1;
098
099                public int endIndex = -1;
100
101                public SOPair(String str, int index) {
102                        this(str, index, -1);
103                }
104
105                public SOPair(String str, int index, int endIndex) {
106                        this.str = str;
107                        this.index = index;
108                        this.endIndex = endIndex;
109                }
110
111
112        }
113
114
115
116
117        public OboFileParser(){
118                listeners = new ArrayList<>();
119        }
120
121
122
123        public void addOboFileEventListener(OboFileEventListener listener){
124                listeners.add(listener);
125        }
126
127        public List<OboFileEventListener> getOboFileEventListener(){
128                return listeners;
129        }
130
131        /** parse an ontology file
132         *
133         * @param oboFile
134         * @throws IOException
135         * @throws IOException
136         */
137        public void parseOBO(BufferedReader oboFile) throws IOException{
138
139                String line;
140                String currentStanza;
141
142                while ((line = oboFile.readLine()) != null) {
143                        if (line.length() == 0)
144                                continue;
145
146                        if ( line.charAt(0) == '[') {
147                                if (line.charAt(line.length() - 1) != ']')
148                                        throw new IOException("Unclosed stanza: \"" + line + "\"" );
149                                String stanzaname = line.substring(1, line.length() - 1);
150                                if (stanzaname.length() < 1)
151                                        throw new IOException("Empty stanza: \"" +line+"\"");
152                                currentStanza = stanzaname;
153
154                                //logger.info("stanza: {}", currentStanza);
155                                triggerNewStanza(currentStanza);
156
157                        } else {
158                                // a content line
159                                SOPair pair;
160
161                                pair = unescape(line, ':', 0, true);
162
163                                //logger.info(pair);
164                                String name = pair.str;
165                                int lineEnd = findUnescaped(line, '!', 0, line.length(), true);
166                                if (lineEnd == -1)
167                                        lineEnd = line.length();
168
169                                // find nested values
170                                NestedValue nv = null;
171
172                                int trailingStartIndex = -1;
173                                int trailingEndIndex = -1;
174                                for (int i = lineEnd - 1; i >= 0; i--) {
175                                        if (Character.isWhitespace(line.charAt(i))) {
176                                                // keep going until we see non-whitespace
177                                        } else if (line.charAt(i) == '}') {
178                                                // if the first thing we see is a closing brace,
179                                                // we have a trailing modifier
180                                                if (i >= 1 && line.charAt(i - 1) == '\\')
181                                                        continue;
182                                                trailingEndIndex = i;
183                                                break;
184                                        } else
185                                                break;
186                                }
187
188                                if (trailingEndIndex != -1) {
189                                        for (int i = trailingEndIndex - 1; i >= 0; i--) {
190                                                if (line.charAt(i) == '{') {
191                                                        if (i >= 1 && line.charAt(i - 1) == '\\')
192                                                                continue;
193                                                        trailingStartIndex = i + 1;
194                                                }
195                                        }
196                                }
197
198                                int valueStopIndex;
199                                if (trailingStartIndex == -1 && trailingEndIndex != -1)
200                                        throw new IOException("Unterminated trailing modifier. " + line);
201                                else if (trailingStartIndex != -1) {
202                                        valueStopIndex = trailingStartIndex - 1;
203                                        String trailing = line.substring(trailingStartIndex,
204                                                        trailingEndIndex).trim();
205                                        nv = new NestedValue();
206                                        getNestedValue(nv, trailing, 0);
207                                } else
208                                        valueStopIndex = lineEnd;
209
210                                String value = line.substring(pair.index + 1, valueStopIndex).trim();
211                                /*
212                                 * if (nv != null) logger.warn("nv = "+nv+", value =
213                                 * |"+value+"|");
214                                 */
215                                if (value.length() == 0)
216                                        throw new IOException("Tag found with no value "+ line);
217
218                                if ( isSynonym(name)){
219                                        Synonym synonym = parseSynonym(name,value);
220                                        triggerNewSynonym(synonym);
221                                } else {
222                                        //logger.info("new key:" + name + " " + value);
223                                        triggerNewKey(name,value);
224                                }
225                                //logger.info("parsed key: " + name +" value: " + value + " nv: " + nv);
226
227
228
229                        }
230                }
231        }
232
233        private boolean isSynonym(String key){
234                if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM))
235                        return true;
236                return false;
237        }
238
239        /** parse the Synonym String from the Term.
240         * value can be:
241         * <pre>"ca_bind" RELATED [uniprot:curation]</pre>
242         * @param value
243         * @return the synonym text
244         */
245        private Synonym parseSynonym(String key, String value) throws IOException{
246
247                //logger.info("PARSE SYNONYM " + key +  " " + value);
248                int startIndex = findUnescaped(value, '"', 0, value.length());
249                if (startIndex == -1)
250                        throw new IOException("Expected \"" +  line + " " + linenum);
251                SOPair p = unescape(value, '"', startIndex + 1, value.length(),
252                                true);
253                int defIndex = findUnescaped(value, '[', p.index, value.length());
254                if (defIndex == -1) {
255                        throw new IOException("Badly formatted synonym. "
256                                        + "No dbxref list found." + line + " " + linenum );
257                }
258                String leftovers = value.substring(p.index + 1, defIndex).trim();
259                StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t");
260                int scope = Synonym.RELATED_SYNONYM;
261
262                if ( key.equals(OboFileHandler.EXACT_SYNONYM))
263                        scope = Synonym.EXACT_SYNONYM;
264                else if ( key.equals(OboFileHandler.BROAD_SYNONYM))
265                        scope = Synonym.BROAD_SYNONYM;
266                else if ( key.equals(OboFileHandler.NARROW_SYNONYM))
267                        scope = Synonym.NARROW_SYNONYM;
268
269
270                String catID = null;
271                for (int i = 0; tokenizer.hasMoreTokens(); i++) {
272                        String token = tokenizer.nextToken();
273                        //logger.info("TOKEN:" +token);
274                        if (i == 0) {
275                                if ("RELATED".equals(token))
276                                        scope = Synonym.RELATED_SYNONYM;
277                                else if ("UNSPECIFIED".equals(token))
278                                        scope = Synonym.RELATED_SYNONYM;
279                                else if ("EXACT".equals(token))
280                                        scope = Synonym.EXACT_SYNONYM;
281                                else if ("BROAD".equals(token))
282                                        scope = Synonym.BROAD_SYNONYM;
283                                else if ("NARROW".equals(token))
284                                        scope = Synonym.NARROW_SYNONYM;
285                                else
286                                        throw new IOException("Found unexpected scope "
287                                                        + "identifier " + token + line);
288                        } else if (i == 1) {
289                                catID = token;
290                        } else
291                                throw new IOException("Expected dbxref list,"
292                                                + " instead found " + token +   line );
293                }
294
295                Synonym synonym = new Synonym();
296                synonym.setScope(scope);
297                synonym.setCategory(catID);
298                synonym.setName(p.str);
299                //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope());
300
301                Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length());
302
303                // set the refs in the synonym
304                for (Map<String, Object> ref : refs){
305                        @SuppressWarnings("unused")
306                        String xref = (String) ref.get("xref");
307                        @SuppressWarnings("unused")
308                        String desc = (String) ref.get("desc");
309                        //logger.info(xref + " " + desc);
310                        @SuppressWarnings("unused")
311                        NestedValue nv = (NestedValue) ref.get("nv");
312                        //TODO: add implementation for this...
313                }
314
315
316                return synonym;
317        }
318
319        protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException {
320                Vector<Map<String,Object>> temp = new Vector<>();
321                boolean stop = false;
322                while (!stop) {
323                        int braceIndex = findUnescaped(line, '{', startoffset, endoffset);
324                        int endIndex = findUnescaped(line, ',', startoffset, endoffset,
325                                        true);
326                        boolean trailing = false;
327                        if (endIndex == -1) {
328                                endIndex = findUnescaped(line, ']', startoffset, endoffset,
329                                                true);
330                                if (endIndex == -1) {
331                                        throw new IOException("Unterminated xref list " + line);
332                                }
333                                stop = true;
334                        }
335                        if (braceIndex != -1 && braceIndex < endIndex) {
336                                endIndex = braceIndex;
337                                trailing = true;
338                        }
339
340                        Map<String, Object> pair = parseXref(line,
341                                        startoffset,
342                                        endIndex);
343                        if (pair == null) {
344                                startoffset++;
345                                continue;
346                        }
347                        NestedValue nv = null;
348                        if (trailing) {
349                                nv = new NestedValue();
350                                endIndex = getNestedValue(nv, line, endIndex + 1);
351                                if (endIndex == -1) {
352                                        throw new IOException("Badly formatted "
353                                                        + "trailing properties " + line);
354                                }
355                                pair.put("nv",nv);
356                        }
357
358                        temp.add(pair);
359                        startoffset = endIndex + 1;
360                }
361                Map<String,Object>[] out = new HashMap[temp.size()];
362                for (int i = 0; i < temp.size(); i++) {
363                        Map<String, Object> pair =  temp.get(i);
364                        out[i] = pair;
365                }
366                return out;
367        }
368
369        protected Map<String,Object> parseXref(String line,
370                        int startoffset, int endoffset) throws IOException {
371                String xref_str = null;
372                String desc_str = null;
373
374                SOPair xref = unescape(line, '"', startoffset, endoffset, false);
375                xref_str = xref.str.trim();
376                if (xref_str.length() == 0)
377                        return null;
378
379                if (xref.index != -1) {
380                        SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true);
381                        desc_str = desc.str.trim();
382                }
383
384
385                Map<String, Object> m = new HashMap<>();
386                m.put("xref",xref_str);
387                m.put("desc",desc_str);
388                return m;
389        }
390
391
392
393        private void triggerNewStanza(String stanza){
394                Iterator<OboFileEventListener> iter = listeners.iterator();
395                while (iter.hasNext()){
396                        OboFileEventListener li = iter.next();
397                        li.newStanza(stanza);
398                }
399        }
400
401        private void triggerNewKey(String key, String value){
402                Iterator<OboFileEventListener> iter = listeners.iterator();
403                while (iter.hasNext()){
404                        OboFileEventListener li = iter.next();
405                        li.newKey(key, value);
406                }
407        }
408
409        private void triggerNewSynonym(Synonym synonym){
410                Iterator<OboFileEventListener> iter = listeners.iterator();
411                while (iter.hasNext()){
412                        OboFileEventListener li = iter.next();
413                        li.newSynonym(synonym);
414                }
415        }
416
417        public static String escape(String str, boolean escapespaces) {
418                StringBuffer out = new StringBuffer();
419                for (int i = 0; i < str.length(); i++) {
420                        char c = str.charAt(i);
421                        Object o = unescapeChars.get(c);
422                        if (o == null)
423                                out.append(c);
424                        else {
425                                if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) {
426                                        out.append("\\").append(o);
427                                } else
428                                        out.append(c);
429                        }
430                }
431                return out.toString();
432        }
433
434        public String unescape(String str) throws IOException {
435                return unescape(str, '\0', 0, str.length(), false).str;
436        }
437
438        public SOPair unescape(String str, char toChar, int startindex,
439                        boolean mustFindChar) throws IOException {
440                return unescape(str, toChar, startindex, str.length(), mustFindChar);
441        }
442
443        public SOPair unescape(String str, char toChar, int startindex,
444                        int endindex, boolean mustFindChar) throws IOException {
445                StringBuffer out = new StringBuffer();
446                int endValue = -1;
447                for (int i = startindex; i < endindex; i++) {
448                        char c = str.charAt(i);
449                        if (c == '\\') {
450                                i++;
451                                c = str.charAt(i);
452                                Character mapchar = escapeChars
453                                .get(c);
454                                if (mapchar == null)
455                                        throw new IOException("Unrecognized escape"
456                                                        + " character " + c + " found.");
457                                out.append(mapchar);
458                        } else if (c == toChar) {
459                                endValue = i;
460                                break;
461                        } else {
462                                out.append(c);
463                        }
464                }
465                if (endValue == -1 && mustFindChar) {
466                        throw new IOException("Expected " + toChar + "." + str);
467                }
468                return new SOPair(out.toString(), endValue);
469        }
470
471
472        public static int findUnescaped(String str, char toChar) {
473                return findUnescaped(str, toChar, 0, str.length());
474        }
475
476        public static int findUnescaped(String str, char toChar, int startIndex,
477                        int endIndex) {
478                return findUnescaped(str, toChar, startIndex, endIndex, false);
479        }
480
481        public static int findUnescaped(String str, char toChar, int startindex,
482                        int endindex, boolean honorQuotes) {
483                boolean inQuotes = false;
484                char quoteChar = '\0';
485                for (int i = startindex; i < endindex; i++) {
486                        char c = str.charAt(i);
487                        if (c == '\\') {
488                                i++;
489                                continue;
490                        } else if (inQuotes) {
491                                if (c == quoteChar)
492                                        inQuotes = false;
493                                continue;
494
495                        } else if (c == toChar) {
496                                return i;
497                        } else if (honorQuotes && isQuote(c)) {
498                                inQuotes = true;
499                                quoteChar = c;
500                        }
501                }
502                return -1;
503        }
504
505        public static boolean isEscapeStarter(char c) {
506                return c == '\\';
507        }
508
509        public static boolean isQuote(char c) {
510                return c == '"';
511        }
512
513        protected StringBuffer getTempBuffer() {
514                tempBuffer.delete(0, tempBuffer.length());
515                return tempBuffer;
516        }
517
518        protected SOPair readQuotedString(String value, int startIndex,
519                        int stopIndex, char terminatingChar, boolean requireQuotes,
520                        boolean legalEndOfLine) throws IOException {
521
522                char quoteChar = '\0';
523                StringBuffer out = getTempBuffer();
524                int i = startIndex;
525                boolean useQuotes = false;
526
527                for (; i < stopIndex; i++) {
528                        // burn through any leading whitespace
529                        if (Character.isWhitespace(value.charAt(i)))
530                                continue;
531
532                        // if the first non-whitespace character is not a quote,
533                        // proceed in non-quoted mode
534                        else if (!isQuote(value.charAt(i))) {
535                                if (requireQuotes)
536                                        throw new IOException(
537                                                        "Expected start of quoted string. " +
538                                                        line + " " +  value+ " at linenr " + linenum);
539                                useQuotes = false;
540                                break;
541                        } else {
542                                useQuotes = true;
543                                quoteChar = value.charAt(i);
544                                i++;
545                                break;
546                        }
547                }
548
549                // look for a closing quote or final delimiter
550                for (; i < stopIndex; i++) {
551                        if (isEscapeStarter(value.charAt(i))) {
552                                i++;
553                                if (i >= value.length())
554                                        throw new IOException("Incomplete escape sequence. " + line);
555                                out.append(value.charAt(i));
556                        } else if ((useQuotes && value.charAt(i) == quoteChar)
557                                        || (!useQuotes && value.charAt(i) == terminatingChar)) {
558                                if (!useQuotes)
559                                        return new SOPair(out.toString().trim(), startIndex, i - 1);
560                                else
561                                        return new SOPair(out.toString(), startIndex, i);
562                        } else {
563                                out.append(value.charAt(i));
564                        }
565                }
566                if (!useQuotes && legalEndOfLine)
567                        return new SOPair(out.toString().trim(), startIndex, i);
568                else
569                        throw new IOException("Unterminated quoted string. " +line);
570        }
571
572        protected int getNestedValue(NestedValue nv, String str, int startIndex)
573        throws IOException {
574                while (startIndex < str.length()) {
575                        int equalsIndex = findUnescaped(str, '=', startIndex, str.length());
576                        if (equalsIndex == -1)
577                                throw new IOException("Expected = in trailing modifier " +line);
578                        String name = str.substring(startIndex, equalsIndex).trim();
579                        SOPair value = readQuotedString(str, equalsIndex + 1, str.length(),
580                                        ',', false, true);
581
582                        Properties pv = new Properties();
583                        pv.setProperty(unescape(name),value.str);
584
585
586                        nv.addPropertyValue(pv);
587                        startIndex = value.endIndex + 1;
588                        for (; startIndex < str.length(); startIndex++) {
589                                if (Character.isWhitespace(str.charAt(startIndex)))
590                                        continue;
591                                else if (str.charAt(startIndex) == ',') {
592                                        startIndex++;
593                                        break;
594                                } else {
595                                        logger.error("found character |{}|", str.charAt(startIndex));
596                                        throw new IOException("Expected comma in trailing modifier. " +
597                                                        line + " linenr: " + linenum);
598                                }
599                        }
600                }
601                return str.length();
602        }
603
604}
605
606class NestedValue {
607
608        protected Properties propertyValues = new Properties();
609        protected String name;
610        protected String suggestedComment;
611
612        public NestedValue() {
613        }
614
615        @Override
616        public String toString(){
617                String txt = "NestedValue: " ;
618                Set<Object> keys = propertyValues.keySet();
619                Iterator<Object> iter = keys.iterator();
620                while (iter.hasNext()){
621                        String key = iter.next().toString();
622                        String value = propertyValues.get(key).toString();
623                        txt += " [" + key + ":" + value + "]";
624                }
625
626
627                return txt;
628        }
629
630        public String getName() {
631                return name;
632        }
633
634        public Properties getPropertyValues() {
635                return propertyValues;
636        }
637
638        public void addPropertyValue(Properties pv) {
639                Set<Object> keys = pv.keySet();
640                Iterator<Object> iter = keys.iterator();
641                while (iter.hasNext()){
642                        String key = iter.next().toString();
643                        String value = pv.get(key).toString();
644                        propertyValues.setProperty(key, value);
645                }
646
647        }
648
649        @Override
650        public Object clone() {
651                try {
652                        return super.clone();
653                } catch (CloneNotSupportedException ex) {
654                        // this will never happen
655                        return null;
656                }
657        }
658
659        public String getSuggestedComment() {
660                return suggestedComment;
661        }
662
663        public void setSuggestedComment(String suggestedComment) {
664                this.suggestedComment = suggestedComment;
665        }
666}
667
668