001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.seq.io;
024
025import java.io.Serializable;
026import java.util.ArrayList;
027import java.util.Iterator;
028import java.util.List;
029
030import org.biojava.bio.Annotation;
031import org.biojava.bio.symbol.Alphabet;
032import org.biojava.bio.symbol.IllegalAlphabetException;
033import org.biojava.bio.symbol.IllegalSymbolException;
034import org.biojava.bio.symbol.Symbol;
035import org.biojava.bio.symbol.SymbolList;
036import org.biojava.utils.Unchangeable;
037
038/**
039 * Base class for tokenizations which accept whitespace-separated
040 * `words'.  Splits at whitespace, except when it is quoted by
041 * either double-quotes ("), brackets (), or square brackets [].
042 *
043 * @author Thomas Down
044 * @author Greg Cox
045 * @author Keith James
046 * @since 1.2
047 */
048
049public abstract class WordTokenization
050  extends
051    Unchangeable
052  implements
053    SymbolTokenization, Serializable
054{
055    private Alphabet alphabet;
056
057    public WordTokenization(Alphabet fab) {
058        this.alphabet = fab;
059    }
060
061    public Alphabet getAlphabet() {
062        return alphabet;
063    }
064
065    public TokenType getTokenType() {
066        return SEPARATED;
067    }
068
069    public Annotation getAnnotation() {
070        return Annotation.EMPTY_ANNOTATION;
071    }
072
073    public String tokenizeSymbolList(SymbolList sl)
074        throws IllegalSymbolException, IllegalAlphabetException
075    {
076        if (sl.getAlphabet() != getAlphabet()) {
077            throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName());
078        }
079        StringBuffer sb = new StringBuffer();
080        Iterator i = sl.iterator();
081        while (i.hasNext()) {
082            Symbol sym = (Symbol) i.next();
083            sb.append(tokenizeSymbol(sym));
084            if (i.hasNext()) {
085                sb.append(' ');
086            }
087        }
088        return sb.substring(0);
089    }
090
091    public StreamParser parseStream(SeqIOListener siol) {
092        return new WordStreamParser(siol);
093    }
094
095    protected List splitString(String str)
096        throws IllegalSymbolException
097    {
098        int ptr = 0;
099        List sl = new ArrayList();
100
101        while (ptr < str.length()) {
102            char c = str.charAt(ptr);
103            if (Character.isWhitespace(c)) {
104                ++ptr;
105            } else if (c == '(') {
106                int nextPtr = findMatch(str, ptr, '(', ')');
107                sl.add(str.substring(ptr, nextPtr));
108                ptr = nextPtr;
109            } else if (c == '[') {
110                int nextPtr = findMatch(str, ptr, '[', ']');
111                sl.add(str.substring(ptr, nextPtr));
112                ptr = nextPtr;
113            } else {
114                int nextPtr = ptr;
115                char nc;
116                boolean quoted = false;
117                do {
118                    nextPtr++;
119                    if (nextPtr == str.length()) {
120                        nc = ' ';
121                    } else {
122                        nc = str.charAt(nextPtr);
123                    }
124                    if (nc == '"') {
125                        quoted = !quoted;
126                    }
127                } while (!Character.isWhitespace(nc));
128
129                sl.add(str.substring(ptr, nextPtr));
130                ptr = nextPtr;
131            }
132        }
133
134        return sl;
135    }
136
137    protected Symbol[] parseString(String s)
138        throws IllegalSymbolException
139    {
140        List split = splitString(s);
141        Symbol[] syms = new Symbol[split.size()];
142        for (int i = 0; i < split.size(); ++i) {
143            syms[i] = parseToken((String) split.get(i));
144        }
145        return syms;
146    }
147
148    private class WordStreamParser implements StreamParser {
149        SeqIOListener listener;
150        StringBuffer sb = new StringBuffer();
151
152        WordStreamParser(SeqIOListener l) {
153            listener = l;
154        }
155
156        public void characters(char[] data, int start, int len) {
157            sb.append(data, start, len);
158        }
159
160        public void close()
161            throws IllegalSymbolException
162        {
163            String str = sb.substring(0);
164            Symbol[] syms = parseString(str);
165            try {
166                listener.addSymbols(alphabet, syms, 0, syms.length);
167            } catch (IllegalAlphabetException ex) {
168                throw new IllegalSymbolException("Mismatched alphabets");
169            }
170        }
171    }
172
173    private int findMatch(String str, int ptr, char openChar, char closeChar) {
174        int level = 0;
175        do {
176            char c = str.charAt(ptr++);
177            if (c == openChar) {
178                ++level;
179            } else if (c == closeChar) {
180                --level;
181            }
182        } while (level > 0);
183        return ptr;
184    }
185}