Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.seq.io;
024
025import java.io.Serializable;
026import java.util.HashMap;
027import java.util.Iterator;
028import java.util.Map;
029
030import org.biojava.bio.Annotation;
031import org.biojava.bio.BioError;
032import org.biojava.bio.symbol.Alphabet;
033import org.biojava.bio.symbol.AlphabetManager;
034import org.biojava.bio.symbol.FiniteAlphabet;
035import org.biojava.bio.symbol.IllegalAlphabetException;
036import org.biojava.bio.symbol.IllegalSymbolException;
037import org.biojava.bio.symbol.Symbol;
038import org.biojava.bio.symbol.SymbolList;
039import org.biojava.utils.Unchangeable;
040
041/**
042 * <p>Implementation of SymbolTokenization which binds symbols to
043 * single unicode characters.</p>
044 *
045 * <p>Many alphabets (and all simple built-in alphabets like DNA, RNA
046 * and Protein) will have an instance of CharacterTokenization
047 * registered under the name 'token', so that you could say
048 * <code>CharacterTokenization ct = (CharacterTokenization)
049 * alpha.getTokenization('token');</code> and expect it to work. When
050 * you construct a new instance of this class for an alphabet, there
051 * will be no initial associations of Symbols with characters. It is
052 * your responsibility to populate the new tokenization appropriately.
053 * </p>
054 *
055 * @author Thomas Down
056 * @author Matthew Pocock
057 * @author Greg Cox
058 * @author Keith James
059 * @since 1.2
060 */
061
062public class CharacterTokenization
063  extends
064    Unchangeable
065  implements
066    SymbolTokenization, Serializable
067{
068    private Alphabet alphabet;
069    private Map symbolsToCharacters = new HashMap();
070    private Map charactersToSymbols = new HashMap();
071    private transient Symbol[] tokenTable;
072    private boolean caseSensitive;
073
074    public CharacterTokenization(Alphabet alpha, boolean caseSensitive) {
075        alphabet = alpha;
076        this.caseSensitive = caseSensitive;
077    }
078
079    public Alphabet getAlphabet() {
080        return alphabet;
081    }
082
083    public TokenType getTokenType() {
084        return CHARACTER;
085    }
086
087    public Annotation getAnnotation() {
088        return Annotation.EMPTY_ANNOTATION;
089    }
090
091    /**
092     * <p>
093     * Bind a Symbol to a character.
094     * </p>
095     *
096     * <p>
097     * This method will ensure that when this char is observed, it resolves to
098     * this symbol. If it was previously associated with another symbol, the old
099     * binding is removed.
100     * If this is the first time the symbol has been bound to any character,
101     * then this character is taken to be the default tokenization of the
102     * Symbol. This means that when converting symbols into characters, this
103     * char will be used. If the symbol has previously been bound to another
104     * character, then this char will not be produced for the symbol when
105     * stringifying the symbol, but this symbol will be produced when tokenizing
106     * this character.
107     * </p>
108     *
109     * @param s  the Symbol to bind
110     * @param c  the char to bind it to
111     */
112    public void bindSymbol(Symbol s, char c) {
113        Character chr = new Character(c);
114
115        if (!symbolsToCharacters.containsKey(s)) {
116            symbolsToCharacters.put(s, chr);
117        }
118        if (!charactersToSymbols.containsKey(chr)) {
119            charactersToSymbols.put(chr, s);
120        }
121        tokenTable = null;
122    }
123
124    public Symbol parseToken(String token)
125        throws IllegalSymbolException
126    {
127        if (token.length() != 1) {
128            throw new IllegalSymbolException("This Tokenization only accepts single-character tokens");
129        }
130        return parseTokenChar(token.charAt(0));
131    }
132
133    protected Symbol[] getTokenTable() {
134        if (tokenTable == null) {
135            int maxChar = 0;
136            for (Iterator i = charactersToSymbols.keySet().iterator(); i.hasNext(); ) {
137                Character c = (Character) i.next();
138                char cv = c.charValue();
139                if (caseSensitive) {
140                    maxChar = Math.max(maxChar, cv);
141                } else {
142                    maxChar = Math.max(maxChar, Character.toUpperCase(cv));
143                    maxChar = Math.max(maxChar, Character.toLowerCase(cv));
144                }
145            }
146
147            tokenTable = new Symbol[maxChar + 1];
148
149            for (Iterator i = charactersToSymbols.entrySet().iterator(); i.hasNext(); ) {
150                Map.Entry me = (Map.Entry) i.next();
151                Symbol sym = (Symbol) me.getValue();
152                Character c = (Character) me.getKey();
153                char cv = c.charValue();
154                if (caseSensitive) {
155                    tokenTable[cv] = sym;
156                } else {
157                    tokenTable[Character.toUpperCase(cv)] = sym;
158                    tokenTable[Character.toLowerCase(cv)] = sym;
159                }
160            }
161        }
162
163        return tokenTable;
164    }
165
166    protected Symbol parseTokenChar(char c)
167        throws IllegalSymbolException
168    {
169        Symbol[] tokenTable = getTokenTable();
170        Symbol sym = null;
171        if (c < tokenTable.length) {
172            sym = tokenTable[c];
173        }
174        if (sym == null) {
175            throw new IllegalSymbolException("This tokenization doesn't contain character: '" + c + "'");
176        }
177
178        return sym;
179    }
180
181    private Character _tokenizeSymbol(Symbol s)
182        throws IllegalSymbolException
183    {
184        Character c = (Character) symbolsToCharacters.get(s);
185        if (c == null) {
186            Alphabet alpha = getAlphabet();
187            alphabet.validate(s);
188            if (alpha instanceof FiniteAlphabet) {
189                c = (Character) symbolsToCharacters.get(AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alpha));
190            }
191            if (c == null) {
192                throw new IllegalSymbolException("No mapping for symbol " + s.getName());
193            }
194        }
195
196        return c;
197    }
198
199    public String tokenizeSymbol(Symbol s) throws IllegalSymbolException {
200        return String.valueOf(_tokenizeSymbol(s).charValue());
201    }
202
203    public String tokenizeSymbolList(SymbolList sl)
204        throws IllegalAlphabetException
205    {
206        if (sl.getAlphabet() != getAlphabet()) {
207            throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName());
208        }
209        StringBuffer sb = new StringBuffer();
210        for (Iterator i = sl.iterator(); i.hasNext(); ) {
211            Symbol sym = (Symbol) i.next();
212            try {
213                Character c = _tokenizeSymbol(sym);
214                sb.append(c.charValue());
215            } catch (IllegalSymbolException ex) {
216                throw new IllegalAlphabetException(ex, "Couldn't tokenize");
217            }
218        }
219
220        return sb.substring(0);
221    }
222
223    public StreamParser parseStream(SeqIOListener listener) {
224        return new TPStreamParser(listener);
225    }
226
227    private class TPStreamParser implements StreamParser {
228        private SeqIOListener listener;
229        private Symbol[] buffer;
230
231        {
232            buffer = new Symbol[256];
233        }
234
235        public TPStreamParser(SeqIOListener l) {
236            this.listener = l;
237        }
238
239        public void characters(char[] data, int start, int len)
240            throws IllegalSymbolException
241        {
242            int cnt = 0;
243            while (cnt < len) {
244                int bcnt = 0;
245                while (cnt < len && bcnt < buffer.length) {
246                    buffer[bcnt++] = parseTokenChar(data[start + (cnt++)]);
247                }
248                try {
249                    listener.addSymbols(getAlphabet(),
250                                        buffer,
251                                        0,
252                                        bcnt);
253                } catch (IllegalAlphabetException ex) {
254                    throw new BioError( "Assertion failed: can't add symbols.", ex);
255                }
256            }
257        }
258
259        public void close() {
260        }
261    }
262}