001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.seq.io; 024 025import java.io.Serializable; 026import java.util.HashMap; 027import java.util.Iterator; 028import java.util.Map; 029 030import org.biojava.bio.Annotation; 031import org.biojava.bio.BioError; 032import org.biojava.bio.symbol.Alphabet; 033import org.biojava.bio.symbol.AlphabetManager; 034import org.biojava.bio.symbol.FiniteAlphabet; 035import org.biojava.bio.symbol.IllegalAlphabetException; 036import org.biojava.bio.symbol.IllegalSymbolException; 037import org.biojava.bio.symbol.Symbol; 038import org.biojava.bio.symbol.SymbolList; 039import org.biojava.utils.Unchangeable; 040 041/** 042 * <p>Implementation of SymbolTokenization which binds symbols to 043 * single unicode characters.</p> 044 * 045 * <p>Many alphabets (and all simple built-in alphabets like DNA, RNA 046 * and Protein) will have an instance of CharacterTokenization 047 * registered under the name 'token', so that you could say 048 * <code>CharacterTokenization ct = (CharacterTokenization) 049 * alpha.getTokenization('token');</code> and expect it to work. When 050 * you construct a new instance of this class for an alphabet, there 051 * will be no initial associations of Symbols with characters. It is 052 * your responsibility to populate the new tokenization appropriately. 053 * </p> 054 * 055 * @author Thomas Down 056 * @author Matthew Pocock 057 * @author Greg Cox 058 * @author Keith James 059 * @since 1.2 060 */ 061 062public class CharacterTokenization 063 extends 064 Unchangeable 065 implements 066 SymbolTokenization, Serializable 067{ 068 private Alphabet alphabet; 069 private Map symbolsToCharacters = new HashMap(); 070 private Map charactersToSymbols = new HashMap(); 071 private transient Symbol[] tokenTable; 072 private boolean caseSensitive; 073 074 public CharacterTokenization(Alphabet alpha, boolean caseSensitive) { 075 alphabet = alpha; 076 this.caseSensitive = caseSensitive; 077 } 078 079 public Alphabet getAlphabet() { 080 return alphabet; 081 } 082 083 public TokenType getTokenType() { 084 return CHARACTER; 085 } 086 087 public Annotation getAnnotation() { 088 return Annotation.EMPTY_ANNOTATION; 089 } 090 091 /** 092 * <p> 093 * Bind a Symbol to a character. 094 * </p> 095 * 096 * <p> 097 * This method will ensure that when this char is observed, it resolves to 098 * this symbol. If it was previously associated with another symbol, the old 099 * binding is removed. 100 * If this is the first time the symbol has been bound to any character, 101 * then this character is taken to be the default tokenization of the 102 * Symbol. This means that when converting symbols into characters, this 103 * char will be used. If the symbol has previously been bound to another 104 * character, then this char will not be produced for the symbol when 105 * stringifying the symbol, but this symbol will be produced when tokenizing 106 * this character. 107 * </p> 108 * 109 * @param s the Symbol to bind 110 * @param c the char to bind it to 111 */ 112 public void bindSymbol(Symbol s, char c) { 113 Character chr = new Character(c); 114 115 if (!symbolsToCharacters.containsKey(s)) { 116 symbolsToCharacters.put(s, chr); 117 } 118 if (!charactersToSymbols.containsKey(chr)) { 119 charactersToSymbols.put(chr, s); 120 } 121 tokenTable = null; 122 } 123 124 public Symbol parseToken(String token) 125 throws IllegalSymbolException 126 { 127 if (token.length() != 1) { 128 throw new IllegalSymbolException("This Tokenization only accepts single-character tokens"); 129 } 130 return parseTokenChar(token.charAt(0)); 131 } 132 133 protected Symbol[] getTokenTable() { 134 if (tokenTable == null) { 135 int maxChar = 0; 136 for (Iterator i = charactersToSymbols.keySet().iterator(); i.hasNext(); ) { 137 Character c = (Character) i.next(); 138 char cv = c.charValue(); 139 if (caseSensitive) { 140 maxChar = Math.max(maxChar, cv); 141 } else { 142 maxChar = Math.max(maxChar, Character.toUpperCase(cv)); 143 maxChar = Math.max(maxChar, Character.toLowerCase(cv)); 144 } 145 } 146 147 tokenTable = new Symbol[maxChar + 1]; 148 149 for (Iterator i = charactersToSymbols.entrySet().iterator(); i.hasNext(); ) { 150 Map.Entry me = (Map.Entry) i.next(); 151 Symbol sym = (Symbol) me.getValue(); 152 Character c = (Character) me.getKey(); 153 char cv = c.charValue(); 154 if (caseSensitive) { 155 tokenTable[cv] = sym; 156 } else { 157 tokenTable[Character.toUpperCase(cv)] = sym; 158 tokenTable[Character.toLowerCase(cv)] = sym; 159 } 160 } 161 } 162 163 return tokenTable; 164 } 165 166 protected Symbol parseTokenChar(char c) 167 throws IllegalSymbolException 168 { 169 Symbol[] tokenTable = getTokenTable(); 170 Symbol sym = null; 171 if (c < tokenTable.length) { 172 sym = tokenTable[c]; 173 } 174 if (sym == null) { 175 throw new IllegalSymbolException("This tokenization doesn't contain character: '" + c + "'"); 176 } 177 178 return sym; 179 } 180 181 private Character _tokenizeSymbol(Symbol s) 182 throws IllegalSymbolException 183 { 184 Character c = (Character) symbolsToCharacters.get(s); 185 if (c == null) { 186 Alphabet alpha = getAlphabet(); 187 alphabet.validate(s); 188 if (alpha instanceof FiniteAlphabet) { 189 c = (Character) symbolsToCharacters.get(AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alpha)); 190 } 191 if (c == null) { 192 throw new IllegalSymbolException("No mapping for symbol " + s.getName()); 193 } 194 } 195 196 return c; 197 } 198 199 public String tokenizeSymbol(Symbol s) throws IllegalSymbolException { 200 return String.valueOf(_tokenizeSymbol(s).charValue()); 201 } 202 203 public String tokenizeSymbolList(SymbolList sl) 204 throws IllegalAlphabetException 205 { 206 if (sl.getAlphabet() != getAlphabet()) { 207 throw new IllegalAlphabetException("Alphabet " + sl.getAlphabet().getName() + " does not match " + getAlphabet().getName()); 208 } 209 StringBuffer sb = new StringBuffer(); 210 for (Iterator i = sl.iterator(); i.hasNext(); ) { 211 Symbol sym = (Symbol) i.next(); 212 try { 213 Character c = _tokenizeSymbol(sym); 214 sb.append(c.charValue()); 215 } catch (IllegalSymbolException ex) { 216 throw new IllegalAlphabetException(ex, "Couldn't tokenize"); 217 } 218 } 219 220 return sb.substring(0); 221 } 222 223 public StreamParser parseStream(SeqIOListener listener) { 224 return new TPStreamParser(listener); 225 } 226 227 private class TPStreamParser implements StreamParser { 228 private SeqIOListener listener; 229 private Symbol[] buffer; 230 231 { 232 buffer = new Symbol[256]; 233 } 234 235 public TPStreamParser(SeqIOListener l) { 236 this.listener = l; 237 } 238 239 public void characters(char[] data, int start, int len) 240 throws IllegalSymbolException 241 { 242 int cnt = 0; 243 while (cnt < len) { 244 int bcnt = 0; 245 while (cnt < len && bcnt < buffer.length) { 246 buffer[bcnt++] = parseTokenChar(data[start + (cnt++)]); 247 } 248 try { 249 listener.addSymbols(getAlphabet(), 250 buffer, 251 0, 252 bcnt); 253 } catch (IllegalAlphabetException ex) { 254 throw new BioError( "Assertion failed: can't add symbols.", ex); 255 } 256 } 257 } 258 259 public void close() { 260 } 261 } 262}