001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.symbol; 023 024import java.util.Iterator; 025import java.util.List; 026import java.util.Set; 027 028import org.biojava.bio.Annotation; 029import org.biojava.bio.BioError; 030import org.biojava.bio.BioException; 031import org.biojava.bio.seq.io.SeqIOListener; 032import org.biojava.bio.seq.io.StreamParser; 033import org.biojava.bio.seq.io.SymbolTokenization; 034import org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol; 035import org.biojava.utils.ChangeVetoException; 036import org.biojava.utils.ListTools; 037import org.biojava.utils.Unchangeable; 038 039/** 040 * Soft masking is usually displayed by making the masked regions somehow 041 * different from the non masked regions. Typically the masked regions are 042 * lower case but other schemes could be invented. For example a softmasked 043 * DNA sequence may look like this:<pre> 044 * 045 * >DNA_sequence 046 * ATGGACGCTAGCATggtggtggtggtggtggtggtGCATAGCGAGCAAGTGGAGCGT 047 * 048 * </pre> 049 * Where the lowercase regions are masked by low complexity. 050 * <p> 051 * <code>SoftMaskedAlphabet</code>s come with <code>SymbolTokenizers</code> 052 * that understand how to read and write the softmasking. The interpretation 053 * of what constitutes a masked region is governed by an implementation of 054 * a <code>MaskingDetector</code>. The <code>DEFAULT</code> field of the 055 * <code>MaskingDetector</code> interface defines lower case tokens as masked. 056 057 * <p> Copyright (c) 2004 Novartis Institute for Tropical Diseases</p> 058 * @author Mark Schreiber 059 * @version 1.0 060 */ 061 062public final class SoftMaskedAlphabet 063 extends Unchangeable implements FiniteAlphabet{ 064 065 //used to indicate masking. 0 indicates no mask 1 indicates mask. 066 private IntegerAlphabet.SubIntegerAlphabet binary; 067 private FiniteAlphabet alpha; 068 private String name; 069 private FiniteAlphabet delegateAlpha; 070 private MaskingDetector maskingDetector; 071 072 private SoftMaskedAlphabet(FiniteAlphabet alpha, String name) 073 throws IllegalAlphabetException{ 074 this.alpha = alpha; 075 binary = IntegerAlphabet.getSubAlphabet(0,1); 076 this.name = name; 077 delegateAlpha = (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet( 078 new ListTools.Doublet(alpha, binary)); 079 } 080 081 /** 082 * Generates a soft masked Alphabet where lowercase tokens are assumed to be 083 * soft masked. 084 * @param alphaToMask for example the DNA alphabet. 085 * @throws IllegalAlphabetException if it cannot be constructed 086 * @return a reference to a singleton <code>SoftMaskedAlphabet</code>. 087 */ 088 public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask) 089 throws IllegalAlphabetException { 090 return getInstance(alphaToMask, MaskingDetector.DEFAULT); 091 } 092 093 /** 094 * Creates a compound alphabet that is a hybrid of the alphabet that is to 095 * be soft masked and a binary alphabet that indicates if any 096 * <code>Symbol</code> is soft masked or not. 097 * 098 * @param alphaToMask for example the DNA alphabet. 099 * @param maskingDetector to define masking behaivour 100 * @throws IllegalAlphabetException if it cannot be constructed 101 * @return a reference to a singleton <code>SoftMaskedAlphabet</code>. 102 */ 103 public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask, 104 MaskingDetector maskingDetector) 105 throws IllegalAlphabetException{ 106 String lookup = "Softmasked {"+alphaToMask.getName()+"}"; 107 if(AlphabetManager.registered(lookup)){ 108 return (SoftMaskedAlphabet)AlphabetManager.alphabetForName(lookup); 109 } 110 111 SoftMaskedAlphabet sma = new SoftMaskedAlphabet(alphaToMask, lookup); 112 AlphabetManager.registerAlphabet(sma.getName(), sma); 113 114 sma.maskingDetector = maskingDetector; 115 return sma; 116 } 117 118 /** 119 * Gets the <CODE>Alphabet</CODE> upon which masking is being applied 120 * @return A <CODE>FiniteAlphabet</CODE> 121 */ 122 public FiniteAlphabet getMaskedAlphabet(){ 123 return alpha; 124 } 125 126 /** 127 * The compound alpha that holds the symbols used by this wrapper 128 * @return a <code>FiniteAlphabet</code> 129 */ 130 protected FiniteAlphabet getDelegate(){ 131 return delegateAlpha; 132 } 133 134 /** 135 * The SoftMaskedAlphabet has no annotation 136 * @return Annotation.EMPTY_ANNOTATION 137 */ 138 public Annotation getAnnotation(){ 139 return Annotation.EMPTY_ANNOTATION; 140 } 141 142 /** 143 * The name of the Alphabet 144 * @return a <code>String</code> in the form of 145 * <code>"Softmasked {"+alphaToMask.getName()+"}"</code> 146 */ 147 public String getName(){ 148 return name; 149 } 150 151 /** 152 * Gets the components of the <code>Alphabet</code>. 153 * @return a <code>List</code> with two members, the first is the wrapped 154 * <code>Alphabet</code> the second is the binary 155 * <code>SubIntegerAlphabet</code>. 156 */ 157 public List getAlphabets(){ 158 return new ListTools.Doublet(alpha, binary); 159 } 160 161 162 /** 163 * Gets the compound symbol composed of the <code>Symbols</code> in the List. 164 * The <code>Symbols</code> in the <code>List</code> must be from <code>alpha</code> 165 * (defined in the constructor) and <code>SUBINTEGER[0..1]</code> 166 * @return A <code>Symbol</code> from this alphabet. 167 * @throws IllegalSymbolException if <code>l</code> is not as expected (see above) 168 * @param l a <code>List</code> of <code>Symbols</code> 169 */ 170 public Symbol getSymbol(List l) throws IllegalSymbolException { 171 return delegateAlpha.getSymbol(l); 172 } 173 174 /** 175 * This is not supported. Ambiguity should be handled at the level of the 176 * wrapped Alphabet. Use <code>getSymbol(List l)</code> instead and provide 177 * it with an ambigutiy and a masking symbol. 178 * @param s a <code>Set</code> of <code>Symbols</code> 179 * @see #getSymbol(List l) 180 * @throws UnsupportedOperationException 181 */ 182 public Symbol getAmbiguity(Set s) throws UnsupportedOperationException { 183 throw new UnsupportedOperationException( 184 "Ambiguity should be handled at the level of the wrapped Alphabet"); 185 } 186 187 public Symbol getGapSymbol(){ 188 return AlphabetManager.getGapSymbol(new ListTools.Doublet(alpha, binary)); 189 } 190 191 public boolean contains(Symbol s){ 192 return delegateAlpha.contains(s); 193 } 194 195 public void validate(Symbol s)throws IllegalSymbolException{ 196 if(! contains(s)){ 197 throw new IllegalSymbolException( 198 s, s.getName()+" is not a valid part of "+getName()); 199 } 200 } 201 202 /** 203 * Getter for the <code>MaskingDetector<code> 204 * @return the <code>MaskingDetector<code> 205 */ 206 public MaskingDetector getMaskingDetector(){ 207 return maskingDetector; 208 } 209 210 public SymbolTokenization getTokenization(String type) 211 throws BioException{ 212 return new CaseSensitiveTokenization(this, type); 213 } 214 215 public int size(){ 216 return delegateAlpha.size(); 217 } 218 219 public Iterator iterator(){ 220 return delegateAlpha.iterator(); 221 } 222 223 /** 224 * <code>SoftMaskedAlphabet</code>s cannot add new <code>Symbol</code>s. A 225 * <code>ChangeVetoException</code> will be thrown. 226 * @param s the <code>Symbol</code> to add. 227 * @throws ChangeVetoException when called. 228 */ 229 public void addSymbol(Symbol s) throws ChangeVetoException{ 230 throw new ChangeVetoException("SoftMaskedAlphabets cannot add new Symbols"); 231 } 232 233 /** 234 * <code>SoftMaskedAlphabet</code>s cannot remove <code>Symbol</code>s. A 235 * <code>ChangeVetoException</code> will be thrown. 236 * @param s the <code>Symbol</code> to remove. 237 * @throws ChangeVetoException when called. 238 */ 239 public void removeSymbol(Symbol s) throws ChangeVetoException{ 240 throw new ChangeVetoException("SoftMaskedAlphabets cannot remove Symbols"); 241 } 242 243 /** 244 * Determines if a <code>Symbol</code> is masked. 245 * @return true if <code>s</code> is masked. 246 * @param s the <code>Symbol</code> to test. 247 */ 248 public boolean isMasked (BasisSymbol s) throws IllegalSymbolException { 249 validate(s); 250 251 IntegerSymbol b = (IntegerSymbol)s.getSymbols().get(1); 252 return (b.intValue() == 1); 253 } 254 255 /** 256 * Implementations will define how soft masking looks. The 257 * <code>DEFAULT</code> implementation considers softmasking to be represented 258 * by lower case characters. 259 * 260 * <p>Copyright (c) 2004 Novartis Institute for Tropical Diseases</p> 261 * @author Mark Schreiber 262 * @version 1.0 263 */ 264 public interface MaskingDetector{ 265 public boolean isMasked (String token); 266 267 /** 268 * Present the token for a <code>Symbol</code> as it would appear if masked 269 * @param token the <code>String</code> to mask. 270 * @return the masked token 271 */ 272 public String mask (String token); 273 274 /** 275 * Present the token for a <code>Symbol</code> as it would appear if 276 * it wasn't softmasked 277 * @param token the <code>String</code> to un-mask. 278 * @return the un-masked token 279 */ 280 public String unmask (String token); 281 public static MaskingDetector DEFAULT = new DefaultMaskingDetector(); 282 283 class DefaultMaskingDetector implements MaskingDetector{ 284 285 /** 286 * Default Behaivour is that if the whole token is lower case it is 287 * masked. 288 * @param token the <code>String</code> to check for masking 289 * @return true is it is all lower case, otherwise false. 290 */ 291 public boolean isMasked(String token){ 292 293 for (int i = 0; i < token.length(); i++) { 294 if(Character.isUpperCase(token.charAt(i))){ 295 return false; 296 } 297 } 298 299 return true; 300 } 301 302 /** 303 * Masks a token by making it lowercase 304 * @param token the <code>String</code> to mask 305 * @return a lower case <code>String</code> 306 */ 307 public String mask(String token){ 308 return token.toLowerCase(); 309 } 310 311 /** 312 * Un-masks the token by making it upper case. 313 * @param token the <code>String</code> to unmask 314 * @return the upper case <code>String</code> 315 */ 316 public String unmask(String token){ 317 return token.toUpperCase(); 318 } 319 } 320 } 321 322 /** 323 * This <code>SymbolTokenizer</code> works with a delegate to softmask 324 * symbol tokenization as appropriate. It should only be used in combination 325 * with a SoftMaskedAlphabet. 326 * You will never instantiate one of these yourself. 327 * 328 * <p> Copyright (c) 2004 Novartis Institute for Tropical Diseases</p> 329 * @author Mark Schreiber 330 * @version 1.0 331 */ 332 public class CaseSensitiveTokenization 333 extends Unchangeable implements SymbolTokenization{ 334 335 private SymbolTokenization delegate; 336 private SoftMaskedAlphabet alpha; 337 338 private CaseSensitiveTokenization( 339 SoftMaskedAlphabet alpha, String type) 340 throws BioException{ 341 342 this.alpha = alpha; 343 this.delegate = alpha.getMaskedAlphabet().getTokenization(type); 344 } 345 346 public Annotation getAnnotation(){ 347 return Annotation.EMPTY_ANNOTATION; 348 } 349 350 public Alphabet getAlphabet(){ 351 return alpha; 352 } 353 354 public SymbolTokenization.TokenType getTokenType(){ 355 return delegate.getTokenType(); 356 } 357 358 public Symbol parseToken(String token) throws IllegalSymbolException{ 359 MaskingDetector md = alpha.getMaskingDetector(); 360 IntegerSymbol bin; 361 362 Symbol component = delegate.parseToken(token); 363 364 if(md.isMasked(token)){ 365 bin = binary.getSymbol(1); 366 }else{ 367 bin = binary.getSymbol(0); 368 } 369 370 return alpha.getSymbol(new ListTools.Doublet(component, bin)); 371 } 372 373 public String tokenizeSymbolList(SymbolList sl) throws 374 IllegalSymbolException { 375 376 StringBuffer sb = new StringBuffer(sl.length()); 377 for(int i = 1; i <= sl.length(); i++){ 378 sb.append(tokenizeSymbol(sl.symbolAt(i))); 379 } 380 return sb.toString(); 381 } 382 383 /** 384 * The current implementation only supports character parsing. Word or 385 * fixed width parsing is not yet supported. 386 * 387 * @param l the <code>SeqIOListener</code> to callback to. 388 * @return a <code>StreamParser</code> that the <code>SeqIOListener</code> 389 * talks to. 390 */ 391 public StreamParser parseStream(SeqIOListener l){ 392 return new CharStreamParser(l); 393 } 394 395 public String tokenizeSymbol (Symbol s) throws IllegalSymbolException{ 396 validate(s); 397 Symbol a = (Symbol) ((BasisSymbol)s).getSymbols().get(0); 398 String token = delegate.tokenizeSymbol(a); 399 400 if(alpha.isMasked((BasisSymbol) s)){ 401 return maskingDetector.mask(token); 402 } 403 404 return maskingDetector.unmask(token); 405 } 406 407 private class CharStreamParser implements StreamParser { 408 private SeqIOListener listener; 409 private Symbol[] buffer; 410 411 public CharStreamParser(SeqIOListener l) { 412 this.listener = l; 413 buffer = new Symbol[256]; 414 } 415 416 public void characters(char[] data, int start, int len) 417 throws IllegalSymbolException{ 418 int cnt = 0; 419 while (cnt < len) { 420 int bcnt = 0; 421 while (cnt < len && bcnt < buffer.length) { 422 buffer[bcnt++] = parseToken( 423 new String(""+data[start + (cnt++)])); 424 } 425 try { 426 listener.addSymbols(getAlphabet(), 427 buffer, 428 0, 429 bcnt); 430 } catch (IllegalAlphabetException ex) { 431 throw new BioError( 432 "Assertion failed: can't add symbols.", ex); 433 } 434 } 435 } 436 437 public void close() { 438 } 439 } 440 441 } 442}