001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.symbol;
023
024import java.util.Iterator;
025import java.util.List;
026import java.util.Set;
027
028import org.biojava.bio.Annotation;
029import org.biojava.bio.BioError;
030import org.biojava.bio.BioException;
031import org.biojava.bio.seq.io.SeqIOListener;
032import org.biojava.bio.seq.io.StreamParser;
033import org.biojava.bio.seq.io.SymbolTokenization;
034import org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol;
035import org.biojava.utils.ChangeVetoException;
036import org.biojava.utils.ListTools;
037import org.biojava.utils.Unchangeable;
038
039/**
040 * Soft masking is usually displayed by making the masked regions somehow
041 * different from the non masked regions. Typically the masked regions are
042 * lower case but other schemes could be invented. For example a softmasked
043 * DNA sequence may look like this:<pre>
044 *
045 * >DNA_sequence
046 * ATGGACGCTAGCATggtggtggtggtggtggtggtGCATAGCGAGCAAGTGGAGCGT
047 *
048 * </pre>
049 * Where the lowercase regions are masked by low complexity.
050 * <p>
051 * <code>SoftMaskedAlphabet</code>s come with <code>SymbolTokenizers</code>
052 * that understand how to read and write the softmasking. The interpretation
053 * of what constitutes a masked region is governed by an implementation of
054 * a <code>MaskingDetector</code>. The <code>DEFAULT</code> field of the
055 * <code>MaskingDetector</code> interface defines lower case tokens as masked.
056
057 * <p> Copyright (c) 2004 Novartis Institute for Tropical Diseases</p>
058 * @author Mark Schreiber
059 * @version 1.0
060 */
061
062public final class SoftMaskedAlphabet
063    extends Unchangeable implements FiniteAlphabet{
064
065  //used to indicate masking. 0 indicates no mask 1 indicates mask.
066  private IntegerAlphabet.SubIntegerAlphabet binary;
067  private FiniteAlphabet alpha;
068  private String name;
069  private FiniteAlphabet delegateAlpha;
070  private MaskingDetector maskingDetector;
071
072  private SoftMaskedAlphabet(FiniteAlphabet alpha, String name)
073      throws IllegalAlphabetException{
074    this.alpha = alpha;
075    binary = IntegerAlphabet.getSubAlphabet(0,1);
076    this.name = name;
077    delegateAlpha = (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet(
078        new ListTools.Doublet(alpha, binary));
079  }
080
081  /**
082   * Generates a soft masked Alphabet where lowercase tokens are assumed to be
083   * soft masked.
084   * @param alphaToMask for example the DNA alphabet.
085   * @throws IllegalAlphabetException if it cannot be constructed
086   * @return a reference to a singleton <code>SoftMaskedAlphabet</code>.
087   */
088  public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask)
089      throws IllegalAlphabetException {
090    return getInstance(alphaToMask, MaskingDetector.DEFAULT);
091  }
092
093  /**
094   * Creates a compound alphabet that is a hybrid of the alphabet that is to
095   * be soft masked and a binary alphabet that indicates if any
096   * <code>Symbol</code> is soft masked or not.
097   *
098   * @param alphaToMask for example the DNA alphabet.
099   * @param maskingDetector to define masking behaivour
100   * @throws IllegalAlphabetException if it cannot be constructed
101   * @return a reference to a singleton <code>SoftMaskedAlphabet</code>.
102   */
103  public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask,
104                                               MaskingDetector maskingDetector)
105      throws IllegalAlphabetException{
106    String lookup = "Softmasked {"+alphaToMask.getName()+"}";
107    if(AlphabetManager.registered(lookup)){
108      return (SoftMaskedAlphabet)AlphabetManager.alphabetForName(lookup);
109    }
110
111    SoftMaskedAlphabet sma = new SoftMaskedAlphabet(alphaToMask, lookup);
112    AlphabetManager.registerAlphabet(sma.getName(), sma);
113
114    sma.maskingDetector = maskingDetector;
115    return sma;
116  }
117
118  /**
119   * Gets the <CODE>Alphabet</CODE> upon which masking is being applied
120   * @return A <CODE>FiniteAlphabet</CODE>
121   */
122  public FiniteAlphabet getMaskedAlphabet(){
123    return alpha;
124  }
125
126  /**
127   * The compound alpha that holds the symbols used by this wrapper
128   * @return a <code>FiniteAlphabet</code>
129   */
130  protected FiniteAlphabet getDelegate(){
131    return delegateAlpha;
132  }
133
134  /**
135   * The SoftMaskedAlphabet has no annotation
136   * @return Annotation.EMPTY_ANNOTATION
137   */
138  public Annotation getAnnotation(){
139    return Annotation.EMPTY_ANNOTATION;
140  }
141
142  /**
143   * The name of the Alphabet
144   * @return a <code>String</code> in the form of
145   * <code>"Softmasked {"+alphaToMask.getName()+"}"</code>
146   */
147  public String getName(){
148    return name;
149  }
150
151  /**
152   * Gets the components of the <code>Alphabet</code>.
153   * @return a <code>List</code> with two members, the first is the wrapped
154   * <code>Alphabet</code> the second is the binary
155   * <code>SubIntegerAlphabet</code>.
156   */
157  public List getAlphabets(){
158    return new ListTools.Doublet(alpha, binary);
159  }
160
161  
162  /**
163   * Gets the compound symbol composed of the <code>Symbols</code> in the List.
164   * The <code>Symbols</code> in the <code>List</code> must be from <code>alpha</code>
165   * (defined in the constructor) and <code>SUBINTEGER[0..1]</code>
166   * @return A <code>Symbol</code> from this alphabet.
167   * @throws IllegalSymbolException if <code>l</code> is not as expected (see above)
168   * @param l a <code>List</code> of <code>Symbols</code>
169   */
170  public Symbol getSymbol(List l) throws IllegalSymbolException {
171    return delegateAlpha.getSymbol(l);
172  }
173  
174  /**
175   * This is not supported. Ambiguity should be handled at the level of the 
176   * wrapped Alphabet. Use <code>getSymbol(List l)</code> instead and provide
177   * it with an ambigutiy and a masking symbol.
178   * @param s a <code>Set</code> of <code>Symbols</code>
179   * @see #getSymbol(List l)
180   * @throws UnsupportedOperationException
181   */
182  public Symbol getAmbiguity(Set s) throws UnsupportedOperationException {
183    throw new UnsupportedOperationException(
184        "Ambiguity should be handled at the level of the wrapped Alphabet");
185  }
186
187  public Symbol getGapSymbol(){
188    return AlphabetManager.getGapSymbol(new ListTools.Doublet(alpha, binary));
189  }
190
191  public boolean contains(Symbol s){
192    return delegateAlpha.contains(s);
193  }
194
195  public void validate(Symbol s)throws IllegalSymbolException{
196    if(! contains(s)){
197      throw new IllegalSymbolException(
198          s, s.getName()+" is not a valid part of "+getName());
199    }
200  }
201
202  /**
203   * Getter for the <code>MaskingDetector<code>
204   * @return the <code>MaskingDetector<code>
205   */
206  public MaskingDetector getMaskingDetector(){
207    return maskingDetector;
208  }
209
210  public SymbolTokenization getTokenization(String type)
211      throws BioException{
212    return new CaseSensitiveTokenization(this, type);
213  }
214
215  public int size(){
216      return delegateAlpha.size();
217  }
218
219  public Iterator iterator(){
220    return delegateAlpha.iterator();
221  }
222
223  /**
224   * <code>SoftMaskedAlphabet</code>s cannot add new <code>Symbol</code>s. A
225   * <code>ChangeVetoException</code> will be thrown.
226   * @param s the <code>Symbol</code> to add.
227   * @throws ChangeVetoException when called.
228   */
229  public void addSymbol(Symbol s) throws ChangeVetoException{
230    throw new ChangeVetoException("SoftMaskedAlphabets cannot add new Symbols");
231  }
232
233  /**
234   * <code>SoftMaskedAlphabet</code>s cannot remove <code>Symbol</code>s. A
235   * <code>ChangeVetoException</code> will be thrown.
236   * @param s the <code>Symbol</code> to remove.
237   * @throws ChangeVetoException when called.
238   */
239  public void removeSymbol(Symbol s) throws ChangeVetoException{
240    throw new ChangeVetoException("SoftMaskedAlphabets cannot remove Symbols");
241  }
242
243  /**
244   * Determines if a <code>Symbol</code> is masked.
245   * @return true if <code>s</code> is masked.
246   * @param s the <code>Symbol</code> to test.
247   */
248  public boolean isMasked (BasisSymbol s) throws IllegalSymbolException {
249    validate(s);
250
251    IntegerSymbol b = (IntegerSymbol)s.getSymbols().get(1);
252    return (b.intValue() == 1);
253  }
254
255  /**
256   * Implementations will define how soft masking looks. The
257   * <code>DEFAULT</code> implementation considers softmasking to be represented
258   * by lower case characters.
259   *
260   * <p>Copyright (c) 2004 Novartis Institute for Tropical Diseases</p>
261   * @author Mark Schreiber
262   * @version 1.0
263   */
264  public interface MaskingDetector{
265    public boolean isMasked (String token);
266
267    /**
268     * Present the token for a <code>Symbol</code> as it would appear if masked
269     * @param token the <code>String</code> to mask.
270     * @return the masked token
271     */
272    public String mask (String token);
273
274    /**
275     * Present the token for a <code>Symbol</code> as it would appear if
276     * it wasn't softmasked
277     * @param token the <code>String</code> to un-mask.
278     * @return the un-masked token
279     */
280    public String unmask (String token);
281    public static MaskingDetector DEFAULT = new DefaultMaskingDetector();
282
283    class DefaultMaskingDetector implements MaskingDetector{
284
285      /**
286       * Default Behaivour is that if the whole token is lower case it is
287       * masked.
288       * @param token the <code>String</code> to check for masking
289       * @return true is it is all lower case, otherwise false.
290       */
291      public boolean isMasked(String token){
292
293        for (int i = 0; i < token.length(); i++) {
294          if(Character.isUpperCase(token.charAt(i))){
295            return false;
296          }
297        }
298
299        return true;
300      }
301
302      /**
303       * Masks a token by making it lowercase
304       * @param token the <code>String</code> to mask
305       * @return a lower case <code>String</code>
306       */
307      public String mask(String token){
308        return token.toLowerCase();
309      }
310
311      /**
312       * Un-masks the token by making it upper case.
313       * @param token the <code>String</code> to unmask
314       * @return the upper case <code>String</code>
315       */
316      public String unmask(String token){
317        return token.toUpperCase();
318      }
319    }
320  }
321
322  /**
323   * This <code>SymbolTokenizer</code> works with a delegate to softmask
324   * symbol tokenization as appropriate. It should only be used in combination
325   * with a SoftMaskedAlphabet.
326   * You will never instantiate one of these yourself.
327   *
328   * <p> Copyright (c) 2004 Novartis Institute for Tropical Diseases</p>
329   * @author Mark Schreiber
330   * @version 1.0
331   */
332  public class CaseSensitiveTokenization
333      extends Unchangeable implements SymbolTokenization{
334
335    private SymbolTokenization delegate;
336    private SoftMaskedAlphabet alpha;
337
338    private CaseSensitiveTokenization(
339        SoftMaskedAlphabet alpha, String type)
340        throws BioException{
341
342      this.alpha = alpha;
343      this.delegate = alpha.getMaskedAlphabet().getTokenization(type);
344    }
345
346    public Annotation getAnnotation(){
347      return Annotation.EMPTY_ANNOTATION;
348    }
349
350    public Alphabet getAlphabet(){
351      return alpha;
352    }
353
354    public SymbolTokenization.TokenType getTokenType(){
355      return delegate.getTokenType();
356    }
357
358    public Symbol parseToken(String token) throws IllegalSymbolException{
359      MaskingDetector md = alpha.getMaskingDetector();
360      IntegerSymbol bin;
361
362      Symbol component = delegate.parseToken(token);
363
364      if(md.isMasked(token)){
365        bin = binary.getSymbol(1);
366      }else{
367        bin = binary.getSymbol(0);
368      }
369
370      return alpha.getSymbol(new ListTools.Doublet(component, bin));
371    }
372
373    public String tokenizeSymbolList(SymbolList sl) throws
374        IllegalSymbolException {
375
376      StringBuffer sb = new StringBuffer(sl.length());
377      for(int i = 1; i <= sl.length(); i++){
378        sb.append(tokenizeSymbol(sl.symbolAt(i)));
379      }
380      return sb.toString();
381    }
382
383    /**
384     * The current implementation only supports character parsing. Word or
385     * fixed width parsing is not yet supported.
386     *
387     * @param l the <code>SeqIOListener</code> to callback to.
388     * @return a <code>StreamParser</code> that the <code>SeqIOListener</code>
389     * talks to.
390     */
391    public StreamParser parseStream(SeqIOListener l){
392      return new CharStreamParser(l);
393    }
394
395    public String tokenizeSymbol (Symbol s) throws IllegalSymbolException{
396      validate(s);
397      Symbol a = (Symbol) ((BasisSymbol)s).getSymbols().get(0);
398      String token = delegate.tokenizeSymbol(a);
399
400      if(alpha.isMasked((BasisSymbol) s)){
401        return maskingDetector.mask(token);
402      }
403
404      return maskingDetector.unmask(token);
405    }
406
407    private class CharStreamParser implements StreamParser {
408        private SeqIOListener listener;
409        private Symbol[] buffer;
410
411        public CharStreamParser(SeqIOListener l) {
412            this.listener = l;
413            buffer = new Symbol[256];
414        }
415
416        public void characters(char[] data, int start, int len)
417            throws IllegalSymbolException{
418            int cnt = 0;
419            while (cnt < len) {
420                int bcnt = 0;
421                while (cnt < len && bcnt < buffer.length) {
422                    buffer[bcnt++] = parseToken(
423                      new String(""+data[start + (cnt++)]));
424                }
425                try {
426                    listener.addSymbols(getAlphabet(),
427                                        buffer,
428                                        0,
429                                        bcnt);
430                } catch (IllegalAlphabetException ex) {
431                    throw new BioError(
432                      "Assertion failed: can't add symbols.", ex);
433                }
434            }
435        }
436
437        public void close() {
438        }
439    }
440
441  }
442}