001
002
003
004
005package org.biojava.utils.regex;
006
007import java.util.Iterator;
008
009import org.biojava.bio.BioException;
010import org.biojava.bio.seq.io.CharacterTokenization;
011import org.biojava.bio.seq.io.SymbolTokenization;
012import org.biojava.bio.symbol.AbstractAlphabet;
013import org.biojava.bio.symbol.Alphabet;
014import org.biojava.bio.symbol.AlphabetManager;
015import org.biojava.bio.symbol.AtomicSymbol;
016import org.biojava.bio.symbol.FiniteAlphabet;
017import org.biojava.bio.symbol.IllegalAlphabetException;
018import org.biojava.bio.symbol.IllegalSymbolException;
019import org.biojava.bio.symbol.Symbol;
020
021/**
022 * A class that creates Patterns for regex matching on 
023 * SymbolLists of a specific Alphabet.
024 * @author David Huen
025 * @since 1.4
026 */
027public class PatternFactory
028{
029    private FiniteAlphabet alfa;
030    private SymbolTokenization toke = null;
031    private PatternChecker checker = null;
032
033    PatternFactory(FiniteAlphabet alfa)
034    {
035        this.alfa = alfa;
036        fetchTokenizer();
037    }
038
039    private void fetchTokenizer()
040    {
041        boolean gotCharTokenizer =false;
042        try {
043            toke = alfa.getTokenization("token");
044            if (toke.getTokenType() == SymbolTokenization.CHARACTER)
045                gotCharTokenizer = true;
046        }
047        catch (BioException be) {
048        }
049
050        if (!gotCharTokenizer) {
051            // make own tokenizer for this turkey
052            CharacterTokenization cToke = new CharacterTokenization(alfa, true);
053
054            // go thru' and associate all atomic symbols with a unicode char
055            char uniChar = '\uE000';
056            for (Iterator symI = alfa.iterator(); symI.hasNext(); ) {
057                AtomicSymbol sym = (AtomicSymbol) symI.next();
058                cToke.bindSymbol(sym, uniChar);
059                uniChar++;
060            }
061
062            // add all ambiguity symbol
063            cToke.bindSymbol(
064                AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alfa),
065                '\uF8FF');
066            // add terminal gap
067            cToke.bindSymbol(Alphabet.EMPTY_ALPHABET.getGapSymbol(), '~');
068            // add interstitial gap
069            cToke.bindSymbol(alfa.getGapSymbol(), '-');
070
071            // bind alphabet to this tokenization
072            ((AbstractAlphabet) alfa).putTokenization("unicode", cToke);
073            toke = cToke;
074        }
075    }
076
077    /**
078     * Returns a Pattern object that applies the specified regex
079     * against SymbolLists in the Alphabet that this PatternFactory 
080     * was defined against.  The String returned by getName() is
081     * set to pattern.
082     */
083    public org.biojava.utils.regex.Pattern compile(String pattern)
084        throws RegexException, IllegalAlphabetException
085    {
086        // validate the pattern is from this alphabet
087        // we only accept RE tokens and characters from 
088        // the alphabet itself.
089        if (checker == null) checker = new PatternChecker(alfa);
090        return new org.biojava.utils.regex.Pattern(pattern, checker, alfa);
091    }
092
093    /**
094     * Returns a Pattern object that applies the specified regex
095     * against SymbolLists in the Alphabet that this PatternFactory
096     * was defined against.
097     *
098     * @param pattern regex pattern expressed as a String.
099     * @param label A String label assigned to the Pattern object.  Can be retrieved later with getName().
100     */
101    public org.biojava.utils.regex.Pattern compile(String pattern, String label)
102        throws RegexException, IllegalAlphabetException
103    {
104        // validate the pattern is from this alphabet
105        // we only accept RE tokens and characters from
106        // the alphabet itself.
107        if (checker == null) checker = new PatternChecker(alfa);
108        return new org.biojava.utils.regex.Pattern(pattern, checker, alfa, label);
109    }
110
111    /**
112     * Returns the character that represents the specified Symbol in
113     * the Alphabet that this PatternFactory was defined for.
114     * <p>
115     * The character will be ASCII in Alphabets that define a Character tokenization.
116     * In Alphabets that don't a Unicode character in the private range is returned
117     * instead and this can be used to assemble the String that is the argument
118     * for the compile method.
119     */
120    public char charValue(Symbol sym)
121        throws IllegalSymbolException
122    {
123        // this class is only used with alphabets that have a character tokenization.
124        return toke.tokenizeSymbol(sym).charAt(0);
125    }
126
127    /**
128     * Returns a factory for Patterns in the specified Alphabet.
129     */
130    public static PatternFactory makeFactory(FiniteAlphabet alfa)
131    {
132        return new PatternFactory(alfa);
133    }
134}
135