001 002 003 004 005package org.biojava.utils.regex; 006 007import java.util.Iterator; 008 009import org.biojava.bio.BioException; 010import org.biojava.bio.seq.io.CharacterTokenization; 011import org.biojava.bio.seq.io.SymbolTokenization; 012import org.biojava.bio.symbol.AbstractAlphabet; 013import org.biojava.bio.symbol.Alphabet; 014import org.biojava.bio.symbol.AlphabetManager; 015import org.biojava.bio.symbol.AtomicSymbol; 016import org.biojava.bio.symbol.FiniteAlphabet; 017import org.biojava.bio.symbol.IllegalAlphabetException; 018import org.biojava.bio.symbol.IllegalSymbolException; 019import org.biojava.bio.symbol.Symbol; 020 021/** 022 * A class that creates Patterns for regex matching on 023 * SymbolLists of a specific Alphabet. 024 * @author David Huen 025 * @since 1.4 026 */ 027public class PatternFactory 028{ 029 private FiniteAlphabet alfa; 030 private SymbolTokenization toke = null; 031 private PatternChecker checker = null; 032 033 PatternFactory(FiniteAlphabet alfa) 034 { 035 this.alfa = alfa; 036 fetchTokenizer(); 037 } 038 039 private void fetchTokenizer() 040 { 041 boolean gotCharTokenizer =false; 042 try { 043 toke = alfa.getTokenization("token"); 044 if (toke.getTokenType() == SymbolTokenization.CHARACTER) 045 gotCharTokenizer = true; 046 } 047 catch (BioException be) { 048 } 049 050 if (!gotCharTokenizer) { 051 // make own tokenizer for this turkey 052 CharacterTokenization cToke = new CharacterTokenization(alfa, true); 053 054 // go thru' and associate all atomic symbols with a unicode char 055 char uniChar = '\uE000'; 056 for (Iterator symI = alfa.iterator(); symI.hasNext(); ) { 057 AtomicSymbol sym = (AtomicSymbol) symI.next(); 058 cToke.bindSymbol(sym, uniChar); 059 uniChar++; 060 } 061 062 // add all ambiguity symbol 063 cToke.bindSymbol( 064 AlphabetManager.getAllAmbiguitySymbol((FiniteAlphabet) alfa), 065 '\uF8FF'); 066 // add terminal gap 067 cToke.bindSymbol(Alphabet.EMPTY_ALPHABET.getGapSymbol(), '~'); 068 // add interstitial gap 069 cToke.bindSymbol(alfa.getGapSymbol(), '-'); 070 071 // bind alphabet to this tokenization 072 ((AbstractAlphabet) alfa).putTokenization("unicode", cToke); 073 toke = cToke; 074 } 075 } 076 077 /** 078 * Returns a Pattern object that applies the specified regex 079 * against SymbolLists in the Alphabet that this PatternFactory 080 * was defined against. The String returned by getName() is 081 * set to pattern. 082 */ 083 public org.biojava.utils.regex.Pattern compile(String pattern) 084 throws RegexException, IllegalAlphabetException 085 { 086 // validate the pattern is from this alphabet 087 // we only accept RE tokens and characters from 088 // the alphabet itself. 089 if (checker == null) checker = new PatternChecker(alfa); 090 return new org.biojava.utils.regex.Pattern(pattern, checker, alfa); 091 } 092 093 /** 094 * Returns a Pattern object that applies the specified regex 095 * against SymbolLists in the Alphabet that this PatternFactory 096 * was defined against. 097 * 098 * @param pattern regex pattern expressed as a String. 099 * @param label A String label assigned to the Pattern object. Can be retrieved later with getName(). 100 */ 101 public org.biojava.utils.regex.Pattern compile(String pattern, String label) 102 throws RegexException, IllegalAlphabetException 103 { 104 // validate the pattern is from this alphabet 105 // we only accept RE tokens and characters from 106 // the alphabet itself. 107 if (checker == null) checker = new PatternChecker(alfa); 108 return new org.biojava.utils.regex.Pattern(pattern, checker, alfa, label); 109 } 110 111 /** 112 * Returns the character that represents the specified Symbol in 113 * the Alphabet that this PatternFactory was defined for. 114 * <p> 115 * The character will be ASCII in Alphabets that define a Character tokenization. 116 * In Alphabets that don't a Unicode character in the private range is returned 117 * instead and this can be used to assemble the String that is the argument 118 * for the compile method. 119 */ 120 public char charValue(Symbol sym) 121 throws IllegalSymbolException 122 { 123 // this class is only used with alphabets that have a character tokenization. 124 return toke.tokenizeSymbol(sym).charAt(0); 125 } 126 127 /** 128 * Returns a factory for Patterns in the specified Alphabet. 129 */ 130 public static PatternFactory makeFactory(FiniteAlphabet alfa) 131 { 132 return new PatternFactory(alfa); 133 } 134} 135