001 002 003package org.biojava.utils.automata; 004 005import java.util.Iterator; 006 007import org.biojava.bio.BioException; 008import org.biojava.bio.seq.io.SymbolTokenization; 009import org.biojava.bio.symbol.AtomicSymbol; 010import org.biojava.bio.symbol.FiniteAlphabet; 011import org.biojava.bio.symbol.IllegalSymbolException; 012import org.biojava.bio.symbol.Symbol; 013 014public class PatternMaker 015{ 016 // the NFA that will be the result of parsing the specified pattern. 017 private Nfa nfa; 018 private Tokenizer toke; 019 private SymbolTokenization symtoke; 020 021 private static class Tokenizer 022 { 023 private String packedTxt; 024 private int ptr = 0; 025 026 final static int EOL = -1; 027 final static int SYMBOL_TOKEN = 0; 028 final static int NUMERIC = 1; 029 final static int LEFT_BRACE = 2; 030 final static int RIGHT_BRACE = 3; 031 final static int COMMA = 4; 032 final static int LEFT_BRACKET = 5; 033 final static int RIGHT_BRACKET = 6; 034 final static int PLUS = 7; 035 final static int ASTERISK = 8; 036 final static int VERT_BAR = 9; 037 final static int UNKNOWN = 999; 038 039 private Tokenizer(String target) 040 { 041 packedTxt = pack(target); 042 } 043 044 private char getToken() 045 throws IndexOutOfBoundsException 046 { 047 if (hasNext()) 048 return packedTxt.charAt(ptr++); 049 else 050 throw new IndexOutOfBoundsException("text length: " + packedTxt.length() + " index: " + ptr); 051 } 052 053 private char peekToken() 054 throws IndexOutOfBoundsException 055 { 056 if (hasNext()) 057 return packedTxt.charAt(ptr); 058 else 059 throw new IndexOutOfBoundsException("text length: " + packedTxt.length() + " index: " + ptr); 060 } 061 062 private int nextTokenType() 063 { 064 if (!hasNext()) return EOL; 065 066 char nextCh = packedTxt.charAt(ptr); 067 068 // symbol tokens assumed to be alphas. 069 if (Character.isLetter(nextCh)) 070 return SYMBOL_TOKEN; 071 if (Character.isDigit(nextCh)) 072 return NUMERIC; 073 // now check for specific chars 074 if (nextCh == '{') 075 return LEFT_BRACE; 076 if (nextCh == '}') 077 return RIGHT_BRACE; 078 if (nextCh == ',') 079 return COMMA; 080 if (nextCh == '(') 081 return LEFT_BRACKET; 082 if (nextCh == ')') 083 return RIGHT_BRACKET; 084 if (nextCh == '+') 085 return PLUS; 086 if (nextCh == '*') 087 return ASTERISK; 088 if (nextCh == '|') 089 return VERT_BAR; 090 return UNKNOWN; 091 } 092 093 private boolean hasNext() 094 { 095 return ptr < packedTxt.length(); 096 } 097 098 /** 099 * produces a version of the String with whitespace removed. 100 */ 101 private String pack(String source) 102 { 103 StringBuffer packedString = new StringBuffer(); 104 105 for (int i=0; i < source.length(); i++) { 106 char currCh; 107 if (!Character.isWhitespace(currCh = source.charAt(i))) { 108 packedString.append(currCh); 109 } 110 } 111 112 return packedString.toString(); 113 } 114 } 115 116 private static class Range 117 { 118 private int min = 1; 119 private int max = 1; 120 private int getMin() { return min; } 121 private int getMax() { return max; } 122 123 private Range(int min, int max) 124 { 125 this.min = min; 126 this.max = max; 127 } 128 129 private boolean once() { return (min == 1) && (max == 1); } 130 } 131 132 PatternMaker(String patternString, FiniteAlphabet alfa) 133 throws BioException 134 { 135 toke = new Tokenizer(patternString); 136 nfa = new Nfa(patternString, alfa); 137 symtoke = alfa.getTokenization("token"); 138 } 139 140 /** 141 * Compiles the regex described by patternString into a DFA. 142 * The DFA can subsequently be converted into a state 143 * state machine for searching. 144 * <b>This is the main work method for this class.</b> 145 */ 146 static FiniteAutomaton compilePattern(String patternString, FiniteAlphabet alfa) 147 throws BioException, AutomatonException 148 { 149 PatternMaker maker = new PatternMaker(patternString, alfa); 150 return maker.parse(); 151 } 152 153 private FiniteAutomaton parse() 154 throws AutomatonException 155 { 156 NfaSubModel result = parse(nfa); 157 158 // complete linking up model to the start/end. 159 nfa.addEpsilonTransition(nfa.getStart(), result.getStart()); 160 nfa.addEpsilonTransition(result.getEnd(), nfa.getEnd()); 161 //System.out.println(nfa.toString()); 162 nfa.doEpsilonClosure(); 163 //System.out.println(nfa.toString()); 164 return new DfaBuilder(nfa).getDFA(); 165 } 166 167 private NfaSubModel parse(NfaBuilder delegate) 168 throws AutomatonException 169 { 170 /** 171 * The model is that the pattern here can have 172 * alternative patterns so a single pattern ends 173 * up being a pattern of branching 1. 174 */ 175 NfaSubModel returnSubModel = new NfaSubModel(delegate); 176 NfaSubModel branchSubModel = new NfaSubModel(returnSubModel); 177 // make it link silently so append works properly. 178 branchSubModel.addEpsilonTransition(branchSubModel.getStart(), branchSubModel.getEnd()); 179 // do I have content in this branch? 180 boolean gotContent =false; 181 182 Range times; 183 while (toke.hasNext()) { 184 int tokenType = toke.nextTokenType(); 185 186 NfaSubModel currSubModel = null; 187 188 switch (tokenType) { 189 case Tokenizer.LEFT_BRACKET: 190 //System.out.println("processing left bracket" + toke.peekToken()); 191 gotContent = true; 192 currSubModel = parseSubPattern(branchSubModel); 193 times = parseIterations(); 194 currSubModel = reiterate(currSubModel, branchSubModel, times); 195 branchSubModel.append(currSubModel); 196 break; 197 case Tokenizer.SYMBOL_TOKEN: 198 //System.out.println("processing symbol " + toke.peekToken()); 199 gotContent = true; 200 currSubModel = parseSymbol(branchSubModel); 201 times = parseIterations(); 202 currSubModel = reiterate(currSubModel, branchSubModel, times); 203// System.out.println("before\n" + branchSubModel); 204 branchSubModel.append(currSubModel); 205// System.out.println("after\n" + branchSubModel); 206 break; 207 case Tokenizer.VERT_BAR: 208 //System.out.println("processing bar" + toke.peekToken()); 209 // link current branch into return value 210 if (!gotContent) throw new AutomatonException("no content in this branch!"); 211 System.out.println(returnSubModel.getStart().getID() + " " + branchSubModel.getStart().getID()); 212 returnSubModel.addEpsilonTransition( 213 returnSubModel.getStart(), 214 branchSubModel.getStart()); 215 returnSubModel.addEpsilonTransition( 216 branchSubModel.getEnd(), 217 returnSubModel.getEnd()); 218 // start new branch 219 branchSubModel = new NfaSubModel(returnSubModel); 220 branchSubModel.addEpsilonTransition(branchSubModel.getStart(), branchSubModel.getEnd()); 221 gotContent = false; 222 toke.getToken(); 223 break; 224 case Tokenizer.RIGHT_BRACKET: 225 //System.out.println("processing right bracket" + toke.peekToken()); 226 // link current branch into return value 227 if (!gotContent) throw new AutomatonException("no content in this branch!"); 228 //System.out.println(returnSubModel.getStart().getID() + " " + branchSubModel.getStart().getID()); 229 returnSubModel.addEpsilonTransition( 230 returnSubModel.getStart(), 231 branchSubModel.getStart()); 232 returnSubModel.addEpsilonTransition( 233 branchSubModel.getEnd(), 234 returnSubModel.getEnd()); 235 return returnSubModel; // note that the right bracket is consumed by caller. 236 default: 237 throw new AutomatonException("Illegal symbol encountered: " + toke.peekToken()); 238 } 239 } 240 241 if (gotContent) { 242 // the latest content would be in branchSubModel 243 // and this needs to be connected back into the 244 // returnSubModel. 245 returnSubModel.addEpsilonTransition( 246 returnSubModel.getStart(), 247 branchSubModel.getStart()); 248 returnSubModel.addEpsilonTransition( 249 branchSubModel.getEnd(), 250 returnSubModel.getEnd()); 251 return returnSubModel; 252 } 253 else throw new AutomatonException("no content in this branch!"); 254 } 255 256 NfaSubModel parseSubPattern(NfaBuilder delegate) 257 throws AutomatonException 258 { 259 NfaSubModel returnSubModel; 260 261 // consume the left bracket 262 toke.getToken(); 263 returnSubModel = parse(delegate); 264 // consume the right bracket 265 if (toke.nextTokenType() != Tokenizer.RIGHT_BRACKET) 266 throw new AutomatonException("Missing right bracket!"); 267 else 268 toke.getToken(); 269 270 return returnSubModel; 271 } 272 273 NfaSubModel parseSymbol(NfaBuilder delegate) 274 throws AutomatonException 275 { 276 NfaSubModel returnSubModel = new NfaSubModel(delegate); 277 278 Symbol sym; 279 try { 280 sym = symtoke.parseToken(Character.toString(toke.getToken())); 281 } 282 catch (IllegalSymbolException ise) { 283 throw new AutomatonException(ise); 284 } 285 286 FiniteAutomaton.Node pre = returnSubModel.addNode(false); 287 FiniteAutomaton.Node post = returnSubModel.addNode(false); 288 if (sym instanceof AtomicSymbol) { 289 returnSubModel.addTransition(pre, post, sym); 290 returnSubModel.addEpsilonTransition(returnSubModel.getStart(), pre); 291 returnSubModel.addEpsilonTransition(post, returnSubModel.getEnd()); 292 } 293 else { 294 for (Iterator symI = ((FiniteAlphabet) sym.getMatches()).iterator(); 295 symI.hasNext(); ) { 296 Symbol atomicSym = (Symbol) symI.next(); 297 298 returnSubModel.addTransition(pre, post, atomicSym); 299 returnSubModel.addEpsilonTransition(returnSubModel.getStart(), pre); 300 returnSubModel.addEpsilonTransition(post, returnSubModel.getEnd()); 301 } 302 } 303 304 return returnSubModel; 305 } 306 307 Range parseIterations() 308 throws AutomatonException 309 { 310 int tokenType = toke.nextTokenType(); 311 312 switch (tokenType) { 313 case Tokenizer.LEFT_BRACE: 314 return getIterations(); 315 case Tokenizer.PLUS: 316 toke.getToken(); 317 return new Range(1, Integer.MAX_VALUE); 318 case Tokenizer.ASTERISK: 319 toke.getToken(); 320 return new Range(0, Integer.MAX_VALUE); 321 default: 322 return new Range(1, 1); 323 } 324 325 } 326 327 private Range getIterations() 328 throws AutomatonException 329 { 330 int min = 0; 331 int max = 0; 332 333 // consume the left brace 334 toke.getToken(); 335 336 // there can either be one or two numbers 337 boolean onSecondArg = false; 338 StringBuffer numString = new StringBuffer(); 339 340 while (toke.hasNext()) { 341 int tokenType = toke.nextTokenType(); 342 343 switch (tokenType) { 344 case Tokenizer.NUMERIC: 345 //System.out.println("adding symbol"); 346 if (numString == null) numString = new StringBuffer(); 347 numString.append(toke.getToken()); 348 break; 349 case Tokenizer.COMMA: 350 toke.getToken(); 351 if (!onSecondArg) { 352 //System.out.println("numString is " + numString); 353 if (numString.length() > 0) 354 min = Integer.parseInt(numString.toString()); 355 else 356 min = 0; 357 numString = null; 358 onSecondArg = true; 359 } 360 else { 361 throw new AutomatonException("only two arguments permitted."); 362 } 363 break; 364 case Tokenizer.RIGHT_BRACE: 365 toke.getToken(); 366 if (onSecondArg) { 367 if (numString != null) 368 max = Integer.parseInt(numString.toString()); 369 else 370 max = Integer.MAX_VALUE; 371 } 372 else { 373 min = max = Integer.parseInt(numString.toString()); 374 } 375 return new Range(min, max); 376 default: 377 throw new AutomatonException(toke.getToken() + " is not valid in an iteration specifier."); 378 } 379 } 380 381 throw new AutomatonException("unexpected error."); 382 } 383 384 385 NfaSubModel reiterate(NfaSubModel base, NfaBuilder delegate, Range count) 386 { 387 // no repeats necessary! 388 if (count.once()) return base; 389 390 NfaSubModel returnSubModel = new NfaSubModel(delegate); 391 FiniteAutomaton.Node lastNode = returnSubModel.getStart(); 392 393 // fulfill min number of reiterations 394 for (int i=0; i < count.getMin(); i++) { 395 NfaSubModel duplSubModel = base.duplicate(); 396 returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart()); 397 lastNode = duplSubModel.getEnd(); 398 } 399 400 // is there a finite upper bound? 401 if (count.getMax() != Integer.MAX_VALUE) { 402 for (int i = count.getMin(); i < count.getMax(); i++) { 403 returnSubModel.addLambdaTransition(lastNode, returnSubModel.getEnd()); 404 NfaSubModel duplSubModel = base.duplicate(); 405 returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart()); 406 lastNode = duplSubModel.getEnd(); 407 } 408 } 409 else { 410 // infinite upper bound 411 NfaSubModel duplSubModel = base.duplicate(); 412 returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart()); 413 returnSubModel.addEpsilonTransition(duplSubModel.getEnd(), lastNode); 414 } 415 416 returnSubModel.addEpsilonTransition(lastNode, returnSubModel.getEnd()); 417 418 return returnSubModel; 419 } 420}