001
002
003package org.biojava.utils.automata;
004
005import java.util.Iterator;
006
007import org.biojava.bio.BioException;
008import org.biojava.bio.seq.io.SymbolTokenization;
009import org.biojava.bio.symbol.AtomicSymbol;
010import org.biojava.bio.symbol.FiniteAlphabet;
011import org.biojava.bio.symbol.IllegalSymbolException;
012import org.biojava.bio.symbol.Symbol;
013
014public class PatternMaker
015{
016    // the NFA that will be the result of parsing the specified pattern.
017    private Nfa nfa;
018    private Tokenizer toke;
019    private SymbolTokenization symtoke;
020
021    private static class Tokenizer
022    {
023        private String packedTxt;
024        private int ptr = 0;
025
026        final static int EOL = -1;
027        final static int SYMBOL_TOKEN = 0;
028        final static int NUMERIC = 1;
029        final static int LEFT_BRACE = 2;
030        final static int RIGHT_BRACE = 3;
031        final static int COMMA = 4;
032        final static int LEFT_BRACKET = 5;
033        final static int RIGHT_BRACKET = 6;
034        final static int PLUS = 7;
035        final static int ASTERISK = 8;
036        final static int VERT_BAR = 9;
037        final static int UNKNOWN = 999;
038
039        private Tokenizer(String target)
040        {
041            packedTxt = pack(target);
042        }
043
044        private char getToken()
045            throws IndexOutOfBoundsException
046        {
047            if (hasNext())
048                 return packedTxt.charAt(ptr++);
049            else
050                throw new IndexOutOfBoundsException("text length: " + packedTxt.length() + " index: " + ptr);
051        }
052
053        private char peekToken()
054            throws IndexOutOfBoundsException
055        {
056            if (hasNext())
057                 return packedTxt.charAt(ptr);
058            else
059                throw new IndexOutOfBoundsException("text length: " + packedTxt.length() + " index: " + ptr);
060        }
061
062        private int nextTokenType()
063        {
064            if (!hasNext()) return EOL;
065
066            char nextCh = packedTxt.charAt(ptr);
067
068            // symbol tokens assumed to be alphas.
069            if (Character.isLetter(nextCh))
070                return SYMBOL_TOKEN;
071            if (Character.isDigit(nextCh))
072                return NUMERIC;
073            // now check for specific chars
074            if (nextCh == '{')
075                return LEFT_BRACE;
076            if (nextCh == '}')
077                return RIGHT_BRACE;
078            if (nextCh == ',')
079                return COMMA;
080            if (nextCh == '(')
081                return LEFT_BRACKET;
082            if (nextCh == ')')
083                return RIGHT_BRACKET;
084            if (nextCh == '+')
085                return PLUS;
086            if (nextCh == '*')
087                return ASTERISK;
088            if (nextCh == '|')
089                return VERT_BAR;
090            return UNKNOWN;
091        }
092
093        private boolean hasNext()
094        {
095            return ptr < packedTxt.length();
096        }
097
098        /**
099         * produces a version of the String with whitespace removed.
100         */
101        private String pack(String source)
102        {
103            StringBuffer packedString = new StringBuffer();
104
105            for (int i=0; i < source.length(); i++) {
106                char currCh;
107                if (!Character.isWhitespace(currCh = source.charAt(i))) {
108                    packedString.append(currCh);
109                }
110            }
111
112            return packedString.toString();
113        }
114    }
115
116    private static class Range
117    {
118        private int min = 1;
119        private int max = 1;
120        private int getMin() { return min; }
121        private int getMax() { return max; }
122
123        private Range(int min, int max)
124        {
125            this.min = min;
126            this.max = max;
127        }
128
129        private boolean once() { return (min == 1) && (max == 1); }
130    }
131
132    PatternMaker(String patternString, FiniteAlphabet alfa)
133        throws BioException
134    {
135        toke = new Tokenizer(patternString);
136        nfa = new Nfa(patternString, alfa);
137        symtoke = alfa.getTokenization("token");
138    }
139
140    /**
141     * Compiles the regex described by patternString into a DFA.
142     * The DFA can subsequently be converted into a state
143     * state machine for searching. 
144     * <b>This is the main work method for this class.</b>
145     */
146    static FiniteAutomaton compilePattern(String patternString, FiniteAlphabet alfa)
147        throws BioException, AutomatonException
148    {
149        PatternMaker maker = new PatternMaker(patternString, alfa);
150        return maker.parse();
151    }
152
153    private FiniteAutomaton parse()
154        throws AutomatonException
155    {
156        NfaSubModel result = parse(nfa);
157
158        // complete linking up model to the start/end.
159        nfa.addEpsilonTransition(nfa.getStart(), result.getStart());
160        nfa.addEpsilonTransition(result.getEnd(), nfa.getEnd());
161        //System.out.println(nfa.toString());
162        nfa.doEpsilonClosure();
163        //System.out.println(nfa.toString());
164        return new DfaBuilder(nfa).getDFA();
165    }    
166
167    private NfaSubModel parse(NfaBuilder delegate)
168        throws AutomatonException
169    {
170        /**
171         * The model is that the pattern here can have
172         * alternative patterns so a single pattern ends
173         * up being a pattern of branching 1.
174         */
175        NfaSubModel returnSubModel = new NfaSubModel(delegate);
176        NfaSubModel branchSubModel = new NfaSubModel(returnSubModel);
177        // make it link silently so append works properly.
178        branchSubModel.addEpsilonTransition(branchSubModel.getStart(), branchSubModel.getEnd());
179        // do I have content in this branch?
180        boolean gotContent =false;
181
182        Range times;
183        while (toke.hasNext()) {
184            int tokenType = toke.nextTokenType();
185
186            NfaSubModel currSubModel = null;
187
188            switch (tokenType) {
189                case Tokenizer.LEFT_BRACKET:
190                    //System.out.println("processing left bracket" + toke.peekToken());
191                    gotContent = true;
192                    currSubModel = parseSubPattern(branchSubModel);
193                    times = parseIterations();
194                    currSubModel = reiterate(currSubModel, branchSubModel,  times);
195                    branchSubModel.append(currSubModel);
196                    break;
197                case Tokenizer.SYMBOL_TOKEN:
198                    //System.out.println("processing symbol " + toke.peekToken());
199                    gotContent = true;
200                    currSubModel = parseSymbol(branchSubModel);
201                    times = parseIterations();
202                    currSubModel = reiterate(currSubModel, branchSubModel, times);
203//                    System.out.println("before\n" + branchSubModel);
204                    branchSubModel.append(currSubModel);
205//                    System.out.println("after\n" + branchSubModel);
206                    break;
207                case Tokenizer.VERT_BAR:
208                    //System.out.println("processing bar" + toke.peekToken());
209                    // link current branch into return value
210                    if (!gotContent) throw new AutomatonException("no content in this branch!");
211                    System.out.println(returnSubModel.getStart().getID() + " " + branchSubModel.getStart().getID());
212                    returnSubModel.addEpsilonTransition(
213                        returnSubModel.getStart(),
214                        branchSubModel.getStart());
215                    returnSubModel.addEpsilonTransition(
216                        branchSubModel.getEnd(),
217                        returnSubModel.getEnd());
218                    // start new branch
219                    branchSubModel = new NfaSubModel(returnSubModel);
220                    branchSubModel.addEpsilonTransition(branchSubModel.getStart(), branchSubModel.getEnd());
221                    gotContent = false;
222                    toke.getToken();
223                    break;
224                case Tokenizer.RIGHT_BRACKET:
225                    //System.out.println("processing right bracket" + toke.peekToken());
226                    // link current branch into return value
227                    if (!gotContent) throw new AutomatonException("no content in this branch!");
228                    //System.out.println(returnSubModel.getStart().getID() + " " + branchSubModel.getStart().getID());
229                    returnSubModel.addEpsilonTransition(
230                        returnSubModel.getStart(),
231                        branchSubModel.getStart());
232                    returnSubModel.addEpsilonTransition(
233                        branchSubModel.getEnd(),
234                        returnSubModel.getEnd());
235                    return returnSubModel; // note that the right bracket is consumed by caller.
236                default:
237                    throw new AutomatonException("Illegal symbol encountered: " + toke.peekToken());
238            }
239        }
240
241        if (gotContent) {
242            // the latest content would be in branchSubModel
243            // and this needs to be connected back into the
244            // returnSubModel.
245            returnSubModel.addEpsilonTransition(
246                returnSubModel.getStart(),
247                branchSubModel.getStart());
248            returnSubModel.addEpsilonTransition(
249                branchSubModel.getEnd(),
250                returnSubModel.getEnd());
251            return returnSubModel;
252        }
253        else throw new AutomatonException("no content in this branch!");
254    }
255
256    NfaSubModel parseSubPattern(NfaBuilder delegate)
257        throws AutomatonException
258    {
259        NfaSubModel returnSubModel;
260
261        // consume the left bracket
262        toke.getToken();
263        returnSubModel = parse(delegate);
264        // consume the right bracket
265        if (toke.nextTokenType() != Tokenizer.RIGHT_BRACKET)
266            throw new AutomatonException("Missing right bracket!");
267        else
268            toke.getToken();
269
270        return returnSubModel;
271    }
272
273    NfaSubModel parseSymbol(NfaBuilder delegate)
274        throws AutomatonException
275    {
276        NfaSubModel returnSubModel = new NfaSubModel(delegate);
277
278        Symbol sym;
279        try {
280            sym = symtoke.parseToken(Character.toString(toke.getToken()));
281        }
282        catch (IllegalSymbolException ise) {
283            throw new AutomatonException(ise);
284        }
285
286        FiniteAutomaton.Node pre = returnSubModel.addNode(false);
287        FiniteAutomaton.Node post = returnSubModel.addNode(false);
288        if (sym instanceof AtomicSymbol) {
289            returnSubModel.addTransition(pre, post, sym);
290            returnSubModel.addEpsilonTransition(returnSubModel.getStart(), pre);
291            returnSubModel.addEpsilonTransition(post, returnSubModel.getEnd());
292        }
293        else {
294            for (Iterator symI = ((FiniteAlphabet) sym.getMatches()).iterator();
295                symI.hasNext(); ) {
296                Symbol atomicSym = (Symbol) symI.next();
297
298                returnSubModel.addTransition(pre, post, atomicSym);
299                returnSubModel.addEpsilonTransition(returnSubModel.getStart(), pre);
300                returnSubModel.addEpsilonTransition(post, returnSubModel.getEnd());
301            }
302        }
303
304        return returnSubModel;
305    }
306
307    Range parseIterations()
308        throws AutomatonException
309    {
310        int tokenType = toke.nextTokenType();
311
312        switch (tokenType) {
313            case Tokenizer.LEFT_BRACE:
314                return getIterations();
315            case Tokenizer.PLUS:
316                toke.getToken();
317                return new Range(1, Integer.MAX_VALUE);
318            case Tokenizer.ASTERISK:
319                toke.getToken();
320                return new Range(0, Integer.MAX_VALUE);
321            default:
322                return new Range(1, 1);
323        }
324
325    }
326
327    private Range getIterations()
328        throws AutomatonException
329    {
330        int min = 0;
331        int max = 0;
332
333        // consume the left brace
334        toke.getToken();
335
336        // there can either be one or two numbers
337        boolean onSecondArg = false;
338        StringBuffer numString = new StringBuffer();
339
340        while (toke.hasNext()) {
341            int tokenType = toke.nextTokenType();
342
343            switch (tokenType) {
344                case Tokenizer.NUMERIC:
345                    //System.out.println("adding symbol");
346                    if (numString == null) numString = new StringBuffer();
347                    numString.append(toke.getToken());
348                    break;
349                case Tokenizer.COMMA:
350                    toke.getToken();
351                    if (!onSecondArg) {
352                        //System.out.println("numString is " + numString);
353                        if (numString.length() > 0)
354                            min = Integer.parseInt(numString.toString());
355                        else
356                            min = 0;
357                        numString = null;
358                        onSecondArg = true;
359                    }
360                    else {
361                        throw new AutomatonException("only two arguments permitted.");
362                    }
363                    break;
364                case Tokenizer.RIGHT_BRACE:
365                    toke.getToken();
366                    if (onSecondArg) {
367                        if (numString != null)
368                            max = Integer.parseInt(numString.toString());
369                        else
370                            max = Integer.MAX_VALUE;
371                    }
372                    else {
373                        min = max = Integer.parseInt(numString.toString());
374                    }
375                    return new Range(min, max);
376                default:
377                    throw new AutomatonException(toke.getToken() + " is not valid in an iteration specifier.");
378            }
379        }
380
381        throw new AutomatonException("unexpected error.");
382    }
383
384
385    NfaSubModel reiterate(NfaSubModel base, NfaBuilder delegate, Range count)
386    {
387        // no repeats necessary!
388        if (count.once()) return base;
389
390        NfaSubModel returnSubModel = new NfaSubModel(delegate);
391        FiniteAutomaton.Node lastNode = returnSubModel.getStart();
392
393        // fulfill min number of reiterations
394        for (int i=0; i < count.getMin(); i++) {
395            NfaSubModel duplSubModel = base.duplicate();
396            returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart());
397            lastNode = duplSubModel.getEnd();
398        }
399
400        // is there a finite upper bound?
401        if (count.getMax() != Integer.MAX_VALUE) {
402            for (int i = count.getMin(); i < count.getMax(); i++) {
403                returnSubModel.addLambdaTransition(lastNode, returnSubModel.getEnd());
404                NfaSubModel duplSubModel = base.duplicate();
405                returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart());
406                lastNode = duplSubModel.getEnd();
407            }
408        }
409        else {
410            // infinite upper bound
411            NfaSubModel duplSubModel = base.duplicate();
412            returnSubModel.addEpsilonTransition(lastNode, duplSubModel.getStart());
413            returnSubModel.addEpsilonTransition(duplSubModel.getEnd(), lastNode);
414        }
415
416        returnSubModel.addEpsilonTransition(lastNode, returnSubModel.getEnd());
417
418        return returnSubModel;
419    }
420}