001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.utils.automata;
023
024import java.util.Iterator;
025import java.util.Set;
026
027import org.biojava.bio.symbol.AlphabetIndex;
028import org.biojava.bio.symbol.AlphabetManager;
029import org.biojava.bio.symbol.FiniteAlphabet;
030import org.biojava.bio.symbol.IllegalAlphabetException;
031import org.biojava.bio.symbol.IllegalSymbolException;
032
033
034public class ArrayStateMachineToolkit
035    implements StateMachineToolkit
036{
037    /**
038     * A StateMachine implementation in which the
039     * transitions are maintained in an array. While
040     * every ArrayStateMachine is initialised with a
041     * FiniteAlphabet, symbol data is provided as an
042     * AlphabetIndex int value for effiiciency
043     * reasons and callers are responsible for validating
044     * all such data prior to passing it to an instance
045     * of this class.  It is not recommended that this
046     * class is used with Alphabets with a large
047     * number of Symbols as it uses a dense State
048     * matrix implementation.
049     *
050     * @author David Huen
051     * @since 1.4
052     */
053    private static class ArrayStateMachine
054        implements StateMachineFactory
055    {
056        private static byte ERROR_STATE = Byte.MIN_VALUE;
057
058        class Instance
059            implements StateMachineInstance
060        {
061            private int statePointer = 0;
062            private int start;
063            private int end;
064    
065            private Instance(int start, int end, int statePointer)
066            {
067                this.start = start;
068                this.end = end;
069                this.statePointer = statePointer;
070            }
071
072            // this method should only be used by Comparators and
073            // the equals() method.
074            public int getStart() { return start; }
075
076            /**
077             * invoke transition from current state indicated by Symbol represented by symbol index.
078             * @param symIdx alphabet index of the symbol encountered.
079             * @return true if the symbol is valid and this state machine
080             *         should continue to receive input.
081             */
082            public boolean transit(int symIdx)
083            {
084                byte dest = transitions[entryPoints[statePointer] + symIdx];
085                if (dest == ERROR_STATE) return false;
086                end++;
087                statePointer = Math.abs(dest);
088    
089                if (dest <0) {
090                    // notify listener
091                    listener.notifyHit(name, start, end, false);
092    
093                    return false;
094                }
095                else
096                    return true;
097            }
098
099            /**
100             * Two Instances are equal if they are children of the
101             * same ArrayStateMachine instance and have the same
102             * start value.
103             */
104            public boolean equals(Object o)
105            {
106                if (!(o instanceof Instance)) return false;
107
108                StateMachineInstance other = (Instance) o;
109
110                if (other.parent() != ArrayStateMachine.this)
111                    return false;
112
113                return (start == other.getStart());
114            }
115
116            public StateMachineFactory parent() { return  ArrayStateMachine.this; }
117
118        }
119    
120        class GreedyInstance
121            implements StateMachineInstance
122        {
123            private int statePointer;
124            private int start;
125            private int end;
126            boolean gotTerminationState = false;
127    
128            private GreedyInstance(int start, int end, int statePointer)
129            {
130                this.start = start;
131                this.statePointer = statePointer;
132            }
133
134            // this method should only be used by Comparators and
135            // the equals() method.
136            public int getStart() { return start; }
137 
138            /**
139             * invoke transition from current state indicated by Symbol represented by symbol index.
140             * @param symIdx alphabet index of the symbol encountered.
141             * @return true if the symbol is valid and this state machine
142             *         should continue to receive input.
143             */
144            public boolean transit(int symIdx)
145            {
146                byte dest = transitions[entryPoints[statePointer] + symIdx];
147                if (dest == ERROR_STATE) {
148                    if (gotTerminationState) {
149                        listener.notifyHit(name, start, end, true);
150                    }
151    
152                    return false;
153                }
154                else {
155                    end++;
156                    statePointer = Math.abs(dest);
157    
158                    if (dest <0) {
159                        // got a valid termination state, save it
160                        gotTerminationState = true;
161                        return false;
162                    }
163                    else
164                        return true;
165                }
166            }
167
168            /**
169             * Two GreedyInstances are equal if they are children of the
170             * same ArrayStateMachine instance and have the same
171             * start value.
172             */
173            public boolean equals(Object o)
174            {
175                if (!(o instanceof Instance)) return false;
176
177                StateMachineInstance other = (Instance) o;
178
179                if (other.parent() != ArrayStateMachine.this)
180                    return false;
181
182                return (start == other.getStart());
183            }
184
185            public StateMachineFactory parent() { return  ArrayStateMachine.this; }
186
187        }
188    
189        private String name;
190        private FiniteAlphabet alfa;
191        private boolean greedy;
192
193        private PatternListener listener = null;
194    
195        /**
196         * maps a Node ID to the index in the
197         * array where its transition data is stored.
198         */
199        private int [] entryPoints;
200    
201        /**
202         * transition matrix.  Organised as:-
203         * [source nodeID * symbol idx] -&gt; dest nodeID.
204         */
205        private byte [] transitions;
206    
207        /**
208         * Creates an ArrayStateMachine from a
209         * FiniteAutomaton instance. The FiniteAutomaton
210         * must be in its most compact form (node IDs
211         * in contiguous running order, no duplicate/surplus
212         * nodes/transitions).
213         */
214        ArrayStateMachine(String name, FiniteAutomaton fa, boolean greedy)
215        {
216            this.name = name;
217            alfa = fa.getAlphabet();
218            int alfaSize = alfa.size();
219            entryPoints = new int [fa.nodeCount()];
220            transitions = new byte [alfaSize * fa.nodeCount()];
221            this.greedy = greedy;
222
223            convert(fa);
224            fa = null;   // release FA for GC.
225        }
226    
227        public void setListener(PatternListener listener)
228        {
229            this.listener = listener;
230        }
231    
232        private void convert(FiniteAutomaton fa)
233        {
234            try {
235                AlphabetIndex alfaIdx = AlphabetManager.getAlphabetIndex(alfa);
236        
237                int idx = 0;
238                int alfaSize = alfa.size();
239        
240                // initialise pointer array
241                for (int i = 0; i < entryPoints.length; i++) {
242                    entryPoints[i] = idx;
243                    idx += alfaSize;
244                }
245        
246                // initialise transition matrix
247                for (int i = 0; i < transitions.length; i++) {
248                    transitions[i] = ERROR_STATE;
249                }
250        
251                // go thru all transitions, filling the transition matrix.
252                Set transitionSet = fa.getTransitions();
253        
254                for (Iterator transI = transitionSet.iterator(); transI.hasNext(); ) {
255                    FiniteAutomaton.Transition currTransition = (FiniteAutomaton.Transition) transI.next();
256        
257                    int symIdx = alfaIdx.indexForSymbol(currTransition.getSymbol());
258                    //System.out.println(currTransition.getSymbol() + " " + symIdx + " " + currTransition.getSource().getID() + " " + currTransition.getDest().getID());
259                    transitions[entryPoints[Math.abs(currTransition.getSource().getID())] + symIdx]
260                        = (byte) currTransition.getDest().getID();
261                }
262            }
263            catch (IllegalSymbolException ise) {
264                throw new AssertionError(ise);
265            }
266        } 
267    
268        /**
269         * get a StateMachineInstance to parse the sequence with.
270         * @param start current Sequence coordinate.
271         * @param greedy should greedy regex semantics be used?
272         */
273        StateMachineInstance getInstance(int start)
274        {
275            if (greedy)
276                return new GreedyInstance(start, start, 0);
277            else
278                return new Instance(start, start, 0);
279        }
280    
281        /**
282         * Return a StateMachineInstance if the Symbol represented
283         * by the symbol index is valid as the initial symbol of
284         * the pattern.  This method must remain package private
285         * as it does no alphabet checks at all.
286         *
287         * @param symIdx alphabet index value for specified symbol.
288         * @param greedy should greedy regex semantics be used?
289         * @return an instance of StateMachineInstance if symbol
290         * is valid otherwise null.
291         */
292        public StateMachineInstance startInstance(int symIdx, int start)
293        {
294            int nextState = transitions[symIdx];
295            //System.out.println("startInstance called with " + symIdx + " " + start + " " + nextState);
296            if (nextState != ERROR_STATE) {
297                if (greedy) {
298                    return new GreedyInstance(start, start + 1, nextState);
299                }
300                else {
301                    //System.out.println("creating new Instance.");
302                    return new Instance(start, start + 1, nextState);
303                }
304            }
305            else
306                return null;
307        }
308    }
309
310    private FiniteAlphabet alfa;
311    private boolean greedy;
312
313    ArrayStateMachineToolkit(FiniteAlphabet alfa, boolean greedy)
314    {
315        this.alfa = alfa;
316        this.greedy = greedy;
317    }
318
319    public StateMachineFactory getFactory(String factoryName, FiniteAutomaton fa)
320        throws IllegalAlphabetException
321    {
322        if (alfa != fa.getAlphabet())
323            throw new IllegalAlphabetException("The FiniteAutomaton is defined on an Alphabet incompatible with this ArrayStateMachineToolKit.");
324        return new ArrayStateMachine(factoryName, fa, greedy);
325    }
326}
327