001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.utils.automata; 023 024import java.util.Iterator; 025import java.util.Set; 026 027import org.biojava.bio.symbol.AlphabetIndex; 028import org.biojava.bio.symbol.AlphabetManager; 029import org.biojava.bio.symbol.FiniteAlphabet; 030import org.biojava.bio.symbol.IllegalAlphabetException; 031import org.biojava.bio.symbol.IllegalSymbolException; 032 033 034public class ArrayStateMachineToolkit 035 implements StateMachineToolkit 036{ 037 /** 038 * A StateMachine implementation in which the 039 * transitions are maintained in an array. While 040 * every ArrayStateMachine is initialised with a 041 * FiniteAlphabet, symbol data is provided as an 042 * AlphabetIndex int value for effiiciency 043 * reasons and callers are responsible for validating 044 * all such data prior to passing it to an instance 045 * of this class. It is not recommended that this 046 * class is used with Alphabets with a large 047 * number of Symbols as it uses a dense State 048 * matrix implementation. 049 * 050 * @author David Huen 051 * @since 1.4 052 */ 053 private static class ArrayStateMachine 054 implements StateMachineFactory 055 { 056 private static byte ERROR_STATE = Byte.MIN_VALUE; 057 058 class Instance 059 implements StateMachineInstance 060 { 061 private int statePointer = 0; 062 private int start; 063 private int end; 064 065 private Instance(int start, int end, int statePointer) 066 { 067 this.start = start; 068 this.end = end; 069 this.statePointer = statePointer; 070 } 071 072 // this method should only be used by Comparators and 073 // the equals() method. 074 public int getStart() { return start; } 075 076 /** 077 * invoke transition from current state indicated by Symbol represented by symbol index. 078 * @param symIdx alphabet index of the symbol encountered. 079 * @return true if the symbol is valid and this state machine 080 * should continue to receive input. 081 */ 082 public boolean transit(int symIdx) 083 { 084 byte dest = transitions[entryPoints[statePointer] + symIdx]; 085 if (dest == ERROR_STATE) return false; 086 end++; 087 statePointer = Math.abs(dest); 088 089 if (dest <0) { 090 // notify listener 091 listener.notifyHit(name, start, end, false); 092 093 return false; 094 } 095 else 096 return true; 097 } 098 099 /** 100 * Two Instances are equal if they are children of the 101 * same ArrayStateMachine instance and have the same 102 * start value. 103 */ 104 public boolean equals(Object o) 105 { 106 if (!(o instanceof Instance)) return false; 107 108 StateMachineInstance other = (Instance) o; 109 110 if (other.parent() != ArrayStateMachine.this) 111 return false; 112 113 return (start == other.getStart()); 114 } 115 116 public StateMachineFactory parent() { return ArrayStateMachine.this; } 117 118 } 119 120 class GreedyInstance 121 implements StateMachineInstance 122 { 123 private int statePointer; 124 private int start; 125 private int end; 126 boolean gotTerminationState = false; 127 128 private GreedyInstance(int start, int end, int statePointer) 129 { 130 this.start = start; 131 this.statePointer = statePointer; 132 } 133 134 // this method should only be used by Comparators and 135 // the equals() method. 136 public int getStart() { return start; } 137 138 /** 139 * invoke transition from current state indicated by Symbol represented by symbol index. 140 * @param symIdx alphabet index of the symbol encountered. 141 * @return true if the symbol is valid and this state machine 142 * should continue to receive input. 143 */ 144 public boolean transit(int symIdx) 145 { 146 byte dest = transitions[entryPoints[statePointer] + symIdx]; 147 if (dest == ERROR_STATE) { 148 if (gotTerminationState) { 149 listener.notifyHit(name, start, end, true); 150 } 151 152 return false; 153 } 154 else { 155 end++; 156 statePointer = Math.abs(dest); 157 158 if (dest <0) { 159 // got a valid termination state, save it 160 gotTerminationState = true; 161 return false; 162 } 163 else 164 return true; 165 } 166 } 167 168 /** 169 * Two GreedyInstances are equal if they are children of the 170 * same ArrayStateMachine instance and have the same 171 * start value. 172 */ 173 public boolean equals(Object o) 174 { 175 if (!(o instanceof Instance)) return false; 176 177 StateMachineInstance other = (Instance) o; 178 179 if (other.parent() != ArrayStateMachine.this) 180 return false; 181 182 return (start == other.getStart()); 183 } 184 185 public StateMachineFactory parent() { return ArrayStateMachine.this; } 186 187 } 188 189 private String name; 190 private FiniteAlphabet alfa; 191 private boolean greedy; 192 193 private PatternListener listener = null; 194 195 /** 196 * maps a Node ID to the index in the 197 * array where its transition data is stored. 198 */ 199 private int [] entryPoints; 200 201 /** 202 * transition matrix. Organised as:- 203 * [source nodeID * symbol idx] -> dest nodeID. 204 */ 205 private byte [] transitions; 206 207 /** 208 * Creates an ArrayStateMachine from a 209 * FiniteAutomaton instance. The FiniteAutomaton 210 * must be in its most compact form (node IDs 211 * in contiguous running order, no duplicate/surplus 212 * nodes/transitions). 213 */ 214 ArrayStateMachine(String name, FiniteAutomaton fa, boolean greedy) 215 { 216 this.name = name; 217 alfa = fa.getAlphabet(); 218 int alfaSize = alfa.size(); 219 entryPoints = new int [fa.nodeCount()]; 220 transitions = new byte [alfaSize * fa.nodeCount()]; 221 this.greedy = greedy; 222 223 convert(fa); 224 fa = null; // release FA for GC. 225 } 226 227 public void setListener(PatternListener listener) 228 { 229 this.listener = listener; 230 } 231 232 private void convert(FiniteAutomaton fa) 233 { 234 try { 235 AlphabetIndex alfaIdx = AlphabetManager.getAlphabetIndex(alfa); 236 237 int idx = 0; 238 int alfaSize = alfa.size(); 239 240 // initialise pointer array 241 for (int i = 0; i < entryPoints.length; i++) { 242 entryPoints[i] = idx; 243 idx += alfaSize; 244 } 245 246 // initialise transition matrix 247 for (int i = 0; i < transitions.length; i++) { 248 transitions[i] = ERROR_STATE; 249 } 250 251 // go thru all transitions, filling the transition matrix. 252 Set transitionSet = fa.getTransitions(); 253 254 for (Iterator transI = transitionSet.iterator(); transI.hasNext(); ) { 255 FiniteAutomaton.Transition currTransition = (FiniteAutomaton.Transition) transI.next(); 256 257 int symIdx = alfaIdx.indexForSymbol(currTransition.getSymbol()); 258 //System.out.println(currTransition.getSymbol() + " " + symIdx + " " + currTransition.getSource().getID() + " " + currTransition.getDest().getID()); 259 transitions[entryPoints[Math.abs(currTransition.getSource().getID())] + symIdx] 260 = (byte) currTransition.getDest().getID(); 261 } 262 } 263 catch (IllegalSymbolException ise) { 264 throw new AssertionError(ise); 265 } 266 } 267 268 /** 269 * get a StateMachineInstance to parse the sequence with. 270 * @param start current Sequence coordinate. 271 * @param greedy should greedy regex semantics be used? 272 */ 273 StateMachineInstance getInstance(int start) 274 { 275 if (greedy) 276 return new GreedyInstance(start, start, 0); 277 else 278 return new Instance(start, start, 0); 279 } 280 281 /** 282 * Return a StateMachineInstance if the Symbol represented 283 * by the symbol index is valid as the initial symbol of 284 * the pattern. This method must remain package private 285 * as it does no alphabet checks at all. 286 * 287 * @param symIdx alphabet index value for specified symbol. 288 * @param greedy should greedy regex semantics be used? 289 * @return an instance of StateMachineInstance if symbol 290 * is valid otherwise null. 291 */ 292 public StateMachineInstance startInstance(int symIdx, int start) 293 { 294 int nextState = transitions[symIdx]; 295 //System.out.println("startInstance called with " + symIdx + " " + start + " " + nextState); 296 if (nextState != ERROR_STATE) { 297 if (greedy) { 298 return new GreedyInstance(start, start + 1, nextState); 299 } 300 else { 301 //System.out.println("creating new Instance."); 302 return new Instance(start, start + 1, nextState); 303 } 304 } 305 else 306 return null; 307 } 308 } 309 310 private FiniteAlphabet alfa; 311 private boolean greedy; 312 313 ArrayStateMachineToolkit(FiniteAlphabet alfa, boolean greedy) 314 { 315 this.alfa = alfa; 316 this.greedy = greedy; 317 } 318 319 public StateMachineFactory getFactory(String factoryName, FiniteAutomaton fa) 320 throws IllegalAlphabetException 321 { 322 if (alfa != fa.getAlphabet()) 323 throw new IllegalAlphabetException("The FiniteAutomaton is defined on an Alphabet incompatible with this ArrayStateMachineToolKit."); 324 return new ArrayStateMachine(factoryName, fa, greedy); 325 } 326} 327