001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022
023package org.biojava.bio.search;
024
025
026import java.util.ArrayList;
027import java.util.List;
028
029import org.biojava.bio.BioError;
030import org.biojava.bio.seq.CircularView;
031import org.biojava.bio.seq.DNATools;
032import org.biojava.bio.symbol.Alphabet;
033import org.biojava.bio.symbol.SimpleSymbolList;
034import org.biojava.bio.symbol.Symbol;
035import org.biojava.bio.symbol.SymbolList;
036
037/**
038 * An object to find exact subsequences within a sequence.
039 *
040 * <p>
041 * Reference: KNUTH D.E., MORRIS (Jr) J.H., PRATT V.R., 1977,
042 * Fast pattern matching in strings, SIAM Journal on Computing 6(1):323-350.
043 * </P.
044 *
045 * <p><b>USAGE:</b></p>
046 * When the object is constructed the <code>findMatches()</code>
047 * method would be called. This will return an int[] giving the offsets
048 * (ie the location of the first symbol of each match in the text).
049 * The <code>getKMPNextTable()</code> returns the table of border lengths
050 * used in the algorithm. This method is protected as it is unlikely it
051 * will be needed except for debugging.</p>
052 *
053 * <p>
054 * The algorithm finds exact matches therefore ambiguity symbols will match
055 * only themselves. The class cannot perform regular expressions. The class
056 * operates on all alphabets thus if searching for a DNA pattern you should
057 * compile both the pattern and its reverse complement.</p>
058 *
059 * <p><b>WARNING the behaviour of a pattern containing gaps is undefined.
060 *  It's not recommended that you try it.</b></p>
061 *
062 * <p>Copyright:    Copyright (c) 2002</p>
063 * <p>Company:      AgResearch</p>
064 *
065 * @author Mark Schreiber
066 * @version 1.0
067 */
068
069public final class KnuthMorrisPrattSearch {
070  private int[] kmpNext;// the table that defines the border lengths.
071  private SymbolList pattern;
072
073  /**
074   * Constructs a KMP matcher to find exact occurances of
075   * <code>pattern</code> in <code>text</code> using the
076   * Knuth-Morris-Pratt algorithm.<p>
077   *
078   * A new class should be constructed for each occurance of
079   * <code>pattern</code>.<p>
080   * @param pattern the pattern to search for
081   */
082  public KnuthMorrisPrattSearch(SymbolList pattern) {
083    Alphabet alpha = pattern.getAlphabet();
084    ArrayList rList = new ArrayList(pattern.toList());
085
086    /*
087     *need to perform this hack to make the kmpNext capable of dealing with
088     *overlapping patterns, unfortunately it means the behaviour of a pattern
089     *containing a gap is unspecified.
090     */
091    rList.add(alpha.getGapSymbol());
092    try{
093      this.pattern = new SimpleSymbolList(alpha, rList);
094    }catch(Exception e){
095      //really shouldn't happen
096      throw new BioError("Couldn't make KMP pattern", e);
097    }
098
099
100    kmpNext = new int[this.pattern.length()];
101    compilePattern();
102  }
103
104  private void compilePattern(){
105    int k = pattern.length()-1;
106    //System.out.println("k = "+k);
107    int i = 0;
108    int j = kmpNext[0] = -1;
109
110    while(i < k){
111      while (j > -1 && pattern.symbolAt(i+1) != pattern.symbolAt(j+1)){
112        j = kmpNext[j];
113      }
114      i++; j++;
115      if(pattern.symbolAt(i+1) == pattern.symbolAt(j+1)){
116        //System.out.println("i = "+i+" j = "+j);
117        kmpNext[i] = kmpNext[j];
118      }else{
119        //System.out.println("i = "+i+" j = "+j);
120        kmpNext[i] = j;
121      }
122    }
123  }
124
125
126
127  /**
128   * This will return an int[] giving the offsets of the matches in <code>text</code>
129   * (ie the location of the first symbol of each match in the <code>text</code>).
130   *
131   * @param text the text or sequence to search for the pattern
132   *
133   * @return an int[] giving the offsets to the matches or a zero length array if there are
134   * no matches.
135   */
136  public int[] findMatches(SymbolList text){
137    List matches = new ArrayList();
138    int n; // the length of the text
139    int m; //the length of the pattern
140    int i = 0;
141    int j = 0;
142
143    m = this.pattern.length()-1; //-1 to remove the gap at the end hack
144    if(text instanceof CircularView){
145      n = text.length()+pattern.length() -1; //allow wrap around
146    }else{
147      n = text.length();
148    }
149
150    //find the matches
151    while(j < n){
152      Symbol sym = text.symbolAt(j+1);
153      while( i > -1 && pattern.symbolAt(i+1) != sym)
154        i = kmpNext[i];
155      i++;
156      j++;
157      if(i >= m){
158        //match found, add 1 for SymbolList coordinates.
159        matches.add(new Integer(j - i +1));
160        i = kmpNext[i];
161      }
162    }
163
164    //turn matches into an int[]
165    int[] mat = new int[matches.size()];
166    for (int x = 0; x < mat.length; x++) {
167      mat[x] = ((Integer)matches.get(x)).intValue();
168    }
169    return mat;
170  }
171
172  /**
173   * Returns the table of border lengths
174   * @return an int[] of border lenghts
175   */
176  protected int[] getKmpNextTable(){
177    return kmpNext;
178  }
179
180  /**
181   *
182   * @return the pattern being searched for
183   */
184  public SymbolList getPattern() {
185      return pattern;
186  }
187
188  /**
189   * Demo and Test method
190   * @param args no arguments required
191   * @throws Exception if the test fails
192   */
193  public static void main(String[] args) throws Exception{
194    KnuthMorrisPrattSearch kmp1;
195    int[] table;
196    int[] matches;
197
198
199    SymbolList pattern = DNATools.createDNA("gcagagag");
200    SymbolList pattern2 = DNATools.createDNA("agag");
201    SymbolList text = DNATools.createDNA("gcatcgcagagagtatacagtacg");
202
203    //check pattern
204    kmp1 = new KnuthMorrisPrattSearch(pattern);
205
206    table = kmp1.getKmpNextTable();
207    System.out.println(pattern.seqString());
208    for (int i = 0; i < table.length; i++) {
209      System.out.print(table[i] +" ");
210    }
211    //table should be -1 0 0 -1 1 -1 1 -1
212    System.out.println("");
213    matches = kmp1.findMatches(text);
214    System.out.print("Matches at: ");
215    for (int i = 0; i < matches.length; i++) {
216      System.out.print(matches[i]+" ");
217    }
218    //matches should be at 6.
219    System.out.println("\n");
220
221    //check pattern2
222    kmp1 = new KnuthMorrisPrattSearch(pattern2);
223    table = kmp1.getKmpNextTable();
224    System.out.println(pattern2.seqString());
225    for (int i = 0; i < table.length; i++) {
226      System.out.print(table[i] +" ");
227    }
228    System.out.println("");
229    //table should be    -1  0 -1  0  2
230
231    matches = kmp1.findMatches(text);
232    System.out.print("Matches at: ");
233    for (int i = 0; i < matches.length; i++) {
234      System.out.print(matches[i]+" ");
235    }
236    //matches should be at 8 and 10
237    System.out.println("\n");
238  }
239}