001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022 023package org.biojava.bio.search; 024 025 026import java.util.ArrayList; 027import java.util.List; 028 029import org.biojava.bio.BioError; 030import org.biojava.bio.seq.CircularView; 031import org.biojava.bio.seq.DNATools; 032import org.biojava.bio.symbol.Alphabet; 033import org.biojava.bio.symbol.SimpleSymbolList; 034import org.biojava.bio.symbol.Symbol; 035import org.biojava.bio.symbol.SymbolList; 036 037/** 038 * An object to find exact subsequences within a sequence. 039 * 040 * <p> 041 * Reference: KNUTH D.E., MORRIS (Jr) J.H., PRATT V.R., 1977, 042 * Fast pattern matching in strings, SIAM Journal on Computing 6(1):323-350. 043 * </P. 044 * 045 * <p><b>USAGE:</b></p> 046 * When the object is constructed the <code>findMatches()</code> 047 * method would be called. This will return an int[] giving the offsets 048 * (ie the location of the first symbol of each match in the text). 049 * The <code>getKMPNextTable()</code> returns the table of border lengths 050 * used in the algorithm. This method is protected as it is unlikely it 051 * will be needed except for debugging.</p> 052 * 053 * <p> 054 * The algorithm finds exact matches therefore ambiguity symbols will match 055 * only themselves. The class cannot perform regular expressions. The class 056 * operates on all alphabets thus if searching for a DNA pattern you should 057 * compile both the pattern and its reverse complement.</p> 058 * 059 * <p><b>WARNING the behaviour of a pattern containing gaps is undefined. 060 * It's not recommended that you try it.</b></p> 061 * 062 * <p>Copyright: Copyright (c) 2002</p> 063 * <p>Company: AgResearch</p> 064 * 065 * @author Mark Schreiber 066 * @version 1.0 067 */ 068 069public final class KnuthMorrisPrattSearch { 070 private int[] kmpNext;// the table that defines the border lengths. 071 private SymbolList pattern; 072 073 /** 074 * Constructs a KMP matcher to find exact occurances of 075 * <code>pattern</code> in <code>text</code> using the 076 * Knuth-Morris-Pratt algorithm.<p> 077 * 078 * A new class should be constructed for each occurance of 079 * <code>pattern</code>.<p> 080 * @param pattern the pattern to search for 081 */ 082 public KnuthMorrisPrattSearch(SymbolList pattern) { 083 Alphabet alpha = pattern.getAlphabet(); 084 ArrayList rList = new ArrayList(pattern.toList()); 085 086 /* 087 *need to perform this hack to make the kmpNext capable of dealing with 088 *overlapping patterns, unfortunately it means the behaviour of a pattern 089 *containing a gap is unspecified. 090 */ 091 rList.add(alpha.getGapSymbol()); 092 try{ 093 this.pattern = new SimpleSymbolList(alpha, rList); 094 }catch(Exception e){ 095 //really shouldn't happen 096 throw new BioError("Couldn't make KMP pattern", e); 097 } 098 099 100 kmpNext = new int[this.pattern.length()]; 101 compilePattern(); 102 } 103 104 private void compilePattern(){ 105 int k = pattern.length()-1; 106 //System.out.println("k = "+k); 107 int i = 0; 108 int j = kmpNext[0] = -1; 109 110 while(i < k){ 111 while (j > -1 && pattern.symbolAt(i+1) != pattern.symbolAt(j+1)){ 112 j = kmpNext[j]; 113 } 114 i++; j++; 115 if(pattern.symbolAt(i+1) == pattern.symbolAt(j+1)){ 116 //System.out.println("i = "+i+" j = "+j); 117 kmpNext[i] = kmpNext[j]; 118 }else{ 119 //System.out.println("i = "+i+" j = "+j); 120 kmpNext[i] = j; 121 } 122 } 123 } 124 125 126 127 /** 128 * This will return an int[] giving the offsets of the matches in <code>text</code> 129 * (ie the location of the first symbol of each match in the <code>text</code>). 130 * 131 * @param text the text or sequence to search for the pattern 132 * 133 * @return an int[] giving the offsets to the matches or a zero length array if there are 134 * no matches. 135 */ 136 public int[] findMatches(SymbolList text){ 137 List matches = new ArrayList(); 138 int n; // the length of the text 139 int m; //the length of the pattern 140 int i = 0; 141 int j = 0; 142 143 m = this.pattern.length()-1; //-1 to remove the gap at the end hack 144 if(text instanceof CircularView){ 145 n = text.length()+pattern.length() -1; //allow wrap around 146 }else{ 147 n = text.length(); 148 } 149 150 //find the matches 151 while(j < n){ 152 Symbol sym = text.symbolAt(j+1); 153 while( i > -1 && pattern.symbolAt(i+1) != sym) 154 i = kmpNext[i]; 155 i++; 156 j++; 157 if(i >= m){ 158 //match found, add 1 for SymbolList coordinates. 159 matches.add(new Integer(j - i +1)); 160 i = kmpNext[i]; 161 } 162 } 163 164 //turn matches into an int[] 165 int[] mat = new int[matches.size()]; 166 for (int x = 0; x < mat.length; x++) { 167 mat[x] = ((Integer)matches.get(x)).intValue(); 168 } 169 return mat; 170 } 171 172 /** 173 * Returns the table of border lengths 174 * @return an int[] of border lenghts 175 */ 176 protected int[] getKmpNextTable(){ 177 return kmpNext; 178 } 179 180 /** 181 * 182 * @return the pattern being searched for 183 */ 184 public SymbolList getPattern() { 185 return pattern; 186 } 187 188 /** 189 * Demo and Test method 190 * @param args no arguments required 191 * @throws Exception if the test fails 192 */ 193 public static void main(String[] args) throws Exception{ 194 KnuthMorrisPrattSearch kmp1; 195 int[] table; 196 int[] matches; 197 198 199 SymbolList pattern = DNATools.createDNA("gcagagag"); 200 SymbolList pattern2 = DNATools.createDNA("agag"); 201 SymbolList text = DNATools.createDNA("gcatcgcagagagtatacagtacg"); 202 203 //check pattern 204 kmp1 = new KnuthMorrisPrattSearch(pattern); 205 206 table = kmp1.getKmpNextTable(); 207 System.out.println(pattern.seqString()); 208 for (int i = 0; i < table.length; i++) { 209 System.out.print(table[i] +" "); 210 } 211 //table should be -1 0 0 -1 1 -1 1 -1 212 System.out.println(""); 213 matches = kmp1.findMatches(text); 214 System.out.print("Matches at: "); 215 for (int i = 0; i < matches.length; i++) { 216 System.out.print(matches[i]+" "); 217 } 218 //matches should be at 6. 219 System.out.println("\n"); 220 221 //check pattern2 222 kmp1 = new KnuthMorrisPrattSearch(pattern2); 223 table = kmp1.getKmpNextTable(); 224 System.out.println(pattern2.seqString()); 225 for (int i = 0; i < table.length; i++) { 226 System.out.print(table[i] +" "); 227 } 228 System.out.println(""); 229 //table should be -1 0 -1 0 2 230 231 matches = kmp1.findMatches(text); 232 System.out.print("Matches at: "); 233 for (int i = 0; i < matches.length; i++) { 234 System.out.print(matches[i]+" "); 235 } 236 //matches should be at 8 and 10 237 System.out.println("\n"); 238 } 239}