001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.alignment; 023 024import org.biojava.bio.BioException; 025import org.biojava.bio.BioRuntimeException; 026import org.biojava.bio.SimpleAnnotation; 027import org.biojava.bio.seq.Sequence; 028import org.biojava.bio.seq.impl.SimpleGappedSequence; 029import org.biojava.bio.seq.impl.SimpleSequence; 030import org.biojava.bio.seq.io.SymbolTokenization; 031import org.biojava.bio.symbol.SimpleSymbolList; 032import org.biojava.bio.symbol.SymbolList; 033 034/** 035 * Smith and Waterman developed an efficient dynamic programming algorithm to 036 * perform local sequence alignments, which returns the most conserved region of 037 * two sequences (longest common substring with modifications). This algorithm 038 * is performed by the method <code>pairwiseAlignment</code> of this class. It 039 * uses affine gap penalties if and only if the expenses of a delete or insert 040 * operation are unequal to the expenses of gap extension. This uses 041 * significantly more memory (four times as much) and increases the runtime if 042 * swapping is performed. 043 * 044 * @author Andreas Dräger 045 * @author Gero Greiner 046 * @author Mark Schreiber 047 * @since 1.5 048 */ 049public class SmithWaterman extends AlignmentAlgorithm { 050 051 private short match, replace, insert, delete, gapExt; 052 private SubstitutionMatrix subMatrix; 053 054 /** 055 * Default constructor. 056 */ 057 public SmithWaterman() { 058 this.subMatrix = null; 059 } 060 061 /** 062 * Constructs the new SmithWaterman alignment object. Alignments are only 063 * performed, if the alphabet of the given <code>SubstitutionMatrix</code> 064 * equals the alphabet of both the query and the target 065 * <code>Sequence</code>. The alignment parameters here are expenses and not 066 * scores as they are in the <code>NeedlemanWunsch</code> object. scores are 067 * just given by multiplying the expenses with <code>(-1)</code>. For 068 * example you could use parameters like "-2, 5, 3, 3, 0". If the expenses 069 * for gap extension are equal to the cost of starting a gap (delete or 070 * insert), no affine gap penalties are used, which saves memory. 071 * 072 * @param match 073 * expenses for a match 074 * @param replace 075 * expenses for a replace operation 076 * @param insert 077 * expenses for a gap opening in the query sequence 078 * @param delete 079 * expenses for a gap opening in the target sequence 080 * @param gapExtend 081 * expenses for the extension of a gap which was started earlier. 082 * @param matrix 083 * the <code>SubstitutionMatrix</code> object to use. 084 */ 085 public SmithWaterman(short match, short replace, short insert, 086 short delete, short gapExtend, SubstitutionMatrix matrix) { 087 this.match = (short) -match; 088 this.replace = (short) -replace; 089 this.insert = (short) -insert; 090 this.delete = (short) -delete; 091 this.gapExt = (short) -gapExtend; 092 this.subMatrix = matrix; 093 } 094 095 /** 096 * @return the delete 097 */ 098 public short getDelete() { 099 return delete; 100 } 101 102 /** 103 * @return the gapExt 104 */ 105 public short getGapExt() { 106 return gapExt; 107 } 108 109 /** 110 * @return the insert 111 */ 112 public short getInsert() { 113 return insert; 114 } 115 116 /** 117 * @return the match 118 */ 119 public short getMatch() { 120 return match; 121 } 122 123 /** 124 * @return the replace 125 */ 126 public short getReplace() { 127 return replace; 128 } 129 130 /** 131 * @return the subMatrix 132 */ 133 public SubstitutionMatrix getSubMatrix() { 134 return subMatrix; 135 } 136 137 /** 138 * This method computes the scores for the substitution of the i-th symbol 139 * of query by the j-th symbol of subject. 140 * 141 * @param query 142 * The query sequence 143 * @param subject 144 * The target sequence 145 * @param i 146 * The position of the symbol under consideration within the 147 * query sequence (starting from one) 148 * @param j 149 * The position of the symbol under consideration within the 150 * target sequence 151 * @return The score for the given substitution. 152 */ 153 private short matchReplace(Sequence query, Sequence subject, int i, int j) { 154 try { 155 if (subMatrix != null) 156 return subMatrix.getValueAt(query.symbolAt(i), subject 157 .symbolAt(j)); 158 } catch (Exception exc) { 159 } 160 if (query.symbolAt(i).equals(subject.symbolAt(j))) 161 return match; 162 return replace; 163 } 164 165 /** 166 * This just computes the maximum of four integers. 167 * 168 * @param w 169 * @param x 170 * @param y 171 * @param z 172 * @return the maximum of four <code>int</code>s. 173 */ 174 private int max(int w, int x, int y, int z) { 175 if ((w > x) && (w > y) && (w > z)) 176 return w; 177 if ((x > y) && (x > z)) 178 return x; 179 if ((y > z)) 180 return y; 181 return z; 182 } 183 184 /** 185 * Overrides the method inherited from the NeedlemanWunsch and performs only 186 * a local alignment. It finds only the longest common subsequence. This is 187 * good for the beginning, but it might be better to have a system to find 188 * more than only one hit within the score matrix. Therefore, one should 189 * only define the k-th best hit, where k is somehow related to the number 190 * of hits. 191 * 192 * @see AlignmentAlgorithm#pairwiseAlignment(org.biojava.bio.symbol.SymbolList, 193 * org.biojava.bio.symbol.SymbolList) 194 */ 195 public AlignmentPair pairwiseAlignment(SymbolList query, SymbolList subject) 196 throws BioRuntimeException { 197 int[][] scoreMatrix; 198 Sequence squery = null; 199 Sequence ssubject = null; 200 201 if (query instanceof Sequence) { 202 squery = (Sequence) query; 203 } else { 204 // make it a sequence 205 squery = new SimpleSequence(query, "", "query", 206 new SimpleAnnotation()); 207 } 208 if (subject instanceof Sequence) { 209 ssubject = (Sequence) subject; 210 } else { 211 // make it a sequence 212 ssubject = new SimpleSequence(subject, "", "subject", 213 new SimpleAnnotation()); 214 } 215 216 if (squery.getAlphabet().equals(ssubject.getAlphabet()) 217 && (subMatrix == null || squery.getAlphabet().equals( 218 subMatrix.getAlphabet()))) { 219 StringBuffer[] align = { new StringBuffer(), new StringBuffer() }; 220 long time = System.currentTimeMillis(); 221 int i, j, maxI = 0, maxJ = 0, queryStart = 0, subjectStart = 0; 222 scoreMatrix = new int[squery.length() + 1][ssubject.length() + 1]; 223 224 /* 225 * Variables needed for traceback 226 */ 227 228 SymbolTokenization st; 229 try { 230 st = squery.getAlphabet().getTokenization("default"); 231 } catch (BioException exc) { 232 throw new BioRuntimeException(exc); 233 } 234 235 /* 236 * Use affine gap panalties. 237 */ 238 if ((gapExt != delete) || (gapExt != insert)) { 239 240 int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts 241 int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes 242 243 scoreMatrix[0][0] = 0; 244 E[0][0] = F[0][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY; 245 for (i = 1; i <= squery.length(); i++) { 246 scoreMatrix[i][0] = F[i][0] = 0; 247 E[i][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY; 248 } 249 for (j = 1; j <= ssubject.length(); j++) { 250 scoreMatrix[0][j] = E[0][j] = 0; 251 F[0][j] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY; 252 } 253 for (i = 1; i <= squery.length(); i++) 254 for (j = 1; j <= ssubject.length(); j++) { 255 E[i][j] = Math.max(E[i][j - 1], scoreMatrix[i][j - 1] 256 + insert) 257 + gapExt; 258 F[i][j] = Math.max(F[i - 1][j], scoreMatrix[i - 1][j] 259 + delete) 260 + gapExt; 261 scoreMatrix[i][j] = max(0, E[i][j], F[i][j], 262 scoreMatrix[i - 1][j - 1] 263 + matchReplace(squery, ssubject, i, j)); 264 265 if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) { 266 maxI = i; 267 maxJ = j; 268 } 269 } 270 // System.out.println(printCostMatrix(G, 271 // query.seqString().toCharArray(), 272 // subject.seqString().toCharArray())); 273 274 /* 275 * Here starts the traceback for affine gap penalities 276 */ 277 try { 278 boolean[] gap_extend = { false, false }; 279 j = maxJ; 280 for (i = maxI; i > 0;) { 281 do { 282 // only Deletes or Inserts or Replaces possible. 283 // That's not what we want to have. 284 if (scoreMatrix[i][j] == 0) { 285 queryStart = i; 286 subjectStart = j; 287 i = j = 0; 288 289 // Match/Replace 290 } else if ((scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1] 291 + matchReplace(squery, ssubject, i, j)) 292 && !(gap_extend[0] || gap_extend[1])) { 293 294 align[0].insert(0, st.tokenizeSymbol(squery 295 .symbolAt(i--))); 296 align[1].insert(0, st.tokenizeSymbol(ssubject 297 .symbolAt(j--))); 298 299 // Insert || finish gap if extended gap is 300 // opened 301 } else if (scoreMatrix[i][j] == E[i][j] 302 || gap_extend[0]) { 303 // check if gap has been extended or freshly 304 // opened 305 gap_extend[0] = (E[i][j] != scoreMatrix[i][j - 1] 306 + insert + gapExt); 307 308 align[0].insert(0, '-'); 309 align[1].insert(0, st.tokenizeSymbol(ssubject 310 .symbolAt(j--))); 311 312 // Delete || finish gap if extended gap is 313 // opened 314 } else { 315 // check if gap has been extended or freshly 316 // opened 317 gap_extend[1] = (F[i][j] != scoreMatrix[i - 1][j] 318 + delete + gapExt); 319 320 align[0].insert(0, st.tokenizeSymbol(squery 321 .symbolAt(i--))); 322 align[1].insert(0, '-'); 323 } 324 } while (j > 0); 325 } 326 } catch (BioException exc) { 327 throw new BioRuntimeException(exc); 328 } 329 330 /* 331 * No affine gap penalties to save memory. 332 */ 333 } else { 334 335 for (i = 0; i <= squery.length(); i++) 336 scoreMatrix[i][0] = 0; 337 for (j = 0; j <= ssubject.length(); j++) 338 scoreMatrix[0][j] = 0; 339 for (i = 1; i <= squery.length(); i++) 340 for (j = 1; j <= ssubject.length(); j++) { 341 342 scoreMatrix[i][j] = max(0, scoreMatrix[i - 1][j] 343 + delete, scoreMatrix[i][j - 1] + insert, 344 scoreMatrix[i - 1][j - 1] 345 + matchReplace(squery, ssubject, i, j)); 346 347 if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) { 348 maxI = i; 349 maxJ = j; 350 } 351 } 352 353 /* 354 * Here starts the traceback for non-affine gap penalities 355 */ 356 try { 357 j = maxJ; 358 for (i = maxI; i > 0;) { 359 do { 360 // only Deletes or Inserts or Replaces possible. 361 // That's not what 362 // we want to have. 363 if (scoreMatrix[i][j] == 0) { 364 queryStart = i; 365 subjectStart = j; 366 i = j = 0; 367 368 // Match/Replace 369 } else if (scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1] 370 + matchReplace(squery, ssubject, i, j)) { 371 372 align[0].insert(0, st.tokenizeSymbol(squery 373 .symbolAt(i--))); 374 align[1].insert(0, st.tokenizeSymbol(ssubject 375 .symbolAt(j--))); 376 377 // Insert 378 } else if (scoreMatrix[i][j] == scoreMatrix[i][j - 1] 379 + insert) { 380 align[0].insert(0, '-'); 381 align[1].insert(0, st.tokenizeSymbol(ssubject 382 .symbolAt(j--))); 383 384 // Delete 385 } else { 386 align[0].insert(0, st.tokenizeSymbol(squery 387 .symbolAt(i--))); 388 align[1].insert(0, '-'); 389 } 390 } while (j > 0); 391 } 392 } catch (BioException exc) { 393 throw new BioRuntimeException(exc); 394 } 395 } 396 397 /* 398 * From here both cases are equal again. 399 */ 400 401 try { 402 403 /* 404 * Make equal length alignments! 405 */ 406 for (i = 0; i < queryStart; i++) 407 align[0].insert(0, '-'); 408 for (i = align[0].length(); i < Math.max(maxI, maxJ); i++) 409 align[0].append('-'); 410 for (i = 0; i < subjectStart; i++) 411 align[1].insert(0, '-'); 412 for (i = align[1].length(); i < Math.max(maxJ, align[0] 413 .length()); i++) 414 align[1].append('-'); 415 for (i = align[0].length(); i < align[1].length(); i++) 416 align[0].append('-'); 417 418 squery = new SimpleGappedSequence( 419 new SimpleSequence(new SimpleSymbolList(squery 420 .getAlphabet().getTokenization("token"), 421 align[0].toString()), squery.getURN(), squery 422 .getName(), squery.getAnnotation())); 423 ssubject = new SimpleGappedSequence( 424 new SimpleSequence(new SimpleSymbolList(ssubject 425 .getAlphabet().getTokenization("token"), 426 align[1].toString()), ssubject.getURN(), 427 ssubject.getName(), ssubject.getAnnotation())); 428 429 AlignmentPair pairalign = new AlignmentPair(squery, ssubject, 430 queryStart + 1, maxI, subjectStart + 1, maxJ, subMatrix); 431 pairalign.setComputationTime(System.currentTimeMillis() - time); 432 pairalign.setScore(scoreMatrix[maxI][maxJ]); 433 // Don't waste any memory. 434 scoreMatrix = null; 435 return pairalign; 436 437 } catch (BioException exc) { 438 throw new BioRuntimeException(exc); 439 } 440 } else 441 throw new BioRuntimeException( 442 "The alphabets of the sequences and the substitution matrix have to be equal."); 443 } 444 445 /** 446 * Overrides the method inherited from the NeedlemanWunsch and sets the 447 * penalty for a delete operation to the specified value. Reason: internally 448 * scores are used instead of penalties so that the value is muliplyed with 449 * -1. 450 * 451 * @param del 452 * costs for a single deletion operation 453 */ 454 public void setDelete(short del) { 455 this.delete = (short) -del; 456 } 457 458 /** 459 * Overrides the method inherited from the NeedlemanWunsch and sets the 460 * penalty for an extension of any gap (insert or delete) to the specified 461 * value. Reason: internally scores are used instead of penalties so that 462 * the value is muliplyed with -1. 463 * 464 * @param ge 465 * costs for any gap extension 466 */ 467 public void setGapExt(short ge) { 468 this.gapExt = (short) -ge; 469 } 470 471 /** 472 * Overrides the method inherited from the NeedlemanWunsch and sets the 473 * penalty for an insert operation to the specified value. Reason: 474 * internally scores are used instead of penalties so that the value is 475 * muliplyed with -1. 476 * 477 * @param ins 478 * costs for a single insert operation 479 */ 480 public void setInsert(short ins) { 481 this.insert = (short) -ins; 482 } 483 484 /** 485 * Overrides the method inherited from the NeedlemanWunsch and sets the 486 * penalty for a match operation to the specified value. Reason: internally 487 * scores are used instead of penalties so that the value is muliplyed with 488 * -1. 489 * 490 * @param ma 491 * costs for a single match operation 492 */ 493 public void setMatch(short ma) { 494 this.match = (short) -ma; 495 } 496 497 /** 498 * Overrides the method inherited from the NeedlemanWunsch and sets the 499 * penalty for a replace operation to the specified value. Reason: 500 * internally scores are used instead of penalties so that the value is 501 * muliplyed with -1. 502 * 503 * @param rep 504 * costs for a single replace operation 505 */ 506 public void setReplace(short rep) { 507 this.replace = (short) -rep; 508 } 509 510 /** 511 * @param subMatrix 512 * the subMatrix to set 513 */ 514 public void setSubMatrix(SubstitutionMatrix subMatrix) { 515 this.subMatrix = subMatrix; 516 } 517 518}