001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.alignment; 022 023import org.biojava.bio.BioException; 024import org.biojava.bio.SimpleAnnotation; 025import org.biojava.bio.seq.Sequence; 026import org.biojava.bio.seq.impl.SimpleGappedSequence; 027import org.biojava.bio.seq.impl.SimpleSequence; 028import org.biojava.bio.seq.io.SymbolTokenization; 029import org.biojava.bio.symbol.SimpleSymbolList; 030import org.biojava.bio.symbol.SymbolList; 031 032/** 033 * Needleman and Wunsch defined the problem of global sequence alignments, from 034 * the first till the last symbol of a sequence. This class is able to perform 035 * such global sequence comparisons efficiently by dynamic programming. If 036 * inserts and deletes are equally expensive and as expensive as the extension 037 * of a gap, the alignment method of this class does not use affine gap 038 * penalties. Otherwise it does. Those costs need four times as much memory, 039 * which has significant effects on the run time, if the computer needs to swap. 040 * 041 * @author Andreas Dräger 042 * @author Gero Greiner 043 * @author Mark Schreiber 044 * @since 1.5 045 */ 046 047public class NeedlemanWunsch extends AlignmentAlgorithm { 048 049 /** 050 * This just computes the minimum of three integer values. 051 * 052 * @param x 053 * @param y 054 * @param z 055 * @return Gives the minimum of three integers 056 */ 057 protected static int min(int x, int y, int z) { 058 if ((x < y) && (x < z)) 059 return x; 060 if (y < z) 061 return y; 062 return z; 063 } 064 065 /** 066 * Prints a String representation of the CostMatrix for the given Alignment 067 * on the screen. This can be used to get a better understanding of the 068 * algorithm. There is no other purpose. This method also works for all 069 * extensions of this class with all kinds of matrices. 070 * 071 * @param CostMatrix 072 * The matrix that contains all expenses for swapping symbols. 073 * @param queryChar 074 * a character representation of the query sequence ( 075 * <code>mySequence.seqString().toCharArray()</code>). 076 * @param targetChar 077 * a character representation of the target sequence. 078 * @return a String representation of the matrix. 079 */ 080 public static String printCostMatrix(int[][] CostMatrix, char[] queryChar, 081 char[] targetChar) { 082 int line, col; 083 StringBuilder output = new StringBuilder(); 084 output.append('\t'); 085 String ls = System.getProperty("line.separator"); 086 087 for (col = 0; col <= targetChar.length; col++) 088 if (col == 0) { 089 output.append('['); 090 output.append(col); 091 output.append("]\t"); 092 } else { 093 output.append('['); 094 output.append(targetChar[col - 1]); 095 output.append("]\t"); 096 } 097 for (line = 0; line <= queryChar.length; line++) { 098 if (line == 0) { 099 output.append(ls); 100 output.append('['); 101 output.append(line); 102 output.append("]\t"); 103 } else { 104 output.append(ls); 105 output.append('['); 106 output.append(queryChar[line - 1]); 107 output.append("]\t"); 108 } 109 for (col = 0; col <= targetChar.length; col++) { 110 output.append(CostMatrix[line][col]); 111 output.append('\t'); 112 } 113 } 114 output.append(ls); 115 output.append("delta[Edit] = "); 116 output.append(CostMatrix[line - 1][col - 1]); 117 output.append(ls); 118 return output.toString(); 119 } 120 121 /** 122 * A matrix with the size length(sequence1) times length(sequence2) 123 */ 124 protected int[][] CostMatrix; 125 126 /** 127 * Expenses for deletes. 128 */ 129 private short delete; 130 131 /** 132 * Expenses for the extension of a gap. 133 */ 134 private short gapExt; 135 136 /** 137 * Expenses for inserts. 138 */ 139 private short insert; 140 141 /** 142 * Expenses for matches. 143 */ 144 private short match; 145 146 /** 147 * Expenses for replaces. 148 */ 149 private short replace; 150 151 /** 152 * A matrix with the size length(alphabet) times length(alphabet) 153 */ 154 protected SubstitutionMatrix subMatrix; 155 156 /** 157 * Constructs a new Object with the given parameters based on the 158 * Needleman-Wunsch algorithm The alphabet of sequences to be aligned will 159 * be taken from the given substitution matrix. 160 * 161 * @param match 162 * This gives the costs for a match operation. It is only used, 163 * if there is no entry for a certain match of two symbols in the 164 * substitution matrix (default value). 165 * @param replace 166 * This is like the match parameter just the default, if there is 167 * no entry in the substitution matrix object. 168 * @param insert 169 * The costs of a single insert operation. 170 * @param delete 171 * The expenses of a single delete operation. 172 * @param gapExtend 173 * The expenses of an extension of a existing gap (that is a 174 * previous insert or delete. If the costs for insert and delete 175 * are equal and also equal to gapExtend, no affine gap penalties 176 * will be used, which saves a significant amount of memory. 177 * @param subMat 178 * The substitution matrix object which gives the costs for 179 * matches and replaces. 180 */ 181 public NeedlemanWunsch(short match, short replace, short insert, 182 short delete, short gapExtend, SubstitutionMatrix subMat) { 183 this.subMatrix = subMat; 184 this.insert = insert; 185 this.delete = delete; 186 this.gapExt = gapExtend; 187 this.match = match; 188 this.replace = replace; 189 } 190 191 /** 192 * Returns the current expenses of a single delete operation. 193 * 194 * @return delete 195 */ 196 public short getDelete() { 197 return delete; 198 } 199 200 /** 201 * This gives the edit distance according to the given parameters of this 202 * certain object. It returns just the last element of the internal cost 203 * matrix (left side down). So if you extend this class, you can just do the 204 * following: 205 * <code>int myDistanceValue = foo; this.CostMatrix = new int[1][1]; this.CostMatrix[0][0] = myDistanceValue;</code> 206 * 207 * @return returns the edit_distance computed with the given parameters. 208 */ 209 public int getEditDistance() { 210 return CostMatrix[CostMatrix.length - 1][CostMatrix[CostMatrix.length - 1].length - 1]; 211 } 212 213 /** 214 * Returns the current expenses of any extension of a gap operation. 215 * 216 * @return gapExt 217 */ 218 public short getGapExt() { 219 return gapExt; 220 } 221 222 /** 223 * Returns the current expenses of a single insert operation. 224 * 225 * @return insert 226 */ 227 public short getInsert() { 228 return insert; 229 } 230 231 /** 232 * Returns the current expenses of a single match operation. 233 * 234 * @return match 235 */ 236 public short getMatch() { 237 return match; 238 } 239 240 /** 241 * Returns the current expenses of a single replace operation. 242 * 243 * @return replace 244 */ 245 public short getReplace() { 246 return replace; 247 } 248 249 /** 250 * This method computes the scores for the substitution of the i-th symbol 251 * of query by the j-th symbol of subject. 252 * 253 * @param query 254 * The query sequence 255 * @param subject 256 * The target sequence 257 * @param i 258 * The position of the symbol under consideration within the 259 * query sequence (starting from one) 260 * @param j 261 * The position of the symbol under consideration within the 262 * target sequence 263 * @return The score for the given substitution. 264 */ 265 private int matchReplace(Sequence query, Sequence subject, int i, int j) { 266 try { 267 return subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j)); 268 } catch (Exception exc) { 269 if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j)) 270 || subject.symbolAt(j).getMatches().contains( 271 query.symbolAt(i))) 272 return -match; 273 return -replace; 274 } 275 } 276 277 /** 278 * Global pairwise sequence alignment of two BioJava-Sequence objects 279 * according to the Needleman-Wunsch-algorithm. 280 * 281 * @see org.biojava.bio.alignment.AlignmentAlgorithm#pairwiseAlignment(org.biojava.bio.symbol.SymbolList, 282 * org.biojava.bio.symbol.SymbolList) 283 */ 284 @Override 285 public AlignmentPair pairwiseAlignment(SymbolList query, SymbolList subject) 286 throws BioException { 287 Sequence squery = null; 288 Sequence ssubject = null; 289 290 if (query instanceof Sequence) { 291 squery = (Sequence) query; 292 } else { 293 // make it a sequence 294 squery = new SimpleSequence(query, "", "query", 295 new SimpleAnnotation()); 296 } 297 if (subject instanceof Sequence) { 298 ssubject = (Sequence) subject; 299 } else { 300 // make it a sequence 301 ssubject = new SimpleSequence(subject, "", "subject", 302 new SimpleAnnotation()); 303 } 304 305 SymbolTokenization st = null; 306 st = subMatrix.getAlphabet().getTokenization("default"); 307 308 if (squery.getAlphabet().equals(ssubject.getAlphabet()) 309 && squery.getAlphabet().equals(subMatrix.getAlphabet())) { 310 311 StringBuffer[] align = { new StringBuffer(), new StringBuffer() }; 312 313 long time = System.currentTimeMillis(); 314 int i, j; 315 this.CostMatrix = new int[squery.length() + 1][ssubject.length() + 1]; 316 317 /* 318 * Variables for the traceback 319 */ 320 321 StringBuffer path = new StringBuffer(); 322 323 // construct the matrix: 324 CostMatrix[0][0] = 0; 325 326 /* 327 * If we want to have affine gap penalties, we have to initialise 328 * additional matrices: If this is not necessary, we won't do that 329 * (because it's expensive). 330 */ 331 if ((gapExt != delete) || (gapExt != insert)) { 332 333 int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts 334 int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes 335 336 E[0][0] = F[0][0] = Integer.MAX_VALUE; // Double.MAX_VALUE; 337 for (i = 1; i <= squery.length(); i++) { 338 // CostMatrix[i][0] = CostMatrix[i-1][0] + delete; 339 E[i][0] = Integer.MAX_VALUE; // Double.POSITIVE_INFINITY; 340 CostMatrix[i][0] = F[i][0] = delete + i * gapExt; 341 } 342 for (j = 1; j <= ssubject.length(); j++) { 343 // CostMatrix[0][j] = CostMatrix[0][j - 1] + insert; 344 F[0][j] = Integer.MAX_VALUE; // Double.POSITIVE_INFINITY; 345 CostMatrix[0][j] = E[0][j] = insert + j * gapExt; 346 } 347 for (i = 1; i <= squery.length(); i++) 348 for (j = 1; j <= ssubject.length(); j++) { 349 E[i][j] = Math.min(E[i][j - 1], CostMatrix[i][j - 1] 350 + insert) 351 + gapExt; 352 F[i][j] = Math.min(F[i - 1][j], CostMatrix[i - 1][j] 353 + delete) 354 + gapExt; 355 CostMatrix[i][j] = min(E[i][j], F[i][j], 356 CostMatrix[i - 1][j - 1] 357 - matchReplace(squery, ssubject, i, j)); 358 } 359 360 /* 361 * Traceback for affine gap penalties. 362 */ 363 364 boolean[] gap_extend = { false, false }; 365 j = this.CostMatrix[CostMatrix.length - 1].length - 1; 366 367 for (i = this.CostMatrix.length - 1; i > 0;) { 368 do { 369 // only Insert. 370 if (i == 0) { 371 align[0].insert(0, '-'); 372 align[1].insert(0, st.tokenizeSymbol(ssubject 373 .symbolAt(j--))); 374 path.insert(0, ' '); 375 376 // only Delete. 377 } else if (j == 0) { 378 align[0].insert(0, st.tokenizeSymbol(squery 379 .symbolAt(i--))); 380 align[1].insert(0, '-'); 381 path.insert(0, ' '); 382 383 // Match/Replace 384 } else if ((CostMatrix[i][j] == CostMatrix[i - 1][j - 1] 385 - matchReplace(squery, ssubject, i, j)) 386 && !(gap_extend[0] || gap_extend[1])) { 387 if (squery.symbolAt(i) == ssubject.symbolAt(j)) 388 path.insert(0, '|'); 389 else 390 path.insert(0, ' '); 391 align[0].insert(0, st.tokenizeSymbol(squery 392 .symbolAt(i--))); 393 align[1].insert(0, st.tokenizeSymbol(ssubject 394 .symbolAt(j--))); 395 396 // Insert || finish gap if extended gap is 397 // opened 398 } else if (CostMatrix[i][j] == E[i][j] || gap_extend[0]) { 399 // check if gap has been extended or freshly 400 // opened 401 gap_extend[0] = (E[i][j] != CostMatrix[i][j - 1] 402 + insert + gapExt); 403 404 align[0].insert(0, '-'); 405 align[1].insert(0, st.tokenizeSymbol(ssubject 406 .symbolAt(j--))); 407 path.insert(0, ' '); 408 409 // Delete || finish gap if extended gap is 410 // opened 411 } else { 412 // check if gap has been extended or freshly 413 // opened 414 gap_extend[1] = (F[i][j] != CostMatrix[i - 1][j] 415 + delete + gapExt); 416 417 align[0].insert(0, st.tokenizeSymbol(squery 418 .symbolAt(i--))); 419 align[1].insert(0, '-'); 420 path.insert(0, ' '); 421 } 422 } while (j > 0); 423 } 424 425 /* 426 * No affine gap penalties, constant gap penalties, which is 427 * much faster and needs less memory. 428 */ 429 430 } else { 431 432 for (i = 1; i <= squery.length(); i++) 433 CostMatrix[i][0] = CostMatrix[i - 1][0] + delete; 434 for (j = 1; j <= ssubject.length(); j++) 435 CostMatrix[0][j] = CostMatrix[0][j - 1] + insert; 436 for (i = 1; i <= squery.length(); i++) 437 for (j = 1; j <= ssubject.length(); j++) { 438 CostMatrix[i][j] = min(CostMatrix[i - 1][j] + delete, 439 CostMatrix[i][j - 1] + insert, 440 CostMatrix[i - 1][j - 1] 441 - matchReplace(squery, ssubject, i, j)); 442 } 443 444 /* 445 * Traceback for constant gap penalties. 446 */ 447 j = this.CostMatrix[CostMatrix.length - 1].length - 1; 448 449 // System.out.println(printCostMatrix(CostMatrix, 450 // query.seqString().toCharArray(), 451 // subject.seqString().toCharArray())); 452 453 for (i = this.CostMatrix.length - 1; i > 0;) { 454 do { 455 // only Insert. 456 if (i == 0) { 457 align[0].insert(0, '-'); 458 align[1].insert(0, st.tokenizeSymbol(ssubject 459 .symbolAt(j--))); 460 path.insert(0, ' '); 461 462 // only Delete. 463 } else if (j == 0) { 464 align[0].insert(0, st.tokenizeSymbol(squery 465 .symbolAt(i--))); 466 align[1].insert(0, '-'); 467 path.insert(0, ' '); 468 469 // Match/Replace 470 } else if (CostMatrix[i][j] == CostMatrix[i - 1][j - 1] 471 - matchReplace(squery, ssubject, i, j)) { 472 473 if (squery.symbolAt(i) == ssubject.symbolAt(j)) 474 path.insert(0, '|'); 475 else 476 path.insert(0, ' '); 477 align[0].insert(0, st.tokenizeSymbol(squery 478 .symbolAt(i--))); 479 align[1].insert(0, st.tokenizeSymbol(ssubject 480 .symbolAt(j--))); 481 482 // Insert 483 } else if (CostMatrix[i][j] == CostMatrix[i][j - 1] 484 + insert) { 485 align[0].insert(0, '-'); 486 align[1].insert(0, st.tokenizeSymbol(ssubject 487 .symbolAt(j--))); 488 path.insert(0, ' '); 489 490 // Delete 491 } else { 492 align[0].insert(0, st.tokenizeSymbol(squery 493 .symbolAt(i--))); 494 align[1].insert(0, '-'); 495 path.insert(0, ' '); 496 } 497 } while (j > 0); 498 } 499 500 } 501 502 /* 503 * From here both cases are equal again. 504 */ 505 squery = new SimpleGappedSequence(new SimpleSequence( 506 new SimpleSymbolList(squery.getAlphabet().getTokenization( 507 "token"), align[0].toString()), squery.getURN(), 508 squery.getName(), squery.getAnnotation())); 509 ssubject = new SimpleGappedSequence(new SimpleSequence( 510 new SimpleSymbolList(ssubject.getAlphabet() 511 .getTokenization("token"), align[1].toString()), 512 ssubject.getURN(), ssubject.getName(), ssubject 513 .getAnnotation())); 514 AlignmentPair pairalign = new AlignmentPair(squery, ssubject, 1, 515 squery.length(), 1, ssubject.length(), subMatrix); 516 pairalign.setComputationTime(System.currentTimeMillis() - time); 517 pairalign.setScore((-1) * getEditDistance()); 518 return pairalign; 519 } else { 520 throw new BioException( 521 "Alphabet missmatch occured: sequences with different alphabet cannot be aligned."); 522 } 523 } 524 525 /** 526 * Sets the penalty for a delete operation to the specified value. 527 * 528 * @param del 529 * costs for a single deletion operation 530 */ 531 public void setDelete(short del) { 532 this.delete = del; 533 } 534 535 /** 536 * Sets the penalty for an extension of any gap (insert or delete) to the 537 * specified value. 538 * 539 * @param ge 540 * costs for any gap extension 541 */ 542 public void setGapExt(short ge) { 543 this.gapExt = ge; 544 } 545 546 /** 547 * Sets the penalty for an insert operation to the specified value. 548 * 549 * @param ins 550 * costs for a single insert operation 551 */ 552 public void setInsert(short ins) { 553 this.insert = ins; 554 } 555 556 /** 557 * Sets the penalty for a match operation to the specified value. 558 * 559 * @param ma 560 * costs for a single match operation 561 */ 562 public void setMatch(short ma) { 563 this.match = ma; 564 } 565 566 /** 567 * Sets the penalty for a replace operation to the specified value. 568 * 569 * @param rep 570 * costs for a single replace operation 571 */ 572 public void setReplace(short rep) { 573 this.replace = rep; 574 } 575 576 /** 577 * Sets the substitution matrix to be used to the specified one. Afterwards 578 * it is only possible to align sequences of the alphabet of this 579 * substitution matrix. 580 * 581 * @param matrix 582 * an instance of a substitution matrix. 583 */ 584 public void setSubstitutionMatrix(SubstitutionMatrix matrix) { 585 this.subMatrix = matrix; 586 } 587 588}