001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.multiple.util; 022 023import java.util.ArrayList; 024import java.util.Arrays; 025import java.util.List; 026 027import org.biojava.nbio.structure.Atom; 028import org.biojava.nbio.structure.Calc; 029import org.biojava.nbio.structure.SVDSuperimposer; 030import org.biojava.nbio.structure.StructureException; 031import org.biojava.nbio.structure.align.multiple.MultipleAlignment; 032import org.biojava.nbio.structure.jama.Matrix; 033 034/** 035 * Utility class for calculating common scores of {@link MultipleAlignment}s. 036 * 037 * @author Spencer Bliven 038 * @author Aleix Lafita 039 * @since 4.1.0 040 * 041 */ 042public class MultipleAlignmentScorer { 043 044 // Names for commonly used scores 045 public static final String PROBABILITY = "Probability"; 046 public static final String CE_SCORE = "CE-score"; 047 public static final String RMSD = "RMSD"; 048 public static final String AVGTM_SCORE = "AvgTM-score"; 049 public static final String MC_SCORE = "MC-score"; 050 public static final String REF_RMSD = "Ref-RMSD"; 051 public static final String REFTM_SCORE = "RefTM-score"; 052 053 /** 054 * Calculates and puts the RMSD and the average TM-Score of the 055 * MultipleAlignment. 056 * 057 * @param alignment 058 * @throws StructureException 059 * @see #getAvgTMScore(MultipleAlignment) 060 * @see #getRMSD(MultipleAlignment) 061 */ 062 public static void calculateScores(MultipleAlignment alignment) 063 throws StructureException { 064 065 // Put RMSD 066 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 067 alignment.putScore(RMSD, getRMSD(trans)); 068 069 // Put AvgTM-Score 070 List<Integer> lengths = new ArrayList<Integer>(alignment.size()); 071 for (Atom[] atoms : alignment.getAtomArrays()) { 072 lengths.add(atoms.length); 073 } 074 alignment.putScore(AVGTM_SCORE, getAvgTMScore(trans, lengths)); 075 } 076 077 /** 078 * Calculates the RMSD of all-to-all structure comparisons (distances) of 079 * the given MultipleAlignment. 080 * <p> 081 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 082 * length. 083 * <p> 084 * The formula used is just the sqroot of the average distance of all 085 * possible pairs of atoms. Thus, for R structures aligned over C columns 086 * without gaps, we have 087 * 088 * <pre> 089 * RMSD = \sqrt{1/(C*(R*(R-1)/2)) * \sum_{r1=1}^{R-1} 090 * \sum_{r2=r1+1}^{R-1} \sum_{j=0}^{C-1} (atom[r1][c]-atom[r2][c])^2} 091 * </pre> 092 * 093 * @param alignment 094 * @return double RMSD 095 */ 096 public static double getRMSD(MultipleAlignment alignment) { 097 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 098 return getRMSD(trans); 099 } 100 101 /** 102 * Calculates the RMSD of all-to-all structure comparisons (distances), 103 * given a set of superimposed atoms. 104 * 105 * @param transformed 106 * @return double RMSD 107 * @see #getRMSD(MultipleAlignment) 108 */ 109 public static double getRMSD(List<Atom[]> transformed) { 110 111 double sumSqDist = 0; 112 int comparisons = 0; 113 114 for (int r1 = 0; r1 < transformed.size(); r1++) { 115 for (int c = 0; c < transformed.get(r1).length; c++) { 116 Atom refAtom = transformed.get(r1)[c]; 117 if (refAtom == null) 118 continue; 119 120 double nonNullSqDist = 0; 121 int nonNullLength = 0; 122 for (int r2 = r1 + 1; r2 < transformed.size(); r2++) { 123 Atom atom = transformed.get(r2)[c]; 124 if (atom != null) { 125 nonNullSqDist += Calc.getDistanceFast(refAtom, atom); 126 nonNullLength++; 127 } 128 } 129 130 if (nonNullLength > 0) { 131 comparisons++; 132 sumSqDist += nonNullSqDist / nonNullLength; 133 } 134 } 135 } 136 return Math.sqrt(sumSqDist / comparisons); 137 } 138 139 /** 140 * /** Calculates the average RMSD from all structures to a reference s 141 * tructure, given a set of superimposed atoms. 142 * <p> 143 * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment 144 * length. 145 * <p> 146 * For ungapped alignments, this is just the sqroot of the average distance 147 * from an atom to the aligned atom from the reference. Thus, for R 148 * structures aligned over C columns (with structure 0 as the reference), we 149 * have: 150 * 151 * <pre> 152 * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1} 153 * (atom[1][c]-atom[r][c])^2 } 154 * </pre> 155 * <p> 156 * For gapped alignments, null atoms are omitted from consideration, so that 157 * the RMSD is the average over all columns with non-null reference of the 158 * average RMSD within the non-null elements of the column. 159 * 160 * @param alignment 161 * MultipleAlignment 162 * @param ref 163 * reference structure index 164 * @return 165 */ 166 public static double getRefRMSD(MultipleAlignment alignment, int ref) { 167 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 168 return getRefRMSD(trans, ref); 169 } 170 171 /** 172 * Calculates the average RMSD from all structures to a reference s 173 * tructure, given a set of superimposed atoms. 174 * <p> 175 * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment 176 * length. 177 * <p> 178 * For ungapped alignments, this is just the sqroot of the average distance 179 * from an atom to the aligned atom from the reference. Thus, for R 180 * structures aligned over C columns (with structure 0 as the reference), we 181 * have: 182 * 183 * <pre> 184 * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1} 185 * (atom[1][c]-atom[r][c])^2 } 186 * </pre> 187 * <p> 188 * For gapped alignments, null atoms are omitted from consideration, so that 189 * the RMSD is the average over all columns with non-null reference of the 190 * average RMSD within the non-null elements of the column. 191 * 192 * @param transformed 193 * @param reference 194 * @return 195 */ 196 public static double getRefRMSD(List<Atom[]> transformed, int reference) { 197 198 double sumSqDist = 0; 199 int totalLength = 0; 200 for (int c = 0; c < transformed.get(reference).length; c++) { 201 Atom refAtom = transformed.get(reference)[c]; 202 if (refAtom == null) 203 continue; 204 205 double nonNullSqDist = 0; 206 int nonNullLength = 0; 207 for (int r = 0; r < transformed.size(); r++) { 208 if (r == reference) 209 continue; 210 Atom atom = transformed.get(r)[c]; 211 if (atom != null) { 212 nonNullSqDist += Calc.getDistanceFast(refAtom, atom); 213 nonNullLength++; 214 } 215 } 216 217 if (nonNullLength > 0) { 218 totalLength++; 219 sumSqDist += nonNullSqDist / nonNullLength; 220 } 221 } 222 return Math.sqrt(sumSqDist / totalLength); 223 } 224 225 /** 226 * Calculates the average TMScore of all the possible pairwise structure 227 * comparisons of the given alignment. 228 * <p> 229 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 230 * length. 231 * 232 * @param alignment 233 * @return double Average TMscore 234 * @throws StructureException 235 */ 236 public static double getAvgTMScore(MultipleAlignment alignment) 237 throws StructureException { 238 239 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 240 241 List<Integer> lengths = new ArrayList<Integer>(alignment.size()); 242 for (Atom[] atoms : alignment.getAtomArrays()) { 243 lengths.add(atoms.length); 244 } 245 return getAvgTMScore(trans, lengths); 246 } 247 248 /** 249 * Calculates the average TMScore all the possible pairwise structure 250 * comparisons of the given a set of superimposed Atoms and the original 251 * structure lengths. 252 * <p> 253 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 254 * length. 255 * 256 * @param transformed 257 * aligned Atoms transformed 258 * @param lengths 259 * lengths of the structures in residue number 260 * @return double Average TMscore 261 * @throws StructureException 262 */ 263 public static double getAvgTMScore(List<Atom[]> transformed, 264 List<Integer> lengths) throws StructureException { 265 266 if (transformed.size() != lengths.size()) 267 throw new IllegalArgumentException("Input sizes differ."); 268 269 double sumTM = 0; 270 int comparisons = 0; 271 272 for (int r1 = 0; r1 < transformed.size(); r1++) { 273 for (int r2 = r1 + 1; r2 < transformed.size(); r2++) { 274 int len = transformed.get(r1).length; 275 // Remove nulls from both arrays 276 Atom[] ref = new Atom[len]; 277 Atom[] aln = new Atom[len]; 278 int nonNullLen = 0; 279 for (int c = 0; c < len; c++) { 280 if (transformed.get(r1)[c] != null 281 && transformed.get(r2)[c] != null) { 282 ref[nonNullLen] = transformed.get(r1)[c]; 283 aln[nonNullLen] = transformed.get(r2)[c]; 284 nonNullLen++; 285 } 286 } 287 // Truncate nulls 288 if (nonNullLen < len) { 289 ref = Arrays.copyOf(ref, nonNullLen); 290 aln = Arrays.copyOf(aln, nonNullLen); 291 } 292 sumTM += SVDSuperimposer.getTMScore(ref, aln, lengths.get(r1), 293 lengths.get(r2)); 294 comparisons++; 295 } 296 } 297 return sumTM / comparisons; 298 } 299 300 /** 301 * Calculates the average TMScore from all structures to a reference 302 * structure, given a set of superimposed atoms. 303 * <p> 304 * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment 305 * length. 306 * 307 * @param alignment 308 * @param reference 309 * Index of the reference structure 310 * @return 311 * @throws StructureException 312 */ 313 public static double getRefTMScore(MultipleAlignment alignment, int ref) 314 throws StructureException { 315 316 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 317 318 List<Integer> lengths = new ArrayList<Integer>(alignment.size()); 319 for (Atom[] atoms : alignment.getAtomArrays()) { 320 lengths.add(atoms.length); 321 } 322 return getRefTMScore(trans, lengths, ref); 323 } 324 325 /** 326 * Calculates the average TMScore from all structures to a reference 327 * structure, given a set of superimposed atoms. 328 * <p> 329 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 330 * length. 331 * 332 * @param transformed 333 * Arrays of aligned atoms, after superposition 334 * @param lengths 335 * lengths of the full input structures 336 * @param reference 337 * Index of the reference structure 338 * @return 339 * @throws StructureException 340 */ 341 public static double getRefTMScore(List<Atom[]> transformed, 342 List<Integer> lengths, int reference) throws StructureException { 343 344 if (transformed.size() != lengths.size()) 345 throw new IllegalArgumentException("Input sizes differ"); 346 347 double sumTM = 0; 348 int comparisons = 0; 349 350 int len = transformed.get(reference).length; 351 for (int r = 0; r < transformed.size(); r++) { 352 if (r == reference) 353 continue; 354 // remove nulls from both arrays 355 Atom[] ref = new Atom[len]; 356 Atom[] aln = new Atom[len]; 357 int nonNullLen = 0; 358 for (int c = 0; c < len; c++) { 359 if (transformed.get(reference)[c] != null 360 && transformed.get(r)[c] != null) { 361 ref[nonNullLen] = transformed.get(reference)[c]; 362 aln[nonNullLen] = transformed.get(r)[c]; 363 nonNullLen++; 364 } 365 } 366 // truncate nulls 367 if (nonNullLen < len) { 368 ref = Arrays.copyOf(ref, nonNullLen); 369 aln = Arrays.copyOf(aln, nonNullLen); 370 } 371 sumTM += SVDSuperimposer.getTMScore(ref, aln, 372 lengths.get(reference), lengths.get(r)); 373 comparisons++; 374 } 375 return sumTM / comparisons; 376 } 377 378 /** 379 * Calculates the MC score, specific for the MultipleAlignment algorithm. 380 * The score function is modified from the original CEMC paper, making it 381 * continuous and differentiable. 382 * <p> 383 * The maximum score of a match is 20, and the penalties for gaps are part 384 * of the input. The half-score distance, d0, is chosen as in the TM-score. 385 * <p> 386 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 387 * length. 388 * 389 * @param alignment 390 * @param gapOpen 391 * penalty for gap opening (reasonable values are in the range 392 * (1.0-20.0) 393 * @param gapExtension 394 * penalty for extending a gap (reasonable values are in the 395 * range (0.5-10.0) 396 * @param dCutoff 397 * the distance cutoff 398 * @return the value of the score 399 * @throws StructureException 400 */ 401 public static double getMCScore(MultipleAlignment alignment, 402 double gapOpen, double gapExtension, double dCutoff) 403 throws StructureException { 404 405 List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment); 406 407 // Calculate d0: same as the one in TM score 408 int minLen = Integer.MAX_VALUE; 409 for (Atom[] atoms : alignment.getAtomArrays()) 410 if (atoms.length < minLen) 411 minLen = atoms.length; 412 double d0 = 1.24 * Math.cbrt((minLen) - 15.) - 1.8; 413 414 // Calculate the distance cutoff penalization 415 double A = 20.0 / (1 + (dCutoff * dCutoff) / (d0 * d0)); 416 417 return getMCScore(trans, d0, gapOpen, gapExtension, A); 418 } 419 420 /** 421 * Calculates the MC score, specific for the MultipleAlignment algorithm. 422 * The score function is modified from the original CEMC paper, making it 423 * continuous and differentiable. 424 * <p> 425 * The maximum score of a match is 20, and the penalties for gaps are part 426 * of the input. 427 * <p> 428 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 429 * length. 430 * 431 * @param transformed 432 * List of transformed Atom arrays 433 * @param d0 434 * parameter for the half-score distance 435 * @param gapOpen 436 * penalty for gap opening (reasonable values are in the range 437 * (1.0-20.0) 438 * @param gapExtension 439 * penalty for extending a gap (reasonable values are in the 440 * range (0.5-10.0) 441 * @param A 442 * the distance cutoff penalization 443 * @return the value of the score 444 * @throws StructureException 445 */ 446 private static double getMCScore(List<Atom[]> trans, double d0, 447 double gapOpen, double gapExtension, double A) 448 throws StructureException { 449 450 int size = trans.size(); 451 int length = trans.get(0).length; 452 Matrix residueDistances = new Matrix(size, length, -1); 453 double scoreMC = 0.0; 454 int openGaps = 0; 455 int extensionGaps = 0; 456 457 // Calculate the average residue distances 458 for (int r1 = 0; r1 < size; r1++) { 459 boolean gapped = false; 460 for (int c = 0; c < trans.get(r1).length; c++) { 461 Atom refAtom = trans.get(r1)[c]; 462 // Calculate the gap extension and opening on the fly 463 if (refAtom == null) { 464 if (gapped) 465 extensionGaps++; 466 else { 467 gapped = true; 468 openGaps++; 469 } 470 continue; 471 } else 472 gapped = false; 473 474 for (int r2 = r1 + 1; r2 < size; r2++) { 475 Atom atom = trans.get(r2)[c]; 476 if (atom != null) { 477 double distance = Calc.getDistance(refAtom, atom); 478 if (residueDistances.get(r1, c) == -1) { 479 residueDistances.set(r1, c, 1 + distance); 480 } else { 481 residueDistances.set(r1, c, 482 residueDistances.get(r1, c) + distance); 483 } 484 if (residueDistances.get(r2, c) == -1) { 485 residueDistances.set(r2, c, 1 + distance); 486 } else { 487 residueDistances.set(r2, c, 488 residueDistances.get(r2, c) + distance); 489 } 490 } 491 } 492 } 493 } 494 495 for (int c = 0; c < length; c++) { 496 int nonNullRes = 0; 497 for (int r = 0; r < size; r++) { 498 if (residueDistances.get(r, c) != -1) 499 nonNullRes++; 500 } 501 for (int r = 0; r < size; r++) { 502 if (residueDistances.get(r, c) != -1) { 503 residueDistances.set(r, c, residueDistances.get(r, c) 504 / nonNullRes); 505 } 506 } 507 } 508 509 // Sum all the aligned residue scores 510 for (int row = 0; row < size; row++) { 511 for (int col = 0; col < length; col++) { 512 if (residueDistances.get(row, col) == -1) 513 continue; 514 double d1 = residueDistances.get(row, col); 515 double resScore = 20.0 / (1 + (d1 * d1) / (d0 * d0)); 516 scoreMC += resScore - A; 517 } 518 } 519 520 // Apply the Gap penalty and return 521 return scoreMC - (openGaps * gapOpen + extensionGaps * gapExtension); 522 } 523}