001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.phylo; 022 023import java.io.IOException; 024import java.util.List; 025import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 026import org.biojava.nbio.core.sequence.MultipleSequenceAlignment; 027import org.biojava.nbio.core.sequence.template.Compound; 028import org.biojava.nbio.core.sequence.template.Sequence; 029import org.forester.evoinference.distance.PairwiseDistanceCalculator; 030import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; 031import org.forester.evoinference.matrix.distance.DistanceMatrix; 032import org.forester.msa.Msa; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036/** 037 * The DistanceMatrixCalculator methods generate a {@link DistanceMatrix} from a 038 * {@link MultipleSequenceAlignment} or other indirect distance infomation (RMSD). 039 * 040 * @author Aleix Lafita 041 * @since 4.1.1 042 * 043 */ 044public class DistanceMatrixCalculator { 045 046 private static final Logger logger = LoggerFactory 047 .getLogger(DistanceMatrixCalculator.class); 048 049 /** Prevent instantiation */ 050 private DistanceMatrixCalculator() {} 051 052 /** 053 * The fractional dissimilarity (D) is defined as the percentage of sites 054 * that differ between two aligned sequences. The percentage of identity 055 * (PID) is the fraction of identical sites between two aligned sequences. 056 * 057 * <pre> 058 * D = 1 - PID 059 * </pre> 060 * 061 * The gapped positons in the alignment are ignored in the calculation. This 062 * method is a wrapper to the forester implementation of the calculation: 063 * {@link PairwiseDistanceCalculator#calcFractionalDissimilarities(Msa)} 064 * 065 * @param msa 066 * MultipleSequenceAlignment 067 * @return DistanceMatrix 068 * @throws Exception 069 */ 070 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix fractionalDissimilarity( 071 MultipleSequenceAlignment<C, D> msa) throws IOException { 072 073 Msa fMsa = ForesterWrapper.convert(msa); 074 DistanceMatrix DM = PairwiseDistanceCalculator 075 .calcFractionalDissimilarities(fMsa); 076 077 return DM; 078 } 079 080 /** 081 * The Poisson (correction) evolutionary distance (d) is a function of the 082 * fractional dissimilarity (D), given by: 083 * 084 * <pre> 085 * d = -log(1 - D) 086 * </pre> 087 * 088 * The gapped positons in the alignment are ignored in the calculation. This 089 * method is a wrapper to the forester implementation of the calculation: 090 * {@link PairwiseDistanceCalculator#calcPoissonDistances(Msa)} 091 * 092 * @param msa 093 * MultipleSequenceAlignment 094 * @return DistanceMatrix 095 * @throws IOException 096 */ 097 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix poissonDistance( 098 MultipleSequenceAlignment<C, D> msa) throws IOException { 099 100 Msa fMsa = ForesterWrapper.convert(msa); 101 DistanceMatrix DM = PairwiseDistanceCalculator 102 .calcPoissonDistances(fMsa); 103 104 return DM; 105 } 106 107 /** 108 * The Kimura evolutionary distance (d) is a correction of the fractional 109 * dissimilarity (D) specially needed for large evolutionary distances. It 110 * is given by: 111 * 112 * <pre> 113 * d = -log(1 - D - 0.2 * D<sup>2</sup>) 114 * </pre> 115 * 116 * The equation is derived by fitting the relationship between the 117 * evolutionary distance (d) and the fractional dissimilarity (D) according 118 * to the PAM model of evolution (it is an empirical approximation for the 119 * method {@link #pamDistance(MultipleSequenceAlignment}). The gapped 120 * positons in the alignment are ignored in the calculation. This method is 121 * a wrapper to the forester implementation of the calculation: 122 * {@link PairwiseDistanceCalculator#calcKimuraDistances(Msa)}. 123 * 124 * @param msa 125 * MultipleSequenceAlignment 126 * @return DistanceMatrix 127 * @throws IOException 128 */ 129 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix kimuraDistance( 130 MultipleSequenceAlignment<C, D> msa) throws IOException { 131 132 Msa fMsa = ForesterWrapper.convert(msa); 133 DistanceMatrix DM = PairwiseDistanceCalculator 134 .calcPoissonDistances(fMsa); 135 136 return DM; 137 } 138 139 /** 140 * BioJava implementation for percentage of identity (PID). Although the 141 * name of the method is percentage of identity, the DistanceMatrix contains 142 * the fractional dissimilarity (D), computed as D = 1 - PID. 143 * <p> 144 * It is recommended to use the method 145 * {@link DistanceMatrixCalculator#fractionalDissimilarity(MultipleSequenceAlignment)} 146 * instead of this one. 147 * 148 * @param msa 149 * MultipleSequenceAlignment 150 * @return DistanceMatrix 151 */ 152 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percentageIdentity( 153 MultipleSequenceAlignment<C, D> msa) { 154 155 logger.info("{}:{}", "Determing Distances", 0); 156 int n = msa.getSize(); 157 String[] sequenceString = new String[n]; 158 for (int i = 0; i < n; i++) { 159 sequenceString[i] = msa.getAlignedSequence(i + 1) 160 .getSequenceAsString(); 161 } 162 163 DistanceMatrix distance = new BasicSymmetricalDistanceMatrix(n); 164 int totalloopcount = (n / 2) * (n + 1); 165 166 int loopcount = 0; 167 for (int i = 0; i < (n - 1); i++) { 168 logger.info("{}:{}", "Determining Distances", (loopcount * 100) 169 / totalloopcount); 170 distance.setIdentifier(i, msa.getAlignedSequence(i + 1) 171 .getAccession().getID()); 172 173 for (int j = i; j < n; j++) { 174 loopcount++; 175 if (j == i) { 176 distance.setValue(i, j, 0); 177 } else { 178 distance.setValue(i, j, 100 - Comparison.PID( 179 sequenceString[i], sequenceString[j])); 180 distance.setValue(j, i, distance.getValue(i, j)); 181 } 182 } 183 } 184 logger.info("{}:{}", "Determining Distances", 100); 185 186 return distance; 187 } 188 189 /** 190 * The fractional dissimilarity score (Ds) is a relative measure of the 191 * dissimilarity between two aligned sequences. It is calculated as: 192 * 193 * <pre> 194 * Ds = sum( max(M) - M<sub>ai,bi</sub> ) / (max(M)-min(M)) ) / L 195 * </pre> 196 * 197 * Where the sum through i runs for all the alignment positions, ai and bi 198 * are the AA at position i in the first and second aligned sequences, 199 * respectively, and L is the total length of the alignment (normalization). 200 * <p> 201 * The fractional dissimilarity score (Ds) is in the interval [0, 1], where 202 * 0 means that the sequences are identical and 1 that the sequences are 203 * completely different. 204 * <p> 205 * Gaps do not have a contribution to the similarity score calculation (gap 206 * penalty = 0) 207 * 208 * @param msa 209 * MultipleSequenceAlignment 210 * @param M 211 * SubstitutionMatrix for similarity scoring 212 * @return DistanceMatrix 213 */ 214 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix fractionalDissimilarityScore( 215 MultipleSequenceAlignment<C, D> msa, SubstitutionMatrix<D> M) { 216 217 // Calculate the similarity scores using the alignment package 218 logger.info("{}:{}", "Determing Distances", 0); 219 220 int n = msa.getSize(); 221 DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); 222 int totalloopcount = (n / 2) * (n + 1); 223 int end = msa.getLength(); 224 225 String[] sequenceString = new String[n]; 226 for (int i = 0; i < n; i++) { 227 sequenceString[i] = msa.getAlignedSequence(i + 1) 228 .getSequenceAsString(); 229 } 230 List<C> seqs = msa.getAlignedSequences(); 231 232 int loopcount = 0; 233 for (int i = 0; i < (n - 1); i++) { 234 logger.info("{}:{}", "Determining Distances", (loopcount * 100) 235 / totalloopcount); 236 237 // Obtain the similarity scores 238 for (int j = i; j < n; j++) { 239 240 double score = 0; 241 loopcount++; 242 243 for (int k = 0; k < end; k++) { 244 if (Comparison.isGap(sequenceString[i].charAt(k)) 245 || Comparison.isGap(sequenceString[j].charAt(k))) 246 continue; 247 score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs 248 .get(j).getCompoundAt(k + 1)); 249 } 250 251 if (i == j) 252 DM.setValue(i, j, 0.0); 253 else { 254 double dS = (M.getMaxValue() - score / msa.getLength()) 255 / (M.getMaxValue() - M.getMinValue()); 256 257 DM.setValue(i, j, dS); 258 DM.setValue(j, i, dS); 259 } 260 } 261 } 262 263 return DM; 264 } 265 266 /** 267 * The dissimilarity score is the additive inverse of the similarity score 268 * (sum of scores) between two aligned sequences using a substitution model 269 * (Substitution Matrix). The maximum dissimilarity score is taken to be the 270 * maximum similarity score between self-alignments (each sequence against 271 * itself). Calculation of the score is as follows: 272 * 273 * <pre> 274 * Ds = maxScore - sum<sub>i</sub>(M<sub>ai,bi</sub>) 275 * </pre> 276 * 277 * It is recommended to use the method 278 * {@link #fractionalDissimilarityScore(MultipleSequenceAlignment, SubstitutionMatrix)} 279 * , since the maximum similarity score is not relative to the data set, but 280 * relative to the Substitution Matrix, and the score is normalized by the 281 * alignment length (fractional). 282 * <p> 283 * Gaps do not have a contribution to the similarity score calculation (gap 284 * penalty = 0). 285 * 286 * @param msa 287 * MultipleSequenceAlignment 288 * @param M 289 * SubstitutionMatrix for similarity scoring 290 * @return DistanceMatrix 291 */ 292 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix dissimilarityScore( 293 MultipleSequenceAlignment<C, D> msa, SubstitutionMatrix<D> M) { 294 295 logger.info("{}:{}", "Determing Distances", 0); 296 297 int n = msa.getSize(); 298 String[] sequenceString = new String[n]; 299 for (int i = 0; i < n; i++) { 300 sequenceString[i] = msa.getAlignedSequence(i + 1) 301 .getSequenceAsString(); 302 } 303 List<C> seqs = msa.getAlignedSequences(); 304 305 DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); 306 int totalloopcount = (n / 2) * (n + 1); 307 308 double maxscore = 0; 309 int end = msa.getLength(); 310 int loopcount = 0; 311 312 for (int i = 0; i < (n - 1); i++) { 313 logger.info("{}:{}", "Determining Distances", (loopcount * 100) 314 / totalloopcount); 315 316 // Obtain the similarity scores 317 for (int j = i; j < n; j++) { 318 319 double score = 0; 320 loopcount++; 321 322 for (int k = 0; k < end; k++) { 323 if (Comparison.isGap(sequenceString[i].charAt(k)) 324 || Comparison.isGap(sequenceString[j].charAt(k))) 325 continue; 326 score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs 327 .get(j).getCompoundAt(k + 1)); 328 } 329 330 if (i != j){ 331 score = Math.max(score, 0.0); 332 DM.setValue(i, j, score); 333 } 334 335 if (score > maxscore) 336 maxscore = score; 337 } 338 } 339 340 for (int i = 0; i < n; i++) { 341 DM.setIdentifier(i, msa.getAlignedSequence(i + 1).getAccession() 342 .getID()); 343 344 for (int j = i; j < n; j++) { 345 if (i == j) 346 DM.setValue(i, j, 0.0); 347 else { 348 double dS = Math.max(maxscore - DM.getValue(i, j), 0); 349 DM.setValue(i, j, dS); 350 DM.setValue(j, i, dS); 351 } 352 } 353 } 354 355 logger.info("{}:{}", "Determining Distances", 100); 356 return DM; 357 } 358 359 /** 360 * The PAM (Point Accepted Mutations) distance is a measure of evolutionary 361 * distance in protein sequences. The PAM unit represents an average 362 * substitution rate of 1% per site. The fractional dissimilarity (D) of two 363 * aligned sequences is related with the PAM distance (d) by the equation: 364 * 365 * <pre> 366 * D = sum(fi * (1 - M<sub>ii</sub><sup>d</sup>)) 367 * </pre> 368 * 369 * Where the sum is for all 20 AA, fi denotes the natural fraction of the 370 * given AA and M is the substitution matrix (in this case the PAM1 matrix). 371 * <p> 372 * To calculate the PAM distance between two aligned sequences the maximum 373 * likelihood (ML) approach is used, which consists in finding d that 374 * maximazies the function: 375 * 376 * <pre> 377 * L(d) = product(f<sub>ai</sub> * (1 - M<sub>ai,bi</sub><sup>d</sup>)) 378 * </pre> 379 * 380 * Where the product is for every position i in the alignment, and ai and bi 381 * are the AA at position i in the first and second aligned sequences, 382 * respectively. 383 * 384 * @param msa 385 * MultipleSequenceAlignment 386 * @return 387 */ 388 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix pamMLdistance( 389 MultipleSequenceAlignment<C, D> msa) { 390 391 // Need to import PAM1 matrix to biojava TODO 392 //SubstitutionMatrix<AminoAcidCompound> PAM1 = SubstitutionMatrixHelper.getPAM250(); 393 394 throw new IllegalStateException("PAM ML distance calculation not implemented!"); 395 } 396 397 /** 398 * The structural distance (d<sub>S</sub>) uses the structural similarity 399 * (or dissimilarity) from a the structural alignment of two protein 400 * strutures. It is based on the diffusive model for protein fold evolution 401 * (Grishin 1995). The structural deviations are captured as RMS deviations. 402 * 403 * <pre> 404 * d<sub>Sij</sub> = (rmsd<sub>max</sub><sup>2</sup> / alpha<sup>2</sup>) * 405 * ln( (rmsd<sub>max</sub><sup>2</sup> - rmsd<sub>0</sub><sup>2</sup>) / 406 * (rmsd<sub>max</sub><sup>2</sup> - (rmsd<sub>ij</sub><sup>2</sup>) ) 407 * </pre> 408 * 409 * @param rmsdMat 410 * RMSD matrix for all structure pairs (symmetric matrix) 411 * @param alpha 412 * change in CA positions introduced by a single AA substitution 413 * (Grishin 1995: 1 A) 414 * @param rmsdMax 415 * estimated RMSD between proteins of the same fold when the 416 * percentage of identity is infinitely low (the maximum allowed 417 * RMSD of proteins with the same fold). (Grishin 1995: 5 A) 418 * @param rmsd0 419 * arithmetical mean of squares of the RMSD for identical 420 * proteins (Grishin 1995: 0.4 A) 421 * @return DistanceMatrix 422 */ 423 public static <C extends Sequence<D>, D extends Compound> DistanceMatrix structuralDistance( 424 double[][] rmsdMat, double alpha, double rmsdMax, double rmsd0) { 425 426 int n = rmsdMat.length; 427 DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); 428 429 // Transform raw RMSD into distances and create matrix 430 for (int i = 0; i < n; i++) { 431 for (int j = i; j < n; j++) { 432 if (i == j) 433 DM.setValue(i, j, 0.0); 434 else { 435 double d = (rmsdMax * rmsdMax) 436 / (alpha * alpha) 437 * Math.log((rmsdMax * rmsdMax - rmsd0 * rmsd0) 438 / (rmsdMax * rmsdMax - rmsdMat[i][j] 439 * rmsdMat[i][j])); 440 d = Math.max(d, 0.0); 441 DM.setValue(i, j, d); 442 DM.setValue(j, i, d); 443 } 444 } 445 } 446 447 return DM; 448 } 449 450}