Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.phylo;
022
023import java.io.IOException;
024import java.util.List;
025import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
026import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
027import org.biojava.nbio.core.sequence.template.Compound;
028import org.biojava.nbio.core.sequence.template.Sequence;
029import org.forester.evoinference.distance.PairwiseDistanceCalculator;
030import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
031import org.forester.evoinference.matrix.distance.DistanceMatrix;
032import org.forester.msa.Msa;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036/**
037 * The DistanceMatrixCalculator methods generate a {@link DistanceMatrix} from a
038 * {@link MultipleSequenceAlignment} or other indirect distance infomation (RMSD).
039 *
040 * @author Aleix Lafita
041 * @since 4.1.1
042 *
043 */
044public class DistanceMatrixCalculator {
045
046        private static final Logger logger = LoggerFactory
047                        .getLogger(DistanceMatrixCalculator.class);
048
049        /** Prevent instantiation */
050        private DistanceMatrixCalculator() {}
051
052        /**
053         * The fractional dissimilarity (D) is defined as the percentage of sites
054         * that differ between two aligned sequences. The percentage of identity
055         * (PID) is the fraction of identical sites between two aligned sequences.
056         *
057         * <pre>
058         * D = 1 - PID
059         * </pre>
060         *
061         * The gapped positons in the alignment are ignored in the calculation. This
062         * method is a wrapper to the forester implementation of the calculation:
063         * {@link PairwiseDistanceCalculator#calcFractionalDissimilarities(Msa)}
064         *
065         * @param msa
066         *            MultipleSequenceAlignment
067         * @return DistanceMatrix
068         * @throws Exception
069         */
070        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix fractionalDissimilarity(
071                        MultipleSequenceAlignment<C, D> msa) throws IOException {
072
073                Msa fMsa = ForesterWrapper.convert(msa);
074                DistanceMatrix DM = PairwiseDistanceCalculator
075                                .calcFractionalDissimilarities(fMsa);
076
077                return DM;
078        }
079
080        /**
081         * The Poisson (correction) evolutionary distance (d) is a function of the
082         * fractional dissimilarity (D), given by:
083         *
084         * <pre>
085         * d = -log(1 - D)
086         * </pre>
087         *
088         * The gapped positons in the alignment are ignored in the calculation. This
089         * method is a wrapper to the forester implementation of the calculation:
090         * {@link PairwiseDistanceCalculator#calcPoissonDistances(Msa)}
091         *
092         * @param msa
093         *            MultipleSequenceAlignment
094         * @return DistanceMatrix
095         * @throws IOException
096         */
097        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix poissonDistance(
098                        MultipleSequenceAlignment<C, D> msa) throws IOException {
099
100                Msa fMsa = ForesterWrapper.convert(msa);
101                DistanceMatrix DM = PairwiseDistanceCalculator
102                                .calcPoissonDistances(fMsa);
103
104                return DM;
105        }
106
107        /**
108         * The Kimura evolutionary distance (d) is a correction of the fractional
109         * dissimilarity (D) specially needed for large evolutionary distances. It
110         * is given by:
111         *
112         * <pre>
113         * d = -log(1 - D - 0.2 * D<sup>2</sup>)
114         * </pre>
115         *
116         * The equation is derived by fitting the relationship between the
117         * evolutionary distance (d) and the fractional dissimilarity (D) according
118         * to the PAM model of evolution (it is an empirical approximation for the
119         * method {@link #pamDistance(MultipleSequenceAlignment}). The gapped
120         * positons in the alignment are ignored in the calculation. This method is
121         * a wrapper to the forester implementation of the calculation:
122         * {@link PairwiseDistanceCalculator#calcKimuraDistances(Msa)}.
123         *
124         * @param msa
125         *            MultipleSequenceAlignment
126         * @return DistanceMatrix
127         * @throws IOException
128         */
129        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix kimuraDistance(
130                        MultipleSequenceAlignment<C, D> msa) throws IOException {
131
132                Msa fMsa = ForesterWrapper.convert(msa);
133                DistanceMatrix DM = PairwiseDistanceCalculator
134                                .calcPoissonDistances(fMsa);
135
136                return DM;
137        }
138
139        /**
140         * BioJava implementation for percentage of identity (PID). Although the
141         * name of the method is percentage of identity, the DistanceMatrix contains
142         * the fractional dissimilarity (D), computed as D = 1 - PID.
143         * <p>
144         * It is recommended to use the method
145         * {@link DistanceMatrixCalculator#fractionalDissimilarity(MultipleSequenceAlignment)}
146         * instead of this one.
147         *
148         * @param msa
149         *            MultipleSequenceAlignment
150         * @return DistanceMatrix
151         */
152        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percentageIdentity(
153                        MultipleSequenceAlignment<C, D> msa) {
154
155                logger.info("{}:{}", "Determing Distances", 0);
156                int n = msa.getSize();
157                String[] sequenceString = new String[n];
158                for (int i = 0; i < n; i++) {
159                        sequenceString[i] = msa.getAlignedSequence(i + 1)
160                                        .getSequenceAsString();
161                }
162
163                DistanceMatrix distance = new BasicSymmetricalDistanceMatrix(n);
164                int totalloopcount = (n / 2) * (n + 1);
165
166                int loopcount = 0;
167                for (int i = 0; i < (n - 1); i++) {
168                        logger.info("{}:{}", "Determining Distances", (loopcount * 100)
169                                        / totalloopcount);
170                        distance.setIdentifier(i, msa.getAlignedSequence(i + 1)
171                                        .getAccession().getID());
172
173                        for (int j = i; j < n; j++) {
174                                loopcount++;
175                                if (j == i) {
176                                        distance.setValue(i, j, 0);
177                                } else {
178                                        distance.setValue(i, j, 100 - Comparison.PID(
179                                                        sequenceString[i], sequenceString[j]));
180                                        distance.setValue(j, i, distance.getValue(i, j));
181                                }
182                        }
183                }
184                logger.info("{}:{}", "Determining Distances", 100);
185
186                return distance;
187        }
188
189        /**
190         * The fractional dissimilarity score (Ds) is a relative measure of the
191         * dissimilarity between two aligned sequences. It is calculated as:
192         *
193         * <pre>
194         * Ds = sum( max(M) - M<sub>ai,bi</sub> ) / (max(M)-min(M)) ) / L
195         * </pre>
196         *
197         * Where the sum through i runs for all the alignment positions, ai and bi
198         * are the AA at position i in the first and second aligned sequences,
199         * respectively, and L is the total length of the alignment (normalization).
200         * <p>
201         * The fractional dissimilarity score (Ds) is in the interval [0, 1], where
202         * 0 means that the sequences are identical and 1 that the sequences are
203         * completely different.
204         * <p>
205         * Gaps do not have a contribution to the similarity score calculation (gap
206         * penalty = 0)
207         *
208         * @param msa
209         *            MultipleSequenceAlignment
210         * @param M
211         *            SubstitutionMatrix for similarity scoring
212         * @return DistanceMatrix
213         */
214        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix fractionalDissimilarityScore(
215                        MultipleSequenceAlignment<C, D> msa, SubstitutionMatrix<D> M) {
216
217                // Calculate the similarity scores using the alignment package
218                logger.info("{}:{}", "Determing Distances", 0);
219
220                int n = msa.getSize();
221                DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
222                int totalloopcount = (n / 2) * (n + 1);
223                int end = msa.getLength();
224
225                String[] sequenceString = new String[n];
226                for (int i = 0; i < n; i++) {
227                        sequenceString[i] = msa.getAlignedSequence(i + 1)
228                                        .getSequenceAsString();
229                }
230                List<C> seqs = msa.getAlignedSequences();
231
232                int loopcount = 0;
233                for (int i = 0; i < (n - 1); i++) {
234                        logger.info("{}:{}", "Determining Distances", (loopcount * 100)
235                                        / totalloopcount);
236
237                        // Obtain the similarity scores
238                        for (int j = i; j < n; j++) {
239
240                                double score = 0;
241                                loopcount++;
242
243                                for (int k = 0; k < end; k++) {
244                                        if (Comparison.isGap(sequenceString[i].charAt(k))
245                                                        || Comparison.isGap(sequenceString[j].charAt(k)))
246                                                continue;
247                                        score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
248                                                        .get(j).getCompoundAt(k + 1));
249                                }
250
251                                if (i == j)
252                                        DM.setValue(i, j, 0.0);
253                                else {
254                                        double dS = (M.getMaxValue() - score / msa.getLength())
255                                                        / (M.getMaxValue() - M.getMinValue());
256
257                                        DM.setValue(i, j, dS);
258                                        DM.setValue(j, i, dS);
259                                }
260                        }
261                }
262
263                return DM;
264        }
265
266        /**
267         * The dissimilarity score is the additive inverse of the similarity score
268         * (sum of scores) between two aligned sequences using a substitution model
269         * (Substitution Matrix). The maximum dissimilarity score is taken to be the
270         * maximum similarity score between self-alignments (each sequence against
271         * itself). Calculation of the score is as follows:
272         *
273         * <pre>
274         * Ds = maxScore - sum<sub>i</sub>(M<sub>ai,bi</sub>)
275         * </pre>
276         *
277         * It is recommended to use the method
278         * {@link #fractionalDissimilarityScore(MultipleSequenceAlignment, SubstitutionMatrix)}
279         * , since the maximum similarity score is not relative to the data set, but
280         * relative to the Substitution Matrix, and the score is normalized by the
281         * alignment length (fractional).
282         * <p>
283         * Gaps do not have a contribution to the similarity score calculation (gap
284         * penalty = 0).
285         *
286         * @param msa
287         *            MultipleSequenceAlignment
288         * @param M
289         *            SubstitutionMatrix for similarity scoring
290         * @return DistanceMatrix
291         */
292        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix dissimilarityScore(
293                        MultipleSequenceAlignment<C, D> msa, SubstitutionMatrix<D> M) {
294
295                logger.info("{}:{}", "Determing Distances", 0);
296
297                int n = msa.getSize();
298                String[] sequenceString = new String[n];
299                for (int i = 0; i < n; i++) {
300                        sequenceString[i] = msa.getAlignedSequence(i + 1)
301                                        .getSequenceAsString();
302                }
303                List<C> seqs = msa.getAlignedSequences();
304
305                DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
306                int totalloopcount = (n / 2) * (n + 1);
307
308                double maxscore = 0;
309                int end = msa.getLength();
310                int loopcount = 0;
311
312                for (int i = 0; i < (n - 1); i++) {
313                        logger.info("{}:{}", "Determining Distances", (loopcount * 100)
314                                        / totalloopcount);
315
316                        // Obtain the similarity scores
317                        for (int j = i; j < n; j++) {
318
319                                double score = 0;
320                                loopcount++;
321
322                                for (int k = 0; k < end; k++) {
323                                        if (Comparison.isGap(sequenceString[i].charAt(k))
324                                                        || Comparison.isGap(sequenceString[j].charAt(k)))
325                                                continue;
326                                        score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
327                                                        .get(j).getCompoundAt(k + 1));
328                                }
329
330                                if (i != j){
331                                        score = Math.max(score, 0.0);
332                                        DM.setValue(i, j, score);
333                                }
334
335                                if (score > maxscore)
336                                        maxscore = score;
337                        }
338                }
339
340                for (int i = 0; i < n; i++) {
341                        DM.setIdentifier(i, msa.getAlignedSequence(i + 1).getAccession()
342                                        .getID());
343
344                        for (int j = i; j < n; j++) {
345                                if (i == j)
346                                        DM.setValue(i, j, 0.0);
347                                else {
348                                        double dS = Math.max(maxscore - DM.getValue(i, j), 0);
349                                        DM.setValue(i, j, dS);
350                                        DM.setValue(j, i, dS);
351                                }
352                        }
353                }
354
355                logger.info("{}:{}", "Determining Distances", 100);
356                return DM;
357        }
358
359        /**
360         * The PAM (Point Accepted Mutations) distance is a measure of evolutionary
361         * distance in protein sequences. The PAM unit represents an average
362         * substitution rate of 1% per site. The fractional dissimilarity (D) of two
363         * aligned sequences is related with the PAM distance (d) by the equation:
364         *
365         * <pre>
366         * D = sum(fi * (1 - M<sub>ii</sub><sup>d</sup>))
367         * </pre>
368         *
369         * Where the sum is for all 20 AA, fi denotes the natural fraction of the
370         * given AA and M is the substitution matrix (in this case the PAM1 matrix).
371         * <p>
372         * To calculate the PAM distance between two aligned sequences the maximum
373         * likelihood (ML) approach is used, which consists in finding d that
374         * maximazies the function:
375         *
376         * <pre>
377         * L(d) = product(f<sub>ai</sub> * (1 - M<sub>ai,bi</sub><sup>d</sup>))
378         * </pre>
379         *
380         * Where the product is for every position i in the alignment, and ai and bi
381         * are the AA at position i in the first and second aligned sequences,
382         * respectively.
383         *
384         * @param msa
385         *            MultipleSequenceAlignment
386         * @return
387         */
388        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix pamMLdistance(
389                        MultipleSequenceAlignment<C, D> msa) {
390
391                // Need to import PAM1 matrix to biojava TODO
392                //SubstitutionMatrix<AminoAcidCompound> PAM1 = SubstitutionMatrixHelper.getPAM250();
393
394                throw new IllegalStateException("PAM ML distance calculation not implemented!");
395        }
396
397        /**
398         * The structural distance (d<sub>S</sub>) uses the structural similarity
399         * (or dissimilarity) from a the structural alignment of two protein
400         * strutures. It is based on the diffusive model for protein fold evolution
401         * (Grishin 1995). The structural deviations are captured as RMS deviations.
402         *
403         * <pre>
404         * d<sub>Sij</sub> = (rmsd<sub>max</sub><sup>2</sup> / alpha<sup>2</sup>) *
405         *        ln( (rmsd<sub>max</sub><sup>2</sup> - rmsd<sub>0</sub><sup>2</sup>) /
406         *        (rmsd<sub>max</sub><sup>2</sup> - (rmsd<sub>ij</sub><sup>2</sup>) )
407         * </pre>
408         *
409         * @param rmsdMat
410         *            RMSD matrix for all structure pairs (symmetric matrix)
411         * @param alpha
412         *            change in CA positions introduced by a single AA substitution
413         *            (Grishin 1995: 1 A)
414         * @param rmsdMax
415         *            estimated RMSD between proteins of the same fold when the
416         *            percentage of identity is infinitely low (the maximum allowed
417         *            RMSD of proteins with the same fold). (Grishin 1995: 5 A)
418         * @param rmsd0
419         *            arithmetical mean of squares of the RMSD for identical
420         *            proteins (Grishin 1995: 0.4 A)
421         * @return DistanceMatrix
422         */
423        public static <C extends Sequence<D>, D extends Compound> DistanceMatrix structuralDistance(
424                        double[][] rmsdMat, double alpha, double rmsdMax, double rmsd0) {
425
426                int n = rmsdMat.length;
427                DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
428
429                // Transform raw RMSD into distances and create matrix
430                for (int i = 0; i < n; i++) {
431                        for (int j = i; j < n; j++) {
432                                if (i == j)
433                                        DM.setValue(i, j, 0.0);
434                                else {
435                                        double d = (rmsdMax * rmsdMax)
436                                                        / (alpha * alpha)
437                                                        * Math.log((rmsdMax * rmsdMax - rmsd0 * rmsd0)
438                                                                        / (rmsdMax * rmsdMax - rmsdMat[i][j]
439                                                                                        * rmsdMat[i][j]));
440                                        d = Math.max(d, 0.0);
441                                        DM.setValue(i, j, d);
442                                        DM.setValue(j, i, d);
443                                }
444                        }
445                }
446
447                return DM;
448        }
449
450}