001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.multiple.util;
022
023import java.util.ArrayList;
024import java.util.Arrays;
025import java.util.List;
026
027import org.biojava.nbio.structure.Atom;
028import org.biojava.nbio.structure.Calc;
029import org.biojava.nbio.structure.StructureException;
030import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
031import org.biojava.nbio.structure.jama.Matrix;
032
033/**
034 * Utility class for calculating common scores of {@link MultipleAlignment}s.
035 *
036 * @author Spencer Bliven
037 * @author Aleix Lafita
038 * @since 4.1.0
039 *
040 */
041public class MultipleAlignmentScorer {
042
043        // Names for commonly used scores
044        public static final String PROBABILITY = "Probability";
045        public static final String CE_SCORE = "CE-score";
046        public static final String RMSD = "RMSD";
047        public static final String AVGTM_SCORE = "AvgTM-score";
048        public static final String MC_SCORE = "MC-score";
049        public static final String REF_RMSD = "Ref-RMSD";
050        public static final String REFTM_SCORE = "RefTM-score";
051
052        /**
053         * Calculates and puts the RMSD and the average TM-Score of the
054         * MultipleAlignment.
055         *
056         * @param alignment
057         * @throws StructureException
058         * @see #getAvgTMScore(MultipleAlignment)
059         * @see #getRMSD(MultipleAlignment)
060         */
061        public static void calculateScores(MultipleAlignment alignment)
062                        throws StructureException {
063
064                // Put RMSD
065                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
066                alignment.putScore(RMSD, getRMSD(trans));
067
068                // Put AvgTM-Score
069                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
070                for (Atom[] atoms : alignment.getAtomArrays()) {
071                        lengths.add(atoms.length);
072                }
073                alignment.putScore(AVGTM_SCORE, getAvgTMScore(trans, lengths));
074        }
075
076        /**
077         * Calculates the RMSD of all-to-all structure comparisons (distances) of
078         * the given MultipleAlignment.
079         * <p>
080         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
081         * length.
082         * <p>
083         * The formula used is just the sqroot of the average distance of all
084         * possible pairs of atoms. Thus, for R structures aligned over C columns
085         * without gaps, we have
086         *
087         * <pre>
088         * RMSD = \sqrt{1/(C*(R*(R-1)/2)) * \sum_{r1=1}^{R-1}
089         * \sum_{r2=r1+1}^{R-1} \sum_{j=0}^{C-1} (atom[r1][c]-atom[r2][c])^2}
090         * </pre>
091         *
092         * @param alignment
093         * @return double RMSD
094         */
095        public static double getRMSD(MultipleAlignment alignment) {
096                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
097                return getRMSD(trans);
098        }
099
100        /**
101         * Calculates the RMSD of all-to-all structure comparisons (distances),
102         * given a set of superimposed atoms.
103         *
104         * @param transformed
105         * @return double RMSD
106         * @see #getRMSD(MultipleAlignment)
107         */
108        public static double getRMSD(List<Atom[]> transformed) {
109
110                double sumSqDist = 0;
111                int comparisons = 0;
112
113                for (int r1 = 0; r1 < transformed.size(); r1++) {
114                        for (int c = 0; c < transformed.get(r1).length; c++) {
115                                Atom refAtom = transformed.get(r1)[c];
116                                if (refAtom == null)
117                                        continue;
118
119                                double nonNullSqDist = 0;
120                                int nonNullLength = 0;
121                                for (int r2 = r1 + 1; r2 < transformed.size(); r2++) {
122                                        Atom atom = transformed.get(r2)[c];
123                                        if (atom != null) {
124                                                nonNullSqDist += Calc.getDistanceFast(refAtom, atom);
125                                                nonNullLength++;
126                                        }
127                                }
128
129                                if (nonNullLength > 0) {
130                                        comparisons++;
131                                        sumSqDist += nonNullSqDist / nonNullLength;
132                                }
133                        }
134                }
135                return Math.sqrt(sumSqDist / comparisons);
136        }
137
138        /**
139         * /** Calculates the average RMSD from all structures to a reference s
140         * tructure, given a set of superimposed atoms.
141         * <p>
142         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
143         * length.
144         * <p>
145         * For ungapped alignments, this is just the sqroot of the average distance
146         * from an atom to the aligned atom from the reference. Thus, for R
147         * structures aligned over C columns (with structure 0 as the reference), we
148         * have:
149         *
150         * <pre>
151         * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1}
152         * (atom[1][c]-atom[r][c])^2 }
153         * </pre>
154         * <p>
155         * For gapped alignments, null atoms are omitted from consideration, so that
156         * the RMSD is the average over all columns with non-null reference of the
157         * average RMSD within the non-null elements of the column.
158         *
159         * @param alignment
160         *            MultipleAlignment
161         * @param ref
162         *            reference structure index
163         * @return
164         */
165        public static double getRefRMSD(MultipleAlignment alignment, int ref) {
166                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
167                return getRefRMSD(trans, ref);
168        }
169
170        /**
171         * Calculates the average RMSD from all structures to a reference s
172         * tructure, given a set of superimposed atoms.
173         * <p>
174         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
175         * length.
176         * <p>
177         * For ungapped alignments, this is just the sqroot of the average distance
178         * from an atom to the aligned atom from the reference. Thus, for R
179         * structures aligned over C columns (with structure 0 as the reference), we
180         * have:
181         *
182         * <pre>
183         * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1}
184         * (atom[1][c]-atom[r][c])^2 }
185         * </pre>
186         * <p>
187         * For gapped alignments, null atoms are omitted from consideration, so that
188         * the RMSD is the average over all columns with non-null reference of the
189         * average RMSD within the non-null elements of the column.
190         *
191         * @param transformed
192         * @param reference
193         * @return
194         */
195        public static double getRefRMSD(List<Atom[]> transformed, int reference) {
196
197                double sumSqDist = 0;
198                int totalLength = 0;
199                for (int c = 0; c < transformed.get(reference).length; c++) {
200                        Atom refAtom = transformed.get(reference)[c];
201                        if (refAtom == null)
202                                continue;
203
204                        double nonNullSqDist = 0;
205                        int nonNullLength = 0;
206                        for (int r = 0; r < transformed.size(); r++) {
207                                if (r == reference)
208                                        continue;
209                                Atom atom = transformed.get(r)[c];
210                                if (atom != null) {
211                                        nonNullSqDist += Calc.getDistanceFast(refAtom, atom);
212                                        nonNullLength++;
213                                }
214                        }
215
216                        if (nonNullLength > 0) {
217                                totalLength++;
218                                sumSqDist += nonNullSqDist / nonNullLength;
219                        }
220                }
221                return Math.sqrt(sumSqDist / totalLength);
222        }
223
224        /**
225         * Calculates the average TMScore of all the possible pairwise structure
226         * comparisons of the given alignment.
227         * <p>
228         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
229         * length.
230         *
231         * @param alignment
232         * @return double Average TMscore
233         * @throws StructureException
234         */
235        public static double getAvgTMScore(MultipleAlignment alignment)
236                        throws StructureException {
237
238                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
239
240                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
241                for (Atom[] atoms : alignment.getAtomArrays()) {
242                        lengths.add(atoms.length);
243                }
244                return getAvgTMScore(trans, lengths);
245        }
246
247        /**
248         * Calculates the average TMScore all the possible pairwise structure
249         * comparisons of the given a set of superimposed Atoms and the original
250         * structure lengths.
251         * <p>
252         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
253         * length.
254         *
255         * @param transformed
256         *            aligned Atoms transformed
257         * @param lengths
258         *            lengths of the structures in residue number
259         * @return double Average TMscore
260         * @throws StructureException
261         */
262        public static double getAvgTMScore(List<Atom[]> transformed,
263                        List<Integer> lengths) throws StructureException {
264
265                if (transformed.size() != lengths.size())
266                        throw new IllegalArgumentException("Input sizes differ.");
267
268                double sumTM = 0;
269                int comparisons = 0;
270
271                for (int r1 = 0; r1 < transformed.size(); r1++) {
272                        for (int r2 = r1 + 1; r2 < transformed.size(); r2++) {
273                                int len = transformed.get(r1).length;
274                                // Remove nulls from both arrays
275                                Atom[] ref = new Atom[len];
276                                Atom[] aln = new Atom[len];
277                                int nonNullLen = 0;
278                                for (int c = 0; c < len; c++) {
279                                        if (transformed.get(r1)[c] != null
280                                                        && transformed.get(r2)[c] != null) {
281                                                ref[nonNullLen] = transformed.get(r1)[c];
282                                                aln[nonNullLen] = transformed.get(r2)[c];
283                                                nonNullLen++;
284                                        }
285                                }
286                                // Truncate nulls
287                                if (nonNullLen < len) {
288                                        ref = Arrays.copyOf(ref, nonNullLen);
289                                        aln = Arrays.copyOf(aln, nonNullLen);
290                                }
291                                sumTM += Calc.getTMScore(ref, aln, lengths.get(r1),
292                                                lengths.get(r2));
293                                comparisons++;
294                        }
295                }
296                return sumTM / comparisons;
297        }
298
299        /**
300         * Calculates the average TMScore from all structures to a reference
301         * structure, given a set of superimposed atoms.
302         * <p>
303         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
304         * length.
305         *
306         * @param alignment
307         * @param reference
308         *            Index of the reference structure
309         * @return
310         * @throws StructureException
311         */
312        public static double getRefTMScore(MultipleAlignment alignment, int ref)
313                        throws StructureException {
314
315                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
316
317                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
318                for (Atom[] atoms : alignment.getAtomArrays()) {
319                        lengths.add(atoms.length);
320                }
321                return getRefTMScore(trans, lengths, ref);
322        }
323
324        /**
325         * Calculates the average TMScore from all structures to a reference
326         * structure, given a set of superimposed atoms.
327         * <p>
328         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
329         * length.
330         *
331         * @param transformed
332         *            Arrays of aligned atoms, after superposition
333         * @param lengths
334         *            lengths of the full input structures
335         * @param reference
336         *            Index of the reference structure
337         * @return
338         * @throws StructureException
339         */
340        public static double getRefTMScore(List<Atom[]> transformed,
341                        List<Integer> lengths, int reference) throws StructureException {
342
343                if (transformed.size() != lengths.size())
344                        throw new IllegalArgumentException("Input sizes differ");
345
346                double sumTM = 0;
347                int comparisons = 0;
348
349                int len = transformed.get(reference).length;
350                for (int r = 0; r < transformed.size(); r++) {
351                        if (r == reference)
352                                continue;
353                        // remove nulls from both arrays
354                        Atom[] ref = new Atom[len];
355                        Atom[] aln = new Atom[len];
356                        int nonNullLen = 0;
357                        for (int c = 0; c < len; c++) {
358                                if (transformed.get(reference)[c] != null
359                                                && transformed.get(r)[c] != null) {
360                                        ref[nonNullLen] = transformed.get(reference)[c];
361                                        aln[nonNullLen] = transformed.get(r)[c];
362                                        nonNullLen++;
363                                }
364                        }
365                        // truncate nulls
366                        if (nonNullLen < len) {
367                                ref = Arrays.copyOf(ref, nonNullLen);
368                                aln = Arrays.copyOf(aln, nonNullLen);
369                        }
370                        sumTM += Calc.getTMScore(ref, aln,
371                                        lengths.get(reference), lengths.get(r));
372                        comparisons++;
373                }
374                return sumTM / comparisons;
375        }
376
377        /**
378         * Calculates the MC score, specific for the MultipleAlignment algorithm.
379         * The score function is modified from the original CEMC paper, making it
380         * continuous and differentiable.
381         * <p>
382         * The maximum score of a match is 20, and the penalties for gaps are part
383         * of the input. The half-score distance, d0, is chosen as in the TM-score.
384         * <p>
385         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
386         * length.
387         *
388         * @param alignment
389         * @param gapOpen
390         *            penalty for gap opening (reasonable values are in the range
391         *            (1.0-20.0)
392         * @param gapExtension
393         *            penalty for extending a gap (reasonable values are in the
394         *            range (0.5-10.0)
395         * @param dCutoff
396         *            the distance cutoff
397         * @return the value of the score
398         * @throws StructureException
399         */
400        public static double getMCScore(MultipleAlignment alignment,
401                        double gapOpen, double gapExtension, double dCutoff)
402                        throws StructureException {
403
404                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
405
406                // Calculate d0: same as the one in TM score
407                int minLen = Integer.MAX_VALUE;
408                for (Atom[] atoms : alignment.getAtomArrays())
409                        if (atoms.length < minLen)
410                                minLen = atoms.length;
411                double d0 = 1.24 * Math.cbrt((minLen) - 15.) - 1.8;
412
413                // Calculate the distance cutoff penalization
414                double A = 20.0 / (1 + (dCutoff * dCutoff) / (d0 * d0));
415
416                return getMCScore(trans, d0, gapOpen, gapExtension, A);
417        }
418
419        /**
420         * Calculates the MC score, specific for the MultipleAlignment algorithm.
421         * The score function is modified from the original CEMC paper, making it
422         * continuous and differentiable.
423         * <p>
424         * The maximum score of a match is 20, and the penalties for gaps are part
425         * of the input.
426         * <p>
427         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
428         * length.
429         *
430         * @param transformed
431         *            List of transformed Atom arrays
432         * @param d0
433         *            parameter for the half-score distance
434         * @param gapOpen
435         *            penalty for gap opening (reasonable values are in the range
436         *            (1.0-20.0)
437         * @param gapExtension
438         *            penalty for extending a gap (reasonable values are in the
439         *            range (0.5-10.0)
440         * @param A
441         *            the distance cutoff penalization
442         * @return the value of the score
443         * @throws StructureException
444         */
445        private static double getMCScore(List<Atom[]> trans, double d0,
446                        double gapOpen, double gapExtension, double A)
447                        throws StructureException {
448
449                int size = trans.size();
450                int length = trans.get(0).length;
451                Matrix residueDistances = new Matrix(size, length, -1);
452                double scoreMC = 0.0;
453                int openGaps = 0;
454                int extensionGaps = 0;
455
456                // Calculate the average residue distances
457                for (int r1 = 0; r1 < size; r1++) {
458                        boolean gapped = false;
459                        for (int c = 0; c < trans.get(r1).length; c++) {
460                                Atom refAtom = trans.get(r1)[c];
461                                // Calculate the gap extension and opening on the fly
462                                if (refAtom == null) {
463                                        if (gapped)
464                                                extensionGaps++;
465                                        else {
466                                                gapped = true;
467                                                openGaps++;
468                                        }
469                                        continue;
470                                } else
471                                        gapped = false;
472
473                                for (int r2 = r1 + 1; r2 < size; r2++) {
474                                        Atom atom = trans.get(r2)[c];
475                                        if (atom != null) {
476                                                double distance = Calc.getDistance(refAtom, atom);
477                                                if (residueDistances.get(r1, c) == -1) {
478                                                        residueDistances.set(r1, c, 1 + distance);
479                                                } else {
480                                                        residueDistances.set(r1, c,
481                                                                        residueDistances.get(r1, c) + distance);
482                                                }
483                                                if (residueDistances.get(r2, c) == -1) {
484                                                        residueDistances.set(r2, c, 1 + distance);
485                                                } else {
486                                                        residueDistances.set(r2, c,
487                                                                        residueDistances.get(r2, c) + distance);
488                                                }
489                                        }
490                                }
491                        }
492                }
493
494                for (int c = 0; c < length; c++) {
495                        int nonNullRes = 0;
496                        for (int r = 0; r < size; r++) {
497                                if (residueDistances.get(r, c) != -1)
498                                        nonNullRes++;
499                        }
500                        for (int r = 0; r < size; r++) {
501                                if (residueDistances.get(r, c) != -1) {
502                                        residueDistances.set(r, c, residueDistances.get(r, c)
503                                                        / nonNullRes);
504                                }
505                        }
506                }
507
508                // Sum all the aligned residue scores
509                for (int row = 0; row < size; row++) {
510                        for (int col = 0; col < length; col++) {
511                                if (residueDistances.get(row, col) == -1)
512                                        continue;
513                                double d1 = residueDistances.get(row, col);
514                                double resScore = 20.0 / (1 + (d1 * d1) / (d0 * d0));
515                                scoreMC += resScore - A;
516                        }
517                }
518
519                // Apply the Gap penalty and return
520                return scoreMC - (openGaps * gapOpen + extensionGaps * gapExtension);
521        }
522}