001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.multiple.util;
022
023import java.util.ArrayList;
024import java.util.Arrays;
025import java.util.List;
026
027import org.biojava.nbio.structure.Atom;
028import org.biojava.nbio.structure.Calc;
029import org.biojava.nbio.structure.SVDSuperimposer;
030import org.biojava.nbio.structure.StructureException;
031import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
032import org.biojava.nbio.structure.jama.Matrix;
033
034/**
035 * Utility class for calculating common scores of {@link MultipleAlignment}s.
036 *
037 * @author Spencer Bliven
038 * @author Aleix Lafita
039 * @since 4.1.0
040 *
041 */
042public class MultipleAlignmentScorer {
043
044        // Names for commonly used scores
045        public static final String PROBABILITY = "Probability";
046        public static final String CE_SCORE = "CE-score";
047        public static final String RMSD = "RMSD";
048        public static final String AVGTM_SCORE = "AvgTM-score";
049        public static final String MC_SCORE = "MC-score";
050        public static final String REF_RMSD = "Ref-RMSD";
051        public static final String REFTM_SCORE = "RefTM-score";
052
053        /**
054         * Calculates and puts the RMSD and the average TM-Score of the
055         * MultipleAlignment.
056         *
057         * @param alignment
058         * @throws StructureException
059         * @see #getAvgTMScore(MultipleAlignment)
060         * @see #getRMSD(MultipleAlignment)
061         */
062        public static void calculateScores(MultipleAlignment alignment)
063                        throws StructureException {
064
065                // Put RMSD
066                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
067                alignment.putScore(RMSD, getRMSD(trans));
068
069                // Put AvgTM-Score
070                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
071                for (Atom[] atoms : alignment.getAtomArrays()) {
072                        lengths.add(atoms.length);
073                }
074                alignment.putScore(AVGTM_SCORE, getAvgTMScore(trans, lengths));
075        }
076
077        /**
078         * Calculates the RMSD of all-to-all structure comparisons (distances) of
079         * the given MultipleAlignment.
080         * <p>
081         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
082         * length.
083         * <p>
084         * The formula used is just the sqroot of the average distance of all
085         * possible pairs of atoms. Thus, for R structures aligned over C columns
086         * without gaps, we have
087         *
088         * <pre>
089         * RMSD = \sqrt{1/(C*(R*(R-1)/2)) * \sum_{r1=1}^{R-1}
090         * \sum_{r2=r1+1}^{R-1} \sum_{j=0}^{C-1} (atom[r1][c]-atom[r2][c])^2}
091         * </pre>
092         *
093         * @param alignment
094         * @return double RMSD
095         */
096        public static double getRMSD(MultipleAlignment alignment) {
097                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
098                return getRMSD(trans);
099        }
100
101        /**
102         * Calculates the RMSD of all-to-all structure comparisons (distances),
103         * given a set of superimposed atoms.
104         *
105         * @param transformed
106         * @return double RMSD
107         * @see #getRMSD(MultipleAlignment)
108         */
109        public static double getRMSD(List<Atom[]> transformed) {
110
111                double sumSqDist = 0;
112                int comparisons = 0;
113
114                for (int r1 = 0; r1 < transformed.size(); r1++) {
115                        for (int c = 0; c < transformed.get(r1).length; c++) {
116                                Atom refAtom = transformed.get(r1)[c];
117                                if (refAtom == null)
118                                        continue;
119
120                                double nonNullSqDist = 0;
121                                int nonNullLength = 0;
122                                for (int r2 = r1 + 1; r2 < transformed.size(); r2++) {
123                                        Atom atom = transformed.get(r2)[c];
124                                        if (atom != null) {
125                                                nonNullSqDist += Calc.getDistanceFast(refAtom, atom);
126                                                nonNullLength++;
127                                        }
128                                }
129
130                                if (nonNullLength > 0) {
131                                        comparisons++;
132                                        sumSqDist += nonNullSqDist / nonNullLength;
133                                }
134                        }
135                }
136                return Math.sqrt(sumSqDist / comparisons);
137        }
138
139        /**
140         * /** Calculates the average RMSD from all structures to a reference s
141         * tructure, given a set of superimposed atoms.
142         * <p>
143         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
144         * length.
145         * <p>
146         * For ungapped alignments, this is just the sqroot of the average distance
147         * from an atom to the aligned atom from the reference. Thus, for R
148         * structures aligned over C columns (with structure 0 as the reference), we
149         * have:
150         *
151         * <pre>
152         * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1}
153         * (atom[1][c]-atom[r][c])^2 }
154         * </pre>
155         * <p>
156         * For gapped alignments, null atoms are omitted from consideration, so that
157         * the RMSD is the average over all columns with non-null reference of the
158         * average RMSD within the non-null elements of the column.
159         *
160         * @param alignment
161         *            MultipleAlignment
162         * @param ref
163         *            reference structure index
164         * @return
165         */
166        public static double getRefRMSD(MultipleAlignment alignment, int ref) {
167                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
168                return getRefRMSD(trans, ref);
169        }
170
171        /**
172         * Calculates the average RMSD from all structures to a reference s
173         * tructure, given a set of superimposed atoms.
174         * <p>
175         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
176         * length.
177         * <p>
178         * For ungapped alignments, this is just the sqroot of the average distance
179         * from an atom to the aligned atom from the reference. Thus, for R
180         * structures aligned over C columns (with structure 0 as the reference), we
181         * have:
182         *
183         * <pre>
184         * RefRMSD = \sqrt{ 1/(C*(R-1)) * \sum_{r=1}^{R-1} \sum_{j=0}^{C-1}
185         * (atom[1][c]-atom[r][c])^2 }
186         * </pre>
187         * <p>
188         * For gapped alignments, null atoms are omitted from consideration, so that
189         * the RMSD is the average over all columns with non-null reference of the
190         * average RMSD within the non-null elements of the column.
191         *
192         * @param transformed
193         * @param reference
194         * @return
195         */
196        public static double getRefRMSD(List<Atom[]> transformed, int reference) {
197
198                double sumSqDist = 0;
199                int totalLength = 0;
200                for (int c = 0; c < transformed.get(reference).length; c++) {
201                        Atom refAtom = transformed.get(reference)[c];
202                        if (refAtom == null)
203                                continue;
204
205                        double nonNullSqDist = 0;
206                        int nonNullLength = 0;
207                        for (int r = 0; r < transformed.size(); r++) {
208                                if (r == reference)
209                                        continue;
210                                Atom atom = transformed.get(r)[c];
211                                if (atom != null) {
212                                        nonNullSqDist += Calc.getDistanceFast(refAtom, atom);
213                                        nonNullLength++;
214                                }
215                        }
216
217                        if (nonNullLength > 0) {
218                                totalLength++;
219                                sumSqDist += nonNullSqDist / nonNullLength;
220                        }
221                }
222                return Math.sqrt(sumSqDist / totalLength);
223        }
224
225        /**
226         * Calculates the average TMScore of all the possible pairwise structure
227         * comparisons of the given alignment.
228         * <p>
229         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
230         * length.
231         *
232         * @param alignment
233         * @return double Average TMscore
234         * @throws StructureException
235         */
236        public static double getAvgTMScore(MultipleAlignment alignment)
237                        throws StructureException {
238
239                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
240
241                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
242                for (Atom[] atoms : alignment.getAtomArrays()) {
243                        lengths.add(atoms.length);
244                }
245                return getAvgTMScore(trans, lengths);
246        }
247
248        /**
249         * Calculates the average TMScore all the possible pairwise structure
250         * comparisons of the given a set of superimposed Atoms and the original
251         * structure lengths.
252         * <p>
253         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
254         * length.
255         *
256         * @param transformed
257         *            aligned Atoms transformed
258         * @param lengths
259         *            lengths of the structures in residue number
260         * @return double Average TMscore
261         * @throws StructureException
262         */
263        public static double getAvgTMScore(List<Atom[]> transformed,
264                        List<Integer> lengths) throws StructureException {
265
266                if (transformed.size() != lengths.size())
267                        throw new IllegalArgumentException("Input sizes differ.");
268
269                double sumTM = 0;
270                int comparisons = 0;
271
272                for (int r1 = 0; r1 < transformed.size(); r1++) {
273                        for (int r2 = r1 + 1; r2 < transformed.size(); r2++) {
274                                int len = transformed.get(r1).length;
275                                // Remove nulls from both arrays
276                                Atom[] ref = new Atom[len];
277                                Atom[] aln = new Atom[len];
278                                int nonNullLen = 0;
279                                for (int c = 0; c < len; c++) {
280                                        if (transformed.get(r1)[c] != null
281                                                        && transformed.get(r2)[c] != null) {
282                                                ref[nonNullLen] = transformed.get(r1)[c];
283                                                aln[nonNullLen] = transformed.get(r2)[c];
284                                                nonNullLen++;
285                                        }
286                                }
287                                // Truncate nulls
288                                if (nonNullLen < len) {
289                                        ref = Arrays.copyOf(ref, nonNullLen);
290                                        aln = Arrays.copyOf(aln, nonNullLen);
291                                }
292                                sumTM += SVDSuperimposer.getTMScore(ref, aln, lengths.get(r1),
293                                                lengths.get(r2));
294                                comparisons++;
295                        }
296                }
297                return sumTM / comparisons;
298        }
299
300        /**
301         * Calculates the average TMScore from all structures to a reference
302         * structure, given a set of superimposed atoms.
303         * <p>
304         * Complexity: T(n,l) = O(l*n), if n=number of structures and l=alignment
305         * length.
306         *
307         * @param alignment
308         * @param reference
309         *            Index of the reference structure
310         * @return
311         * @throws StructureException
312         */
313        public static double getRefTMScore(MultipleAlignment alignment, int ref)
314                        throws StructureException {
315
316                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
317
318                List<Integer> lengths = new ArrayList<Integer>(alignment.size());
319                for (Atom[] atoms : alignment.getAtomArrays()) {
320                        lengths.add(atoms.length);
321                }
322                return getRefTMScore(trans, lengths, ref);
323        }
324
325        /**
326         * Calculates the average TMScore from all structures to a reference
327         * structure, given a set of superimposed atoms.
328         * <p>
329         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
330         * length.
331         *
332         * @param transformed
333         *            Arrays of aligned atoms, after superposition
334         * @param lengths
335         *            lengths of the full input structures
336         * @param reference
337         *            Index of the reference structure
338         * @return
339         * @throws StructureException
340         */
341        public static double getRefTMScore(List<Atom[]> transformed,
342                        List<Integer> lengths, int reference) throws StructureException {
343
344                if (transformed.size() != lengths.size())
345                        throw new IllegalArgumentException("Input sizes differ");
346
347                double sumTM = 0;
348                int comparisons = 0;
349
350                int len = transformed.get(reference).length;
351                for (int r = 0; r < transformed.size(); r++) {
352                        if (r == reference)
353                                continue;
354                        // remove nulls from both arrays
355                        Atom[] ref = new Atom[len];
356                        Atom[] aln = new Atom[len];
357                        int nonNullLen = 0;
358                        for (int c = 0; c < len; c++) {
359                                if (transformed.get(reference)[c] != null
360                                                && transformed.get(r)[c] != null) {
361                                        ref[nonNullLen] = transformed.get(reference)[c];
362                                        aln[nonNullLen] = transformed.get(r)[c];
363                                        nonNullLen++;
364                                }
365                        }
366                        // truncate nulls
367                        if (nonNullLen < len) {
368                                ref = Arrays.copyOf(ref, nonNullLen);
369                                aln = Arrays.copyOf(aln, nonNullLen);
370                        }
371                        sumTM += SVDSuperimposer.getTMScore(ref, aln,
372                                        lengths.get(reference), lengths.get(r));
373                        comparisons++;
374                }
375                return sumTM / comparisons;
376        }
377
378        /**
379         * Calculates the MC score, specific for the MultipleAlignment algorithm.
380         * The score function is modified from the original CEMC paper, making it
381         * continuous and differentiable.
382         * <p>
383         * The maximum score of a match is 20, and the penalties for gaps are part
384         * of the input. The half-score distance, d0, is chosen as in the TM-score.
385         * <p>
386         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
387         * length.
388         *
389         * @param alignment
390         * @param gapOpen
391         *            penalty for gap opening (reasonable values are in the range
392         *            (1.0-20.0)
393         * @param gapExtension
394         *            penalty for extending a gap (reasonable values are in the
395         *            range (0.5-10.0)
396         * @param dCutoff
397         *            the distance cutoff
398         * @return the value of the score
399         * @throws StructureException
400         */
401        public static double getMCScore(MultipleAlignment alignment,
402                        double gapOpen, double gapExtension, double dCutoff)
403                        throws StructureException {
404
405                List<Atom[]> trans = MultipleAlignmentTools.transformAtoms(alignment);
406
407                // Calculate d0: same as the one in TM score
408                int minLen = Integer.MAX_VALUE;
409                for (Atom[] atoms : alignment.getAtomArrays())
410                        if (atoms.length < minLen)
411                                minLen = atoms.length;
412                double d0 = 1.24 * Math.cbrt((minLen) - 15.) - 1.8;
413
414                // Calculate the distance cutoff penalization
415                double A = 20.0 / (1 + (dCutoff * dCutoff) / (d0 * d0));
416
417                return getMCScore(trans, d0, gapOpen, gapExtension, A);
418        }
419
420        /**
421         * Calculates the MC score, specific for the MultipleAlignment algorithm.
422         * The score function is modified from the original CEMC paper, making it
423         * continuous and differentiable.
424         * <p>
425         * The maximum score of a match is 20, and the penalties for gaps are part
426         * of the input.
427         * <p>
428         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
429         * length.
430         *
431         * @param transformed
432         *            List of transformed Atom arrays
433         * @param d0
434         *            parameter for the half-score distance
435         * @param gapOpen
436         *            penalty for gap opening (reasonable values are in the range
437         *            (1.0-20.0)
438         * @param gapExtension
439         *            penalty for extending a gap (reasonable values are in the
440         *            range (0.5-10.0)
441         * @param A
442         *            the distance cutoff penalization
443         * @return the value of the score
444         * @throws StructureException
445         */
446        private static double getMCScore(List<Atom[]> trans, double d0,
447                        double gapOpen, double gapExtension, double A)
448                        throws StructureException {
449
450                int size = trans.size();
451                int length = trans.get(0).length;
452                Matrix residueDistances = new Matrix(size, length, -1);
453                double scoreMC = 0.0;
454                int openGaps = 0;
455                int extensionGaps = 0;
456
457                // Calculate the average residue distances
458                for (int r1 = 0; r1 < size; r1++) {
459                        boolean gapped = false;
460                        for (int c = 0; c < trans.get(r1).length; c++) {
461                                Atom refAtom = trans.get(r1)[c];
462                                // Calculate the gap extension and opening on the fly
463                                if (refAtom == null) {
464                                        if (gapped)
465                                                extensionGaps++;
466                                        else {
467                                                gapped = true;
468                                                openGaps++;
469                                        }
470                                        continue;
471                                } else
472                                        gapped = false;
473
474                                for (int r2 = r1 + 1; r2 < size; r2++) {
475                                        Atom atom = trans.get(r2)[c];
476                                        if (atom != null) {
477                                                double distance = Calc.getDistance(refAtom, atom);
478                                                if (residueDistances.get(r1, c) == -1) {
479                                                        residueDistances.set(r1, c, 1 + distance);
480                                                } else {
481                                                        residueDistances.set(r1, c,
482                                                                        residueDistances.get(r1, c) + distance);
483                                                }
484                                                if (residueDistances.get(r2, c) == -1) {
485                                                        residueDistances.set(r2, c, 1 + distance);
486                                                } else {
487                                                        residueDistances.set(r2, c,
488                                                                        residueDistances.get(r2, c) + distance);
489                                                }
490                                        }
491                                }
492                        }
493                }
494
495                for (int c = 0; c < length; c++) {
496                        int nonNullRes = 0;
497                        for (int r = 0; r < size; r++) {
498                                if (residueDistances.get(r, c) != -1)
499                                        nonNullRes++;
500                        }
501                        for (int r = 0; r < size; r++) {
502                                if (residueDistances.get(r, c) != -1) {
503                                        residueDistances.set(r, c, residueDistances.get(r, c)
504                                                        / nonNullRes);
505                                }
506                        }
507                }
508
509                // Sum all the aligned residue scores
510                for (int row = 0; row < size; row++) {
511                        for (int col = 0; col < length; col++) {
512                                if (residueDistances.get(row, col) == -1)
513                                        continue;
514                                double d1 = residueDistances.get(row, col);
515                                double resScore = 20.0 / (1 + (d1 * d1) / (d0 * d0));
516                                scoreMC += resScore - A;
517                        }
518                }
519
520                // Apply the Gap penalty and return
521                return scoreMC - (openGaps * gapOpen + extensionGaps * gapExtension);
522        }
523}