001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.multiple.util;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.Collections;
027import java.util.Comparator;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Map;
031import java.util.SortedSet;
032import java.util.TreeSet;
033
034import javax.vecmath.Matrix4d;
035
036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
037import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
038import org.biojava.nbio.core.sequence.AccessionID;
039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
040import org.biojava.nbio.core.sequence.ProteinSequence;
041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
042import org.biojava.nbio.phylo.DistanceMatrixCalculator;
043import org.biojava.nbio.phylo.TreeConstructor;
044import org.biojava.nbio.phylo.TreeConstructorType;
045import org.biojava.nbio.structure.AminoAcid;
046import org.biojava.nbio.structure.Atom;
047import org.biojava.nbio.structure.Calc;
048import org.biojava.nbio.structure.Group;
049import org.biojava.nbio.structure.StructureTools;
050import org.biojava.nbio.structure.align.multiple.Block;
051import org.biojava.nbio.structure.align.multiple.BlockSet;
052import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
053import org.biojava.nbio.structure.jama.Matrix;
054import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
055import org.forester.phylogeny.Phylogeny;
056
057/**
058 * Utility functions for working with {@link MultipleAlignment}.
059 * <p>
060 * Supported functions:
061 * <ul>
062 * <li>Obtain the alignment as sequence strings
063 * <li>Map from sequence alignment position to structure Atom
064 * <li>Map from sequence alignment position to Block number
065 * <li>Transform the aligned Atoms of a MultipleAlignment
066 * <li>Get all the core alignment positions of the alignment
067 * <li>Calculate the average residue distance of all aligned positions
068 * <li>Sort Blocks in a MultipleAlignment by a specified row
069 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment
070 * </ul>
071 *
072 * @author Spencer Bliven
073 * @author Aleix Lafita
074 * @since 4.1.0
075 *
076 */
077public class MultipleAlignmentTools {
078
079        /**
080         * Calculate the sequence alignment Strings for the whole alignment. This
081         * method creates a sequence alignment where aligned residues are in
082         * uppercase and unaligned residues are in lowercase, thus providing a more
083         * compact way to represent the alignment.
084         * <p>
085         * Blocks are concatenated in the order returned by
086         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
087         * sequential. Gaps are represented by '-'. Separation between different
088         * Blocks is indicated by a gap in all positions, meaning that there is a
089         * possible discontinuity.
090         *
091         * @param alignment
092         *            input MultipleAlignment
093         * @param mapSeqToStruct
094         *            provides a link from the sequence alignment position to the
095         *            structure alignment position. Specially designed for the GUI.
096         *            Has to be initialized previously and will be overwritten.
097         * @return a string for each row in the alignment, giving the 1-letter code
098         *         for each aligned residue.
099         */
100        public static List<String> getSequenceAlignment(
101                        MultipleAlignment alignment, final List<Integer> mapSeqToStruct) {
102
103                // Initialize sequence variables
104                List<String> alnSequences = new ArrayList<String>();
105                for (int str = 0; str < alignment.size(); str++)
106                        alnSequences.add("");
107                mapSeqToStruct.clear();
108                List<Atom[]> atoms = alignment.getAtomArrays();
109                int globalPos = -1;
110
111                // Initialize helper variables in constucting the sequence alignment
112                List<SortedSet<Integer>> freePool = new ArrayList<SortedSet<Integer>>();
113                List<SortedSet<Integer>> blockStarts = new ArrayList<SortedSet<Integer>>();
114                List<List<Integer>> aligned = new ArrayList<List<Integer>>();
115
116                // Generate freePool residues from the ones not aligned
117                for (int i = 0; i < alignment.size(); i++) {
118                        List<Integer> residues = new ArrayList<Integer>();
119                        freePool.add(new TreeSet<Integer>());
120                        blockStarts.add(new TreeSet<Integer>());
121                        for (BlockSet bs : alignment.getBlockSets()) {
122                                for (Block b : bs.getBlocks()) {
123                                        boolean first = true;
124                                        for (int l = 0; l < b.length(); l++) {
125                                                Integer residue = b.getAlignRes().get(i).get(l);
126                                                if (residue != null) {
127                                                        if (first)
128                                                                blockStarts.get(i).add(residue);
129                                                        residues.add(residue);
130                                                        first = false;
131                                                }
132                                        }
133                                }
134                        }
135                        aligned.add(residues);
136                }
137                // Add any residue not aligned to the free pool for every structure
138                for (int i = 0; i < alignment.size(); i++) {
139                        for (int k = 0; k < atoms.get(i).length; k++) {
140                                if (!aligned.get(i).contains(k))
141                                        freePool.get(i).add(k);
142                        }
143                }
144
145                for (int b = 0; b < alignment.getBlocks().size(); b++) {
146                        if (b != 0) {
147                                // Add a gap to all structures to separate visually the Blocks
148                                for (int str = 0; str < alignment.size(); str++)
149                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
150                                mapSeqToStruct.add(-1); // means unaligned position
151                        }
152                        // Store the previous position added to the sequence alignment
153                        int[] previousPos = new int[alignment.size()];
154                        Arrays.fill(previousPos, -1);
155                        // Store provisional characters
156                        char[] provisionalChar = new char[alignment.size()];
157                        Arrays.fill(provisionalChar, '-');
158
159                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
160                                globalPos++;
161                                boolean gaps = true; // true if consecutive with the previous
162                                while (gaps) {
163                                        gaps = false;
164                                        // Loop through all the structures
165                                        for (int str = 0; str < alignment.size(); str++) {
166                                                // If it is the first position or before it was null
167                                                if (previousPos[str] == -1) {
168                                                        Integer residue = alignment.getBlocks().get(b)
169                                                                        .getAlignRes().get(str).get(pos);
170                                                        if (residue == null)
171                                                                provisionalChar[str] = '-';
172                                                        else {
173                                                                Atom a = atoms.get(str)[residue];
174                                                                String group = a.getGroup().getPDBName();
175                                                                provisionalChar[str] = StructureTools
176                                                                                .get1LetterCode(group);
177                                                        }
178                                                } else {
179                                                        Integer residue = alignment.getBlocks().get(b)
180                                                                        .getAlignRes().get(str).get(pos);
181                                                        int nextPos = previousPos[str] + 1;
182                                                        if (residue == null) {
183                                                                if (freePool.get(str).contains(nextPos)) {
184                                                                        Atom a = atoms.get(str)[nextPos];
185                                                                        String g = a.getGroup().getPDBName();
186                                                                        char aa = StructureTools.get1LetterCode(g);
187                                                                        provisionalChar[str] = Character
188                                                                                        .toLowerCase(aa);
189                                                                } else
190                                                                        provisionalChar[str] = '-';
191                                                        } else if (nextPos == residue) {
192                                                                Atom a = atoms.get(str)[nextPos];
193                                                                String group = a.getGroup().getPDBName();
194                                                                provisionalChar[str] = StructureTools
195                                                                                .get1LetterCode(group);
196                                                        } else {
197                                                                // This means non-consecutive
198                                                                provisionalChar[str] = ' ';
199                                                                gaps = true;
200                                                        }
201                                                }
202                                        }// End all structure analysis
203
204                                        if (gaps) {
205                                                for (int str = 0; str < alignment.size(); str++) {
206                                                        if (provisionalChar[str] == ' ') {
207                                                                // It means this residue was non-consecutive
208                                                                Atom a = atoms.get(str)[previousPos[str] + 1];
209                                                                String group = a.getGroup().getPDBName();
210                                                                char aa = StructureTools.get1LetterCode(group);
211                                                                alnSequences
212                                                                                .set(str,
213                                                                                                alnSequences
214                                                                                                                .get(str)
215                                                                                                                .concat(""
216                                                                                                                                + Character
217                                                                                                                                                .toLowerCase(aa)));
218                                                                previousPos[str]++;
219                                                        } else {
220                                                                // Insert a gap otherwise
221                                                                alnSequences.set(str, alnSequences.get(str)
222                                                                                .concat("-"));
223                                                        }
224                                                }
225                                                mapSeqToStruct.add(-1); // unaligned position
226                                        } else {
227                                                // Add provisional and update the indices
228                                                for (int str = 0; str < alignment.size(); str++) {
229                                                        alnSequences.set(
230                                                                        str,
231                                                                        alnSequences.get(str).concat(
232                                                                                        "" + provisionalChar[str]));
233
234                                                        if (provisionalChar[str] != '-') {
235                                                                if (alignment.getBlocks().get(b).getAlignRes()
236                                                                                .get(str).get(pos) == null) {
237                                                                        previousPos[str]++;
238                                                                } else {
239                                                                        previousPos[str] = alignment.getBlocks()
240                                                                                        .get(b).getAlignRes().get(str)
241                                                                                        .get(pos);
242                                                                }
243                                                        }
244                                                }
245                                                mapSeqToStruct.add(globalPos); // alignment index
246                                        }
247                                }
248                        } // All positions in the Block considered so far
249
250                        // Calculate the index of the next Block for every structure
251                        int[] blockEnds = new int[alignment.size()];
252                        for (int str = 0; str < alignment.size(); str++) {
253                                for (int res : blockStarts.get(str)) {
254                                        if (previousPos[str] > res)
255                                                blockEnds[str] = res;
256                                        else {
257                                                blockEnds[str] = res;
258                                                break;
259                                        }
260                                }
261                        }
262
263                        // Add the unaligned residues in between Blocks (lowercase)
264                        boolean allGaps = false; // true means no more residues to add
265                        while (!allGaps) {
266                                allGaps = true;
267                                for (int str = 0; str < alignment.size(); str++) {
268                                        if (previousPos[str] + 1 < blockEnds[str]) {
269                                                Atom a = atoms.get(str)[previousPos[str] + 1];
270                                                String group = a.getGroup().getPDBName();
271                                                char letter = StructureTools.get1LetterCode(group);
272
273                                                provisionalChar[str] = Character.toLowerCase(letter);
274                                                previousPos[str]++;
275                                                allGaps = false;
276                                        } else
277                                                provisionalChar[str] = '-';
278                                }
279                                if (!allGaps) {
280                                        for (int str = 0; str < alignment.size(); str++) {
281                                                alnSequences.set(
282                                                                str,
283                                                                alnSequences.get(str).concat(
284                                                                                "" + provisionalChar[str]));
285                                        }
286                                        mapSeqToStruct.add(-1); // unaligned position
287                                }
288                        }
289                }
290                return alnSequences;
291        }
292
293        /**
294         * Calculate the sequence alignment Strings for the whole alignment. This
295         * method creates a sequence alignment where aligned residues are in
296         * uppercase and unaligned residues are in lowercase, thus providing a more
297         * compact way to represent the alignment.
298         * <p>
299         * Blocks are concatenated in the order returned by
300         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
301         * sequential. Gaps are represented by '-'. Separation between different
302         * Blocks is indicated by a gap in all positions, meaning that there is a
303         * possible discontinuity.
304         *
305         * @param alignment
306         *            input MultipleAlignment
307         * @return String for each row in the alignment, giving the 1-letter code
308         *         for each aligned residue.
309         */
310        public static List<String> getSequenceAlignment(MultipleAlignment msa) {
311                return getSequenceAlignment(msa, new ArrayList<Integer>());
312        }
313
314        /**
315         * Calculate the sequence alignment Strings for the alignment Blocks in an
316         * alignment. This method creates a sequence alignment where all residues
317         * are in uppercase and a residue alone with gaps in all the other
318         * structures represents unaligned residues. Because of this latter
319         * constraint only the residues within the Blocks are represented, for a
320         * more compact alignment. For a sequence alignment of the full protein use
321         * {@link #getSequenceAlignment(MultipleAlignment)}.
322         * <p>
323         * Blocks are concatenated in the order returned by
324         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
325         * sequential. Gaps between blocks are omitted, while gaps within blocks are
326         * represented by '-'. Separation between different Blocks is indicated by a
327         * gap in all positions, meaning that there is something unaligned
328         * inbetween.
329         *
330         * @param alignment
331         *            input MultipleAlignment
332         * @param mapSeqToStruct
333         *            provides a link from the sequence alignment position to the
334         *            structure alignment position. Specially designed for the GUI.
335         *            Has to be initialized previously and will be overwritten.
336         * @return a string for each row in the alignment, giving the 1-letter code
337         *         for each aligned residue.
338         */
339        public static List<String> getBlockSequenceAlignment(
340                        MultipleAlignment alignment, List<Integer> mapSeqToStruct) {
341
342                // Initialize sequence variables
343                List<String> alnSequences = new ArrayList<String>();
344                for (int str = 0; str < alignment.size(); str++)
345                        alnSequences.add("");
346                mapSeqToStruct.clear();
347                List<Atom[]> atoms = alignment.getAtomArrays();
348                int globalPos = -1;
349
350                // Loop through all the alignment Blocks in the order given
351                for (int b = 0; b < alignment.getBlocks().size(); b++) {
352                        if (b != 0) {
353                                // Add a gap to all structures to separate Blocks
354                                for (int str = 0; str < alignment.size(); str++)
355                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
356                                mapSeqToStruct.add(-1); // means unaligned position
357                        }
358
359                        // Store the previous position added to the sequence alignment
360                        int[] previousPos = new int[alignment.size()];
361                        Arrays.fill(previousPos, -1);
362                        // Store provisional characters
363                        char[] provisionalChar = new char[alignment.size()];
364                        Arrays.fill(provisionalChar, '-');
365
366                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
367                                globalPos++;
368                                boolean gaps = true;
369                                while (gaps) {
370                                        gaps = false;
371                                        // Loop through all the structures
372                                        for (int str = 0; str < alignment.size(); str++) {
373                                                // If it is the first position or before it was null
374                                                if (previousPos[str] == -1) {
375                                                        Integer residue = alignment.getBlocks().get(b)
376                                                                        .getAlignRes().get(str).get(pos);
377                                                        if (residue == null)
378                                                                provisionalChar[str] = '-';
379                                                        else {
380                                                                Atom a = atoms.get(str)[residue];
381                                                                String g = a.getGroup().getPDBName();
382                                                                char aa = StructureTools.get1LetterCode(g);
383                                                                provisionalChar[str] = aa;
384                                                        }
385                                                } else {
386                                                        Integer residue = alignment.getBlocks().get(b)
387                                                                        .getAlignRes().get(str).get(pos);
388                                                        if (residue == null)
389                                                                provisionalChar[str] = '-';
390                                                        else if (previousPos[str] + 1 == residue) {
391                                                                Atom a = atoms.get(str)[residue];
392                                                                String g = a.getGroup().getPDBName();
393                                                                char aa = StructureTools.get1LetterCode(g);
394                                                                provisionalChar[str] = aa;
395                                                        } else {
396                                                                provisionalChar[str] = ' ';
397                                                                gaps = true;
398                                                        }
399                                                }
400                                        }// End all structures analysis
401
402                                        if (gaps) {
403                                                for (int str = 0; str < alignment.size(); str++) {
404                                                        if (provisionalChar[str] == ' ') {
405                                                                // It means this residue was non-consecutive
406                                                                for (int s2 = 0; s2 < alignment.size(); s2++) {
407                                                                        if (str == s2) {
408                                                                                int next = previousPos[str] + 1;
409                                                                                Atom a = atoms.get(s2)[next];
410                                                                                String g = a.getGroup().getPDBName();
411                                                                                char aa = StructureTools
412                                                                                                .get1LetterCode(g);
413                                                                                alnSequences.set(
414                                                                                                s2,
415                                                                                                alnSequences.get(s2).concat(
416                                                                                                                "" + aa));
417                                                                        } else {
418                                                                                alnSequences.set(s2,
419                                                                                                alnSequences.get(s2)
420                                                                                                                .concat("-"));
421                                                                        }
422                                                                }
423                                                                mapSeqToStruct.add(-1); // unaligned
424                                                                previousPos[str] += 1;
425                                                        }
426                                                }
427                                        } else { // Append the provisional and update the indices
428                                                for (int str = 0; str < alignment.size(); str++) {
429                                                        alnSequences.set(
430                                                                        str,
431                                                                        alnSequences.get(str).concat(
432                                                                                        "" + provisionalChar[str]));
433                                                        if (provisionalChar[str] != '-') {
434                                                                previousPos[str] = alignment.getBlocks().get(b)
435                                                                                .getAlignRes().get(str).get(pos);
436                                                        }
437                                                }
438                                                mapSeqToStruct.add(globalPos);
439                                        }
440                                }
441                        }
442                }
443                return alnSequences;
444        }
445
446        /**
447         * Calculate the sequence alignment Strings for the alignment Blocks in an
448         * alignment. This method creates a sequence alignment where all residues
449         * are in uppercase and a residue alone with gaps in all the other
450         * structures represents unaligned residues. Because of this latter
451         * constraint only the residues within the Blocks are represented, for a
452         * more compact alignment. For a sequence alignment of the full protein use
453         * {@link #getSequenceAlignment(MultipleAlignment)}.
454         * <p>
455         * Blocks are concatenated in the order returned by
456         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
457         * sequential. Gaps between blocks are omitted, while gaps within blocks are
458         * represented by '-'. Separation between different Blocks is indicated by a
459         * gap in all positions, meaning that there is something unaligned
460         * inbetween.
461         *
462         * @param alignment
463         *            input MultipleAlignment
464         * @return String for each row in the alignment, giving the 1-letter code
465         *         for each aligned residue.
466         */
467        public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) {
468                return getBlockSequenceAlignment(ma, new ArrayList<Integer>());
469        }
470
471        /**
472         * Returns the Atom of the specified structure that is aligned in the
473         * sequence alignment position specified.
474         *
475         * @param multAln
476         *            the MultipleAlignment object from where the sequence alignment
477         *            has been generated
478         * @param mapSeqToStruct
479         *            the mapping between sequence and structure generated with the
480         *            sequence alignment
481         * @param str
482         *            the structure index of the alignment (row)
483         * @param sequencePos
484         *            the sequence alignment position (column)
485         * @return Atom the atom in that position or null if there is a gap
486         */
487        public static Atom getAtomForSequencePosition(MultipleAlignment msa,
488                        List<Integer> mapSeqToStruct, int str, int sequencePos) {
489
490                int seqPos = mapSeqToStruct.get(sequencePos);
491
492                // Check if the position selected is an aligned position
493                if (seqPos == -1)
494                        return null;
495                else {
496                        Atom a = null;
497                        // Calculate the corresponding structure position
498                        int sum = 0;
499                        for (Block b : msa.getBlocks()) {
500                                if (sum + b.length() <= seqPos) {
501                                        sum += b.length();
502                                        continue;
503                                } else {
504                                        for (Integer p : b.getAlignRes().get(str)) {
505                                                if (sum == seqPos) {
506                                                        if (p != null) {
507                                                                a = msa.getAtomArrays().get(str)[p];
508                                                        }
509                                                        break;
510                                                }
511                                                sum++;
512                                        }
513                                        break;
514                                }
515                        }
516                        return a;
517                }
518        }
519
520        /**
521         * Returns the block number of a specified position in the sequence
522         * alignment, given the mapping from structure to function.
523         *
524         * @param multAln
525         *            the MultipleAlignment object from where the sequence alignment
526         *            has been generated.
527         * @param mapSeqToStruct
528         *            the mapping between sequence and structure generated with the
529         *            sequence alignment
530         * @param sequencePos
531         *            the position in the sequence alignment (column)
532         * @return int the block index, or -1 if the position is not aligned
533         */
534        public static int getBlockForSequencePosition(MultipleAlignment multAln,
535                        List<Integer> mapSeqToStruct, int sequencePos) {
536
537                int seqPos = mapSeqToStruct.get(sequencePos);
538                // Check if the position selected is an aligned position
539                if (seqPos == -1)
540                        return -1;
541                else {
542                        // Calculate the corresponding block (by iterating all Blocks)
543                        int sum = 0;
544                        int block = 0;
545                        for (Block b : multAln.getBlocks()) {
546                                if (sum + b.length() <= seqPos) {
547                                        sum += b.length();
548                                        block++;
549                                        continue;
550                                } else
551                                        break;
552                        }
553                        return block;
554                }
555        }
556
557        /**
558         * The average residue distance Matrix contains the average distance from
559         * each residue to all other residues aligned with it.
560         * <p>
561         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
562         * length.
563         *
564         * @param alignment
565         *            MultipleAlignment
566         * @return Matrix containing all average residue distances
567         */
568        public static Matrix getAverageResidueDistances(MultipleAlignment msa) {
569                List<Atom[]> transformed = transformAtoms(msa);
570                return getAverageResidueDistances(transformed);
571        }
572
573        /**
574         * The average residue distance Matrix contains the average distance from
575         * each residue to all other residues aligned with it.
576         * <p>
577         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
578         * length.
579         *
580         * @param transformed
581         *            List of Atom arrays containing only the aligned atoms of each
582         *            structure, or null if there is a gap.
583         * @return Matrix containing all average residue distances. Entry -1 means
584         *         there is a gap in the position.
585         */
586        public static Matrix getAverageResidueDistances(List<Atom[]> transformed) {
587
588                int size = transformed.size();
589                int length = transformed.get(0).length;
590                Matrix resDist = new Matrix(size, length, -1);
591
592                // Calculate the average residue distances
593                for (int r1 = 0; r1 < size; r1++) {
594                        for (int c = 0; c < transformed.get(r1).length; c++) {
595                                Atom refAtom = transformed.get(r1)[c];
596                                if (refAtom == null)
597                                        continue;
598
599                                for (int r2 = r1 + 1; r2 < size; r2++) {
600                                        Atom atom = transformed.get(r2)[c];
601                                        if (atom != null) {
602                                                double distance = Calc.getDistance(refAtom, atom);
603
604                                                if (resDist.get(r1, c) == -1) {
605                                                        resDist.set(r1, c, 1 + distance);
606                                                } else {
607                                                        resDist.set(r1, c, resDist.get(r1, c) + distance);
608                                                }
609
610                                                if (resDist.get(r2, c) == -1) {
611                                                        resDist.set(r2, c, 1 + distance);
612                                                } else {
613                                                        resDist.set(r2, c, resDist.get(r2, c) + distance);
614                                                }
615                                        }
616                                }
617                        }
618                }
619
620                for (int c = 0; c < length; c++) {
621                        int nonNullRes = 0;
622                        for (int r = 0; r < size; r++) {
623                                if (resDist.get(r, c) != -1)
624                                        nonNullRes++;
625                        }
626                        for (int r = 0; r < size; r++) {
627                                if (resDist.get(r, c) != -1) {
628                                        resDist.set(r, c, resDist.get(r, c) / nonNullRes);
629                                }
630                        }
631                }
632                return resDist;
633        }
634
635        /**
636         * Transforms atoms according to the superposition stored in the alignment.
637         * <p>
638         * For each structure in the alignment, returns an atom for each
639         * representative atom in the aligned columns, omitting unaligned residues
640         * (i.e. an array of length <tt>alignment.length()</tt> ).
641         * <p>
642         * All blocks are concatenated together, so Atoms may not appear in the same
643         * order as in their parent structure. If the alignment blocks contain null
644         * residues (gaps), then the returned array will also contain null Atoms in
645         * the same positions.
646         *
647         * @param alignment
648         *            MultipleAlignment
649         * @return List of Atom arrays of only the aligned atoms of every structure
650         *         (null Atom if a gap position)
651         */
652        public static List<Atom[]> transformAtoms(MultipleAlignment alignment) {
653                if (alignment.getEnsemble() == null) {
654                        throw new NullPointerException("No ensemble set for this alignment");
655                }
656
657                List<Atom[]> atomArrays = alignment.getAtomArrays();
658                List<Atom[]> transformed = new ArrayList<Atom[]>(atomArrays.size());
659
660                // Loop through structures
661                for (int i = 0; i < atomArrays.size(); i++) {
662
663                        Matrix4d transform = null;
664                        Atom[] curr = atomArrays.get(i); // all CA atoms from structure
665
666                        // Concatenated list of all blocks for this structure
667                        Atom[] transformedAtoms = new Atom[alignment.length()];
668                        int transformedAtomsLength = 0;
669
670                        // Each blockset gets transformed independently
671                        for (BlockSet bs : alignment.getBlockSets()) {
672
673                                Atom[] blocksetAtoms = new Atom[bs.length()];
674                                int blockPos = 0;
675
676                                for (Block blk : bs.getBlocks()) {
677                                        if (blk.size() != atomArrays.size()) {
678                                                throw new IllegalStateException(String.format(
679                                                                "Mismatched block size. Expected %d "
680                                                                                + "structures, found %d.",
681                                                                atomArrays.size(), blk.size()));
682                                        }
683                                        // Extract aligned atoms
684                                        for (int j = 0; j < blk.length(); j++) {
685                                                Integer alignedPos = blk.getAlignRes().get(i).get(j);
686                                                if (alignedPos != null) {
687                                                        blocksetAtoms[blockPos] = (Atom) curr[alignedPos]
688                                                                        .clone();
689                                                }
690                                                blockPos++;
691                                        }
692                                }
693
694                                // Transform according to the blockset or alignment matrix
695                                Matrix4d blockTrans = null;
696                                if (bs.getTransformations() != null)
697                                        blockTrans = bs.getTransformations().get(i);
698                                if (blockTrans == null) {
699                                        blockTrans = transform;
700                                }
701
702                                for (Atom a : blocksetAtoms) {
703                                        if (a != null)
704                                                Calc.transform(a, blockTrans);
705                                        transformedAtoms[transformedAtomsLength] = a;
706                                        transformedAtomsLength++;
707                                }
708                        }
709                        assert (transformedAtomsLength == alignment.length());
710
711                        transformed.add(transformedAtoms);
712                }
713                return transformed;
714        }
715
716        /**
717         * Calculate a List of alignment indicies that correspond to the core of a
718         * Block, which means that all structures have a residue in that positon.
719         *
720         * @param block
721         *            alignment Block
722         * @return List of positions in the core of the alignment
723         */
724        public static List<Integer> getCorePositions(Block block) {
725
726                List<Integer> corePositions = new ArrayList<Integer>();
727
728                for (int col = 0; col < block.length(); col++) {
729                        boolean core = true;
730                        for (int str = 0; str < block.size(); str++) {
731                                if (block.getAlignRes().get(str).get(col) == null) {
732                                        core = false;
733                                        break;
734                                }
735                        }
736                        if (core)
737                                corePositions.add(col);
738                }
739                return corePositions;
740        }
741
742        /**
743         * Sort blocks so that the specified row is in sequential order. The sort
744         * happens in place.
745         *
746         * @param blocks
747         *            List of blocks
748         * @param sortedIndex
749         *            Index of the row to be sorted
750         */
751        public static void sortBlocks(List<Block> blocks, final int sortedIndex) {
752                Collections.sort(blocks, new Comparator<Block>() {
753                        @Override
754                        public int compare(Block o1, Block o2) {
755                                // Compare the first non-null residue of each block
756                                List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex);
757                                List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex);
758                                Integer res1 = null;
759                                Integer res2 = null;
760                                for (Integer r : alignres1) {
761                                        if (r != null) {
762                                                res1 = r;
763                                                break;
764                                        }
765                                }
766                                for (Integer r : alignres2) {
767                                        if (r != null) {
768                                                res2 = r;
769                                                break;
770                                        }
771                                }
772                                return res1.compareTo(res2);
773                        }
774                });
775        }
776
777        /**
778         * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid
779         * residues. This method is only valid for protein structure alignments.
780         *
781         * @param msta
782         *            Multiple Structure Alignment
783         * @return MultipleSequenceAlignment of protein sequences
784         * @throws CompoundNotFoundException
785         */
786        public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA(
787                        MultipleAlignment msta) throws CompoundNotFoundException {
788
789                // Check that the alignment is of protein structures
790                Group g = msta.getAtomArrays().get(0)[0].getGroup();
791                if (!(g instanceof AminoAcid)) {
792                        throw new IllegalArgumentException(
793                                        "Cannot convert to multiple sequence alignment: "
794                                                        + "the structures aligned are not proteins");
795                }
796
797                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound>();
798
799                Map<String, Integer> uniqueID = new HashMap<String, Integer>();
800                List<String> seqs = getSequenceAlignment(msta);
801                for (int i = 0; i < msta.size(); i++) {
802                        // Make sure the identifiers are unique (required by AccessionID)
803                        String id = msta.getStructureIdentifier(i).toString();
804                        if (uniqueID.containsKey(id)) {
805                                uniqueID.put(id, uniqueID.get(id) + 1);
806                                id += "_" + uniqueID.get(id);
807                        } else
808                                uniqueID.put(id, 1);
809
810                        AccessionID acc = new AccessionID(id);
811                        ProteinSequence pseq = new ProteinSequence(seqs.get(i));
812                        pseq.setAccession(acc);
813                        msa.addAlignedSequence(pseq);
814                }
815                return msa;
816        }
817
818        /**
819         * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of
820         * the matrix contains the RMSD between structures i and j.
821         *
822         * @param msa
823         *            Multiple Structure Alignment
824         * @return Matrix of RMSD with size the number of structures squared
825         */
826        public static Matrix getRMSDMatrix(MultipleAlignment msa) {
827
828                Matrix rmsdMat = new Matrix(msa.size(), msa.size());
829                List<Atom[]> superposed = transformAtoms(msa);
830
831                for (int i = 0; i < msa.size(); i++) {
832                        for (int j = i; j < msa.size(); j++) {
833                                if (i == j)
834                                        rmsdMat.set(i, j, 0.0);
835                                List<Atom[]> compared = new ArrayList<Atom[]>();
836                                compared.add(superposed.get(i));
837                                compared.add(superposed.get(j));
838                                double rmsd = MultipleAlignmentScorer.getRMSD(compared);
839                                rmsdMat.set(i, j, rmsd);
840                                rmsdMat.set(j, i, rmsd);
841                        }
842                }
843                return rmsdMat;
844        }
845
846        /**
847         * Calculate a phylogenetic tree of the MultipleAlignment using Kimura
848         * distances and the Neighbor Joining algorithm from forester.
849         *
850         * @param msta
851         *            MultipleAlignment of protein structures
852         * @return Phylogeny phylogenetic tree
853         * @throws CompoundNotFoundException
854         * @throws IOException
855         */
856        public static Phylogeny getKimuraTree(MultipleAlignment msta)
857                        throws CompoundNotFoundException, IOException {
858                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
859                                .toProteinMSA(msta);
860                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
861                                .kimuraDistance(msa);
862                Phylogeny tree = TreeConstructor.distanceTree(distmat,
863                                TreeConstructorType.NJ);
864                tree.setName("Kimura Tree");
865                return tree;
866        }
867
868        /**
869         * Calculate a phylogenetic tree of the MultipleAlignment using
870         * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for
871         * distantly related proteins, structure-derived) and the Neighbor Joining
872         * algorithm from forester.
873         *
874         * @param msta
875         *            MultipleAlignment of protein structures
876         * @return Phylogeny phylogenetic tree
877         * @throws CompoundNotFoundException
878         * @throws IOException
879         */
880        public static Phylogeny getHSDMTree(MultipleAlignment msta)
881                        throws CompoundNotFoundException, IOException {
882                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
883                                .toProteinMSA(msta);
884                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
885                                .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102"));
886                Phylogeny tree = TreeConstructor.distanceTree(distmat,
887                                TreeConstructorType.NJ);
888                tree.setName("HSDM Tree");
889                return tree;
890        }
891
892        /**
893         * Calculate a phylogenetic tree of the MultipleAlignment using RMSD
894         * distances and the Neighbor Joining algorithm from forester.
895         *
896         * @param msta
897         *            MultipleAlignment of protein structures
898         * @return Phylogeny phylogenetic tree
899         * @throws CompoundNotFoundException
900         */
901        public static Phylogeny getStructuralTree(MultipleAlignment msta) {
902                double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta)
903                                .getArray();
904                BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
905                                .structuralDistance(rmsdMat, 1, 5, 0.4);
906                // Set the identifiers of the matrix
907                Map<String, Integer> alreadySeen = new HashMap<String, Integer>();
908                for (int i = 0; i < msta.size(); i++) {
909                        // Make sure the identifiers are unique
910                        String id = msta.getStructureIdentifier(i).toString();
911                        if (alreadySeen.containsKey(id)) {
912                                alreadySeen.put(id, alreadySeen.get(id) + 1);
913                                id += "_" + alreadySeen.get(id);
914                        } else
915                                alreadySeen.put(id, 1);
916                        rmsdDist.setIdentifier(i, id);
917                }
918                Phylogeny tree = TreeConstructor.distanceTree(rmsdDist,
919                                TreeConstructorType.NJ);
920                tree.setName("Structural Tree");
921                return tree;
922        }
923
924}