Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.multiple.util;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.Collections;
027import java.util.Comparator;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Map;
031import java.util.SortedSet;
032import java.util.TreeSet;
033
034import javax.vecmath.Matrix4d;
035
036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
037import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
038import org.biojava.nbio.core.sequence.AccessionID;
039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
040import org.biojava.nbio.core.sequence.ProteinSequence;
041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
042import org.biojava.nbio.phylo.DistanceMatrixCalculator;
043import org.biojava.nbio.phylo.TreeConstructor;
044import org.biojava.nbio.phylo.TreeConstructorType;
045import org.biojava.nbio.structure.AminoAcid;
046import org.biojava.nbio.structure.Atom;
047import org.biojava.nbio.structure.Calc;
048import org.biojava.nbio.structure.Chain;
049import org.biojava.nbio.structure.Group;
050import org.biojava.nbio.structure.PDBHeader;
051import org.biojava.nbio.structure.Structure;
052import org.biojava.nbio.structure.StructureException;
053import org.biojava.nbio.structure.StructureIdentifier;
054import org.biojava.nbio.structure.StructureImpl;
055import org.biojava.nbio.structure.StructureTools;
056import org.biojava.nbio.structure.align.multiple.Block;
057import org.biojava.nbio.structure.align.multiple.BlockSet;
058import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
059import org.biojava.nbio.structure.align.util.AlignmentTools;
060import org.biojava.nbio.structure.cluster.SubunitCluster;
061import org.biojava.nbio.structure.jama.Matrix;
062import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
063import org.forester.phylogeny.Phylogeny;
064
065/**
066 * Utility functions for working with {@link MultipleAlignment}.
067 * <p>
068 * Supported functions:
069 * <ul>
070 * <li>Obtain the alignment as sequence strings
071 * <li>Map from sequence alignment position to structure Atom
072 * <li>Map from sequence alignment position to Block number
073 * <li>Transform the aligned Atoms of a MultipleAlignment
074 * <li>Get all the core alignment positions of the alignment
075 * <li>Calculate the average residue distance of all aligned positions
076 * <li>Sort Blocks in a MultipleAlignment by a specified row
077 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment
078 * </ul>
079 *
080 * @author Spencer Bliven
081 * @author Aleix Lafita
082 * @since 4.1.0
083 *
084 */
085public class MultipleAlignmentTools {
086
087        /**
088         * Calculate the sequence alignment Strings for the whole alignment. This
089         * method creates a sequence alignment where aligned residues are in
090         * uppercase and unaligned residues are in lowercase, thus providing a more
091         * compact way to represent the alignment.
092         * <p>
093         * Blocks are concatenated in the order returned by
094         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
095         * sequential. Gaps are represented by '-'. Separation between different
096         * Blocks is indicated by a gap in all positions, meaning that there is a
097         * possible discontinuity.
098         *
099         * @param alignment
100         *            input MultipleAlignment
101         * @param mapSeqToStruct
102         *            provides a link from the sequence alignment position to the
103         *            structure alignment position. Specially designed for the GUI.
104         *            Has to be initialized previously and will be overwritten.
105         * @return a string for each row in the alignment, giving the 1-letter code
106         *         for each aligned residue.
107         */
108        public static List<String> getSequenceAlignment(
109                        MultipleAlignment alignment, final List<Integer> mapSeqToStruct) {
110
111                // Initialize sequence variables
112                List<String> alnSequences = new ArrayList<>();
113                for (int str = 0; str < alignment.size(); str++)
114                        alnSequences.add("");
115                mapSeqToStruct.clear();
116                List<Atom[]> atoms = alignment.getAtomArrays();
117                int globalPos = -1;
118
119                // Initialize helper variables in constucting the sequence alignment
120                List<SortedSet<Integer>> freePool = new ArrayList<>();
121                List<SortedSet<Integer>> blockStarts = new ArrayList<>();
122                List<List<Integer>> aligned = new ArrayList<>();
123
124                // Generate freePool residues from the ones not aligned
125                for (int i = 0; i < alignment.size(); i++) {
126                        List<Integer> residues = new ArrayList<>();
127                        freePool.add(new TreeSet<Integer>());
128                        blockStarts.add(new TreeSet<Integer>());
129                        for (BlockSet bs : alignment.getBlockSets()) {
130                                for (Block b : bs.getBlocks()) {
131                                        boolean first = true;
132                                        for (int l = 0; l < b.length(); l++) {
133                                                Integer residue = b.getAlignRes().get(i).get(l);
134                                                if (residue != null) {
135                                                        if (first)
136                                                                blockStarts.get(i).add(residue);
137                                                        residues.add(residue);
138                                                        first = false;
139                                                }
140                                        }
141                                }
142                        }
143                        aligned.add(residues);
144                }
145                // Add any residue not aligned to the free pool for every structure
146                for (int i = 0; i < alignment.size(); i++) {
147                        for (int k = 0; k < atoms.get(i).length; k++) {
148                                if (!aligned.get(i).contains(k))
149                                        freePool.get(i).add(k);
150                        }
151                }
152
153                for (int b = 0; b < alignment.getBlocks().size(); b++) {
154                        if (b != 0) {
155                                // Add a gap to all structures to separate visually the Blocks
156                                for (int str = 0; str < alignment.size(); str++)
157                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
158                                mapSeqToStruct.add(-1); // means unaligned position
159                        }
160                        // Store the previous position added to the sequence alignment
161                        int[] previousPos = new int[alignment.size()];
162                        Arrays.fill(previousPos, -1);
163                        // Store provisional characters
164                        char[] provisionalChar = new char[alignment.size()];
165                        Arrays.fill(provisionalChar, '-');
166
167                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
168                                globalPos++;
169                                boolean gaps = true; // true if consecutive with the previous
170                                while (gaps) {
171                                        gaps = false;
172                                        // Loop through all the structures
173                                        for (int str = 0; str < alignment.size(); str++) {
174                                                // If it is the first position or before it was null
175                                                if (previousPos[str] == -1) {
176                                                        Integer residue = alignment.getBlocks().get(b)
177                                                                        .getAlignRes().get(str).get(pos);
178                                                        if (residue == null)
179                                                                provisionalChar[str] = '-';
180                                                        else {
181                                                                Atom a = atoms.get(str)[residue];
182                                                                String group = a.getGroup().getPDBName();
183                                                                provisionalChar[str] = StructureTools
184                                                                                .get1LetterCode(group);
185                                                        }
186                                                } else {
187                                                        Integer residue = alignment.getBlocks().get(b)
188                                                                        .getAlignRes().get(str).get(pos);
189                                                        int nextPos = previousPos[str] + 1;
190                                                        if (residue == null) {
191                                                                if (freePool.get(str).contains(nextPos)) {
192                                                                        Atom a = atoms.get(str)[nextPos];
193                                                                        String g = a.getGroup().getPDBName();
194                                                                        char aa = StructureTools.get1LetterCode(g);
195                                                                        provisionalChar[str] = Character
196                                                                                        .toLowerCase(aa);
197                                                                } else
198                                                                        provisionalChar[str] = '-';
199                                                        } else if (nextPos == residue) {
200                                                                Atom a = atoms.get(str)[nextPos];
201                                                                String group = a.getGroup().getPDBName();
202                                                                provisionalChar[str] = StructureTools
203                                                                                .get1LetterCode(group);
204                                                        } else {
205                                                                // This means non-consecutive
206                                                                provisionalChar[str] = ' ';
207                                                                gaps = true;
208                                                        }
209                                                }
210                                        }// End all structure analysis
211
212                                        if (gaps) {
213                                                for (int str = 0; str < alignment.size(); str++) {
214                                                        if (provisionalChar[str] == ' ') {
215                                                                // It means this residue was non-consecutive
216                                                                Atom a = atoms.get(str)[previousPos[str] + 1];
217                                                                String group = a.getGroup().getPDBName();
218                                                                char aa = StructureTools.get1LetterCode(group);
219                                                                alnSequences
220                                                                                .set(str,
221                                                                                                alnSequences
222                                                                                                                .get(str)
223                                                                                                                .concat(String.valueOf(Character
224                                                                                                                                                .toLowerCase(aa))) );
225                                                                previousPos[str]++;
226                                                        } else {
227                                                                // Insert a gap otherwise
228                                                                alnSequences.set(str, alnSequences.get(str)
229                                                                                .concat("-"));
230                                                        }
231                                                }
232                                                mapSeqToStruct.add(-1); // unaligned position
233                                        } else {
234                                                // Add provisional and update the indices
235                                                for (int str = 0; str < alignment.size(); str++) {
236                                                        alnSequences.set(
237                                                                        str,
238                                                                        alnSequences.get(str).concat(
239                                                                                        String.valueOf(provisionalChar[str])));
240
241                                                        if (provisionalChar[str] != '-') {
242                                                                if (alignment.getBlocks().get(b).getAlignRes()
243                                                                                .get(str).get(pos) == null) {
244                                                                        previousPos[str]++;
245                                                                } else {
246                                                                        previousPos[str] = alignment.getBlocks()
247                                                                                        .get(b).getAlignRes().get(str)
248                                                                                        .get(pos);
249                                                                }
250                                                        }
251                                                }
252                                                mapSeqToStruct.add(globalPos); // alignment index
253                                        }
254                                }
255                        } // All positions in the Block considered so far
256
257                        // Calculate the index of the next Block for every structure
258                        int[] blockEnds = new int[alignment.size()];
259                        for (int str = 0; str < alignment.size(); str++) {
260                                for (int res : blockStarts.get(str)) {
261                                        if (previousPos[str] > res)
262                                                blockEnds[str] = res;
263                                        else {
264                                                blockEnds[str] = res;
265                                                break;
266                                        }
267                                }
268                        }
269
270                        // Add the unaligned residues in between Blocks (lowercase)
271                        boolean allGaps = false; // true means no more residues to add
272                        while (!allGaps) {
273                                allGaps = true;
274                                for (int str = 0; str < alignment.size(); str++) {
275                                        if (previousPos[str] + 1 < blockEnds[str]) {
276                                                Atom a = atoms.get(str)[previousPos[str] + 1];
277                                                String group = a.getGroup().getPDBName();
278                                                char letter = StructureTools.get1LetterCode(group);
279
280                                                provisionalChar[str] = Character.toLowerCase(letter);
281                                                previousPos[str]++;
282                                                allGaps = false;
283                                        } else
284                                                provisionalChar[str] = '-';
285                                }
286                                if (!allGaps) {
287                                        for (int str = 0; str < alignment.size(); str++) {
288                                                alnSequences.set(
289                                                                str,
290                                                                alnSequences.get(str).concat(
291                                                                                String.valueOf(provisionalChar[str])) );
292                                        }
293                                        mapSeqToStruct.add(-1); // unaligned position
294                                }
295                        }
296                }
297                return alnSequences;
298        }
299
300        /**
301         * Calculate the sequence alignment Strings for the whole alignment. This
302         * method creates a sequence alignment where aligned residues are in
303         * uppercase and unaligned residues are in lowercase, thus providing a more
304         * compact way to represent the alignment.
305         * <p>
306         * Blocks are concatenated in the order returned by
307         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
308         * sequential. Gaps are represented by '-'. Separation between different
309         * Blocks is indicated by a gap in all positions, meaning that there is a
310         * possible discontinuity.
311         *
312         * @param msa
313         *            input MultipleAlignment
314         * @return String for each row in the alignment, giving the 1-letter code
315         *         for each aligned residue.
316         */
317        public static List<String> getSequenceAlignment(MultipleAlignment msa) {
318                return getSequenceAlignment(msa, new ArrayList<Integer>());
319        }
320
321        /**
322         * Calculate the sequence alignment Strings for the alignment Blocks in an
323         * alignment. This method creates a sequence alignment where all residues
324         * are in uppercase and a residue alone with gaps in all the other
325         * structures represents unaligned residues. Because of this latter
326         * constraint only the residues within the Blocks are represented, for a
327         * more compact alignment. For a sequence alignment of the full protein use
328         * {@link #getSequenceAlignment(MultipleAlignment)}.
329         * <p>
330         * Blocks are concatenated in the order returned by
331         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
332         * sequential. Gaps between blocks are omitted, while gaps within blocks are
333         * represented by '-'. Separation between different Blocks is indicated by a
334         * gap in all positions, meaning that there is something unaligned
335         * inbetween.
336         *
337         * @param alignment
338         *            input MultipleAlignment
339         * @param mapSeqToStruct
340         *            provides a link from the sequence alignment position to the
341         *            structure alignment position. Specially designed for the GUI.
342         *            Has to be initialized previously and will be overwritten.
343         * @return a string for each row in the alignment, giving the 1-letter code
344         *         for each aligned residue.
345         */
346        public static List<String> getBlockSequenceAlignment(
347                        MultipleAlignment alignment, List<Integer> mapSeqToStruct) {
348
349                // Initialize sequence variables
350                List<String> alnSequences = new ArrayList<>();
351                for (int str = 0; str < alignment.size(); str++)
352                        alnSequences.add("");
353                mapSeqToStruct.clear();
354                List<Atom[]> atoms = alignment.getAtomArrays();
355                int globalPos = -1;
356
357                // Loop through all the alignment Blocks in the order given
358                for (int b = 0; b < alignment.getBlocks().size(); b++) {
359                        if (b != 0) {
360                                // Add a gap to all structures to separate Blocks
361                                for (int str = 0; str < alignment.size(); str++)
362                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
363                                mapSeqToStruct.add(-1); // means unaligned position
364                        }
365
366                        // Store the previous position added to the sequence alignment
367                        int[] previousPos = new int[alignment.size()];
368                        Arrays.fill(previousPos, -1);
369                        // Store provisional characters
370                        char[] provisionalChar = new char[alignment.size()];
371                        Arrays.fill(provisionalChar, '-');
372
373                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
374                                globalPos++;
375                                boolean gaps = true;
376                                while (gaps) {
377                                        gaps = false;
378                                        // Loop through all the structures
379                                        for (int str = 0; str < alignment.size(); str++) {
380                                                // If it is the first position or before it was null
381                                                if (previousPos[str] == -1) {
382                                                        Integer residue = alignment.getBlocks().get(b)
383                                                                        .getAlignRes().get(str).get(pos);
384                                                        if (residue == null)
385                                                                provisionalChar[str] = '-';
386                                                        else {
387                                                                Atom a = atoms.get(str)[residue];
388                                                                String g = a.getGroup().getPDBName();
389                                                                char aa = StructureTools.get1LetterCode(g);
390                                                                provisionalChar[str] = aa;
391                                                        }
392                                                } else {
393                                                        Integer residue = alignment.getBlocks().get(b)
394                                                                        .getAlignRes().get(str).get(pos);
395                                                        if (residue == null)
396                                                                provisionalChar[str] = '-';
397                                                        else if (previousPos[str] + 1 == residue) {
398                                                                Atom a = atoms.get(str)[residue];
399                                                                String g = a.getGroup().getPDBName();
400                                                                char aa = StructureTools.get1LetterCode(g);
401                                                                provisionalChar[str] = aa;
402                                                        } else {
403                                                                provisionalChar[str] = ' ';
404                                                                gaps = true;
405                                                        }
406                                                }
407                                        }// End all structures analysis
408
409                                        if (gaps) {
410                                                for (int str = 0; str < alignment.size(); str++) {
411                                                        if (provisionalChar[str] == ' ') {
412                                                                // It means this residue was non-consecutive
413                                                                for (int s2 = 0; s2 < alignment.size(); s2++) {
414                                                                        if (str == s2) {
415                                                                                int next = previousPos[str] + 1;
416                                                                                Atom a = atoms.get(s2)[next];
417                                                                                String g = a.getGroup().getPDBName();
418                                                                                char aa = StructureTools
419                                                                                                .get1LetterCode(g);
420                                                                                alnSequences.set(
421                                                                                                s2,
422                                                                                                alnSequences.get(s2).concat(
423                                                                                                                String.valueOf(aa)) );
424                                                                        } else {
425                                                                                alnSequences.set(s2,
426                                                                                                alnSequences.get(s2)
427                                                                                                                .concat("-"));
428                                                                        }
429                                                                }
430                                                                mapSeqToStruct.add(-1); // unaligned
431                                                                previousPos[str] += 1;
432                                                        }
433                                                }
434                                        } else { // Append the provisional and update the indices
435                                                for (int str = 0; str < alignment.size(); str++) {
436                                                        alnSequences.set(
437                                                                        str,
438                                                                        alnSequences.get(str).concat(
439                                                                                        String.valueOf(provisionalChar[str])) );
440                                                        if (provisionalChar[str] != '-') {
441                                                                previousPos[str] = alignment.getBlocks().get(b)
442                                                                                .getAlignRes().get(str).get(pos);
443                                                        }
444                                                }
445                                                mapSeqToStruct.add(globalPos);
446                                        }
447                                }
448                        }
449                }
450                return alnSequences;
451        }
452
453        /**
454         * Calculate the sequence alignment Strings for the alignment Blocks in an
455         * alignment. This method creates a sequence alignment where all residues
456         * are in uppercase and a residue alone with gaps in all the other
457         * structures represents unaligned residues. Because of this latter
458         * constraint only the residues within the Blocks are represented, for a
459         * more compact alignment. For a sequence alignment of the full protein use
460         * {@link #getSequenceAlignment(MultipleAlignment)}.
461         * <p>
462         * Blocks are concatenated in the order returned by
463         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
464         * sequential. Gaps between blocks are omitted, while gaps within blocks are
465         * represented by '-'. Separation between different Blocks is indicated by a
466         * gap in all positions, meaning that there is something unaligned
467         * inbetween.
468         *
469         * @param ma
470         *            input MultipleAlignment
471         * @return String for each row in the alignment, giving the 1-letter code
472         *         for each aligned residue.
473         */
474        public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) {
475                return getBlockSequenceAlignment(ma, new ArrayList<Integer>());
476        }
477
478        /**
479         * Returns the Atom of the specified structure that is aligned in the
480         * sequence alignment position specified.
481         *
482         * @param msa
483         *            the MultipleAlignment object from where the sequence alignment
484         *            has been generated
485         * @param mapSeqToStruct
486         *            the mapping between sequence and structure generated with the
487         *            sequence alignment
488         * @param str
489         *            the structure index of the alignment (row)
490         * @param sequencePos
491         *            the sequence alignment position (column)
492         * @return Atom the atom in that position or null if there is a gap
493         */
494        public static Atom getAtomForSequencePosition(MultipleAlignment msa,
495                        List<Integer> mapSeqToStruct, int str, int sequencePos) {
496
497                int seqPos = mapSeqToStruct.get(sequencePos);
498
499                // Check if the position selected is an aligned position
500                if (seqPos == -1)
501                        return null;
502                else {
503                        Atom a = null;
504                        // Calculate the corresponding structure position
505                        int sum = 0;
506                        for (Block b : msa.getBlocks()) {
507                                if (sum + b.length() <= seqPos) {
508                                        sum += b.length();
509                                        continue;
510                                } else {
511                                        for (Integer p : b.getAlignRes().get(str)) {
512                                                if (sum == seqPos) {
513                                                        if (p != null) {
514                                                                a = msa.getAtomArrays().get(str)[p];
515                                                        }
516                                                        break;
517                                                }
518                                                sum++;
519                                        }
520                                        break;
521                                }
522                        }
523                        return a;
524                }
525        }
526
527        /**
528         * Returns the block number of a specified position in the sequence
529         * alignment, given the mapping from structure to function.
530         *
531         * @param multAln
532         *            the MultipleAlignment object from where the sequence alignment
533         *            has been generated.
534         * @param mapSeqToStruct
535         *            the mapping between sequence and structure generated with the
536         *            sequence alignment
537         * @param sequencePos
538         *            the position in the sequence alignment (column)
539         * @return int the block index, or -1 if the position is not aligned
540         */
541        public static int getBlockForSequencePosition(MultipleAlignment multAln,
542                        List<Integer> mapSeqToStruct, int sequencePos) {
543
544                int seqPos = mapSeqToStruct.get(sequencePos);
545                // Check if the position selected is an aligned position
546                if (seqPos == -1)
547                        return -1;
548                else {
549                        // Calculate the corresponding block (by iterating all Blocks)
550                        int sum = 0;
551                        int block = 0;
552                        for (Block b : multAln.getBlocks()) {
553                                if (sum + b.length() <= seqPos) {
554                                        sum += b.length();
555                                        block++;
556                                        continue;
557                                } else
558                                        break;
559                        }
560                        return block;
561                }
562        }
563
564        /**
565         * The average residue distance Matrix contains the average distance from
566         * each residue to all other residues aligned with it.
567         * <p>
568         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
569         * length.
570         *
571         * @param msa
572         *            MultipleAlignment
573         * @return Matrix containing all average residue distances
574         */
575        public static Matrix getAverageResidueDistances(MultipleAlignment msa) {
576                List<Atom[]> transformed = transformAtoms(msa);
577                return getAverageResidueDistances(transformed);
578        }
579
580        /**
581         * The average residue distance Matrix contains the average distance from
582         * each residue to all other residues aligned with it.
583         * <p>
584         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
585         * length.
586         *
587         * @param transformed
588         *            List of Atom arrays containing only the aligned atoms of each
589         *            structure, or null if there is a gap.
590         * @return Matrix containing all average residue distances. Entry -1 means
591         *         there is a gap in the position.
592         */
593        public static Matrix getAverageResidueDistances(List<Atom[]> transformed) {
594
595                int size = transformed.size();
596                int length = transformed.get(0).length;
597                Matrix resDist = new Matrix(size, length, -1);
598
599                // Calculate the average residue distances
600                for (int r1 = 0; r1 < size; r1++) {
601                        for (int c = 0; c < transformed.get(r1).length; c++) {
602                                Atom refAtom = transformed.get(r1)[c];
603                                if (refAtom == null)
604                                        continue;
605
606                                for (int r2 = r1 + 1; r2 < size; r2++) {
607                                        Atom atom = transformed.get(r2)[c];
608                                        if (atom != null) {
609                                                double distance = Calc.getDistance(refAtom, atom);
610
611                                                if (resDist.get(r1, c) == -1) {
612                                                        resDist.set(r1, c, 1 + distance);
613                                                } else {
614                                                        resDist.set(r1, c, resDist.get(r1, c) + distance);
615                                                }
616
617                                                if (resDist.get(r2, c) == -1) {
618                                                        resDist.set(r2, c, 1 + distance);
619                                                } else {
620                                                        resDist.set(r2, c, resDist.get(r2, c) + distance);
621                                                }
622                                        }
623                                }
624                        }
625                }
626
627                for (int c = 0; c < length; c++) {
628                        int nonNullRes = 0;
629                        for (int r = 0; r < size; r++) {
630                                if (resDist.get(r, c) != -1)
631                                        nonNullRes++;
632                        }
633                        for (int r = 0; r < size; r++) {
634                                if (resDist.get(r, c) != -1) {
635                                        resDist.set(r, c, resDist.get(r, c) / nonNullRes);
636                                }
637                        }
638                }
639                return resDist;
640        }
641
642        /**
643         * Transforms atoms according to the superposition stored in the alignment.
644         * <p>
645         * For each structure in the alignment, returns an atom for each
646         * representative atom in the aligned columns, omitting unaligned residues
647         * (i.e. an array of length <code>alignment.length()</code> ).
648         * <p>
649         * All blocks are concatenated together, so Atoms may not appear in the same
650         * order as in their parent structure. If the alignment blocks contain null
651         * residues (gaps), then the returned array will also contain null Atoms in
652         * the same positions.
653         *
654         * @param alignment
655         *            MultipleAlignment
656         * @return List of Atom arrays of only the aligned atoms of every structure
657         *         (null Atom if a gap position)
658         */
659        public static List<Atom[]> transformAtoms(MultipleAlignment alignment) {
660                if (alignment.getEnsemble() == null) {
661                        throw new NullPointerException("No ensemble set for this alignment");
662                }
663
664                List<Atom[]> atomArrays = alignment.getAtomArrays();
665                List<Atom[]> transformed = new ArrayList<>(atomArrays.size());
666
667                // Loop through structures
668                for (int i = 0; i < atomArrays.size(); i++) {
669
670                        Matrix4d transform = null;
671                        Atom[] curr = atomArrays.get(i); // all CA atoms from structure
672
673                        // Concatenated list of all blocks for this structure
674                        Atom[] transformedAtoms = new Atom[alignment.length()];
675                        int transformedAtomsLength = 0;
676
677                        // Each blockset gets transformed independently
678                        for (BlockSet bs : alignment.getBlockSets()) {
679
680                                Atom[] blocksetAtoms = new Atom[bs.length()];
681                                int blockPos = 0;
682
683                                for (Block blk : bs.getBlocks()) {
684                                        if (blk.size() != atomArrays.size()) {
685                                                throw new IllegalStateException(String.format(
686                                                                "Mismatched block size. Expected %d "
687                                                                                + "structures, found %d.",
688                                                                atomArrays.size(), blk.size()));
689                                        }
690                                        // Extract aligned atoms
691                                        for (int j = 0; j < blk.length(); j++) {
692                                                Integer alignedPos = blk.getAlignRes().get(i).get(j);
693                                                if (alignedPos != null) {
694                                                        blocksetAtoms[blockPos] = (Atom) curr[alignedPos]
695                                                                        .clone();
696                                                }
697                                                blockPos++;
698                                        }
699                                }
700
701                                // Transform according to the blockset or alignment matrix
702                                Matrix4d blockTrans = null;
703                                if (bs.getTransformations() != null)
704                                        blockTrans = bs.getTransformations().get(i);
705                                if (blockTrans == null) {
706                                        blockTrans = transform;
707                                }
708
709                                for (Atom a : blocksetAtoms) {
710                                        if (a != null)
711                                                Calc.transform(a, blockTrans);
712                                        transformedAtoms[transformedAtomsLength] = a;
713                                        transformedAtomsLength++;
714                                }
715                        }
716                        assert (transformedAtomsLength == alignment.length());
717
718                        transformed.add(transformedAtoms);
719                }
720                return transformed;
721        }
722
723        /**
724         * Calculate a List of alignment indicies that correspond to the core of a
725         * Block, which means that all structures have a residue in that position.
726         *
727         * @param block
728         *            alignment Block
729         * @return List of positions in the core of the alignment
730         */
731        public static List<Integer> getCorePositions(Block block) {
732
733                List<Integer> corePositions = new ArrayList<>();
734
735                for (int col = 0; col < block.length(); col++) {
736                        boolean core = true;
737                        for (int str = 0; str < block.size(); str++) {
738                                if (block.getAlignRes().get(str).get(col) == null) {
739                                        core = false;
740                                        break;
741                                }
742                        }
743                        if (core)
744                                corePositions.add(col);
745                }
746                return corePositions;
747        }
748
749        /**
750         * Sort blocks so that the specified row is in sequential order. The sort
751         * happens in place.
752         *
753         * @param blocks
754         *            List of blocks
755         * @param sortedIndex
756         *            Index of the row to be sorted
757         */
758        public static void sortBlocks(List<Block> blocks, final int sortedIndex) {
759                Collections.sort(blocks, new Comparator<Block>() {
760                        @Override
761                        public int compare(Block o1, Block o2) {
762                                // Compare the first non-null residue of each block
763                                List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex);
764                                List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex);
765                                Integer res1 = null;
766                                Integer res2 = null;
767                                for (Integer r : alignres1) {
768                                        if (r != null) {
769                                                res1 = r;
770                                                break;
771                                        }
772                                }
773                                for (Integer r : alignres2) {
774                                        if (r != null) {
775                                                res2 = r;
776                                                break;
777                                        }
778                                }
779                                return res1.compareTo(res2);
780                        }
781                });
782        }
783
784        /**
785         * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid
786         * residues. This method is only valid for protein structure alignments.
787         *
788         * @param msta
789         *            Multiple Structure Alignment
790         * @return MultipleSequenceAlignment of protein sequences
791         * @throws CompoundNotFoundException
792         */
793        public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA(
794                        MultipleAlignment msta) throws CompoundNotFoundException {
795
796                // Check that the alignment is of protein structures
797                Group g = msta.getAtomArrays().get(0)[0].getGroup();
798                if (!(g instanceof AminoAcid)) {
799                        throw new IllegalArgumentException(
800                                        "Cannot convert to multiple sequence alignment: "
801                                                        + "the structures aligned are not proteins");
802                }
803
804                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<>();
805
806                Map<String, Integer> uniqueID = new HashMap<>();
807                List<String> seqs = getSequenceAlignment(msta);
808                for (int i = 0; i < msta.size(); i++) {
809                        // Make sure the identifiers are unique (required by AccessionID)
810                        String id = msta.getStructureIdentifier(i).toString();
811                        if (uniqueID.containsKey(id)) {
812                                uniqueID.put(id, uniqueID.get(id) + 1);
813                                id += "_" + uniqueID.get(id);
814                        } else
815                                uniqueID.put(id, 1);
816
817                        AccessionID acc = new AccessionID(id);
818                        ProteinSequence pseq = new ProteinSequence(seqs.get(i));
819                        pseq.setAccession(acc);
820                        msa.addAlignedSequence(pseq);
821                }
822                return msa;
823        }
824
825        public static Structure toMultimodelStructure(MultipleAlignment multAln, List<Atom[]> transformedAtoms) throws StructureException {
826                PDBHeader header = new PDBHeader();
827                String title = multAln.getEnsemble().getAlgorithmName() + " V."
828                                + multAln.getEnsemble().getVersion() + " : ";
829
830                for (StructureIdentifier name : multAln.getEnsemble()
831                                .getStructureIdentifiers()) {
832                        title += name.getIdentifier() + " ";
833                }
834                Structure artificial = getAlignedStructure(transformedAtoms);
835
836                artificial.setPDBHeader(header);
837                header.setTitle(title);
838                return artificial;
839        }
840
841        /**
842         * Get an artificial Structure containing a different model for every
843         * input structure, so that the alignment result can be viewed in Jmol.
844         * The Atoms have to be rotated beforehand.
845         *
846         * @param atomArrays an array of Atoms for every aligned structure
847         * @return a structure object containing a set of models,
848         *                      one for each input array of Atoms.
849         * @throws StructureException
850         */
851        public static final Structure getAlignedStructure(List<Atom[]> atomArrays)
852                        throws StructureException {
853
854                Structure s = new StructureImpl();
855                for (int i=0; i<atomArrays.size(); i++){
856                        List<Chain> model = AlignmentTools.getAlignedModel(atomArrays.get(i));
857                        s.addModel(model);
858                }
859                return s;
860        }
861
862        /**
863         * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of
864         * the matrix contains the RMSD between structures i and j.
865         *
866         * @param msa
867         *            Multiple Structure Alignment
868         * @return Matrix of RMSD with size the number of structures squared
869         */
870        public static Matrix getRMSDMatrix(MultipleAlignment msa) {
871
872                Matrix rmsdMat = new Matrix(msa.size(), msa.size());
873                List<Atom[]> superposed = transformAtoms(msa);
874
875                for (int i = 0; i < msa.size(); i++) {
876                        for (int j = i; j < msa.size(); j++) {
877                                if (i == j)
878                                        rmsdMat.set(i, j, 0.0);
879                                List<Atom[]> compared = new ArrayList<>();
880                                compared.add(superposed.get(i));
881                                compared.add(superposed.get(j));
882                                double rmsd = MultipleAlignmentScorer.getRMSD(compared);
883                                rmsdMat.set(i, j, rmsd);
884                                rmsdMat.set(j, i, rmsd);
885                        }
886                }
887                return rmsdMat;
888        }
889
890        /**
891         * Calculate a phylogenetic tree of the MultipleAlignment using Kimura
892         * distances and the Neighbor Joining algorithm from forester.
893         *
894         * @param msta
895         *            MultipleAlignment of protein structures
896         * @return Phylogeny phylogenetic tree
897         * @throws CompoundNotFoundException
898         * @throws IOException
899         */
900        public static Phylogeny getKimuraTree(MultipleAlignment msta)
901                        throws CompoundNotFoundException, IOException {
902                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
903                                .toProteinMSA(msta);
904                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
905                                .kimuraDistance(msa);
906                Phylogeny tree = TreeConstructor.distanceTree(distmat,
907                                TreeConstructorType.NJ);
908                tree.setName("Kimura Tree");
909                return tree;
910        }
911
912        /**
913         * Calculate a phylogenetic tree of the MultipleAlignment using
914         * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for
915         * distantly related proteins, structure-derived) and the Neighbor Joining
916         * algorithm from forester.
917         *
918         * @param msta
919         *            MultipleAlignment of protein structures
920         * @return Phylogeny phylogenetic tree
921         * @throws CompoundNotFoundException
922         * @throws IOException
923         */
924        public static Phylogeny getHSDMTree(MultipleAlignment msta)
925                        throws CompoundNotFoundException, IOException {
926                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
927                                .toProteinMSA(msta);
928                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
929                                .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102"));
930                Phylogeny tree = TreeConstructor.distanceTree(distmat,
931                                TreeConstructorType.NJ);
932                tree.setName("HSDM Tree");
933                return tree;
934        }
935
936        /**
937         * Calculate a phylogenetic tree of the MultipleAlignment using RMSD
938         * distances and the Neighbor Joining algorithm from forester.
939         *
940         * @param msta
941         *            MultipleAlignment of protein structures
942         * @return Phylogeny phylogenetic tree
943         */
944        public static Phylogeny getStructuralTree(MultipleAlignment msta) {
945                double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta)
946                                .getArray();
947                BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
948                                .structuralDistance(rmsdMat, 1, 5, 0.4);
949                // Set the identifiers of the matrix
950                Map<String, Integer> alreadySeen = new HashMap<>();
951                for (int i = 0; i < msta.size(); i++) {
952                        // Make sure the identifiers are unique
953                        String id = msta.getStructureIdentifier(i).toString();
954                        if (alreadySeen.containsKey(id)) {
955                                alreadySeen.put(id, alreadySeen.get(id) + 1);
956                                id += "_" + alreadySeen.get(id);
957                        } else
958                                alreadySeen.put(id, 1);
959                        rmsdDist.setIdentifier(i, id);
960                }
961                Phylogeny tree = TreeConstructor.distanceTree(rmsdDist,
962                                TreeConstructorType.NJ);
963                tree.setName("Structural Tree");
964                return tree;
965        }
966
967        /**
968         * Convert an MSA into a matrix of equivalent residues.
969         *
970         * This concatenates all blocks, meaning that the indices might not be
971         * sequential.
972         *
973         * Indices should be consistent with `msa.getAtomArrays()`.
974         * @param msa Multiple alignment
975         * @param coreOnly Include only core (ungapped) columns. Otherwise gaps are
976         *        represented with null.
977         * @return
978         */
979        public static List<List<Integer>> getEquivalentResidues(MultipleAlignment msa, boolean coreOnly) {
980                List<List<Integer>> eqr = new ArrayList<>();
981                for (int str = 0; str < msa.size(); str++) {
982                        eqr.add(new ArrayList<>());
983                }
984
985                for(Block block: msa.getBlocks()) {
986                        List<List<Integer>> aln = block.getAlignRes();
987                        for (int col = 0; col < block.length(); col++) {
988                                // skip non-core columns
989                                if(coreOnly) {
990                                        boolean core = true;
991                                        for (int str = 0; str < block.size(); str++) {
992                                                if (aln.get(str).get(col) == null) {
993                                                        core = false;
994                                                        break;
995                                                }
996                                        }
997                                        if(!core) {
998                                                continue;
999                                        }
1000                                }
1001                                // add column to eqr
1002                                for (int str = 0; str < block.size(); str++) {
1003                                        eqr.get(str).add(aln.get(str).get(col));
1004                                }
1005                        }
1006                }
1007                return eqr;
1008        }
1009}