001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.multiple.util;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.Collections;
027import java.util.Comparator;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Map;
031import java.util.SortedSet;
032import java.util.TreeSet;
033
034import javax.vecmath.Matrix4d;
035
036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
037import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
038import org.biojava.nbio.core.sequence.AccessionID;
039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
040import org.biojava.nbio.core.sequence.ProteinSequence;
041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
042import org.biojava.nbio.phylo.DistanceMatrixCalculator;
043import org.biojava.nbio.phylo.TreeConstructor;
044import org.biojava.nbio.phylo.TreeConstructorType;
045import org.biojava.nbio.structure.AminoAcid;
046import org.biojava.nbio.structure.Atom;
047import org.biojava.nbio.structure.Calc;
048import org.biojava.nbio.structure.Chain;
049import org.biojava.nbio.structure.Group;
050import org.biojava.nbio.structure.PDBHeader;
051import org.biojava.nbio.structure.Structure;
052import org.biojava.nbio.structure.StructureException;
053import org.biojava.nbio.structure.StructureIdentifier;
054import org.biojava.nbio.structure.StructureImpl;
055import org.biojava.nbio.structure.StructureTools;
056import org.biojava.nbio.structure.align.multiple.Block;
057import org.biojava.nbio.structure.align.multiple.BlockSet;
058import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
059import org.biojava.nbio.structure.align.util.AlignmentTools;
060import org.biojava.nbio.structure.jama.Matrix;
061import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
062import org.forester.phylogeny.Phylogeny;
063
064/**
065 * Utility functions for working with {@link MultipleAlignment}.
066 * <p>
067 * Supported functions:
068 * <ul>
069 * <li>Obtain the alignment as sequence strings
070 * <li>Map from sequence alignment position to structure Atom
071 * <li>Map from sequence alignment position to Block number
072 * <li>Transform the aligned Atoms of a MultipleAlignment
073 * <li>Get all the core alignment positions of the alignment
074 * <li>Calculate the average residue distance of all aligned positions
075 * <li>Sort Blocks in a MultipleAlignment by a specified row
076 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment
077 * </ul>
078 *
079 * @author Spencer Bliven
080 * @author Aleix Lafita
081 * @since 4.1.0
082 *
083 */
084public class MultipleAlignmentTools {
085
086        /**
087         * Calculate the sequence alignment Strings for the whole alignment. This
088         * method creates a sequence alignment where aligned residues are in
089         * uppercase and unaligned residues are in lowercase, thus providing a more
090         * compact way to represent the alignment.
091         * <p>
092         * Blocks are concatenated in the order returned by
093         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
094         * sequential. Gaps are represented by '-'. Separation between different
095         * Blocks is indicated by a gap in all positions, meaning that there is a
096         * possible discontinuity.
097         *
098         * @param alignment
099         *            input MultipleAlignment
100         * @param mapSeqToStruct
101         *            provides a link from the sequence alignment position to the
102         *            structure alignment position. Specially designed for the GUI.
103         *            Has to be initialized previously and will be overwritten.
104         * @return a string for each row in the alignment, giving the 1-letter code
105         *         for each aligned residue.
106         */
107        public static List<String> getSequenceAlignment(
108                        MultipleAlignment alignment, final List<Integer> mapSeqToStruct) {
109
110                // Initialize sequence variables
111                List<String> alnSequences = new ArrayList<String>();
112                for (int str = 0; str < alignment.size(); str++)
113                        alnSequences.add("");
114                mapSeqToStruct.clear();
115                List<Atom[]> atoms = alignment.getAtomArrays();
116                int globalPos = -1;
117
118                // Initialize helper variables in constucting the sequence alignment
119                List<SortedSet<Integer>> freePool = new ArrayList<SortedSet<Integer>>();
120                List<SortedSet<Integer>> blockStarts = new ArrayList<SortedSet<Integer>>();
121                List<List<Integer>> aligned = new ArrayList<List<Integer>>();
122
123                // Generate freePool residues from the ones not aligned
124                for (int i = 0; i < alignment.size(); i++) {
125                        List<Integer> residues = new ArrayList<Integer>();
126                        freePool.add(new TreeSet<Integer>());
127                        blockStarts.add(new TreeSet<Integer>());
128                        for (BlockSet bs : alignment.getBlockSets()) {
129                                for (Block b : bs.getBlocks()) {
130                                        boolean first = true;
131                                        for (int l = 0; l < b.length(); l++) {
132                                                Integer residue = b.getAlignRes().get(i).get(l);
133                                                if (residue != null) {
134                                                        if (first)
135                                                                blockStarts.get(i).add(residue);
136                                                        residues.add(residue);
137                                                        first = false;
138                                                }
139                                        }
140                                }
141                        }
142                        aligned.add(residues);
143                }
144                // Add any residue not aligned to the free pool for every structure
145                for (int i = 0; i < alignment.size(); i++) {
146                        for (int k = 0; k < atoms.get(i).length; k++) {
147                                if (!aligned.get(i).contains(k))
148                                        freePool.get(i).add(k);
149                        }
150                }
151
152                for (int b = 0; b < alignment.getBlocks().size(); b++) {
153                        if (b != 0) {
154                                // Add a gap to all structures to separate visually the Blocks
155                                for (int str = 0; str < alignment.size(); str++)
156                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
157                                mapSeqToStruct.add(-1); // means unaligned position
158                        }
159                        // Store the previous position added to the sequence alignment
160                        int[] previousPos = new int[alignment.size()];
161                        Arrays.fill(previousPos, -1);
162                        // Store provisional characters
163                        char[] provisionalChar = new char[alignment.size()];
164                        Arrays.fill(provisionalChar, '-');
165
166                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
167                                globalPos++;
168                                boolean gaps = true; // true if consecutive with the previous
169                                while (gaps) {
170                                        gaps = false;
171                                        // Loop through all the structures
172                                        for (int str = 0; str < alignment.size(); str++) {
173                                                // If it is the first position or before it was null
174                                                if (previousPos[str] == -1) {
175                                                        Integer residue = alignment.getBlocks().get(b)
176                                                                        .getAlignRes().get(str).get(pos);
177                                                        if (residue == null)
178                                                                provisionalChar[str] = '-';
179                                                        else {
180                                                                Atom a = atoms.get(str)[residue];
181                                                                String group = a.getGroup().getPDBName();
182                                                                provisionalChar[str] = StructureTools
183                                                                                .get1LetterCode(group);
184                                                        }
185                                                } else {
186                                                        Integer residue = alignment.getBlocks().get(b)
187                                                                        .getAlignRes().get(str).get(pos);
188                                                        int nextPos = previousPos[str] + 1;
189                                                        if (residue == null) {
190                                                                if (freePool.get(str).contains(nextPos)) {
191                                                                        Atom a = atoms.get(str)[nextPos];
192                                                                        String g = a.getGroup().getPDBName();
193                                                                        char aa = StructureTools.get1LetterCode(g);
194                                                                        provisionalChar[str] = Character
195                                                                                        .toLowerCase(aa);
196                                                                } else
197                                                                        provisionalChar[str] = '-';
198                                                        } else if (nextPos == residue) {
199                                                                Atom a = atoms.get(str)[nextPos];
200                                                                String group = a.getGroup().getPDBName();
201                                                                provisionalChar[str] = StructureTools
202                                                                                .get1LetterCode(group);
203                                                        } else {
204                                                                // This means non-consecutive
205                                                                provisionalChar[str] = ' ';
206                                                                gaps = true;
207                                                        }
208                                                }
209                                        }// End all structure analysis
210
211                                        if (gaps) {
212                                                for (int str = 0; str < alignment.size(); str++) {
213                                                        if (provisionalChar[str] == ' ') {
214                                                                // It means this residue was non-consecutive
215                                                                Atom a = atoms.get(str)[previousPos[str] + 1];
216                                                                String group = a.getGroup().getPDBName();
217                                                                char aa = StructureTools.get1LetterCode(group);
218                                                                alnSequences
219                                                                                .set(str,
220                                                                                                alnSequences
221                                                                                                                .get(str)
222                                                                                                                .concat(String.valueOf(Character
223                                                                                                                                                .toLowerCase(aa))) );
224                                                                previousPos[str]++;
225                                                        } else {
226                                                                // Insert a gap otherwise
227                                                                alnSequences.set(str, alnSequences.get(str)
228                                                                                .concat("-"));
229                                                        }
230                                                }
231                                                mapSeqToStruct.add(-1); // unaligned position
232                                        } else {
233                                                // Add provisional and update the indices
234                                                for (int str = 0; str < alignment.size(); str++) {
235                                                        alnSequences.set(
236                                                                        str,
237                                                                        alnSequences.get(str).concat(
238                                                                                        String.valueOf(provisionalChar[str])));
239
240                                                        if (provisionalChar[str] != '-') {
241                                                                if (alignment.getBlocks().get(b).getAlignRes()
242                                                                                .get(str).get(pos) == null) {
243                                                                        previousPos[str]++;
244                                                                } else {
245                                                                        previousPos[str] = alignment.getBlocks()
246                                                                                        .get(b).getAlignRes().get(str)
247                                                                                        .get(pos);
248                                                                }
249                                                        }
250                                                }
251                                                mapSeqToStruct.add(globalPos); // alignment index
252                                        }
253                                }
254                        } // All positions in the Block considered so far
255
256                        // Calculate the index of the next Block for every structure
257                        int[] blockEnds = new int[alignment.size()];
258                        for (int str = 0; str < alignment.size(); str++) {
259                                for (int res : blockStarts.get(str)) {
260                                        if (previousPos[str] > res)
261                                                blockEnds[str] = res;
262                                        else {
263                                                blockEnds[str] = res;
264                                                break;
265                                        }
266                                }
267                        }
268
269                        // Add the unaligned residues in between Blocks (lowercase)
270                        boolean allGaps = false; // true means no more residues to add
271                        while (!allGaps) {
272                                allGaps = true;
273                                for (int str = 0; str < alignment.size(); str++) {
274                                        if (previousPos[str] + 1 < blockEnds[str]) {
275                                                Atom a = atoms.get(str)[previousPos[str] + 1];
276                                                String group = a.getGroup().getPDBName();
277                                                char letter = StructureTools.get1LetterCode(group);
278
279                                                provisionalChar[str] = Character.toLowerCase(letter);
280                                                previousPos[str]++;
281                                                allGaps = false;
282                                        } else
283                                                provisionalChar[str] = '-';
284                                }
285                                if (!allGaps) {
286                                        for (int str = 0; str < alignment.size(); str++) {
287                                                alnSequences.set(
288                                                                str,
289                                                                alnSequences.get(str).concat(
290                                                                                String.valueOf(provisionalChar[str])) );
291                                        }
292                                        mapSeqToStruct.add(-1); // unaligned position
293                                }
294                        }
295                }
296                return alnSequences;
297        }
298
299        /**
300         * Calculate the sequence alignment Strings for the whole alignment. This
301         * method creates a sequence alignment where aligned residues are in
302         * uppercase and unaligned residues are in lowercase, thus providing a more
303         * compact way to represent the alignment.
304         * <p>
305         * Blocks are concatenated in the order returned by
306         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
307         * sequential. Gaps are represented by '-'. Separation between different
308         * Blocks is indicated by a gap in all positions, meaning that there is a
309         * possible discontinuity.
310         *
311         * @param alignment
312         *            input MultipleAlignment
313         * @return String for each row in the alignment, giving the 1-letter code
314         *         for each aligned residue.
315         */
316        public static List<String> getSequenceAlignment(MultipleAlignment msa) {
317                return getSequenceAlignment(msa, new ArrayList<Integer>());
318        }
319
320        /**
321         * Calculate the sequence alignment Strings for the alignment Blocks in an
322         * alignment. This method creates a sequence alignment where all residues
323         * are in uppercase and a residue alone with gaps in all the other
324         * structures represents unaligned residues. Because of this latter
325         * constraint only the residues within the Blocks are represented, for a
326         * more compact alignment. For a sequence alignment of the full protein use
327         * {@link #getSequenceAlignment(MultipleAlignment)}.
328         * <p>
329         * Blocks are concatenated in the order returned by
330         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
331         * sequential. Gaps between blocks are omitted, while gaps within blocks are
332         * represented by '-'. Separation between different Blocks is indicated by a
333         * gap in all positions, meaning that there is something unaligned
334         * inbetween.
335         *
336         * @param alignment
337         *            input MultipleAlignment
338         * @param mapSeqToStruct
339         *            provides a link from the sequence alignment position to the
340         *            structure alignment position. Specially designed for the GUI.
341         *            Has to be initialized previously and will be overwritten.
342         * @return a string for each row in the alignment, giving the 1-letter code
343         *         for each aligned residue.
344         */
345        public static List<String> getBlockSequenceAlignment(
346                        MultipleAlignment alignment, List<Integer> mapSeqToStruct) {
347
348                // Initialize sequence variables
349                List<String> alnSequences = new ArrayList<String>();
350                for (int str = 0; str < alignment.size(); str++)
351                        alnSequences.add("");
352                mapSeqToStruct.clear();
353                List<Atom[]> atoms = alignment.getAtomArrays();
354                int globalPos = -1;
355
356                // Loop through all the alignment Blocks in the order given
357                for (int b = 0; b < alignment.getBlocks().size(); b++) {
358                        if (b != 0) {
359                                // Add a gap to all structures to separate Blocks
360                                for (int str = 0; str < alignment.size(); str++)
361                                        alnSequences.set(str, alnSequences.get(str).concat("-"));
362                                mapSeqToStruct.add(-1); // means unaligned position
363                        }
364
365                        // Store the previous position added to the sequence alignment
366                        int[] previousPos = new int[alignment.size()];
367                        Arrays.fill(previousPos, -1);
368                        // Store provisional characters
369                        char[] provisionalChar = new char[alignment.size()];
370                        Arrays.fill(provisionalChar, '-');
371
372                        for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) {
373                                globalPos++;
374                                boolean gaps = true;
375                                while (gaps) {
376                                        gaps = false;
377                                        // Loop through all the structures
378                                        for (int str = 0; str < alignment.size(); str++) {
379                                                // If it is the first position or before it was null
380                                                if (previousPos[str] == -1) {
381                                                        Integer residue = alignment.getBlocks().get(b)
382                                                                        .getAlignRes().get(str).get(pos);
383                                                        if (residue == null)
384                                                                provisionalChar[str] = '-';
385                                                        else {
386                                                                Atom a = atoms.get(str)[residue];
387                                                                String g = a.getGroup().getPDBName();
388                                                                char aa = StructureTools.get1LetterCode(g);
389                                                                provisionalChar[str] = aa;
390                                                        }
391                                                } else {
392                                                        Integer residue = alignment.getBlocks().get(b)
393                                                                        .getAlignRes().get(str).get(pos);
394                                                        if (residue == null)
395                                                                provisionalChar[str] = '-';
396                                                        else if (previousPos[str] + 1 == residue) {
397                                                                Atom a = atoms.get(str)[residue];
398                                                                String g = a.getGroup().getPDBName();
399                                                                char aa = StructureTools.get1LetterCode(g);
400                                                                provisionalChar[str] = aa;
401                                                        } else {
402                                                                provisionalChar[str] = ' ';
403                                                                gaps = true;
404                                                        }
405                                                }
406                                        }// End all structures analysis
407
408                                        if (gaps) {
409                                                for (int str = 0; str < alignment.size(); str++) {
410                                                        if (provisionalChar[str] == ' ') {
411                                                                // It means this residue was non-consecutive
412                                                                for (int s2 = 0; s2 < alignment.size(); s2++) {
413                                                                        if (str == s2) {
414                                                                                int next = previousPos[str] + 1;
415                                                                                Atom a = atoms.get(s2)[next];
416                                                                                String g = a.getGroup().getPDBName();
417                                                                                char aa = StructureTools
418                                                                                                .get1LetterCode(g);
419                                                                                alnSequences.set(
420                                                                                                s2,
421                                                                                                alnSequences.get(s2).concat(
422                                                                                                                String.valueOf(aa)) );
423                                                                        } else {
424                                                                                alnSequences.set(s2,
425                                                                                                alnSequences.get(s2)
426                                                                                                                .concat("-"));
427                                                                        }
428                                                                }
429                                                                mapSeqToStruct.add(-1); // unaligned
430                                                                previousPos[str] += 1;
431                                                        }
432                                                }
433                                        } else { // Append the provisional and update the indices
434                                                for (int str = 0; str < alignment.size(); str++) {
435                                                        alnSequences.set(
436                                                                        str,
437                                                                        alnSequences.get(str).concat(
438                                                                                        String.valueOf(provisionalChar[str])) );
439                                                        if (provisionalChar[str] != '-') {
440                                                                previousPos[str] = alignment.getBlocks().get(b)
441                                                                                .getAlignRes().get(str).get(pos);
442                                                        }
443                                                }
444                                                mapSeqToStruct.add(globalPos);
445                                        }
446                                }
447                        }
448                }
449                return alnSequences;
450        }
451
452        /**
453         * Calculate the sequence alignment Strings for the alignment Blocks in an
454         * alignment. This method creates a sequence alignment where all residues
455         * are in uppercase and a residue alone with gaps in all the other
456         * structures represents unaligned residues. Because of this latter
457         * constraint only the residues within the Blocks are represented, for a
458         * more compact alignment. For a sequence alignment of the full protein use
459         * {@link #getSequenceAlignment(MultipleAlignment)}.
460         * <p>
461         * Blocks are concatenated in the order returned by
462         * {@link MultipleAlignment#getBlocks()}, so sequences may not be
463         * sequential. Gaps between blocks are omitted, while gaps within blocks are
464         * represented by '-'. Separation between different Blocks is indicated by a
465         * gap in all positions, meaning that there is something unaligned
466         * inbetween.
467         *
468         * @param alignment
469         *            input MultipleAlignment
470         * @return String for each row in the alignment, giving the 1-letter code
471         *         for each aligned residue.
472         */
473        public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) {
474                return getBlockSequenceAlignment(ma, new ArrayList<Integer>());
475        }
476
477        /**
478         * Returns the Atom of the specified structure that is aligned in the
479         * sequence alignment position specified.
480         *
481         * @param multAln
482         *            the MultipleAlignment object from where the sequence alignment
483         *            has been generated
484         * @param mapSeqToStruct
485         *            the mapping between sequence and structure generated with the
486         *            sequence alignment
487         * @param str
488         *            the structure index of the alignment (row)
489         * @param sequencePos
490         *            the sequence alignment position (column)
491         * @return Atom the atom in that position or null if there is a gap
492         */
493        public static Atom getAtomForSequencePosition(MultipleAlignment msa,
494                        List<Integer> mapSeqToStruct, int str, int sequencePos) {
495
496                int seqPos = mapSeqToStruct.get(sequencePos);
497
498                // Check if the position selected is an aligned position
499                if (seqPos == -1)
500                        return null;
501                else {
502                        Atom a = null;
503                        // Calculate the corresponding structure position
504                        int sum = 0;
505                        for (Block b : msa.getBlocks()) {
506                                if (sum + b.length() <= seqPos) {
507                                        sum += b.length();
508                                        continue;
509                                } else {
510                                        for (Integer p : b.getAlignRes().get(str)) {
511                                                if (sum == seqPos) {
512                                                        if (p != null) {
513                                                                a = msa.getAtomArrays().get(str)[p];
514                                                        }
515                                                        break;
516                                                }
517                                                sum++;
518                                        }
519                                        break;
520                                }
521                        }
522                        return a;
523                }
524        }
525
526        /**
527         * Returns the block number of a specified position in the sequence
528         * alignment, given the mapping from structure to function.
529         *
530         * @param multAln
531         *            the MultipleAlignment object from where the sequence alignment
532         *            has been generated.
533         * @param mapSeqToStruct
534         *            the mapping between sequence and structure generated with the
535         *            sequence alignment
536         * @param sequencePos
537         *            the position in the sequence alignment (column)
538         * @return int the block index, or -1 if the position is not aligned
539         */
540        public static int getBlockForSequencePosition(MultipleAlignment multAln,
541                        List<Integer> mapSeqToStruct, int sequencePos) {
542
543                int seqPos = mapSeqToStruct.get(sequencePos);
544                // Check if the position selected is an aligned position
545                if (seqPos == -1)
546                        return -1;
547                else {
548                        // Calculate the corresponding block (by iterating all Blocks)
549                        int sum = 0;
550                        int block = 0;
551                        for (Block b : multAln.getBlocks()) {
552                                if (sum + b.length() <= seqPos) {
553                                        sum += b.length();
554                                        block++;
555                                        continue;
556                                } else
557                                        break;
558                        }
559                        return block;
560                }
561        }
562
563        /**
564         * The average residue distance Matrix contains the average distance from
565         * each residue to all other residues aligned with it.
566         * <p>
567         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
568         * length.
569         *
570         * @param alignment
571         *            MultipleAlignment
572         * @return Matrix containing all average residue distances
573         */
574        public static Matrix getAverageResidueDistances(MultipleAlignment msa) {
575                List<Atom[]> transformed = transformAtoms(msa);
576                return getAverageResidueDistances(transformed);
577        }
578
579        /**
580         * The average residue distance Matrix contains the average distance from
581         * each residue to all other residues aligned with it.
582         * <p>
583         * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment
584         * length.
585         *
586         * @param transformed
587         *            List of Atom arrays containing only the aligned atoms of each
588         *            structure, or null if there is a gap.
589         * @return Matrix containing all average residue distances. Entry -1 means
590         *         there is a gap in the position.
591         */
592        public static Matrix getAverageResidueDistances(List<Atom[]> transformed) {
593
594                int size = transformed.size();
595                int length = transformed.get(0).length;
596                Matrix resDist = new Matrix(size, length, -1);
597
598                // Calculate the average residue distances
599                for (int r1 = 0; r1 < size; r1++) {
600                        for (int c = 0; c < transformed.get(r1).length; c++) {
601                                Atom refAtom = transformed.get(r1)[c];
602                                if (refAtom == null)
603                                        continue;
604
605                                for (int r2 = r1 + 1; r2 < size; r2++) {
606                                        Atom atom = transformed.get(r2)[c];
607                                        if (atom != null) {
608                                                double distance = Calc.getDistance(refAtom, atom);
609
610                                                if (resDist.get(r1, c) == -1) {
611                                                        resDist.set(r1, c, 1 + distance);
612                                                } else {
613                                                        resDist.set(r1, c, resDist.get(r1, c) + distance);
614                                                }
615
616                                                if (resDist.get(r2, c) == -1) {
617                                                        resDist.set(r2, c, 1 + distance);
618                                                } else {
619                                                        resDist.set(r2, c, resDist.get(r2, c) + distance);
620                                                }
621                                        }
622                                }
623                        }
624                }
625
626                for (int c = 0; c < length; c++) {
627                        int nonNullRes = 0;
628                        for (int r = 0; r < size; r++) {
629                                if (resDist.get(r, c) != -1)
630                                        nonNullRes++;
631                        }
632                        for (int r = 0; r < size; r++) {
633                                if (resDist.get(r, c) != -1) {
634                                        resDist.set(r, c, resDist.get(r, c) / nonNullRes);
635                                }
636                        }
637                }
638                return resDist;
639        }
640
641        /**
642         * Transforms atoms according to the superposition stored in the alignment.
643         * <p>
644         * For each structure in the alignment, returns an atom for each
645         * representative atom in the aligned columns, omitting unaligned residues
646         * (i.e. an array of length <tt>alignment.length()</tt> ).
647         * <p>
648         * All blocks are concatenated together, so Atoms may not appear in the same
649         * order as in their parent structure. If the alignment blocks contain null
650         * residues (gaps), then the returned array will also contain null Atoms in
651         * the same positions.
652         *
653         * @param alignment
654         *            MultipleAlignment
655         * @return List of Atom arrays of only the aligned atoms of every structure
656         *         (null Atom if a gap position)
657         */
658        public static List<Atom[]> transformAtoms(MultipleAlignment alignment) {
659                if (alignment.getEnsemble() == null) {
660                        throw new NullPointerException("No ensemble set for this alignment");
661                }
662
663                List<Atom[]> atomArrays = alignment.getAtomArrays();
664                List<Atom[]> transformed = new ArrayList<Atom[]>(atomArrays.size());
665
666                // Loop through structures
667                for (int i = 0; i < atomArrays.size(); i++) {
668
669                        Matrix4d transform = null;
670                        Atom[] curr = atomArrays.get(i); // all CA atoms from structure
671
672                        // Concatenated list of all blocks for this structure
673                        Atom[] transformedAtoms = new Atom[alignment.length()];
674                        int transformedAtomsLength = 0;
675
676                        // Each blockset gets transformed independently
677                        for (BlockSet bs : alignment.getBlockSets()) {
678
679                                Atom[] blocksetAtoms = new Atom[bs.length()];
680                                int blockPos = 0;
681
682                                for (Block blk : bs.getBlocks()) {
683                                        if (blk.size() != atomArrays.size()) {
684                                                throw new IllegalStateException(String.format(
685                                                                "Mismatched block size. Expected %d "
686                                                                                + "structures, found %d.",
687                                                                atomArrays.size(), blk.size()));
688                                        }
689                                        // Extract aligned atoms
690                                        for (int j = 0; j < blk.length(); j++) {
691                                                Integer alignedPos = blk.getAlignRes().get(i).get(j);
692                                                if (alignedPos != null) {
693                                                        blocksetAtoms[blockPos] = (Atom) curr[alignedPos]
694                                                                        .clone();
695                                                }
696                                                blockPos++;
697                                        }
698                                }
699
700                                // Transform according to the blockset or alignment matrix
701                                Matrix4d blockTrans = null;
702                                if (bs.getTransformations() != null)
703                                        blockTrans = bs.getTransformations().get(i);
704                                if (blockTrans == null) {
705                                        blockTrans = transform;
706                                }
707
708                                for (Atom a : blocksetAtoms) {
709                                        if (a != null)
710                                                Calc.transform(a, blockTrans);
711                                        transformedAtoms[transformedAtomsLength] = a;
712                                        transformedAtomsLength++;
713                                }
714                        }
715                        assert (transformedAtomsLength == alignment.length());
716
717                        transformed.add(transformedAtoms);
718                }
719                return transformed;
720        }
721
722        /**
723         * Calculate a List of alignment indicies that correspond to the core of a
724         * Block, which means that all structures have a residue in that positon.
725         *
726         * @param block
727         *            alignment Block
728         * @return List of positions in the core of the alignment
729         */
730        public static List<Integer> getCorePositions(Block block) {
731
732                List<Integer> corePositions = new ArrayList<Integer>();
733
734                for (int col = 0; col < block.length(); col++) {
735                        boolean core = true;
736                        for (int str = 0; str < block.size(); str++) {
737                                if (block.getAlignRes().get(str).get(col) == null) {
738                                        core = false;
739                                        break;
740                                }
741                        }
742                        if (core)
743                                corePositions.add(col);
744                }
745                return corePositions;
746        }
747
748        /**
749         * Sort blocks so that the specified row is in sequential order. The sort
750         * happens in place.
751         *
752         * @param blocks
753         *            List of blocks
754         * @param sortedIndex
755         *            Index of the row to be sorted
756         */
757        public static void sortBlocks(List<Block> blocks, final int sortedIndex) {
758                Collections.sort(blocks, new Comparator<Block>() {
759                        @Override
760                        public int compare(Block o1, Block o2) {
761                                // Compare the first non-null residue of each block
762                                List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex);
763                                List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex);
764                                Integer res1 = null;
765                                Integer res2 = null;
766                                for (Integer r : alignres1) {
767                                        if (r != null) {
768                                                res1 = r;
769                                                break;
770                                        }
771                                }
772                                for (Integer r : alignres2) {
773                                        if (r != null) {
774                                                res2 = r;
775                                                break;
776                                        }
777                                }
778                                return res1.compareTo(res2);
779                        }
780                });
781        }
782
783        /**
784         * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid
785         * residues. This method is only valid for protein structure alignments.
786         *
787         * @param msta
788         *            Multiple Structure Alignment
789         * @return MultipleSequenceAlignment of protein sequences
790         * @throws CompoundNotFoundException
791         */
792        public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA(
793                        MultipleAlignment msta) throws CompoundNotFoundException {
794
795                // Check that the alignment is of protein structures
796                Group g = msta.getAtomArrays().get(0)[0].getGroup();
797                if (!(g instanceof AminoAcid)) {
798                        throw new IllegalArgumentException(
799                                        "Cannot convert to multiple sequence alignment: "
800                                                        + "the structures aligned are not proteins");
801                }
802
803                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound>();
804
805                Map<String, Integer> uniqueID = new HashMap<String, Integer>();
806                List<String> seqs = getSequenceAlignment(msta);
807                for (int i = 0; i < msta.size(); i++) {
808                        // Make sure the identifiers are unique (required by AccessionID)
809                        String id = msta.getStructureIdentifier(i).toString();
810                        if (uniqueID.containsKey(id)) {
811                                uniqueID.put(id, uniqueID.get(id) + 1);
812                                id += "_" + uniqueID.get(id);
813                        } else
814                                uniqueID.put(id, 1);
815
816                        AccessionID acc = new AccessionID(id);
817                        ProteinSequence pseq = new ProteinSequence(seqs.get(i));
818                        pseq.setAccession(acc);
819                        msa.addAlignedSequence(pseq);
820                }
821                return msa;
822        }
823
824        public static Structure toMultimodelStructure(MultipleAlignment multAln, List<Atom[]> transformedAtoms) throws StructureException {
825                PDBHeader header = new PDBHeader();
826                String title = multAln.getEnsemble().getAlgorithmName() + " V."
827                                + multAln.getEnsemble().getVersion() + " : ";
828
829                for (StructureIdentifier name : multAln.getEnsemble()
830                                .getStructureIdentifiers()) {
831                        title += name.getIdentifier() + " ";
832                }
833                Structure artificial = getAlignedStructure(transformedAtoms);
834
835                artificial.setPDBHeader(header);
836                header.setTitle(title);
837                return artificial;
838        }
839
840        /**
841         * Get an artificial Structure containing a different model for every
842         * input structure, so that the alignment result can be viewed in Jmol.
843         * The Atoms have to be rotated beforehand.
844         *
845         * @param atomArrays an array of Atoms for every aligned structure
846         * @return a structure object containing a set of models,
847         *                      one for each input array of Atoms.
848         * @throws StructureException
849         */
850        public static final Structure getAlignedStructure(List<Atom[]> atomArrays)
851                        throws StructureException {
852
853                Structure s = new StructureImpl();
854                for (int i=0; i<atomArrays.size(); i++){
855                        List<Chain> model = AlignmentTools.getAlignedModel(atomArrays.get(i));
856                        s.addModel(model);
857                }
858                return s;
859        }
860
861        /**
862         * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of
863         * the matrix contains the RMSD between structures i and j.
864         *
865         * @param msa
866         *            Multiple Structure Alignment
867         * @return Matrix of RMSD with size the number of structures squared
868         */
869        public static Matrix getRMSDMatrix(MultipleAlignment msa) {
870
871                Matrix rmsdMat = new Matrix(msa.size(), msa.size());
872                List<Atom[]> superposed = transformAtoms(msa);
873
874                for (int i = 0; i < msa.size(); i++) {
875                        for (int j = i; j < msa.size(); j++) {
876                                if (i == j)
877                                        rmsdMat.set(i, j, 0.0);
878                                List<Atom[]> compared = new ArrayList<Atom[]>();
879                                compared.add(superposed.get(i));
880                                compared.add(superposed.get(j));
881                                double rmsd = MultipleAlignmentScorer.getRMSD(compared);
882                                rmsdMat.set(i, j, rmsd);
883                                rmsdMat.set(j, i, rmsd);
884                        }
885                }
886                return rmsdMat;
887        }
888
889        /**
890         * Calculate a phylogenetic tree of the MultipleAlignment using Kimura
891         * distances and the Neighbor Joining algorithm from forester.
892         *
893         * @param msta
894         *            MultipleAlignment of protein structures
895         * @return Phylogeny phylogenetic tree
896         * @throws CompoundNotFoundException
897         * @throws IOException
898         */
899        public static Phylogeny getKimuraTree(MultipleAlignment msta)
900                        throws CompoundNotFoundException, IOException {
901                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
902                                .toProteinMSA(msta);
903                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
904                                .kimuraDistance(msa);
905                Phylogeny tree = TreeConstructor.distanceTree(distmat,
906                                TreeConstructorType.NJ);
907                tree.setName("Kimura Tree");
908                return tree;
909        }
910
911        /**
912         * Calculate a phylogenetic tree of the MultipleAlignment using
913         * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for
914         * distantly related proteins, structure-derived) and the Neighbor Joining
915         * algorithm from forester.
916         *
917         * @param msta
918         *            MultipleAlignment of protein structures
919         * @return Phylogeny phylogenetic tree
920         * @throws CompoundNotFoundException
921         * @throws IOException
922         */
923        public static Phylogeny getHSDMTree(MultipleAlignment msta)
924                        throws CompoundNotFoundException, IOException {
925                MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools
926                                .toProteinMSA(msta);
927                BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
928                                .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102"));
929                Phylogeny tree = TreeConstructor.distanceTree(distmat,
930                                TreeConstructorType.NJ);
931                tree.setName("HSDM Tree");
932                return tree;
933        }
934
935        /**
936         * Calculate a phylogenetic tree of the MultipleAlignment using RMSD
937         * distances and the Neighbor Joining algorithm from forester.
938         *
939         * @param msta
940         *            MultipleAlignment of protein structures
941         * @return Phylogeny phylogenetic tree
942         * @throws CompoundNotFoundException
943         */
944        public static Phylogeny getStructuralTree(MultipleAlignment msta) {
945                double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta)
946                                .getArray();
947                BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator
948                                .structuralDistance(rmsdMat, 1, 5, 0.4);
949                // Set the identifiers of the matrix
950                Map<String, Integer> alreadySeen = new HashMap<String, Integer>();
951                for (int i = 0; i < msta.size(); i++) {
952                        // Make sure the identifiers are unique
953                        String id = msta.getStructureIdentifier(i).toString();
954                        if (alreadySeen.containsKey(id)) {
955                                alreadySeen.put(id, alreadySeen.get(id) + 1);
956                                id += "_" + alreadySeen.get(id);
957                        } else
958                                alreadySeen.put(id, 1);
959                        rmsdDist.setIdentifier(i, id);
960                }
961                Phylogeny tree = TreeConstructor.distanceTree(rmsdDist,
962                                TreeConstructorType.NJ);
963                tree.setName("Structural Tree");
964                return tree;
965        }
966
967}