001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.multiple.util; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.Collections; 027import java.util.Comparator; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Map; 031import java.util.SortedSet; 032import java.util.TreeSet; 033 034import javax.vecmath.Matrix4d; 035 036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 037import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 038import org.biojava.nbio.core.sequence.AccessionID; 039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment; 040import org.biojava.nbio.core.sequence.ProteinSequence; 041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 042import org.biojava.nbio.phylo.DistanceMatrixCalculator; 043import org.biojava.nbio.phylo.TreeConstructor; 044import org.biojava.nbio.phylo.TreeConstructorType; 045import org.biojava.nbio.structure.AminoAcid; 046import org.biojava.nbio.structure.Atom; 047import org.biojava.nbio.structure.Calc; 048import org.biojava.nbio.structure.Group; 049import org.biojava.nbio.structure.StructureTools; 050import org.biojava.nbio.structure.align.multiple.Block; 051import org.biojava.nbio.structure.align.multiple.BlockSet; 052import org.biojava.nbio.structure.align.multiple.MultipleAlignment; 053import org.biojava.nbio.structure.jama.Matrix; 054import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; 055import org.forester.phylogeny.Phylogeny; 056 057/** 058 * Utility functions for working with {@link MultipleAlignment}. 059 * <p> 060 * Supported functions: 061 * <ul> 062 * <li>Obtain the alignment as sequence strings 063 * <li>Map from sequence alignment position to structure Atom 064 * <li>Map from sequence alignment position to Block number 065 * <li>Transform the aligned Atoms of a MultipleAlignment 066 * <li>Get all the core alignment positions of the alignment 067 * <li>Calculate the average residue distance of all aligned positions 068 * <li>Sort Blocks in a MultipleAlignment by a specified row 069 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment 070 * </ul> 071 * 072 * @author Spencer Bliven 073 * @author Aleix Lafita 074 * @since 4.1.0 075 * 076 */ 077public class MultipleAlignmentTools { 078 079 /** 080 * Calculate the sequence alignment Strings for the whole alignment. This 081 * method creates a sequence alignment where aligned residues are in 082 * uppercase and unaligned residues are in lowercase, thus providing a more 083 * compact way to represent the alignment. 084 * <p> 085 * Blocks are concatenated in the order returned by 086 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 087 * sequential. Gaps are represented by '-'. Separation between different 088 * Blocks is indicated by a gap in all positions, meaning that there is a 089 * possible discontinuity. 090 * 091 * @param alignment 092 * input MultipleAlignment 093 * @param mapSeqToStruct 094 * provides a link from the sequence alignment position to the 095 * structure alignment position. Specially designed for the GUI. 096 * Has to be initialized previously and will be overwritten. 097 * @return a string for each row in the alignment, giving the 1-letter code 098 * for each aligned residue. 099 */ 100 public static List<String> getSequenceAlignment( 101 MultipleAlignment alignment, final List<Integer> mapSeqToStruct) { 102 103 // Initialize sequence variables 104 List<String> alnSequences = new ArrayList<String>(); 105 for (int str = 0; str < alignment.size(); str++) 106 alnSequences.add(""); 107 mapSeqToStruct.clear(); 108 List<Atom[]> atoms = alignment.getAtomArrays(); 109 int globalPos = -1; 110 111 // Initialize helper variables in constucting the sequence alignment 112 List<SortedSet<Integer>> freePool = new ArrayList<SortedSet<Integer>>(); 113 List<SortedSet<Integer>> blockStarts = new ArrayList<SortedSet<Integer>>(); 114 List<List<Integer>> aligned = new ArrayList<List<Integer>>(); 115 116 // Generate freePool residues from the ones not aligned 117 for (int i = 0; i < alignment.size(); i++) { 118 List<Integer> residues = new ArrayList<Integer>(); 119 freePool.add(new TreeSet<Integer>()); 120 blockStarts.add(new TreeSet<Integer>()); 121 for (BlockSet bs : alignment.getBlockSets()) { 122 for (Block b : bs.getBlocks()) { 123 boolean first = true; 124 for (int l = 0; l < b.length(); l++) { 125 Integer residue = b.getAlignRes().get(i).get(l); 126 if (residue != null) { 127 if (first) 128 blockStarts.get(i).add(residue); 129 residues.add(residue); 130 first = false; 131 } 132 } 133 } 134 } 135 aligned.add(residues); 136 } 137 // Add any residue not aligned to the free pool for every structure 138 for (int i = 0; i < alignment.size(); i++) { 139 for (int k = 0; k < atoms.get(i).length; k++) { 140 if (!aligned.get(i).contains(k)) 141 freePool.get(i).add(k); 142 } 143 } 144 145 for (int b = 0; b < alignment.getBlocks().size(); b++) { 146 if (b != 0) { 147 // Add a gap to all structures to separate visually the Blocks 148 for (int str = 0; str < alignment.size(); str++) 149 alnSequences.set(str, alnSequences.get(str).concat("-")); 150 mapSeqToStruct.add(-1); // means unaligned position 151 } 152 // Store the previous position added to the sequence alignment 153 int[] previousPos = new int[alignment.size()]; 154 Arrays.fill(previousPos, -1); 155 // Store provisional characters 156 char[] provisionalChar = new char[alignment.size()]; 157 Arrays.fill(provisionalChar, '-'); 158 159 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 160 globalPos++; 161 boolean gaps = true; // true if consecutive with the previous 162 while (gaps) { 163 gaps = false; 164 // Loop through all the structures 165 for (int str = 0; str < alignment.size(); str++) { 166 // If it is the first position or before it was null 167 if (previousPos[str] == -1) { 168 Integer residue = alignment.getBlocks().get(b) 169 .getAlignRes().get(str).get(pos); 170 if (residue == null) 171 provisionalChar[str] = '-'; 172 else { 173 Atom a = atoms.get(str)[residue]; 174 String group = a.getGroup().getPDBName(); 175 provisionalChar[str] = StructureTools 176 .get1LetterCode(group); 177 } 178 } else { 179 Integer residue = alignment.getBlocks().get(b) 180 .getAlignRes().get(str).get(pos); 181 int nextPos = previousPos[str] + 1; 182 if (residue == null) { 183 if (freePool.get(str).contains(nextPos)) { 184 Atom a = atoms.get(str)[nextPos]; 185 String g = a.getGroup().getPDBName(); 186 char aa = StructureTools.get1LetterCode(g); 187 provisionalChar[str] = Character 188 .toLowerCase(aa); 189 } else 190 provisionalChar[str] = '-'; 191 } else if (nextPos == residue) { 192 Atom a = atoms.get(str)[nextPos]; 193 String group = a.getGroup().getPDBName(); 194 provisionalChar[str] = StructureTools 195 .get1LetterCode(group); 196 } else { 197 // This means non-consecutive 198 provisionalChar[str] = ' '; 199 gaps = true; 200 } 201 } 202 }// End all structure analysis 203 204 if (gaps) { 205 for (int str = 0; str < alignment.size(); str++) { 206 if (provisionalChar[str] == ' ') { 207 // It means this residue was non-consecutive 208 Atom a = atoms.get(str)[previousPos[str] + 1]; 209 String group = a.getGroup().getPDBName(); 210 char aa = StructureTools.get1LetterCode(group); 211 alnSequences 212 .set(str, 213 alnSequences 214 .get(str) 215 .concat("" 216 + Character 217 .toLowerCase(aa))); 218 previousPos[str]++; 219 } else { 220 // Insert a gap otherwise 221 alnSequences.set(str, alnSequences.get(str) 222 .concat("-")); 223 } 224 } 225 mapSeqToStruct.add(-1); // unaligned position 226 } else { 227 // Add provisional and update the indices 228 for (int str = 0; str < alignment.size(); str++) { 229 alnSequences.set( 230 str, 231 alnSequences.get(str).concat( 232 "" + provisionalChar[str])); 233 234 if (provisionalChar[str] != '-') { 235 if (alignment.getBlocks().get(b).getAlignRes() 236 .get(str).get(pos) == null) { 237 previousPos[str]++; 238 } else { 239 previousPos[str] = alignment.getBlocks() 240 .get(b).getAlignRes().get(str) 241 .get(pos); 242 } 243 } 244 } 245 mapSeqToStruct.add(globalPos); // alignment index 246 } 247 } 248 } // All positions in the Block considered so far 249 250 // Calculate the index of the next Block for every structure 251 int[] blockEnds = new int[alignment.size()]; 252 for (int str = 0; str < alignment.size(); str++) { 253 for (int res : blockStarts.get(str)) { 254 if (previousPos[str] > res) 255 blockEnds[str] = res; 256 else { 257 blockEnds[str] = res; 258 break; 259 } 260 } 261 } 262 263 // Add the unaligned residues in between Blocks (lowercase) 264 boolean allGaps = false; // true means no more residues to add 265 while (!allGaps) { 266 allGaps = true; 267 for (int str = 0; str < alignment.size(); str++) { 268 if (previousPos[str] + 1 < blockEnds[str]) { 269 Atom a = atoms.get(str)[previousPos[str] + 1]; 270 String group = a.getGroup().getPDBName(); 271 char letter = StructureTools.get1LetterCode(group); 272 273 provisionalChar[str] = Character.toLowerCase(letter); 274 previousPos[str]++; 275 allGaps = false; 276 } else 277 provisionalChar[str] = '-'; 278 } 279 if (!allGaps) { 280 for (int str = 0; str < alignment.size(); str++) { 281 alnSequences.set( 282 str, 283 alnSequences.get(str).concat( 284 "" + provisionalChar[str])); 285 } 286 mapSeqToStruct.add(-1); // unaligned position 287 } 288 } 289 } 290 return alnSequences; 291 } 292 293 /** 294 * Calculate the sequence alignment Strings for the whole alignment. This 295 * method creates a sequence alignment where aligned residues are in 296 * uppercase and unaligned residues are in lowercase, thus providing a more 297 * compact way to represent the alignment. 298 * <p> 299 * Blocks are concatenated in the order returned by 300 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 301 * sequential. Gaps are represented by '-'. Separation between different 302 * Blocks is indicated by a gap in all positions, meaning that there is a 303 * possible discontinuity. 304 * 305 * @param alignment 306 * input MultipleAlignment 307 * @return String for each row in the alignment, giving the 1-letter code 308 * for each aligned residue. 309 */ 310 public static List<String> getSequenceAlignment(MultipleAlignment msa) { 311 return getSequenceAlignment(msa, new ArrayList<Integer>()); 312 } 313 314 /** 315 * Calculate the sequence alignment Strings for the alignment Blocks in an 316 * alignment. This method creates a sequence alignment where all residues 317 * are in uppercase and a residue alone with gaps in all the other 318 * structures represents unaligned residues. Because of this latter 319 * constraint only the residues within the Blocks are represented, for a 320 * more compact alignment. For a sequence alignment of the full protein use 321 * {@link #getSequenceAlignment(MultipleAlignment)}. 322 * <p> 323 * Blocks are concatenated in the order returned by 324 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 325 * sequential. Gaps between blocks are omitted, while gaps within blocks are 326 * represented by '-'. Separation between different Blocks is indicated by a 327 * gap in all positions, meaning that there is something unaligned 328 * inbetween. 329 * 330 * @param alignment 331 * input MultipleAlignment 332 * @param mapSeqToStruct 333 * provides a link from the sequence alignment position to the 334 * structure alignment position. Specially designed for the GUI. 335 * Has to be initialized previously and will be overwritten. 336 * @return a string for each row in the alignment, giving the 1-letter code 337 * for each aligned residue. 338 */ 339 public static List<String> getBlockSequenceAlignment( 340 MultipleAlignment alignment, List<Integer> mapSeqToStruct) { 341 342 // Initialize sequence variables 343 List<String> alnSequences = new ArrayList<String>(); 344 for (int str = 0; str < alignment.size(); str++) 345 alnSequences.add(""); 346 mapSeqToStruct.clear(); 347 List<Atom[]> atoms = alignment.getAtomArrays(); 348 int globalPos = -1; 349 350 // Loop through all the alignment Blocks in the order given 351 for (int b = 0; b < alignment.getBlocks().size(); b++) { 352 if (b != 0) { 353 // Add a gap to all structures to separate Blocks 354 for (int str = 0; str < alignment.size(); str++) 355 alnSequences.set(str, alnSequences.get(str).concat("-")); 356 mapSeqToStruct.add(-1); // means unaligned position 357 } 358 359 // Store the previous position added to the sequence alignment 360 int[] previousPos = new int[alignment.size()]; 361 Arrays.fill(previousPos, -1); 362 // Store provisional characters 363 char[] provisionalChar = new char[alignment.size()]; 364 Arrays.fill(provisionalChar, '-'); 365 366 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 367 globalPos++; 368 boolean gaps = true; 369 while (gaps) { 370 gaps = false; 371 // Loop through all the structures 372 for (int str = 0; str < alignment.size(); str++) { 373 // If it is the first position or before it was null 374 if (previousPos[str] == -1) { 375 Integer residue = alignment.getBlocks().get(b) 376 .getAlignRes().get(str).get(pos); 377 if (residue == null) 378 provisionalChar[str] = '-'; 379 else { 380 Atom a = atoms.get(str)[residue]; 381 String g = a.getGroup().getPDBName(); 382 char aa = StructureTools.get1LetterCode(g); 383 provisionalChar[str] = aa; 384 } 385 } else { 386 Integer residue = alignment.getBlocks().get(b) 387 .getAlignRes().get(str).get(pos); 388 if (residue == null) 389 provisionalChar[str] = '-'; 390 else if (previousPos[str] + 1 == residue) { 391 Atom a = atoms.get(str)[residue]; 392 String g = a.getGroup().getPDBName(); 393 char aa = StructureTools.get1LetterCode(g); 394 provisionalChar[str] = aa; 395 } else { 396 provisionalChar[str] = ' '; 397 gaps = true; 398 } 399 } 400 }// End all structures analysis 401 402 if (gaps) { 403 for (int str = 0; str < alignment.size(); str++) { 404 if (provisionalChar[str] == ' ') { 405 // It means this residue was non-consecutive 406 for (int s2 = 0; s2 < alignment.size(); s2++) { 407 if (str == s2) { 408 int next = previousPos[str] + 1; 409 Atom a = atoms.get(s2)[next]; 410 String g = a.getGroup().getPDBName(); 411 char aa = StructureTools 412 .get1LetterCode(g); 413 alnSequences.set( 414 s2, 415 alnSequences.get(s2).concat( 416 "" + aa)); 417 } else { 418 alnSequences.set(s2, 419 alnSequences.get(s2) 420 .concat("-")); 421 } 422 } 423 mapSeqToStruct.add(-1); // unaligned 424 previousPos[str] += 1; 425 } 426 } 427 } else { // Append the provisional and update the indices 428 for (int str = 0; str < alignment.size(); str++) { 429 alnSequences.set( 430 str, 431 alnSequences.get(str).concat( 432 "" + provisionalChar[str])); 433 if (provisionalChar[str] != '-') { 434 previousPos[str] = alignment.getBlocks().get(b) 435 .getAlignRes().get(str).get(pos); 436 } 437 } 438 mapSeqToStruct.add(globalPos); 439 } 440 } 441 } 442 } 443 return alnSequences; 444 } 445 446 /** 447 * Calculate the sequence alignment Strings for the alignment Blocks in an 448 * alignment. This method creates a sequence alignment where all residues 449 * are in uppercase and a residue alone with gaps in all the other 450 * structures represents unaligned residues. Because of this latter 451 * constraint only the residues within the Blocks are represented, for a 452 * more compact alignment. For a sequence alignment of the full protein use 453 * {@link #getSequenceAlignment(MultipleAlignment)}. 454 * <p> 455 * Blocks are concatenated in the order returned by 456 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 457 * sequential. Gaps between blocks are omitted, while gaps within blocks are 458 * represented by '-'. Separation between different Blocks is indicated by a 459 * gap in all positions, meaning that there is something unaligned 460 * inbetween. 461 * 462 * @param alignment 463 * input MultipleAlignment 464 * @return String for each row in the alignment, giving the 1-letter code 465 * for each aligned residue. 466 */ 467 public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) { 468 return getBlockSequenceAlignment(ma, new ArrayList<Integer>()); 469 } 470 471 /** 472 * Returns the Atom of the specified structure that is aligned in the 473 * sequence alignment position specified. 474 * 475 * @param multAln 476 * the MultipleAlignment object from where the sequence alignment 477 * has been generated 478 * @param mapSeqToStruct 479 * the mapping between sequence and structure generated with the 480 * sequence alignment 481 * @param str 482 * the structure index of the alignment (row) 483 * @param sequencePos 484 * the sequence alignment position (column) 485 * @return Atom the atom in that position or null if there is a gap 486 */ 487 public static Atom getAtomForSequencePosition(MultipleAlignment msa, 488 List<Integer> mapSeqToStruct, int str, int sequencePos) { 489 490 int seqPos = mapSeqToStruct.get(sequencePos); 491 492 // Check if the position selected is an aligned position 493 if (seqPos == -1) 494 return null; 495 else { 496 Atom a = null; 497 // Calculate the corresponding structure position 498 int sum = 0; 499 for (Block b : msa.getBlocks()) { 500 if (sum + b.length() <= seqPos) { 501 sum += b.length(); 502 continue; 503 } else { 504 for (Integer p : b.getAlignRes().get(str)) { 505 if (sum == seqPos) { 506 if (p != null) { 507 a = msa.getAtomArrays().get(str)[p]; 508 } 509 break; 510 } 511 sum++; 512 } 513 break; 514 } 515 } 516 return a; 517 } 518 } 519 520 /** 521 * Returns the block number of a specified position in the sequence 522 * alignment, given the mapping from structure to function. 523 * 524 * @param multAln 525 * the MultipleAlignment object from where the sequence alignment 526 * has been generated. 527 * @param mapSeqToStruct 528 * the mapping between sequence and structure generated with the 529 * sequence alignment 530 * @param sequencePos 531 * the position in the sequence alignment (column) 532 * @return int the block index, or -1 if the position is not aligned 533 */ 534 public static int getBlockForSequencePosition(MultipleAlignment multAln, 535 List<Integer> mapSeqToStruct, int sequencePos) { 536 537 int seqPos = mapSeqToStruct.get(sequencePos); 538 // Check if the position selected is an aligned position 539 if (seqPos == -1) 540 return -1; 541 else { 542 // Calculate the corresponding block (by iterating all Blocks) 543 int sum = 0; 544 int block = 0; 545 for (Block b : multAln.getBlocks()) { 546 if (sum + b.length() <= seqPos) { 547 sum += b.length(); 548 block++; 549 continue; 550 } else 551 break; 552 } 553 return block; 554 } 555 } 556 557 /** 558 * The average residue distance Matrix contains the average distance from 559 * each residue to all other residues aligned with it. 560 * <p> 561 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 562 * length. 563 * 564 * @param alignment 565 * MultipleAlignment 566 * @return Matrix containing all average residue distances 567 */ 568 public static Matrix getAverageResidueDistances(MultipleAlignment msa) { 569 List<Atom[]> transformed = transformAtoms(msa); 570 return getAverageResidueDistances(transformed); 571 } 572 573 /** 574 * The average residue distance Matrix contains the average distance from 575 * each residue to all other residues aligned with it. 576 * <p> 577 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 578 * length. 579 * 580 * @param transformed 581 * List of Atom arrays containing only the aligned atoms of each 582 * structure, or null if there is a gap. 583 * @return Matrix containing all average residue distances. Entry -1 means 584 * there is a gap in the position. 585 */ 586 public static Matrix getAverageResidueDistances(List<Atom[]> transformed) { 587 588 int size = transformed.size(); 589 int length = transformed.get(0).length; 590 Matrix resDist = new Matrix(size, length, -1); 591 592 // Calculate the average residue distances 593 for (int r1 = 0; r1 < size; r1++) { 594 for (int c = 0; c < transformed.get(r1).length; c++) { 595 Atom refAtom = transformed.get(r1)[c]; 596 if (refAtom == null) 597 continue; 598 599 for (int r2 = r1 + 1; r2 < size; r2++) { 600 Atom atom = transformed.get(r2)[c]; 601 if (atom != null) { 602 double distance = Calc.getDistance(refAtom, atom); 603 604 if (resDist.get(r1, c) == -1) { 605 resDist.set(r1, c, 1 + distance); 606 } else { 607 resDist.set(r1, c, resDist.get(r1, c) + distance); 608 } 609 610 if (resDist.get(r2, c) == -1) { 611 resDist.set(r2, c, 1 + distance); 612 } else { 613 resDist.set(r2, c, resDist.get(r2, c) + distance); 614 } 615 } 616 } 617 } 618 } 619 620 for (int c = 0; c < length; c++) { 621 int nonNullRes = 0; 622 for (int r = 0; r < size; r++) { 623 if (resDist.get(r, c) != -1) 624 nonNullRes++; 625 } 626 for (int r = 0; r < size; r++) { 627 if (resDist.get(r, c) != -1) { 628 resDist.set(r, c, resDist.get(r, c) / nonNullRes); 629 } 630 } 631 } 632 return resDist; 633 } 634 635 /** 636 * Transforms atoms according to the superposition stored in the alignment. 637 * <p> 638 * For each structure in the alignment, returns an atom for each 639 * representative atom in the aligned columns, omitting unaligned residues 640 * (i.e. an array of length <tt>alignment.length()</tt> ). 641 * <p> 642 * All blocks are concatenated together, so Atoms may not appear in the same 643 * order as in their parent structure. If the alignment blocks contain null 644 * residues (gaps), then the returned array will also contain null Atoms in 645 * the same positions. 646 * 647 * @param alignment 648 * MultipleAlignment 649 * @return List of Atom arrays of only the aligned atoms of every structure 650 * (null Atom if a gap position) 651 */ 652 public static List<Atom[]> transformAtoms(MultipleAlignment alignment) { 653 if (alignment.getEnsemble() == null) { 654 throw new NullPointerException("No ensemble set for this alignment"); 655 } 656 657 List<Atom[]> atomArrays = alignment.getAtomArrays(); 658 List<Atom[]> transformed = new ArrayList<Atom[]>(atomArrays.size()); 659 660 // Loop through structures 661 for (int i = 0; i < atomArrays.size(); i++) { 662 663 Matrix4d transform = null; 664 Atom[] curr = atomArrays.get(i); // all CA atoms from structure 665 666 // Concatenated list of all blocks for this structure 667 Atom[] transformedAtoms = new Atom[alignment.length()]; 668 int transformedAtomsLength = 0; 669 670 // Each blockset gets transformed independently 671 for (BlockSet bs : alignment.getBlockSets()) { 672 673 Atom[] blocksetAtoms = new Atom[bs.length()]; 674 int blockPos = 0; 675 676 for (Block blk : bs.getBlocks()) { 677 if (blk.size() != atomArrays.size()) { 678 throw new IllegalStateException(String.format( 679 "Mismatched block size. Expected %d " 680 + "structures, found %d.", 681 atomArrays.size(), blk.size())); 682 } 683 // Extract aligned atoms 684 for (int j = 0; j < blk.length(); j++) { 685 Integer alignedPos = blk.getAlignRes().get(i).get(j); 686 if (alignedPos != null) { 687 blocksetAtoms[blockPos] = (Atom) curr[alignedPos] 688 .clone(); 689 } 690 blockPos++; 691 } 692 } 693 694 // Transform according to the blockset or alignment matrix 695 Matrix4d blockTrans = null; 696 if (bs.getTransformations() != null) 697 blockTrans = bs.getTransformations().get(i); 698 if (blockTrans == null) { 699 blockTrans = transform; 700 } 701 702 for (Atom a : blocksetAtoms) { 703 if (a != null) 704 Calc.transform(a, blockTrans); 705 transformedAtoms[transformedAtomsLength] = a; 706 transformedAtomsLength++; 707 } 708 } 709 assert (transformedAtomsLength == alignment.length()); 710 711 transformed.add(transformedAtoms); 712 } 713 return transformed; 714 } 715 716 /** 717 * Calculate a List of alignment indicies that correspond to the core of a 718 * Block, which means that all structures have a residue in that positon. 719 * 720 * @param block 721 * alignment Block 722 * @return List of positions in the core of the alignment 723 */ 724 public static List<Integer> getCorePositions(Block block) { 725 726 List<Integer> corePositions = new ArrayList<Integer>(); 727 728 for (int col = 0; col < block.length(); col++) { 729 boolean core = true; 730 for (int str = 0; str < block.size(); str++) { 731 if (block.getAlignRes().get(str).get(col) == null) { 732 core = false; 733 break; 734 } 735 } 736 if (core) 737 corePositions.add(col); 738 } 739 return corePositions; 740 } 741 742 /** 743 * Sort blocks so that the specified row is in sequential order. The sort 744 * happens in place. 745 * 746 * @param blocks 747 * List of blocks 748 * @param sortedIndex 749 * Index of the row to be sorted 750 */ 751 public static void sortBlocks(List<Block> blocks, final int sortedIndex) { 752 Collections.sort(blocks, new Comparator<Block>() { 753 @Override 754 public int compare(Block o1, Block o2) { 755 // Compare the first non-null residue of each block 756 List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex); 757 List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex); 758 Integer res1 = null; 759 Integer res2 = null; 760 for (Integer r : alignres1) { 761 if (r != null) { 762 res1 = r; 763 break; 764 } 765 } 766 for (Integer r : alignres2) { 767 if (r != null) { 768 res2 = r; 769 break; 770 } 771 } 772 return res1.compareTo(res2); 773 } 774 }); 775 } 776 777 /** 778 * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid 779 * residues. This method is only valid for protein structure alignments. 780 * 781 * @param msta 782 * Multiple Structure Alignment 783 * @return MultipleSequenceAlignment of protein sequences 784 * @throws CompoundNotFoundException 785 */ 786 public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA( 787 MultipleAlignment msta) throws CompoundNotFoundException { 788 789 // Check that the alignment is of protein structures 790 Group g = msta.getAtomArrays().get(0)[0].getGroup(); 791 if (!(g instanceof AminoAcid)) { 792 throw new IllegalArgumentException( 793 "Cannot convert to multiple sequence alignment: " 794 + "the structures aligned are not proteins"); 795 } 796 797 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound>(); 798 799 Map<String, Integer> uniqueID = new HashMap<String, Integer>(); 800 List<String> seqs = getSequenceAlignment(msta); 801 for (int i = 0; i < msta.size(); i++) { 802 // Make sure the identifiers are unique (required by AccessionID) 803 String id = msta.getStructureIdentifier(i).toString(); 804 if (uniqueID.containsKey(id)) { 805 uniqueID.put(id, uniqueID.get(id) + 1); 806 id += "_" + uniqueID.get(id); 807 } else 808 uniqueID.put(id, 1); 809 810 AccessionID acc = new AccessionID(id); 811 ProteinSequence pseq = new ProteinSequence(seqs.get(i)); 812 pseq.setAccession(acc); 813 msa.addAlignedSequence(pseq); 814 } 815 return msa; 816 } 817 818 /** 819 * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of 820 * the matrix contains the RMSD between structures i and j. 821 * 822 * @param msa 823 * Multiple Structure Alignment 824 * @return Matrix of RMSD with size the number of structures squared 825 */ 826 public static Matrix getRMSDMatrix(MultipleAlignment msa) { 827 828 Matrix rmsdMat = new Matrix(msa.size(), msa.size()); 829 List<Atom[]> superposed = transformAtoms(msa); 830 831 for (int i = 0; i < msa.size(); i++) { 832 for (int j = i; j < msa.size(); j++) { 833 if (i == j) 834 rmsdMat.set(i, j, 0.0); 835 List<Atom[]> compared = new ArrayList<Atom[]>(); 836 compared.add(superposed.get(i)); 837 compared.add(superposed.get(j)); 838 double rmsd = MultipleAlignmentScorer.getRMSD(compared); 839 rmsdMat.set(i, j, rmsd); 840 rmsdMat.set(j, i, rmsd); 841 } 842 } 843 return rmsdMat; 844 } 845 846 /** 847 * Calculate a phylogenetic tree of the MultipleAlignment using Kimura 848 * distances and the Neighbor Joining algorithm from forester. 849 * 850 * @param msta 851 * MultipleAlignment of protein structures 852 * @return Phylogeny phylogenetic tree 853 * @throws CompoundNotFoundException 854 * @throws IOException 855 */ 856 public static Phylogeny getKimuraTree(MultipleAlignment msta) 857 throws CompoundNotFoundException, IOException { 858 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 859 .toProteinMSA(msta); 860 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 861 .kimuraDistance(msa); 862 Phylogeny tree = TreeConstructor.distanceTree(distmat, 863 TreeConstructorType.NJ); 864 tree.setName("Kimura Tree"); 865 return tree; 866 } 867 868 /** 869 * Calculate a phylogenetic tree of the MultipleAlignment using 870 * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for 871 * distantly related proteins, structure-derived) and the Neighbor Joining 872 * algorithm from forester. 873 * 874 * @param msta 875 * MultipleAlignment of protein structures 876 * @return Phylogeny phylogenetic tree 877 * @throws CompoundNotFoundException 878 * @throws IOException 879 */ 880 public static Phylogeny getHSDMTree(MultipleAlignment msta) 881 throws CompoundNotFoundException, IOException { 882 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 883 .toProteinMSA(msta); 884 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 885 .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102")); 886 Phylogeny tree = TreeConstructor.distanceTree(distmat, 887 TreeConstructorType.NJ); 888 tree.setName("HSDM Tree"); 889 return tree; 890 } 891 892 /** 893 * Calculate a phylogenetic tree of the MultipleAlignment using RMSD 894 * distances and the Neighbor Joining algorithm from forester. 895 * 896 * @param msta 897 * MultipleAlignment of protein structures 898 * @return Phylogeny phylogenetic tree 899 * @throws CompoundNotFoundException 900 */ 901 public static Phylogeny getStructuralTree(MultipleAlignment msta) { 902 double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta) 903 .getArray(); 904 BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 905 .structuralDistance(rmsdMat, 1, 5, 0.4); 906 // Set the identifiers of the matrix 907 Map<String, Integer> alreadySeen = new HashMap<String, Integer>(); 908 for (int i = 0; i < msta.size(); i++) { 909 // Make sure the identifiers are unique 910 String id = msta.getStructureIdentifier(i).toString(); 911 if (alreadySeen.containsKey(id)) { 912 alreadySeen.put(id, alreadySeen.get(id) + 1); 913 id += "_" + alreadySeen.get(id); 914 } else 915 alreadySeen.put(id, 1); 916 rmsdDist.setIdentifier(i, id); 917 } 918 Phylogeny tree = TreeConstructor.distanceTree(rmsdDist, 919 TreeConstructorType.NJ); 920 tree.setName("Structural Tree"); 921 return tree; 922 } 923 924}