001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.multiple.util; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.Collections; 027import java.util.Comparator; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Map; 031import java.util.SortedSet; 032import java.util.TreeSet; 033 034import javax.vecmath.Matrix4d; 035 036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 037import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 038import org.biojava.nbio.core.sequence.AccessionID; 039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment; 040import org.biojava.nbio.core.sequence.ProteinSequence; 041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 042import org.biojava.nbio.phylo.DistanceMatrixCalculator; 043import org.biojava.nbio.phylo.TreeConstructor; 044import org.biojava.nbio.phylo.TreeConstructorType; 045import org.biojava.nbio.structure.AminoAcid; 046import org.biojava.nbio.structure.Atom; 047import org.biojava.nbio.structure.Calc; 048import org.biojava.nbio.structure.Chain; 049import org.biojava.nbio.structure.Group; 050import org.biojava.nbio.structure.PDBHeader; 051import org.biojava.nbio.structure.Structure; 052import org.biojava.nbio.structure.StructureException; 053import org.biojava.nbio.structure.StructureIdentifier; 054import org.biojava.nbio.structure.StructureImpl; 055import org.biojava.nbio.structure.StructureTools; 056import org.biojava.nbio.structure.align.multiple.Block; 057import org.biojava.nbio.structure.align.multiple.BlockSet; 058import org.biojava.nbio.structure.align.multiple.MultipleAlignment; 059import org.biojava.nbio.structure.align.util.AlignmentTools; 060import org.biojava.nbio.structure.cluster.SubunitCluster; 061import org.biojava.nbio.structure.jama.Matrix; 062import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; 063import org.forester.phylogeny.Phylogeny; 064 065/** 066 * Utility functions for working with {@link MultipleAlignment}. 067 * <p> 068 * Supported functions: 069 * <ul> 070 * <li>Obtain the alignment as sequence strings 071 * <li>Map from sequence alignment position to structure Atom 072 * <li>Map from sequence alignment position to Block number 073 * <li>Transform the aligned Atoms of a MultipleAlignment 074 * <li>Get all the core alignment positions of the alignment 075 * <li>Calculate the average residue distance of all aligned positions 076 * <li>Sort Blocks in a MultipleAlignment by a specified row 077 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment 078 * </ul> 079 * 080 * @author Spencer Bliven 081 * @author Aleix Lafita 082 * @since 4.1.0 083 * 084 */ 085public class MultipleAlignmentTools { 086 087 /** 088 * Calculate the sequence alignment Strings for the whole alignment. This 089 * method creates a sequence alignment where aligned residues are in 090 * uppercase and unaligned residues are in lowercase, thus providing a more 091 * compact way to represent the alignment. 092 * <p> 093 * Blocks are concatenated in the order returned by 094 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 095 * sequential. Gaps are represented by '-'. Separation between different 096 * Blocks is indicated by a gap in all positions, meaning that there is a 097 * possible discontinuity. 098 * 099 * @param alignment 100 * input MultipleAlignment 101 * @param mapSeqToStruct 102 * provides a link from the sequence alignment position to the 103 * structure alignment position. Specially designed for the GUI. 104 * Has to be initialized previously and will be overwritten. 105 * @return a string for each row in the alignment, giving the 1-letter code 106 * for each aligned residue. 107 */ 108 public static List<String> getSequenceAlignment( 109 MultipleAlignment alignment, final List<Integer> mapSeqToStruct) { 110 111 // Initialize sequence variables 112 List<String> alnSequences = new ArrayList<>(); 113 for (int str = 0; str < alignment.size(); str++) 114 alnSequences.add(""); 115 mapSeqToStruct.clear(); 116 List<Atom[]> atoms = alignment.getAtomArrays(); 117 int globalPos = -1; 118 119 // Initialize helper variables in constucting the sequence alignment 120 List<SortedSet<Integer>> freePool = new ArrayList<>(); 121 List<SortedSet<Integer>> blockStarts = new ArrayList<>(); 122 List<List<Integer>> aligned = new ArrayList<>(); 123 124 // Generate freePool residues from the ones not aligned 125 for (int i = 0; i < alignment.size(); i++) { 126 List<Integer> residues = new ArrayList<>(); 127 freePool.add(new TreeSet<Integer>()); 128 blockStarts.add(new TreeSet<Integer>()); 129 for (BlockSet bs : alignment.getBlockSets()) { 130 for (Block b : bs.getBlocks()) { 131 boolean first = true; 132 for (int l = 0; l < b.length(); l++) { 133 Integer residue = b.getAlignRes().get(i).get(l); 134 if (residue != null) { 135 if (first) 136 blockStarts.get(i).add(residue); 137 residues.add(residue); 138 first = false; 139 } 140 } 141 } 142 } 143 aligned.add(residues); 144 } 145 // Add any residue not aligned to the free pool for every structure 146 for (int i = 0; i < alignment.size(); i++) { 147 for (int k = 0; k < atoms.get(i).length; k++) { 148 if (!aligned.get(i).contains(k)) 149 freePool.get(i).add(k); 150 } 151 } 152 153 for (int b = 0; b < alignment.getBlocks().size(); b++) { 154 if (b != 0) { 155 // Add a gap to all structures to separate visually the Blocks 156 for (int str = 0; str < alignment.size(); str++) 157 alnSequences.set(str, alnSequences.get(str).concat("-")); 158 mapSeqToStruct.add(-1); // means unaligned position 159 } 160 // Store the previous position added to the sequence alignment 161 int[] previousPos = new int[alignment.size()]; 162 Arrays.fill(previousPos, -1); 163 // Store provisional characters 164 char[] provisionalChar = new char[alignment.size()]; 165 Arrays.fill(provisionalChar, '-'); 166 167 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 168 globalPos++; 169 boolean gaps = true; // true if consecutive with the previous 170 while (gaps) { 171 gaps = false; 172 // Loop through all the structures 173 for (int str = 0; str < alignment.size(); str++) { 174 // If it is the first position or before it was null 175 if (previousPos[str] == -1) { 176 Integer residue = alignment.getBlocks().get(b) 177 .getAlignRes().get(str).get(pos); 178 if (residue == null) 179 provisionalChar[str] = '-'; 180 else { 181 Atom a = atoms.get(str)[residue]; 182 String group = a.getGroup().getPDBName(); 183 provisionalChar[str] = StructureTools 184 .get1LetterCode(group); 185 } 186 } else { 187 Integer residue = alignment.getBlocks().get(b) 188 .getAlignRes().get(str).get(pos); 189 int nextPos = previousPos[str] + 1; 190 if (residue == null) { 191 if (freePool.get(str).contains(nextPos)) { 192 Atom a = atoms.get(str)[nextPos]; 193 String g = a.getGroup().getPDBName(); 194 char aa = StructureTools.get1LetterCode(g); 195 provisionalChar[str] = Character 196 .toLowerCase(aa); 197 } else 198 provisionalChar[str] = '-'; 199 } else if (nextPos == residue) { 200 Atom a = atoms.get(str)[nextPos]; 201 String group = a.getGroup().getPDBName(); 202 provisionalChar[str] = StructureTools 203 .get1LetterCode(group); 204 } else { 205 // This means non-consecutive 206 provisionalChar[str] = ' '; 207 gaps = true; 208 } 209 } 210 }// End all structure analysis 211 212 if (gaps) { 213 for (int str = 0; str < alignment.size(); str++) { 214 if (provisionalChar[str] == ' ') { 215 // It means this residue was non-consecutive 216 Atom a = atoms.get(str)[previousPos[str] + 1]; 217 String group = a.getGroup().getPDBName(); 218 char aa = StructureTools.get1LetterCode(group); 219 alnSequences 220 .set(str, 221 alnSequences 222 .get(str) 223 .concat(String.valueOf(Character 224 .toLowerCase(aa))) ); 225 previousPos[str]++; 226 } else { 227 // Insert a gap otherwise 228 alnSequences.set(str, alnSequences.get(str) 229 .concat("-")); 230 } 231 } 232 mapSeqToStruct.add(-1); // unaligned position 233 } else { 234 // Add provisional and update the indices 235 for (int str = 0; str < alignment.size(); str++) { 236 alnSequences.set( 237 str, 238 alnSequences.get(str).concat( 239 String.valueOf(provisionalChar[str]))); 240 241 if (provisionalChar[str] != '-') { 242 if (alignment.getBlocks().get(b).getAlignRes() 243 .get(str).get(pos) == null) { 244 previousPos[str]++; 245 } else { 246 previousPos[str] = alignment.getBlocks() 247 .get(b).getAlignRes().get(str) 248 .get(pos); 249 } 250 } 251 } 252 mapSeqToStruct.add(globalPos); // alignment index 253 } 254 } 255 } // All positions in the Block considered so far 256 257 // Calculate the index of the next Block for every structure 258 int[] blockEnds = new int[alignment.size()]; 259 for (int str = 0; str < alignment.size(); str++) { 260 for (int res : blockStarts.get(str)) { 261 if (previousPos[str] > res) 262 blockEnds[str] = res; 263 else { 264 blockEnds[str] = res; 265 break; 266 } 267 } 268 } 269 270 // Add the unaligned residues in between Blocks (lowercase) 271 boolean allGaps = false; // true means no more residues to add 272 while (!allGaps) { 273 allGaps = true; 274 for (int str = 0; str < alignment.size(); str++) { 275 if (previousPos[str] + 1 < blockEnds[str]) { 276 Atom a = atoms.get(str)[previousPos[str] + 1]; 277 String group = a.getGroup().getPDBName(); 278 char letter = StructureTools.get1LetterCode(group); 279 280 provisionalChar[str] = Character.toLowerCase(letter); 281 previousPos[str]++; 282 allGaps = false; 283 } else 284 provisionalChar[str] = '-'; 285 } 286 if (!allGaps) { 287 for (int str = 0; str < alignment.size(); str++) { 288 alnSequences.set( 289 str, 290 alnSequences.get(str).concat( 291 String.valueOf(provisionalChar[str])) ); 292 } 293 mapSeqToStruct.add(-1); // unaligned position 294 } 295 } 296 } 297 return alnSequences; 298 } 299 300 /** 301 * Calculate the sequence alignment Strings for the whole alignment. This 302 * method creates a sequence alignment where aligned residues are in 303 * uppercase and unaligned residues are in lowercase, thus providing a more 304 * compact way to represent the alignment. 305 * <p> 306 * Blocks are concatenated in the order returned by 307 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 308 * sequential. Gaps are represented by '-'. Separation between different 309 * Blocks is indicated by a gap in all positions, meaning that there is a 310 * possible discontinuity. 311 * 312 * @param msa 313 * input MultipleAlignment 314 * @return String for each row in the alignment, giving the 1-letter code 315 * for each aligned residue. 316 */ 317 public static List<String> getSequenceAlignment(MultipleAlignment msa) { 318 return getSequenceAlignment(msa, new ArrayList<Integer>()); 319 } 320 321 /** 322 * Calculate the sequence alignment Strings for the alignment Blocks in an 323 * alignment. This method creates a sequence alignment where all residues 324 * are in uppercase and a residue alone with gaps in all the other 325 * structures represents unaligned residues. Because of this latter 326 * constraint only the residues within the Blocks are represented, for a 327 * more compact alignment. For a sequence alignment of the full protein use 328 * {@link #getSequenceAlignment(MultipleAlignment)}. 329 * <p> 330 * Blocks are concatenated in the order returned by 331 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 332 * sequential. Gaps between blocks are omitted, while gaps within blocks are 333 * represented by '-'. Separation between different Blocks is indicated by a 334 * gap in all positions, meaning that there is something unaligned 335 * inbetween. 336 * 337 * @param alignment 338 * input MultipleAlignment 339 * @param mapSeqToStruct 340 * provides a link from the sequence alignment position to the 341 * structure alignment position. Specially designed for the GUI. 342 * Has to be initialized previously and will be overwritten. 343 * @return a string for each row in the alignment, giving the 1-letter code 344 * for each aligned residue. 345 */ 346 public static List<String> getBlockSequenceAlignment( 347 MultipleAlignment alignment, List<Integer> mapSeqToStruct) { 348 349 // Initialize sequence variables 350 List<String> alnSequences = new ArrayList<>(); 351 for (int str = 0; str < alignment.size(); str++) 352 alnSequences.add(""); 353 mapSeqToStruct.clear(); 354 List<Atom[]> atoms = alignment.getAtomArrays(); 355 int globalPos = -1; 356 357 // Loop through all the alignment Blocks in the order given 358 for (int b = 0; b < alignment.getBlocks().size(); b++) { 359 if (b != 0) { 360 // Add a gap to all structures to separate Blocks 361 for (int str = 0; str < alignment.size(); str++) 362 alnSequences.set(str, alnSequences.get(str).concat("-")); 363 mapSeqToStruct.add(-1); // means unaligned position 364 } 365 366 // Store the previous position added to the sequence alignment 367 int[] previousPos = new int[alignment.size()]; 368 Arrays.fill(previousPos, -1); 369 // Store provisional characters 370 char[] provisionalChar = new char[alignment.size()]; 371 Arrays.fill(provisionalChar, '-'); 372 373 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 374 globalPos++; 375 boolean gaps = true; 376 while (gaps) { 377 gaps = false; 378 // Loop through all the structures 379 for (int str = 0; str < alignment.size(); str++) { 380 // If it is the first position or before it was null 381 if (previousPos[str] == -1) { 382 Integer residue = alignment.getBlocks().get(b) 383 .getAlignRes().get(str).get(pos); 384 if (residue == null) 385 provisionalChar[str] = '-'; 386 else { 387 Atom a = atoms.get(str)[residue]; 388 String g = a.getGroup().getPDBName(); 389 char aa = StructureTools.get1LetterCode(g); 390 provisionalChar[str] = aa; 391 } 392 } else { 393 Integer residue = alignment.getBlocks().get(b) 394 .getAlignRes().get(str).get(pos); 395 if (residue == null) 396 provisionalChar[str] = '-'; 397 else if (previousPos[str] + 1 == residue) { 398 Atom a = atoms.get(str)[residue]; 399 String g = a.getGroup().getPDBName(); 400 char aa = StructureTools.get1LetterCode(g); 401 provisionalChar[str] = aa; 402 } else { 403 provisionalChar[str] = ' '; 404 gaps = true; 405 } 406 } 407 }// End all structures analysis 408 409 if (gaps) { 410 for (int str = 0; str < alignment.size(); str++) { 411 if (provisionalChar[str] == ' ') { 412 // It means this residue was non-consecutive 413 for (int s2 = 0; s2 < alignment.size(); s2++) { 414 if (str == s2) { 415 int next = previousPos[str] + 1; 416 Atom a = atoms.get(s2)[next]; 417 String g = a.getGroup().getPDBName(); 418 char aa = StructureTools 419 .get1LetterCode(g); 420 alnSequences.set( 421 s2, 422 alnSequences.get(s2).concat( 423 String.valueOf(aa)) ); 424 } else { 425 alnSequences.set(s2, 426 alnSequences.get(s2) 427 .concat("-")); 428 } 429 } 430 mapSeqToStruct.add(-1); // unaligned 431 previousPos[str] += 1; 432 } 433 } 434 } else { // Append the provisional and update the indices 435 for (int str = 0; str < alignment.size(); str++) { 436 alnSequences.set( 437 str, 438 alnSequences.get(str).concat( 439 String.valueOf(provisionalChar[str])) ); 440 if (provisionalChar[str] != '-') { 441 previousPos[str] = alignment.getBlocks().get(b) 442 .getAlignRes().get(str).get(pos); 443 } 444 } 445 mapSeqToStruct.add(globalPos); 446 } 447 } 448 } 449 } 450 return alnSequences; 451 } 452 453 /** 454 * Calculate the sequence alignment Strings for the alignment Blocks in an 455 * alignment. This method creates a sequence alignment where all residues 456 * are in uppercase and a residue alone with gaps in all the other 457 * structures represents unaligned residues. Because of this latter 458 * constraint only the residues within the Blocks are represented, for a 459 * more compact alignment. For a sequence alignment of the full protein use 460 * {@link #getSequenceAlignment(MultipleAlignment)}. 461 * <p> 462 * Blocks are concatenated in the order returned by 463 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 464 * sequential. Gaps between blocks are omitted, while gaps within blocks are 465 * represented by '-'. Separation between different Blocks is indicated by a 466 * gap in all positions, meaning that there is something unaligned 467 * inbetween. 468 * 469 * @param ma 470 * input MultipleAlignment 471 * @return String for each row in the alignment, giving the 1-letter code 472 * for each aligned residue. 473 */ 474 public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) { 475 return getBlockSequenceAlignment(ma, new ArrayList<Integer>()); 476 } 477 478 /** 479 * Returns the Atom of the specified structure that is aligned in the 480 * sequence alignment position specified. 481 * 482 * @param msa 483 * the MultipleAlignment object from where the sequence alignment 484 * has been generated 485 * @param mapSeqToStruct 486 * the mapping between sequence and structure generated with the 487 * sequence alignment 488 * @param str 489 * the structure index of the alignment (row) 490 * @param sequencePos 491 * the sequence alignment position (column) 492 * @return Atom the atom in that position or null if there is a gap 493 */ 494 public static Atom getAtomForSequencePosition(MultipleAlignment msa, 495 List<Integer> mapSeqToStruct, int str, int sequencePos) { 496 497 int seqPos = mapSeqToStruct.get(sequencePos); 498 499 // Check if the position selected is an aligned position 500 if (seqPos == -1) 501 return null; 502 else { 503 Atom a = null; 504 // Calculate the corresponding structure position 505 int sum = 0; 506 for (Block b : msa.getBlocks()) { 507 if (sum + b.length() <= seqPos) { 508 sum += b.length(); 509 continue; 510 } else { 511 for (Integer p : b.getAlignRes().get(str)) { 512 if (sum == seqPos) { 513 if (p != null) { 514 a = msa.getAtomArrays().get(str)[p]; 515 } 516 break; 517 } 518 sum++; 519 } 520 break; 521 } 522 } 523 return a; 524 } 525 } 526 527 /** 528 * Returns the block number of a specified position in the sequence 529 * alignment, given the mapping from structure to function. 530 * 531 * @param multAln 532 * the MultipleAlignment object from where the sequence alignment 533 * has been generated. 534 * @param mapSeqToStruct 535 * the mapping between sequence and structure generated with the 536 * sequence alignment 537 * @param sequencePos 538 * the position in the sequence alignment (column) 539 * @return int the block index, or -1 if the position is not aligned 540 */ 541 public static int getBlockForSequencePosition(MultipleAlignment multAln, 542 List<Integer> mapSeqToStruct, int sequencePos) { 543 544 int seqPos = mapSeqToStruct.get(sequencePos); 545 // Check if the position selected is an aligned position 546 if (seqPos == -1) 547 return -1; 548 else { 549 // Calculate the corresponding block (by iterating all Blocks) 550 int sum = 0; 551 int block = 0; 552 for (Block b : multAln.getBlocks()) { 553 if (sum + b.length() <= seqPos) { 554 sum += b.length(); 555 block++; 556 continue; 557 } else 558 break; 559 } 560 return block; 561 } 562 } 563 564 /** 565 * The average residue distance Matrix contains the average distance from 566 * each residue to all other residues aligned with it. 567 * <p> 568 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 569 * length. 570 * 571 * @param msa 572 * MultipleAlignment 573 * @return Matrix containing all average residue distances 574 */ 575 public static Matrix getAverageResidueDistances(MultipleAlignment msa) { 576 List<Atom[]> transformed = transformAtoms(msa); 577 return getAverageResidueDistances(transformed); 578 } 579 580 /** 581 * The average residue distance Matrix contains the average distance from 582 * each residue to all other residues aligned with it. 583 * <p> 584 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 585 * length. 586 * 587 * @param transformed 588 * List of Atom arrays containing only the aligned atoms of each 589 * structure, or null if there is a gap. 590 * @return Matrix containing all average residue distances. Entry -1 means 591 * there is a gap in the position. 592 */ 593 public static Matrix getAverageResidueDistances(List<Atom[]> transformed) { 594 595 int size = transformed.size(); 596 int length = transformed.get(0).length; 597 Matrix resDist = new Matrix(size, length, -1); 598 599 // Calculate the average residue distances 600 for (int r1 = 0; r1 < size; r1++) { 601 for (int c = 0; c < transformed.get(r1).length; c++) { 602 Atom refAtom = transformed.get(r1)[c]; 603 if (refAtom == null) 604 continue; 605 606 for (int r2 = r1 + 1; r2 < size; r2++) { 607 Atom atom = transformed.get(r2)[c]; 608 if (atom != null) { 609 double distance = Calc.getDistance(refAtom, atom); 610 611 if (resDist.get(r1, c) == -1) { 612 resDist.set(r1, c, 1 + distance); 613 } else { 614 resDist.set(r1, c, resDist.get(r1, c) + distance); 615 } 616 617 if (resDist.get(r2, c) == -1) { 618 resDist.set(r2, c, 1 + distance); 619 } else { 620 resDist.set(r2, c, resDist.get(r2, c) + distance); 621 } 622 } 623 } 624 } 625 } 626 627 for (int c = 0; c < length; c++) { 628 int nonNullRes = 0; 629 for (int r = 0; r < size; r++) { 630 if (resDist.get(r, c) != -1) 631 nonNullRes++; 632 } 633 for (int r = 0; r < size; r++) { 634 if (resDist.get(r, c) != -1) { 635 resDist.set(r, c, resDist.get(r, c) / nonNullRes); 636 } 637 } 638 } 639 return resDist; 640 } 641 642 /** 643 * Transforms atoms according to the superposition stored in the alignment. 644 * <p> 645 * For each structure in the alignment, returns an atom for each 646 * representative atom in the aligned columns, omitting unaligned residues 647 * (i.e. an array of length <code>alignment.length()</code> ). 648 * <p> 649 * All blocks are concatenated together, so Atoms may not appear in the same 650 * order as in their parent structure. If the alignment blocks contain null 651 * residues (gaps), then the returned array will also contain null Atoms in 652 * the same positions. 653 * 654 * @param alignment 655 * MultipleAlignment 656 * @return List of Atom arrays of only the aligned atoms of every structure 657 * (null Atom if a gap position) 658 */ 659 public static List<Atom[]> transformAtoms(MultipleAlignment alignment) { 660 if (alignment.getEnsemble() == null) { 661 throw new NullPointerException("No ensemble set for this alignment"); 662 } 663 664 List<Atom[]> atomArrays = alignment.getAtomArrays(); 665 List<Atom[]> transformed = new ArrayList<>(atomArrays.size()); 666 667 // Loop through structures 668 for (int i = 0; i < atomArrays.size(); i++) { 669 670 Matrix4d transform = null; 671 Atom[] curr = atomArrays.get(i); // all CA atoms from structure 672 673 // Concatenated list of all blocks for this structure 674 Atom[] transformedAtoms = new Atom[alignment.length()]; 675 int transformedAtomsLength = 0; 676 677 // Each blockset gets transformed independently 678 for (BlockSet bs : alignment.getBlockSets()) { 679 680 Atom[] blocksetAtoms = new Atom[bs.length()]; 681 int blockPos = 0; 682 683 for (Block blk : bs.getBlocks()) { 684 if (blk.size() != atomArrays.size()) { 685 throw new IllegalStateException(String.format( 686 "Mismatched block size. Expected %d " 687 + "structures, found %d.", 688 atomArrays.size(), blk.size())); 689 } 690 // Extract aligned atoms 691 for (int j = 0; j < blk.length(); j++) { 692 Integer alignedPos = blk.getAlignRes().get(i).get(j); 693 if (alignedPos != null) { 694 blocksetAtoms[blockPos] = (Atom) curr[alignedPos] 695 .clone(); 696 } 697 blockPos++; 698 } 699 } 700 701 // Transform according to the blockset or alignment matrix 702 Matrix4d blockTrans = null; 703 if (bs.getTransformations() != null) 704 blockTrans = bs.getTransformations().get(i); 705 if (blockTrans == null) { 706 blockTrans = transform; 707 } 708 709 for (Atom a : blocksetAtoms) { 710 if (a != null) 711 Calc.transform(a, blockTrans); 712 transformedAtoms[transformedAtomsLength] = a; 713 transformedAtomsLength++; 714 } 715 } 716 assert (transformedAtomsLength == alignment.length()); 717 718 transformed.add(transformedAtoms); 719 } 720 return transformed; 721 } 722 723 /** 724 * Calculate a List of alignment indicies that correspond to the core of a 725 * Block, which means that all structures have a residue in that position. 726 * 727 * @param block 728 * alignment Block 729 * @return List of positions in the core of the alignment 730 */ 731 public static List<Integer> getCorePositions(Block block) { 732 733 List<Integer> corePositions = new ArrayList<>(); 734 735 for (int col = 0; col < block.length(); col++) { 736 boolean core = true; 737 for (int str = 0; str < block.size(); str++) { 738 if (block.getAlignRes().get(str).get(col) == null) { 739 core = false; 740 break; 741 } 742 } 743 if (core) 744 corePositions.add(col); 745 } 746 return corePositions; 747 } 748 749 /** 750 * Sort blocks so that the specified row is in sequential order. The sort 751 * happens in place. 752 * 753 * @param blocks 754 * List of blocks 755 * @param sortedIndex 756 * Index of the row to be sorted 757 */ 758 public static void sortBlocks(List<Block> blocks, final int sortedIndex) { 759 Collections.sort(blocks, new Comparator<Block>() { 760 @Override 761 public int compare(Block o1, Block o2) { 762 // Compare the first non-null residue of each block 763 List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex); 764 List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex); 765 Integer res1 = null; 766 Integer res2 = null; 767 for (Integer r : alignres1) { 768 if (r != null) { 769 res1 = r; 770 break; 771 } 772 } 773 for (Integer r : alignres2) { 774 if (r != null) { 775 res2 = r; 776 break; 777 } 778 } 779 return res1.compareTo(res2); 780 } 781 }); 782 } 783 784 /** 785 * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid 786 * residues. This method is only valid for protein structure alignments. 787 * 788 * @param msta 789 * Multiple Structure Alignment 790 * @return MultipleSequenceAlignment of protein sequences 791 * @throws CompoundNotFoundException 792 */ 793 public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA( 794 MultipleAlignment msta) throws CompoundNotFoundException { 795 796 // Check that the alignment is of protein structures 797 Group g = msta.getAtomArrays().get(0)[0].getGroup(); 798 if (!(g instanceof AminoAcid)) { 799 throw new IllegalArgumentException( 800 "Cannot convert to multiple sequence alignment: " 801 + "the structures aligned are not proteins"); 802 } 803 804 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<>(); 805 806 Map<String, Integer> uniqueID = new HashMap<>(); 807 List<String> seqs = getSequenceAlignment(msta); 808 for (int i = 0; i < msta.size(); i++) { 809 // Make sure the identifiers are unique (required by AccessionID) 810 String id = msta.getStructureIdentifier(i).toString(); 811 if (uniqueID.containsKey(id)) { 812 uniqueID.put(id, uniqueID.get(id) + 1); 813 id += "_" + uniqueID.get(id); 814 } else 815 uniqueID.put(id, 1); 816 817 AccessionID acc = new AccessionID(id); 818 ProteinSequence pseq = new ProteinSequence(seqs.get(i)); 819 pseq.setAccession(acc); 820 msa.addAlignedSequence(pseq); 821 } 822 return msa; 823 } 824 825 public static Structure toMultimodelStructure(MultipleAlignment multAln, List<Atom[]> transformedAtoms) throws StructureException { 826 PDBHeader header = new PDBHeader(); 827 String title = multAln.getEnsemble().getAlgorithmName() + " V." 828 + multAln.getEnsemble().getVersion() + " : "; 829 830 for (StructureIdentifier name : multAln.getEnsemble() 831 .getStructureIdentifiers()) { 832 title += name.getIdentifier() + " "; 833 } 834 Structure artificial = getAlignedStructure(transformedAtoms); 835 836 artificial.setPDBHeader(header); 837 header.setTitle(title); 838 return artificial; 839 } 840 841 /** 842 * Get an artificial Structure containing a different model for every 843 * input structure, so that the alignment result can be viewed in Jmol. 844 * The Atoms have to be rotated beforehand. 845 * 846 * @param atomArrays an array of Atoms for every aligned structure 847 * @return a structure object containing a set of models, 848 * one for each input array of Atoms. 849 * @throws StructureException 850 */ 851 public static final Structure getAlignedStructure(List<Atom[]> atomArrays) 852 throws StructureException { 853 854 Structure s = new StructureImpl(); 855 for (int i=0; i<atomArrays.size(); i++){ 856 List<Chain> model = AlignmentTools.getAlignedModel(atomArrays.get(i)); 857 s.addModel(model); 858 } 859 return s; 860 } 861 862 /** 863 * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of 864 * the matrix contains the RMSD between structures i and j. 865 * 866 * @param msa 867 * Multiple Structure Alignment 868 * @return Matrix of RMSD with size the number of structures squared 869 */ 870 public static Matrix getRMSDMatrix(MultipleAlignment msa) { 871 872 Matrix rmsdMat = new Matrix(msa.size(), msa.size()); 873 List<Atom[]> superposed = transformAtoms(msa); 874 875 for (int i = 0; i < msa.size(); i++) { 876 for (int j = i; j < msa.size(); j++) { 877 if (i == j) 878 rmsdMat.set(i, j, 0.0); 879 List<Atom[]> compared = new ArrayList<>(); 880 compared.add(superposed.get(i)); 881 compared.add(superposed.get(j)); 882 double rmsd = MultipleAlignmentScorer.getRMSD(compared); 883 rmsdMat.set(i, j, rmsd); 884 rmsdMat.set(j, i, rmsd); 885 } 886 } 887 return rmsdMat; 888 } 889 890 /** 891 * Calculate a phylogenetic tree of the MultipleAlignment using Kimura 892 * distances and the Neighbor Joining algorithm from forester. 893 * 894 * @param msta 895 * MultipleAlignment of protein structures 896 * @return Phylogeny phylogenetic tree 897 * @throws CompoundNotFoundException 898 * @throws IOException 899 */ 900 public static Phylogeny getKimuraTree(MultipleAlignment msta) 901 throws CompoundNotFoundException, IOException { 902 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 903 .toProteinMSA(msta); 904 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 905 .kimuraDistance(msa); 906 Phylogeny tree = TreeConstructor.distanceTree(distmat, 907 TreeConstructorType.NJ); 908 tree.setName("Kimura Tree"); 909 return tree; 910 } 911 912 /** 913 * Calculate a phylogenetic tree of the MultipleAlignment using 914 * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for 915 * distantly related proteins, structure-derived) and the Neighbor Joining 916 * algorithm from forester. 917 * 918 * @param msta 919 * MultipleAlignment of protein structures 920 * @return Phylogeny phylogenetic tree 921 * @throws CompoundNotFoundException 922 * @throws IOException 923 */ 924 public static Phylogeny getHSDMTree(MultipleAlignment msta) 925 throws CompoundNotFoundException, IOException { 926 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 927 .toProteinMSA(msta); 928 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 929 .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102")); 930 Phylogeny tree = TreeConstructor.distanceTree(distmat, 931 TreeConstructorType.NJ); 932 tree.setName("HSDM Tree"); 933 return tree; 934 } 935 936 /** 937 * Calculate a phylogenetic tree of the MultipleAlignment using RMSD 938 * distances and the Neighbor Joining algorithm from forester. 939 * 940 * @param msta 941 * MultipleAlignment of protein structures 942 * @return Phylogeny phylogenetic tree 943 */ 944 public static Phylogeny getStructuralTree(MultipleAlignment msta) { 945 double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta) 946 .getArray(); 947 BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 948 .structuralDistance(rmsdMat, 1, 5, 0.4); 949 // Set the identifiers of the matrix 950 Map<String, Integer> alreadySeen = new HashMap<>(); 951 for (int i = 0; i < msta.size(); i++) { 952 // Make sure the identifiers are unique 953 String id = msta.getStructureIdentifier(i).toString(); 954 if (alreadySeen.containsKey(id)) { 955 alreadySeen.put(id, alreadySeen.get(id) + 1); 956 id += "_" + alreadySeen.get(id); 957 } else 958 alreadySeen.put(id, 1); 959 rmsdDist.setIdentifier(i, id); 960 } 961 Phylogeny tree = TreeConstructor.distanceTree(rmsdDist, 962 TreeConstructorType.NJ); 963 tree.setName("Structural Tree"); 964 return tree; 965 } 966 967 /** 968 * Convert an MSA into a matrix of equivalent residues. 969 * 970 * This concatenates all blocks, meaning that the indices might not be 971 * sequential. 972 * 973 * Indices should be consistent with `msa.getAtomArrays()`. 974 * @param msa Multiple alignment 975 * @param coreOnly Include only core (ungapped) columns. Otherwise gaps are 976 * represented with null. 977 * @return 978 */ 979 public static List<List<Integer>> getEquivalentResidues(MultipleAlignment msa, boolean coreOnly) { 980 List<List<Integer>> eqr = new ArrayList<>(); 981 for (int str = 0; str < msa.size(); str++) { 982 eqr.add(new ArrayList<>()); 983 } 984 985 for(Block block: msa.getBlocks()) { 986 List<List<Integer>> aln = block.getAlignRes(); 987 for (int col = 0; col < block.length(); col++) { 988 // skip non-core columns 989 if(coreOnly) { 990 boolean core = true; 991 for (int str = 0; str < block.size(); str++) { 992 if (aln.get(str).get(col) == null) { 993 core = false; 994 break; 995 } 996 } 997 if(!core) { 998 continue; 999 } 1000 } 1001 // add column to eqr 1002 for (int str = 0; str < block.size(); str++) { 1003 eqr.get(str).add(aln.get(str).get(col)); 1004 } 1005 } 1006 } 1007 return eqr; 1008 } 1009}