001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.multiple.util; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.Collections; 027import java.util.Comparator; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Map; 031import java.util.SortedSet; 032import java.util.TreeSet; 033 034import javax.vecmath.Matrix4d; 035 036import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 037import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 038import org.biojava.nbio.core.sequence.AccessionID; 039import org.biojava.nbio.core.sequence.MultipleSequenceAlignment; 040import org.biojava.nbio.core.sequence.ProteinSequence; 041import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 042import org.biojava.nbio.phylo.DistanceMatrixCalculator; 043import org.biojava.nbio.phylo.TreeConstructor; 044import org.biojava.nbio.phylo.TreeConstructorType; 045import org.biojava.nbio.structure.AminoAcid; 046import org.biojava.nbio.structure.Atom; 047import org.biojava.nbio.structure.Calc; 048import org.biojava.nbio.structure.Chain; 049import org.biojava.nbio.structure.Group; 050import org.biojava.nbio.structure.PDBHeader; 051import org.biojava.nbio.structure.Structure; 052import org.biojava.nbio.structure.StructureException; 053import org.biojava.nbio.structure.StructureIdentifier; 054import org.biojava.nbio.structure.StructureImpl; 055import org.biojava.nbio.structure.StructureTools; 056import org.biojava.nbio.structure.align.multiple.Block; 057import org.biojava.nbio.structure.align.multiple.BlockSet; 058import org.biojava.nbio.structure.align.multiple.MultipleAlignment; 059import org.biojava.nbio.structure.align.util.AlignmentTools; 060import org.biojava.nbio.structure.jama.Matrix; 061import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; 062import org.forester.phylogeny.Phylogeny; 063 064/** 065 * Utility functions for working with {@link MultipleAlignment}. 066 * <p> 067 * Supported functions: 068 * <ul> 069 * <li>Obtain the alignment as sequence strings 070 * <li>Map from sequence alignment position to structure Atom 071 * <li>Map from sequence alignment position to Block number 072 * <li>Transform the aligned Atoms of a MultipleAlignment 073 * <li>Get all the core alignment positions of the alignment 074 * <li>Calculate the average residue distance of all aligned positions 075 * <li>Sort Blocks in a MultipleAlignment by a specified row 076 * <li>Convert a MultipleAlignment to a MultipleSequenceAlignment 077 * </ul> 078 * 079 * @author Spencer Bliven 080 * @author Aleix Lafita 081 * @since 4.1.0 082 * 083 */ 084public class MultipleAlignmentTools { 085 086 /** 087 * Calculate the sequence alignment Strings for the whole alignment. This 088 * method creates a sequence alignment where aligned residues are in 089 * uppercase and unaligned residues are in lowercase, thus providing a more 090 * compact way to represent the alignment. 091 * <p> 092 * Blocks are concatenated in the order returned by 093 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 094 * sequential. Gaps are represented by '-'. Separation between different 095 * Blocks is indicated by a gap in all positions, meaning that there is a 096 * possible discontinuity. 097 * 098 * @param alignment 099 * input MultipleAlignment 100 * @param mapSeqToStruct 101 * provides a link from the sequence alignment position to the 102 * structure alignment position. Specially designed for the GUI. 103 * Has to be initialized previously and will be overwritten. 104 * @return a string for each row in the alignment, giving the 1-letter code 105 * for each aligned residue. 106 */ 107 public static List<String> getSequenceAlignment( 108 MultipleAlignment alignment, final List<Integer> mapSeqToStruct) { 109 110 // Initialize sequence variables 111 List<String> alnSequences = new ArrayList<String>(); 112 for (int str = 0; str < alignment.size(); str++) 113 alnSequences.add(""); 114 mapSeqToStruct.clear(); 115 List<Atom[]> atoms = alignment.getAtomArrays(); 116 int globalPos = -1; 117 118 // Initialize helper variables in constucting the sequence alignment 119 List<SortedSet<Integer>> freePool = new ArrayList<SortedSet<Integer>>(); 120 List<SortedSet<Integer>> blockStarts = new ArrayList<SortedSet<Integer>>(); 121 List<List<Integer>> aligned = new ArrayList<List<Integer>>(); 122 123 // Generate freePool residues from the ones not aligned 124 for (int i = 0; i < alignment.size(); i++) { 125 List<Integer> residues = new ArrayList<Integer>(); 126 freePool.add(new TreeSet<Integer>()); 127 blockStarts.add(new TreeSet<Integer>()); 128 for (BlockSet bs : alignment.getBlockSets()) { 129 for (Block b : bs.getBlocks()) { 130 boolean first = true; 131 for (int l = 0; l < b.length(); l++) { 132 Integer residue = b.getAlignRes().get(i).get(l); 133 if (residue != null) { 134 if (first) 135 blockStarts.get(i).add(residue); 136 residues.add(residue); 137 first = false; 138 } 139 } 140 } 141 } 142 aligned.add(residues); 143 } 144 // Add any residue not aligned to the free pool for every structure 145 for (int i = 0; i < alignment.size(); i++) { 146 for (int k = 0; k < atoms.get(i).length; k++) { 147 if (!aligned.get(i).contains(k)) 148 freePool.get(i).add(k); 149 } 150 } 151 152 for (int b = 0; b < alignment.getBlocks().size(); b++) { 153 if (b != 0) { 154 // Add a gap to all structures to separate visually the Blocks 155 for (int str = 0; str < alignment.size(); str++) 156 alnSequences.set(str, alnSequences.get(str).concat("-")); 157 mapSeqToStruct.add(-1); // means unaligned position 158 } 159 // Store the previous position added to the sequence alignment 160 int[] previousPos = new int[alignment.size()]; 161 Arrays.fill(previousPos, -1); 162 // Store provisional characters 163 char[] provisionalChar = new char[alignment.size()]; 164 Arrays.fill(provisionalChar, '-'); 165 166 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 167 globalPos++; 168 boolean gaps = true; // true if consecutive with the previous 169 while (gaps) { 170 gaps = false; 171 // Loop through all the structures 172 for (int str = 0; str < alignment.size(); str++) { 173 // If it is the first position or before it was null 174 if (previousPos[str] == -1) { 175 Integer residue = alignment.getBlocks().get(b) 176 .getAlignRes().get(str).get(pos); 177 if (residue == null) 178 provisionalChar[str] = '-'; 179 else { 180 Atom a = atoms.get(str)[residue]; 181 String group = a.getGroup().getPDBName(); 182 provisionalChar[str] = StructureTools 183 .get1LetterCode(group); 184 } 185 } else { 186 Integer residue = alignment.getBlocks().get(b) 187 .getAlignRes().get(str).get(pos); 188 int nextPos = previousPos[str] + 1; 189 if (residue == null) { 190 if (freePool.get(str).contains(nextPos)) { 191 Atom a = atoms.get(str)[nextPos]; 192 String g = a.getGroup().getPDBName(); 193 char aa = StructureTools.get1LetterCode(g); 194 provisionalChar[str] = Character 195 .toLowerCase(aa); 196 } else 197 provisionalChar[str] = '-'; 198 } else if (nextPos == residue) { 199 Atom a = atoms.get(str)[nextPos]; 200 String group = a.getGroup().getPDBName(); 201 provisionalChar[str] = StructureTools 202 .get1LetterCode(group); 203 } else { 204 // This means non-consecutive 205 provisionalChar[str] = ' '; 206 gaps = true; 207 } 208 } 209 }// End all structure analysis 210 211 if (gaps) { 212 for (int str = 0; str < alignment.size(); str++) { 213 if (provisionalChar[str] == ' ') { 214 // It means this residue was non-consecutive 215 Atom a = atoms.get(str)[previousPos[str] + 1]; 216 String group = a.getGroup().getPDBName(); 217 char aa = StructureTools.get1LetterCode(group); 218 alnSequences 219 .set(str, 220 alnSequences 221 .get(str) 222 .concat(String.valueOf(Character 223 .toLowerCase(aa))) ); 224 previousPos[str]++; 225 } else { 226 // Insert a gap otherwise 227 alnSequences.set(str, alnSequences.get(str) 228 .concat("-")); 229 } 230 } 231 mapSeqToStruct.add(-1); // unaligned position 232 } else { 233 // Add provisional and update the indices 234 for (int str = 0; str < alignment.size(); str++) { 235 alnSequences.set( 236 str, 237 alnSequences.get(str).concat( 238 String.valueOf(provisionalChar[str]))); 239 240 if (provisionalChar[str] != '-') { 241 if (alignment.getBlocks().get(b).getAlignRes() 242 .get(str).get(pos) == null) { 243 previousPos[str]++; 244 } else { 245 previousPos[str] = alignment.getBlocks() 246 .get(b).getAlignRes().get(str) 247 .get(pos); 248 } 249 } 250 } 251 mapSeqToStruct.add(globalPos); // alignment index 252 } 253 } 254 } // All positions in the Block considered so far 255 256 // Calculate the index of the next Block for every structure 257 int[] blockEnds = new int[alignment.size()]; 258 for (int str = 0; str < alignment.size(); str++) { 259 for (int res : blockStarts.get(str)) { 260 if (previousPos[str] > res) 261 blockEnds[str] = res; 262 else { 263 blockEnds[str] = res; 264 break; 265 } 266 } 267 } 268 269 // Add the unaligned residues in between Blocks (lowercase) 270 boolean allGaps = false; // true means no more residues to add 271 while (!allGaps) { 272 allGaps = true; 273 for (int str = 0; str < alignment.size(); str++) { 274 if (previousPos[str] + 1 < blockEnds[str]) { 275 Atom a = atoms.get(str)[previousPos[str] + 1]; 276 String group = a.getGroup().getPDBName(); 277 char letter = StructureTools.get1LetterCode(group); 278 279 provisionalChar[str] = Character.toLowerCase(letter); 280 previousPos[str]++; 281 allGaps = false; 282 } else 283 provisionalChar[str] = '-'; 284 } 285 if (!allGaps) { 286 for (int str = 0; str < alignment.size(); str++) { 287 alnSequences.set( 288 str, 289 alnSequences.get(str).concat( 290 String.valueOf(provisionalChar[str])) ); 291 } 292 mapSeqToStruct.add(-1); // unaligned position 293 } 294 } 295 } 296 return alnSequences; 297 } 298 299 /** 300 * Calculate the sequence alignment Strings for the whole alignment. This 301 * method creates a sequence alignment where aligned residues are in 302 * uppercase and unaligned residues are in lowercase, thus providing a more 303 * compact way to represent the alignment. 304 * <p> 305 * Blocks are concatenated in the order returned by 306 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 307 * sequential. Gaps are represented by '-'. Separation between different 308 * Blocks is indicated by a gap in all positions, meaning that there is a 309 * possible discontinuity. 310 * 311 * @param alignment 312 * input MultipleAlignment 313 * @return String for each row in the alignment, giving the 1-letter code 314 * for each aligned residue. 315 */ 316 public static List<String> getSequenceAlignment(MultipleAlignment msa) { 317 return getSequenceAlignment(msa, new ArrayList<Integer>()); 318 } 319 320 /** 321 * Calculate the sequence alignment Strings for the alignment Blocks in an 322 * alignment. This method creates a sequence alignment where all residues 323 * are in uppercase and a residue alone with gaps in all the other 324 * structures represents unaligned residues. Because of this latter 325 * constraint only the residues within the Blocks are represented, for a 326 * more compact alignment. For a sequence alignment of the full protein use 327 * {@link #getSequenceAlignment(MultipleAlignment)}. 328 * <p> 329 * Blocks are concatenated in the order returned by 330 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 331 * sequential. Gaps between blocks are omitted, while gaps within blocks are 332 * represented by '-'. Separation between different Blocks is indicated by a 333 * gap in all positions, meaning that there is something unaligned 334 * inbetween. 335 * 336 * @param alignment 337 * input MultipleAlignment 338 * @param mapSeqToStruct 339 * provides a link from the sequence alignment position to the 340 * structure alignment position. Specially designed for the GUI. 341 * Has to be initialized previously and will be overwritten. 342 * @return a string for each row in the alignment, giving the 1-letter code 343 * for each aligned residue. 344 */ 345 public static List<String> getBlockSequenceAlignment( 346 MultipleAlignment alignment, List<Integer> mapSeqToStruct) { 347 348 // Initialize sequence variables 349 List<String> alnSequences = new ArrayList<String>(); 350 for (int str = 0; str < alignment.size(); str++) 351 alnSequences.add(""); 352 mapSeqToStruct.clear(); 353 List<Atom[]> atoms = alignment.getAtomArrays(); 354 int globalPos = -1; 355 356 // Loop through all the alignment Blocks in the order given 357 for (int b = 0; b < alignment.getBlocks().size(); b++) { 358 if (b != 0) { 359 // Add a gap to all structures to separate Blocks 360 for (int str = 0; str < alignment.size(); str++) 361 alnSequences.set(str, alnSequences.get(str).concat("-")); 362 mapSeqToStruct.add(-1); // means unaligned position 363 } 364 365 // Store the previous position added to the sequence alignment 366 int[] previousPos = new int[alignment.size()]; 367 Arrays.fill(previousPos, -1); 368 // Store provisional characters 369 char[] provisionalChar = new char[alignment.size()]; 370 Arrays.fill(provisionalChar, '-'); 371 372 for (int pos = 0; pos < alignment.getBlocks().get(b).length(); pos++) { 373 globalPos++; 374 boolean gaps = true; 375 while (gaps) { 376 gaps = false; 377 // Loop through all the structures 378 for (int str = 0; str < alignment.size(); str++) { 379 // If it is the first position or before it was null 380 if (previousPos[str] == -1) { 381 Integer residue = alignment.getBlocks().get(b) 382 .getAlignRes().get(str).get(pos); 383 if (residue == null) 384 provisionalChar[str] = '-'; 385 else { 386 Atom a = atoms.get(str)[residue]; 387 String g = a.getGroup().getPDBName(); 388 char aa = StructureTools.get1LetterCode(g); 389 provisionalChar[str] = aa; 390 } 391 } else { 392 Integer residue = alignment.getBlocks().get(b) 393 .getAlignRes().get(str).get(pos); 394 if (residue == null) 395 provisionalChar[str] = '-'; 396 else if (previousPos[str] + 1 == residue) { 397 Atom a = atoms.get(str)[residue]; 398 String g = a.getGroup().getPDBName(); 399 char aa = StructureTools.get1LetterCode(g); 400 provisionalChar[str] = aa; 401 } else { 402 provisionalChar[str] = ' '; 403 gaps = true; 404 } 405 } 406 }// End all structures analysis 407 408 if (gaps) { 409 for (int str = 0; str < alignment.size(); str++) { 410 if (provisionalChar[str] == ' ') { 411 // It means this residue was non-consecutive 412 for (int s2 = 0; s2 < alignment.size(); s2++) { 413 if (str == s2) { 414 int next = previousPos[str] + 1; 415 Atom a = atoms.get(s2)[next]; 416 String g = a.getGroup().getPDBName(); 417 char aa = StructureTools 418 .get1LetterCode(g); 419 alnSequences.set( 420 s2, 421 alnSequences.get(s2).concat( 422 String.valueOf(aa)) ); 423 } else { 424 alnSequences.set(s2, 425 alnSequences.get(s2) 426 .concat("-")); 427 } 428 } 429 mapSeqToStruct.add(-1); // unaligned 430 previousPos[str] += 1; 431 } 432 } 433 } else { // Append the provisional and update the indices 434 for (int str = 0; str < alignment.size(); str++) { 435 alnSequences.set( 436 str, 437 alnSequences.get(str).concat( 438 String.valueOf(provisionalChar[str])) ); 439 if (provisionalChar[str] != '-') { 440 previousPos[str] = alignment.getBlocks().get(b) 441 .getAlignRes().get(str).get(pos); 442 } 443 } 444 mapSeqToStruct.add(globalPos); 445 } 446 } 447 } 448 } 449 return alnSequences; 450 } 451 452 /** 453 * Calculate the sequence alignment Strings for the alignment Blocks in an 454 * alignment. This method creates a sequence alignment where all residues 455 * are in uppercase and a residue alone with gaps in all the other 456 * structures represents unaligned residues. Because of this latter 457 * constraint only the residues within the Blocks are represented, for a 458 * more compact alignment. For a sequence alignment of the full protein use 459 * {@link #getSequenceAlignment(MultipleAlignment)}. 460 * <p> 461 * Blocks are concatenated in the order returned by 462 * {@link MultipleAlignment#getBlocks()}, so sequences may not be 463 * sequential. Gaps between blocks are omitted, while gaps within blocks are 464 * represented by '-'. Separation between different Blocks is indicated by a 465 * gap in all positions, meaning that there is something unaligned 466 * inbetween. 467 * 468 * @param alignment 469 * input MultipleAlignment 470 * @return String for each row in the alignment, giving the 1-letter code 471 * for each aligned residue. 472 */ 473 public static List<String> getBlockSequenceAlignment(MultipleAlignment ma) { 474 return getBlockSequenceAlignment(ma, new ArrayList<Integer>()); 475 } 476 477 /** 478 * Returns the Atom of the specified structure that is aligned in the 479 * sequence alignment position specified. 480 * 481 * @param multAln 482 * the MultipleAlignment object from where the sequence alignment 483 * has been generated 484 * @param mapSeqToStruct 485 * the mapping between sequence and structure generated with the 486 * sequence alignment 487 * @param str 488 * the structure index of the alignment (row) 489 * @param sequencePos 490 * the sequence alignment position (column) 491 * @return Atom the atom in that position or null if there is a gap 492 */ 493 public static Atom getAtomForSequencePosition(MultipleAlignment msa, 494 List<Integer> mapSeqToStruct, int str, int sequencePos) { 495 496 int seqPos = mapSeqToStruct.get(sequencePos); 497 498 // Check if the position selected is an aligned position 499 if (seqPos == -1) 500 return null; 501 else { 502 Atom a = null; 503 // Calculate the corresponding structure position 504 int sum = 0; 505 for (Block b : msa.getBlocks()) { 506 if (sum + b.length() <= seqPos) { 507 sum += b.length(); 508 continue; 509 } else { 510 for (Integer p : b.getAlignRes().get(str)) { 511 if (sum == seqPos) { 512 if (p != null) { 513 a = msa.getAtomArrays().get(str)[p]; 514 } 515 break; 516 } 517 sum++; 518 } 519 break; 520 } 521 } 522 return a; 523 } 524 } 525 526 /** 527 * Returns the block number of a specified position in the sequence 528 * alignment, given the mapping from structure to function. 529 * 530 * @param multAln 531 * the MultipleAlignment object from where the sequence alignment 532 * has been generated. 533 * @param mapSeqToStruct 534 * the mapping between sequence and structure generated with the 535 * sequence alignment 536 * @param sequencePos 537 * the position in the sequence alignment (column) 538 * @return int the block index, or -1 if the position is not aligned 539 */ 540 public static int getBlockForSequencePosition(MultipleAlignment multAln, 541 List<Integer> mapSeqToStruct, int sequencePos) { 542 543 int seqPos = mapSeqToStruct.get(sequencePos); 544 // Check if the position selected is an aligned position 545 if (seqPos == -1) 546 return -1; 547 else { 548 // Calculate the corresponding block (by iterating all Blocks) 549 int sum = 0; 550 int block = 0; 551 for (Block b : multAln.getBlocks()) { 552 if (sum + b.length() <= seqPos) { 553 sum += b.length(); 554 block++; 555 continue; 556 } else 557 break; 558 } 559 return block; 560 } 561 } 562 563 /** 564 * The average residue distance Matrix contains the average distance from 565 * each residue to all other residues aligned with it. 566 * <p> 567 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 568 * length. 569 * 570 * @param alignment 571 * MultipleAlignment 572 * @return Matrix containing all average residue distances 573 */ 574 public static Matrix getAverageResidueDistances(MultipleAlignment msa) { 575 List<Atom[]> transformed = transformAtoms(msa); 576 return getAverageResidueDistances(transformed); 577 } 578 579 /** 580 * The average residue distance Matrix contains the average distance from 581 * each residue to all other residues aligned with it. 582 * <p> 583 * Complexity: T(n,l) = O(l*n^2), if n=number of structures and l=alignment 584 * length. 585 * 586 * @param transformed 587 * List of Atom arrays containing only the aligned atoms of each 588 * structure, or null if there is a gap. 589 * @return Matrix containing all average residue distances. Entry -1 means 590 * there is a gap in the position. 591 */ 592 public static Matrix getAverageResidueDistances(List<Atom[]> transformed) { 593 594 int size = transformed.size(); 595 int length = transformed.get(0).length; 596 Matrix resDist = new Matrix(size, length, -1); 597 598 // Calculate the average residue distances 599 for (int r1 = 0; r1 < size; r1++) { 600 for (int c = 0; c < transformed.get(r1).length; c++) { 601 Atom refAtom = transformed.get(r1)[c]; 602 if (refAtom == null) 603 continue; 604 605 for (int r2 = r1 + 1; r2 < size; r2++) { 606 Atom atom = transformed.get(r2)[c]; 607 if (atom != null) { 608 double distance = Calc.getDistance(refAtom, atom); 609 610 if (resDist.get(r1, c) == -1) { 611 resDist.set(r1, c, 1 + distance); 612 } else { 613 resDist.set(r1, c, resDist.get(r1, c) + distance); 614 } 615 616 if (resDist.get(r2, c) == -1) { 617 resDist.set(r2, c, 1 + distance); 618 } else { 619 resDist.set(r2, c, resDist.get(r2, c) + distance); 620 } 621 } 622 } 623 } 624 } 625 626 for (int c = 0; c < length; c++) { 627 int nonNullRes = 0; 628 for (int r = 0; r < size; r++) { 629 if (resDist.get(r, c) != -1) 630 nonNullRes++; 631 } 632 for (int r = 0; r < size; r++) { 633 if (resDist.get(r, c) != -1) { 634 resDist.set(r, c, resDist.get(r, c) / nonNullRes); 635 } 636 } 637 } 638 return resDist; 639 } 640 641 /** 642 * Transforms atoms according to the superposition stored in the alignment. 643 * <p> 644 * For each structure in the alignment, returns an atom for each 645 * representative atom in the aligned columns, omitting unaligned residues 646 * (i.e. an array of length <tt>alignment.length()</tt> ). 647 * <p> 648 * All blocks are concatenated together, so Atoms may not appear in the same 649 * order as in their parent structure. If the alignment blocks contain null 650 * residues (gaps), then the returned array will also contain null Atoms in 651 * the same positions. 652 * 653 * @param alignment 654 * MultipleAlignment 655 * @return List of Atom arrays of only the aligned atoms of every structure 656 * (null Atom if a gap position) 657 */ 658 public static List<Atom[]> transformAtoms(MultipleAlignment alignment) { 659 if (alignment.getEnsemble() == null) { 660 throw new NullPointerException("No ensemble set for this alignment"); 661 } 662 663 List<Atom[]> atomArrays = alignment.getAtomArrays(); 664 List<Atom[]> transformed = new ArrayList<Atom[]>(atomArrays.size()); 665 666 // Loop through structures 667 for (int i = 0; i < atomArrays.size(); i++) { 668 669 Matrix4d transform = null; 670 Atom[] curr = atomArrays.get(i); // all CA atoms from structure 671 672 // Concatenated list of all blocks for this structure 673 Atom[] transformedAtoms = new Atom[alignment.length()]; 674 int transformedAtomsLength = 0; 675 676 // Each blockset gets transformed independently 677 for (BlockSet bs : alignment.getBlockSets()) { 678 679 Atom[] blocksetAtoms = new Atom[bs.length()]; 680 int blockPos = 0; 681 682 for (Block blk : bs.getBlocks()) { 683 if (blk.size() != atomArrays.size()) { 684 throw new IllegalStateException(String.format( 685 "Mismatched block size. Expected %d " 686 + "structures, found %d.", 687 atomArrays.size(), blk.size())); 688 } 689 // Extract aligned atoms 690 for (int j = 0; j < blk.length(); j++) { 691 Integer alignedPos = blk.getAlignRes().get(i).get(j); 692 if (alignedPos != null) { 693 blocksetAtoms[blockPos] = (Atom) curr[alignedPos] 694 .clone(); 695 } 696 blockPos++; 697 } 698 } 699 700 // Transform according to the blockset or alignment matrix 701 Matrix4d blockTrans = null; 702 if (bs.getTransformations() != null) 703 blockTrans = bs.getTransformations().get(i); 704 if (blockTrans == null) { 705 blockTrans = transform; 706 } 707 708 for (Atom a : blocksetAtoms) { 709 if (a != null) 710 Calc.transform(a, blockTrans); 711 transformedAtoms[transformedAtomsLength] = a; 712 transformedAtomsLength++; 713 } 714 } 715 assert (transformedAtomsLength == alignment.length()); 716 717 transformed.add(transformedAtoms); 718 } 719 return transformed; 720 } 721 722 /** 723 * Calculate a List of alignment indicies that correspond to the core of a 724 * Block, which means that all structures have a residue in that positon. 725 * 726 * @param block 727 * alignment Block 728 * @return List of positions in the core of the alignment 729 */ 730 public static List<Integer> getCorePositions(Block block) { 731 732 List<Integer> corePositions = new ArrayList<Integer>(); 733 734 for (int col = 0; col < block.length(); col++) { 735 boolean core = true; 736 for (int str = 0; str < block.size(); str++) { 737 if (block.getAlignRes().get(str).get(col) == null) { 738 core = false; 739 break; 740 } 741 } 742 if (core) 743 corePositions.add(col); 744 } 745 return corePositions; 746 } 747 748 /** 749 * Sort blocks so that the specified row is in sequential order. The sort 750 * happens in place. 751 * 752 * @param blocks 753 * List of blocks 754 * @param sortedIndex 755 * Index of the row to be sorted 756 */ 757 public static void sortBlocks(List<Block> blocks, final int sortedIndex) { 758 Collections.sort(blocks, new Comparator<Block>() { 759 @Override 760 public int compare(Block o1, Block o2) { 761 // Compare the first non-null residue of each block 762 List<Integer> alignres1 = o1.getAlignRes().get(sortedIndex); 763 List<Integer> alignres2 = o2.getAlignRes().get(sortedIndex); 764 Integer res1 = null; 765 Integer res2 = null; 766 for (Integer r : alignres1) { 767 if (r != null) { 768 res1 = r; 769 break; 770 } 771 } 772 for (Integer r : alignres2) { 773 if (r != null) { 774 res2 = r; 775 break; 776 } 777 } 778 return res1.compareTo(res2); 779 } 780 }); 781 } 782 783 /** 784 * Convert a MultipleAlignment into a MultipleSequenceAlignment of AminoAcid 785 * residues. This method is only valid for protein structure alignments. 786 * 787 * @param msta 788 * Multiple Structure Alignment 789 * @return MultipleSequenceAlignment of protein sequences 790 * @throws CompoundNotFoundException 791 */ 792 public static MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> toProteinMSA( 793 MultipleAlignment msta) throws CompoundNotFoundException { 794 795 // Check that the alignment is of protein structures 796 Group g = msta.getAtomArrays().get(0)[0].getGroup(); 797 if (!(g instanceof AminoAcid)) { 798 throw new IllegalArgumentException( 799 "Cannot convert to multiple sequence alignment: " 800 + "the structures aligned are not proteins"); 801 } 802 803 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = new MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound>(); 804 805 Map<String, Integer> uniqueID = new HashMap<String, Integer>(); 806 List<String> seqs = getSequenceAlignment(msta); 807 for (int i = 0; i < msta.size(); i++) { 808 // Make sure the identifiers are unique (required by AccessionID) 809 String id = msta.getStructureIdentifier(i).toString(); 810 if (uniqueID.containsKey(id)) { 811 uniqueID.put(id, uniqueID.get(id) + 1); 812 id += "_" + uniqueID.get(id); 813 } else 814 uniqueID.put(id, 1); 815 816 AccessionID acc = new AccessionID(id); 817 ProteinSequence pseq = new ProteinSequence(seqs.get(i)); 818 pseq.setAccession(acc); 819 msa.addAlignedSequence(pseq); 820 } 821 return msa; 822 } 823 824 public static Structure toMultimodelStructure(MultipleAlignment multAln, List<Atom[]> transformedAtoms) throws StructureException { 825 PDBHeader header = new PDBHeader(); 826 String title = multAln.getEnsemble().getAlgorithmName() + " V." 827 + multAln.getEnsemble().getVersion() + " : "; 828 829 for (StructureIdentifier name : multAln.getEnsemble() 830 .getStructureIdentifiers()) { 831 title += name.getIdentifier() + " "; 832 } 833 Structure artificial = getAlignedStructure(transformedAtoms); 834 835 artificial.setPDBHeader(header); 836 header.setTitle(title); 837 return artificial; 838 } 839 840 /** 841 * Get an artificial Structure containing a different model for every 842 * input structure, so that the alignment result can be viewed in Jmol. 843 * The Atoms have to be rotated beforehand. 844 * 845 * @param atomArrays an array of Atoms for every aligned structure 846 * @return a structure object containing a set of models, 847 * one for each input array of Atoms. 848 * @throws StructureException 849 */ 850 public static final Structure getAlignedStructure(List<Atom[]> atomArrays) 851 throws StructureException { 852 853 Structure s = new StructureImpl(); 854 for (int i=0; i<atomArrays.size(); i++){ 855 List<Chain> model = AlignmentTools.getAlignedModel(atomArrays.get(i)); 856 s.addModel(model); 857 } 858 return s; 859 } 860 861 /** 862 * Calculate the RMSD matrix of a MultipleAlignment, that is, entry (i,j) of 863 * the matrix contains the RMSD between structures i and j. 864 * 865 * @param msa 866 * Multiple Structure Alignment 867 * @return Matrix of RMSD with size the number of structures squared 868 */ 869 public static Matrix getRMSDMatrix(MultipleAlignment msa) { 870 871 Matrix rmsdMat = new Matrix(msa.size(), msa.size()); 872 List<Atom[]> superposed = transformAtoms(msa); 873 874 for (int i = 0; i < msa.size(); i++) { 875 for (int j = i; j < msa.size(); j++) { 876 if (i == j) 877 rmsdMat.set(i, j, 0.0); 878 List<Atom[]> compared = new ArrayList<Atom[]>(); 879 compared.add(superposed.get(i)); 880 compared.add(superposed.get(j)); 881 double rmsd = MultipleAlignmentScorer.getRMSD(compared); 882 rmsdMat.set(i, j, rmsd); 883 rmsdMat.set(j, i, rmsd); 884 } 885 } 886 return rmsdMat; 887 } 888 889 /** 890 * Calculate a phylogenetic tree of the MultipleAlignment using Kimura 891 * distances and the Neighbor Joining algorithm from forester. 892 * 893 * @param msta 894 * MultipleAlignment of protein structures 895 * @return Phylogeny phylogenetic tree 896 * @throws CompoundNotFoundException 897 * @throws IOException 898 */ 899 public static Phylogeny getKimuraTree(MultipleAlignment msta) 900 throws CompoundNotFoundException, IOException { 901 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 902 .toProteinMSA(msta); 903 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 904 .kimuraDistance(msa); 905 Phylogeny tree = TreeConstructor.distanceTree(distmat, 906 TreeConstructorType.NJ); 907 tree.setName("Kimura Tree"); 908 return tree; 909 } 910 911 /** 912 * Calculate a phylogenetic tree of the MultipleAlignment using 913 * dissimilarity scores (DS), based in SDM Substitution Matrix (ideal for 914 * distantly related proteins, structure-derived) and the Neighbor Joining 915 * algorithm from forester. 916 * 917 * @param msta 918 * MultipleAlignment of protein structures 919 * @return Phylogeny phylogenetic tree 920 * @throws CompoundNotFoundException 921 * @throws IOException 922 */ 923 public static Phylogeny getHSDMTree(MultipleAlignment msta) 924 throws CompoundNotFoundException, IOException { 925 MultipleSequenceAlignment<ProteinSequence, AminoAcidCompound> msa = MultipleAlignmentTools 926 .toProteinMSA(msta); 927 BasicSymmetricalDistanceMatrix distmat = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 928 .dissimilarityScore(msa, SubstitutionMatrixHelper.getAminoAcidSubstitutionMatrix("PRLA000102")); 929 Phylogeny tree = TreeConstructor.distanceTree(distmat, 930 TreeConstructorType.NJ); 931 tree.setName("HSDM Tree"); 932 return tree; 933 } 934 935 /** 936 * Calculate a phylogenetic tree of the MultipleAlignment using RMSD 937 * distances and the Neighbor Joining algorithm from forester. 938 * 939 * @param msta 940 * MultipleAlignment of protein structures 941 * @return Phylogeny phylogenetic tree 942 * @throws CompoundNotFoundException 943 */ 944 public static Phylogeny getStructuralTree(MultipleAlignment msta) { 945 double[][] rmsdMat = MultipleAlignmentTools.getRMSDMatrix(msta) 946 .getArray(); 947 BasicSymmetricalDistanceMatrix rmsdDist = (BasicSymmetricalDistanceMatrix) DistanceMatrixCalculator 948 .structuralDistance(rmsdMat, 1, 5, 0.4); 949 // Set the identifiers of the matrix 950 Map<String, Integer> alreadySeen = new HashMap<String, Integer>(); 951 for (int i = 0; i < msta.size(); i++) { 952 // Make sure the identifiers are unique 953 String id = msta.getStructureIdentifier(i).toString(); 954 if (alreadySeen.containsKey(id)) { 955 alreadySeen.put(id, alreadySeen.get(id) + 1); 956 id += "_" + alreadySeen.get(id); 957 } else 958 alreadySeen.put(id, 1); 959 rmsdDist.setIdentifier(i, id); 960 } 961 Phylogeny tree = TreeConstructor.distanceTree(rmsdDist, 962 TreeConstructorType.NJ); 963 tree.setName("Structural Tree"); 964 return tree; 965 } 966 967}