001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.align.util; 022 023import org.biojava.nbio.structure.*; 024import org.biojava.nbio.structure.align.ce.CECalculator; 025import org.biojava.nbio.structure.align.model.AFPChain; 026import org.biojava.nbio.structure.align.xml.AFPChainXMLParser; 027import org.biojava.nbio.structure.jama.Matrix; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.io.IOException; 032import java.io.Writer; 033import java.util.*; 034import java.util.Map.Entry; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037 038/** 039 * Methods for analyzing and manipulating AFPChains and for 040 * other pairwise alignment utilities. <p> 041 * Current methods: replace optimal alignment, create new AFPChain, 042 * format conversion, update superposition, etc. 043 * 044 * @author Spencer Bliven 045 * @author Aleix Lafita 046 * 047 */ 048public class AlignmentTools { 049 050 private static final Logger logger = LoggerFactory.getLogger(AlignmentTools.class); 051 052 053 public static boolean debug = false; 054 055 /** 056 * Checks that the alignment given by afpChain is sequential. This means 057 * that the residue indices of both proteins increase monotonically as 058 * a function of the alignment position (ie both proteins are sorted). 059 * 060 * This will return false for circularly permuted alignments or other 061 * non-topological alignments. It will also return false for cases where 062 * the alignment itself is sequential but it is not stored in the afpChain 063 * in a sorted manner. 064 * 065 * Since algorithms which create non-sequential alignments split the 066 * alignment into multiple blocks, some computational time can be saved 067 * by only checking block boundaries for sequentiality. Setting 068 * <tt>checkWithinBlocks</tt> to <tt>true</tt> makes this function slower, 069 * but detects AFPChains with non-sequential blocks. 070 * 071 * Note that this method should give the same results as 072 * {@link AFPChain#isSequentialAlignment()}. However, the AFPChain version 073 * relies on the StructureAlignment algorithm correctly setting this 074 * parameter, which is sadly not always the case. 075 * 076 * @param afpChain An alignment 077 * @param checkWithinBlocks Indicates whether individual blocks should be 078 * checked for sequentiality 079 * @return True if the alignment is sequential. 080 */ 081 public static boolean isSequentialAlignment(AFPChain afpChain, boolean checkWithinBlocks) { 082 int[][][] optAln = afpChain.getOptAln(); 083 int[] alnLen = afpChain.getOptLen(); 084 int blocks = afpChain.getBlockNum(); 085 086 if(blocks < 1) return true; //trivial case 087 if ( alnLen[0] < 1) return true; 088 089 // Check that blocks are sequential 090 if(checkWithinBlocks) { 091 for(int block = 0; block<blocks; block++) { 092 if(alnLen[block] < 1 ) continue; //skip empty blocks 093 094 int prevRes1 = optAln[block][0][0]; 095 int prevRes2 = optAln[block][1][0]; 096 097 for(int pos = 1; pos<alnLen[block]; pos++) { 098 int currRes1 = optAln[block][0][pos]; 099 int currRes2 = optAln[block][1][pos]; 100 101 if(currRes1 < prevRes1) { 102 return false; 103 } 104 if(currRes2 < prevRes2) { 105 return false; 106 } 107 108 prevRes1 = currRes1; 109 prevRes2 = currRes2; 110 } 111 } 112 } 113 114 // Check that blocks are sequential 115 int prevRes1 = optAln[0][0][alnLen[0]-1]; 116 int prevRes2 = optAln[0][1][alnLen[0]-1]; 117 118 for(int block = 1; block<blocks;block++) { 119 if(alnLen[block] < 1 ) continue; //skip empty blocks 120 121 if(optAln[block][0][0]<prevRes1) { 122 return false; 123 } 124 if(optAln[block][1][0]<prevRes2) { 125 return false; 126 } 127 128 prevRes1 = optAln[block][0][alnLen[block]-1]; 129 prevRes2 = optAln[block][1][alnLen[block]-1]; 130 } 131 132 return true; 133 } 134 135 /** 136 * Creates a Map specifying the alignment as a mapping between residue indices 137 * of protein 1 and residue indices of protein 2. 138 * 139 * <p>For example,<pre> 140 * 1234 141 * 5678</pre> 142 * becomes<pre> 143 * 1->5 144 * 2->6 145 * 3->7 146 * 4->8</pre> 147 * 148 * @param afpChain An alignment 149 * @return A mapping from aligned residues of protein 1 to their partners in protein 2. 150 * @throws StructureException If afpChain is not one-to-one 151 */ 152 public static Map<Integer, Integer> alignmentAsMap(AFPChain afpChain) throws StructureException { 153 Map<Integer,Integer> map = new HashMap<Integer,Integer>(); 154 155 if( afpChain.getAlnLength() < 1 ) { 156 return map; 157 } 158 int[][][] optAln = afpChain.getOptAln(); 159 int[] optLen = afpChain.getOptLen(); 160 for(int block = 0; block < afpChain.getBlockNum(); block++) { 161 for(int pos = 0; pos < optLen[block]; pos++) { 162 int res1 = optAln[block][0][pos]; 163 int res2 = optAln[block][1][pos]; 164 if(map.containsKey(res1)) { 165 throw new StructureException(String.format("Residue %d aligned to both %d and %d.", res1,map.get(res1),res2)); 166 } 167 map.put(res1,res2); 168 } 169 } 170 return map; 171 } 172 173 /** 174 * Applies an alignment k times. Eg if alignmentMap defines function f(x), 175 * this returns a function f^k(x)=f(f(...f(x)...)). 176 * 177 * @param <T> 178 * @param alignmentMap The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)}) 179 * @param k The number of times to apply the alignment 180 * @return A new alignment. If the input function is not automorphic 181 * (one-to-one), then some inputs may map to null, indicating that the 182 * function is undefined for that input. 183 */ 184 public static <T> Map<T,T> applyAlignment(Map<T, T> alignmentMap, int k) { 185 return applyAlignment(alignmentMap, new IdentityMap<T>(), k); 186 } 187 188 /** 189 * Applies an alignment k times. Eg if alignmentMap defines function f(x), 190 * this returns a function f^k(x)=f(f(...f(x)...)). 191 * 192 * To allow for functions with different domains and codomains, the identity 193 * function allows converting back in a reasonable way. For instance, if 194 * alignmentMap represented an alignment between two proteins with different 195 * numbering schemes, the identity function could calculate the offset 196 * between residue numbers, eg I(x) = x-offset. 197 * 198 * When an identity function is provided, the returned function calculates 199 * f^k(x) = f(I( f(I( ... f(x) ... )) )). 200 * 201 * @param <S> 202 * @param <T> 203 * @param alignmentMap The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)}) 204 * @param identity An identity-like function providing the isomorphism between 205 * the codomain of alignmentMap (of type <T>) and the domain (type <S>). 206 * @param k The number of times to apply the alignment 207 * @return A new alignment. If the input function is not automorphic 208 * (one-to-one), then some inputs may map to null, indicating that the 209 * function is undefined for that input. 210 */ 211 public static <S,T> Map<S,T> applyAlignment(Map<S, T> alignmentMap, Map<T,S> identity, int k) { 212 213 // This implementation simply applies the map k times. 214 // If k were large, it would be more efficient to do this recursively, 215 // (eg f^4 = (f^2)^2) but k will usually be small. 216 217 if(k<0) throw new IllegalArgumentException("k must be positive"); 218 if(k==1) { 219 return new HashMap<S,T>(alignmentMap); 220 } 221 // Convert to lists to establish a fixed order 222 List<S> preimage = new ArrayList<S>(alignmentMap.keySet()); // currently unmodified 223 List<S> image = new ArrayList<S>(preimage); 224 225 for(int n=1;n<k;n++) { 226 // apply alignment 227 for(int i=0;i<image.size();i++) { 228 S pre = image.get(i); 229 T intermediate = (pre==null?null: alignmentMap.get(pre)); 230 S post = (intermediate==null?null: identity.get(intermediate)); 231 image.set(i, post); 232 } 233 } 234 235 236 237 Map<S, T> imageMap = new HashMap<S,T>(alignmentMap.size()); 238 239 //TODO handle nulls consistently. 240 // assure that all the residues in the domain are valid keys 241 /* 242 for(int i=0;i<preimage.size();i++) { 243 S pre = preimage.get(i); 244 T intermediate = (pre==null?null: alignmentMap.get(pre)); 245 S post = (intermediate==null?null: identity.get(intermediate)); 246 imageMap.put(post, null); 247 } 248 */ 249 // now populate with actual values 250 for(int i=0;i<preimage.size();i++) { 251 S pre = preimage.get(i); 252 253 // image is currently f^k-1(x), so take the final step 254 S preK1 = image.get(i); 255 T postK = (preK1==null?null: alignmentMap.get(preK1)); 256 imageMap.put(pre,postK); 257 258 } 259 return imageMap; 260 } 261 262 /** 263 * Helper for {@link #getSymmetryOrder(Map, Map, int, float)} with a true 264 * identity function (X->X). 265 * 266 * <p>This method should only be used in cases where the two proteins 267 * aligned have identical numbering, as for self-alignments. See 268 * {@link #getSymmetryOrder(AFPChain, int, float)} for a way to guess 269 * the sequential correspondence between two proteins. 270 * 271 * @param alignment 272 * @param maxSymmetry 273 * @param minimumMetricChange 274 * @return 275 */ 276 public static int getSymmetryOrder(Map<Integer, Integer> alignment, 277 final int maxSymmetry, final float minimumMetricChange) { 278 return getSymmetryOrder(alignment, new IdentityMap<Integer>(), maxSymmetry, minimumMetricChange); 279 } 280 /** 281 * Tries to detect symmetry in an alignment. 282 * 283 * <p>Conceptually, an alignment is a function f:A->B between two sets of 284 * integers. The function may have simple topology (meaning that if two 285 * elements of A are close, then their images in B will also be close), or 286 * may have more complex topology (such as a circular permutation). This 287 * function checks <i>alignment</i> against a reference function 288 * <i>identity</i>, which should have simple topology. It then tries to 289 * determine the symmetry order of <i>alignment</i> relative to 290 * <i>identity</i>, up to a maximum order of <i>maxSymmetry</i>. 291 * 292 * 293 * <p><strong>Details</strong><br/> 294 * Considers the offset (in number of residues) which a residue moves 295 * after undergoing <i>n</i> alternating transforms by alignment and 296 * identity. If <i>n</i> corresponds to the intrinsic order of the alignment, 297 * this will be small. This algorithm tries increasing values of <i>n</i> 298 * and looks for abrupt decreases in the root mean squared offset. 299 * If none are found at <i>n</i><=maxSymmetry, the alignment is reported as 300 * non-symmetric. 301 * 302 * @param alignment The alignment to test for symmetry 303 * @param identity An alignment with simple topology which approximates 304 * the sequential relationship between the two proteins. Should map in the 305 * reverse direction from alignment. 306 * @param maxSymmetry Maximum symmetry to consider. High values increase 307 * the calculation time and can lead to overfitting. 308 * @param minimumMetricChange Percent decrease in root mean squared offsets 309 * in order to declare symmetry. 0.4f seems to work well for CeSymm. 310 * @return The order of symmetry of alignment, or 1 if no order <= 311 * maxSymmetry is found. 312 * 313 * @see IdentityMap For a simple identity function 314 */ 315 public static int getSymmetryOrder(Map<Integer, Integer> alignment, Map<Integer,Integer> identity, 316 final int maxSymmetry, final float minimumMetricChange) { 317 List<Integer> preimage = new ArrayList<Integer>(alignment.keySet()); // currently unmodified 318 List<Integer> image = new ArrayList<Integer>(preimage); 319 320 int bestSymmetry = 1; 321 double bestMetric = Double.POSITIVE_INFINITY; //lower is better 322 boolean foundSymmetry = false; 323 324 if(debug) { 325 logger.trace("Symm\tPos\tDelta"); 326 } 327 328 for(int n=1;n<=maxSymmetry;n++) { 329 int deltasSq = 0; 330 int numDeltas = 0; 331 // apply alignment 332 for(int i=0;i<image.size();i++) { 333 Integer pre = image.get(i); 334 Integer intermediate = (pre==null?null: alignment.get(pre)); 335 Integer post = (intermediate==null?null: identity.get(intermediate)); 336 image.set(i, post); 337 338 if(post != null) { 339 int delta = post-preimage.get(i); 340 341 deltasSq += delta*delta; 342 numDeltas++; 343 344 if(debug) { 345 logger.debug("%d\t%d\t%d\n",n,preimage.get(i),delta); 346 } 347 } 348 349 } 350 351 // Metrics: RMS compensates for the trend of smaller numDeltas with higher order 352 // Not normalizing by numDeltas favors smaller orders 353 354 double metric = Math.sqrt((double)deltasSq/numDeltas); // root mean squared distance 355 356 if(!foundSymmetry && metric < bestMetric * minimumMetricChange) { 357 // n = 1 is never the best symmetry 358 if(bestMetric < Double.POSITIVE_INFINITY) { 359 foundSymmetry = true; 360 } 361 bestSymmetry = n; 362 bestMetric = metric; 363 } 364 365 // When debugging need to loop over everything. Unneeded in production 366 if(!debug && foundSymmetry) { 367 break; 368 } 369 370 } 371 if(foundSymmetry) { 372 return bestSymmetry; 373 } else { 374 return 1; 375 } 376 } 377 378 379 /** 380 * Guesses the order of symmetry in an alignment 381 * 382 * <p>Uses {@link #getSymmetryOrder(Map alignment, Map identity, int, float)} 383 * to determine the the symmetry order. For the identity alignment, sorts 384 * the aligned residues of each protein sequentially, then defines the ith 385 * residues of each protein to be equivalent. 386 * 387 * <p>Note that the selection of the identity alignment here is <i>very</i> 388 * naive, and only works for proteins with very good coverage. Wherever 389 * possible, it is better to construct an identity function explicitly 390 * from a sequence alignment (or use an {@link IdentityMap} for internally 391 * symmetric proteins) and use {@link #getSymmetryOrder(Map, Map, int, float)}. 392 */ 393 public static int getSymmetryOrder(AFPChain afpChain, int maxSymmetry, float minimumMetricChange) throws StructureException { 394 // alignment comes from the afpChain alignment 395 Map<Integer,Integer> alignment = AlignmentTools.alignmentAsMap(afpChain); 396 397 // Now construct identity to map aligned residues in sequential order 398 Map<Integer, Integer> identity = guessSequentialAlignment(alignment, true); 399 400 401 return AlignmentTools.getSymmetryOrder(alignment, 402 identity, 403 maxSymmetry, minimumMetricChange); 404 } 405 406 /** 407 * Takes a potentially non-sequential alignment and guesses a sequential 408 * version of it. Residues from each structure are sorted sequentially and 409 * then compared directly. 410 * 411 * <p>The results of this method are consistent with what one might expect 412 * from an identity function, and are therefore useful with 413 * {@link #getSymmetryOrder(Map, Map identity, int, float)}. 414 * <ul> 415 * <li>Perfect self-alignments will have the same pre-image and image, 416 * so will map X->X</li> 417 * <li>Gaps and alignment errors will cause errors in the resulting map, 418 * but only locally. Errors do not propagate through the whole 419 * alignment.</li> 420 * </ul> 421 * 422 * <h4>Example:</h4> 423 * A non sequential alignment, represented schematically as 424 * <pre> 425 * 12456789 426 * 78912345</pre> 427 * would result in a map 428 * <pre> 429 * 12456789 430 * 12345789</pre> 431 * @param alignment The non-sequential input alignment 432 * @param inverseAlignment If false, map from structure1 to structure2. If 433 * true, generate the inverse of that map. 434 * @return A mapping from sequential residues of one protein to those of the other 435 * @throws IllegalArgumentException if the input alignment is not one-to-one. 436 */ 437 public static Map<Integer, Integer> guessSequentialAlignment( 438 Map<Integer,Integer> alignment, boolean inverseAlignment) { 439 Map<Integer,Integer> identity = new HashMap<Integer,Integer>(); 440 441 SortedSet<Integer> aligned1 = new TreeSet<Integer>(); 442 SortedSet<Integer> aligned2 = new TreeSet<Integer>(); 443 444 for(Entry<Integer,Integer> pair : alignment.entrySet()) { 445 aligned1.add(pair.getKey()); 446 if( !aligned2.add(pair.getValue()) ) 447 throw new IllegalArgumentException("Alignment is not one-to-one for residue "+pair.getValue()+" of the second structure."); 448 } 449 450 Iterator<Integer> it1 = aligned1.iterator(); 451 Iterator<Integer> it2 = aligned2.iterator(); 452 while(it1.hasNext()) { 453 if(inverseAlignment) { // 2->1 454 identity.put(it2.next(),it1.next()); 455 } else { // 1->2 456 identity.put(it1.next(),it2.next()); 457 } 458 } 459 return identity; 460 } 461 462 /** 463 * Retrieves the optimum alignment from an AFPChain and returns it as a 464 * java collection. The result is indexed in the same way as 465 * {@link AFPChain#getOptAln()}, but has the correct size(). 466 * <pre> 467 * List<List<List<Integer>>> aln = getOptAlnAsList(AFPChain afpChain); 468 * aln.get(blockNum).get(structureNum={0,1}).get(pos)</pre> 469 * 470 * @param afpChain 471 * @return 472 */ 473 public static List<List<List<Integer>>> getOptAlnAsList(AFPChain afpChain) { 474 int[][][] optAln = afpChain.getOptAln(); 475 int[] optLen = afpChain.getOptLen(); 476 List<List<List<Integer>>> blocks = new ArrayList<List<List<Integer>>>(afpChain.getBlockNum()); 477 for(int blockNum=0;blockNum<afpChain.getBlockNum();blockNum++) { 478 //TODO could improve speed an memory by wrapping the arrays with 479 // an unmodifiable list, similar to Arrays.asList(...) but with the 480 // correct size parameter. 481 List<Integer> align1 = new ArrayList<Integer>(optLen[blockNum]); 482 List<Integer> align2 = new ArrayList<Integer>(optLen[blockNum]); 483 for(int pos=0;pos<optLen[blockNum];pos++) { 484 align1.add(optAln[blockNum][0][pos]); 485 align2.add(optAln[blockNum][1][pos]); 486 } 487 List<List<Integer>> block = new ArrayList<List<Integer>>(2); 488 block.add(align1); 489 block.add(align2); 490 blocks.add(block); 491 } 492 493 return blocks; 494 } 495 496 497 498 /** 499 * A Map<K,V> can be viewed as a function from K to V. This class represents 500 * the identity function. Getting a value results in the value itself. 501 * 502 * <p>The class is a bit inconsistent when representing its contents. On 503 * the one hand, containsKey(key) is true for all objects. However, 504 * attempting to iterate through the values returns an empty set. 505 * 506 * @author Spencer Bliven 507 * 508 * @param <K> 509 */ 510 public static class IdentityMap<K> extends AbstractMap<K,K> { 511 public IdentityMap() {} 512 513 /** 514 * @param key 515 * @return the key 516 * @throws ClassCastException if key is not of type K 517 */ 518 @SuppressWarnings("unchecked") 519 @Override 520 public K get(Object key) { 521 return (K)key; 522 } 523 524 /** 525 * Always returns the empty set 526 */ 527 @Override 528 public Set<java.util.Map.Entry<K, K>> entrySet() { 529 return Collections.emptySet(); 530 } 531 532 @Override 533 public boolean containsKey(Object key) { 534 return true; 535 } 536 } 537 538 /** 539 * Fundamentally, an alignment is just a list of aligned residues in each 540 * protein. This method converts two lists of ResidueNumbers into an 541 * AFPChain. 542 * 543 * <p>Parameters are filled with defaults (often null) or sometimes 544 * calculated. 545 * 546 * <p>For a way to modify the alignment of an existing AFPChain, see 547 * {@link AlignmentTools#replaceOptAln(AFPChain, Atom[], Atom[], Map)} 548 * @param ca1 CA atoms of the first protein 549 * @param ca2 CA atoms of the second protein 550 * @param aligned1 A list of aligned residues from the first protein 551 * @param aligned2 A list of aligned residues from the second protein. 552 * Must be the same length as aligned1. 553 * @return An AFPChain representing the alignment. Many properties may be 554 * null or another default. 555 * @throws StructureException if an error occured during superposition 556 * @throws IllegalArgumentException if aligned1 and aligned2 have different 557 * lengths 558 * @see AlignmentTools#replaceOptAln(AFPChain, Atom[], Atom[], Map) 559 */ 560 public static AFPChain createAFPChain(Atom[] ca1, Atom[] ca2, 561 ResidueNumber[] aligned1, ResidueNumber[] aligned2 ) throws StructureException { 562 //input validation 563 int alnLen = aligned1.length; 564 if(alnLen != aligned2.length) { 565 throw new IllegalArgumentException("Alignment lengths are not equal"); 566 } 567 568 AFPChain a = new AFPChain(AFPChain.UNKNOWN_ALGORITHM); 569 try { 570 a.setName1(ca1[0].getGroup().getChain().getStructure().getName()); 571 if(ca2[0].getGroup().getChain().getStructure() != null) { 572 // common case for cloned ca2 573 a.setName2(ca2[0].getGroup().getChain().getStructure().getName()); 574 } 575 } catch(Exception e) { 576 // One of the structures wasn't fully created. Ignore 577 } 578 a.setBlockNum(1); 579 a.setCa1Length(ca1.length); 580 a.setCa2Length(ca2.length); 581 582 a.setOptLength(alnLen); 583 a.setOptLen(new int[] {alnLen}); 584 585 586 Matrix[] ms = new Matrix[a.getBlockNum()]; 587 a.setBlockRotationMatrix(ms); 588 Atom[] blockShiftVector = new Atom[a.getBlockNum()]; 589 a.setBlockShiftVector(blockShiftVector); 590 591 String[][][] pdbAln = new String[1][2][alnLen]; 592 for(int i=0;i<alnLen;i++) { 593 pdbAln[0][0][i] = aligned1[i].getChainId()+":"+aligned1[i]; 594 pdbAln[0][1][i] = aligned2[i].getChainId()+":"+aligned2[i]; 595 } 596 597 a.setPdbAln(pdbAln); 598 599 // convert pdbAln to optAln, and fill in some other basic parameters 600 AFPChainXMLParser.rebuildAFPChain(a, ca1, ca2); 601 602 return a; 603 604 // Currently a single block. Split into several blocks by sequence if needed 605 // return AlignmentTools.splitBlocksByTopology(a,ca1,ca2); 606 } 607 608 /** 609 * 610 * @param a 611 * @param ca1 612 * @param ca2 613 * @return 614 * @throws StructureException if an error occurred during superposition 615 */ 616 public static AFPChain splitBlocksByTopology(AFPChain a, Atom[] ca1, Atom[] ca2) throws StructureException { 617 int[][][] optAln = a.getOptAln(); 618 int blockNum = a.getBlockNum(); 619 int[] optLen = a.getOptLen(); 620 621 // Determine block lengths 622 // Split blocks if residue indices don't increase monotonically 623 List<Integer> newBlkLen = new ArrayList<Integer>(); 624 boolean blockChanged = false; 625 for(int blk=0;blk<blockNum;blk++) { 626 int currLen=1; 627 for(int pos=1;pos<optLen[blk];pos++) { 628 if( optAln[blk][0][pos] <= optAln[blk][0][pos-1] 629 || optAln[blk][1][pos] <= optAln[blk][1][pos-1] ) 630 { 631 //start a new block 632 newBlkLen.add(currLen); 633 currLen = 0; 634 blockChanged = true; 635 } 636 currLen++; 637 } 638 if(optLen[blk] < 2 ) { 639 newBlkLen.add(optLen[blk]); 640 } else { 641 newBlkLen.add(currLen); 642 } 643 } 644 645 // Check if anything needs to be split 646 if( !blockChanged ) { 647 return a; 648 } 649 650 // Split blocks 651 List<int[][]> blocks = new ArrayList<int[][]>( newBlkLen.size() ); 652 653 int oldBlk = 0; 654 int pos = 0; 655 for(int blkLen : newBlkLen) { 656 if( blkLen == optLen[oldBlk] ) { 657 assert(pos == 0); //should be the whole block 658 // Use the old block 659 blocks.add(optAln[oldBlk]); 660 } else { 661 int[][] newBlock = new int[2][blkLen]; 662 assert( pos+blkLen <= optLen[oldBlk] ); // don't overrun block 663 for(int i=0; i<blkLen;i++) { 664 newBlock[0][i] = optAln[oldBlk][0][pos + i]; 665 newBlock[1][i] = optAln[oldBlk][1][pos + i]; 666 } 667 pos += blkLen; 668 blocks.add(newBlock); 669 670 if( pos == optLen[oldBlk] ) { 671 // Finished this oldBlk, start the next 672 oldBlk++; 673 pos = 0; 674 } 675 } 676 } 677 678 // Store new blocks 679 int[][][] newOptAln = blocks.toArray(new int[0][][]); 680 int[] newBlockLens = new int[newBlkLen.size()]; 681 for(int i=0;i<newBlkLen.size();i++) { 682 newBlockLens[i] = newBlkLen.get(i); 683 } 684 685 return replaceOptAln(a, ca1, ca2, blocks.size(), newBlockLens, newOptAln); 686 } 687 688 /** 689 * It replaces an optimal alignment of an AFPChain and calculates all the new alignment scores and variables. 690 */ 691 public static AFPChain replaceOptAln(int[][][] newAlgn, AFPChain afpChain, Atom[] ca1, Atom[] ca2) throws StructureException { 692 693 //The order is the number of groups in the newAlgn 694 int order = newAlgn.length; 695 696 //Calculate the alignment length from all the subunits lengths 697 int[] optLens = new int[order]; 698 for(int s=0;s<order;s++) { 699 optLens[s] = newAlgn[s][0].length; 700 } 701 int optLength = 0; 702 for(int s=0;s<order;s++) { 703 optLength += optLens[s]; 704 } 705 706 //Create a copy of the original AFPChain and set everything needed for the structure update 707 AFPChain copyAFP = (AFPChain) afpChain.clone(); 708 709 //Set the new parameters of the optimal alignment 710 copyAFP.setOptLength(optLength); 711 copyAFP.setOptLen(optLens); 712 copyAFP.setOptAln(newAlgn); 713 714 //Set the block information of the new alignment 715 copyAFP.setBlockNum(order); 716 copyAFP.setBlockSize(optLens); 717 copyAFP.setBlockResList(newAlgn); 718 copyAFP.setBlockResSize(optLens); 719 copyAFP.setBlockGap(calculateBlockGap(newAlgn)); 720 721 //Recalculate properties: superposition, tm-score, etc 722 Atom[] ca2clone = StructureTools.cloneAtomArray(ca2); // don't modify ca2 positions 723 AlignmentTools.updateSuperposition(copyAFP, ca1, ca2clone); 724 725 //It re-does the sequence alignment strings from the OptAlgn information only 726 copyAFP.setAlnsymb(null); 727 AFPAlignmentDisplay.getAlign(copyAFP, ca1, ca2clone); 728 729 return copyAFP; 730 } 731 732 /** 733 * Takes an AFPChain and replaces the optimal alignment based on an alignment map 734 * 735 * <p>Parameters are filled with defaults (often null) or sometimes 736 * calculated. 737 * 738 * <p>For a way to create a new AFPChain, see 739 * {@link AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[])} 740 * 741 * @param afpChain The alignment to be modified 742 * @param alignment The new alignment, as a Map 743 * @throws StructureException if an error occurred during superposition 744 * @see AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[]) 745 */ 746 public static AFPChain replaceOptAln(AFPChain afpChain, Atom[] ca1, Atom[] ca2, 747 Map<Integer, Integer> alignment) throws StructureException { 748 749 // Determine block lengths 750 // Sort ca1 indices, then start a new block whenever ca2 indices aren't 751 // increasing monotonically. 752 Integer[] res1 = alignment.keySet().toArray(new Integer[0]); 753 Arrays.sort(res1); 754 List<Integer> blockLens = new ArrayList<Integer>(2); 755 int optLength = 0; 756 Integer lastRes = alignment.get(res1[0]); 757 int blkLen = lastRes==null?0:1; 758 for(int i=1;i<res1.length;i++) { 759 Integer currRes = alignment.get(res1[i]); //res2 index 760 assert(currRes != null);// could be converted to if statement if assertion doesn't hold; just modify below as well. 761 if(lastRes<currRes) { 762 blkLen++; 763 } else { 764 // CP! 765 blockLens.add(blkLen); 766 optLength+=blkLen; 767 blkLen = 1; 768 } 769 lastRes = currRes; 770 } 771 blockLens.add(blkLen); 772 optLength+=blkLen; 773 774 // Create array structure for alignment 775 int[][][] optAln = new int[blockLens.size()][][]; 776 int pos1 = 0; //index into res1 777 for(int blk=0;blk<blockLens.size();blk++) { 778 optAln[blk] = new int[2][]; 779 blkLen = blockLens.get(blk); 780 optAln[blk][0] = new int[blkLen]; 781 optAln[blk][1] = new int[blkLen]; 782 int pos = 0; //index into optAln 783 while(pos<blkLen) { 784 optAln[blk][0][pos]=res1[pos1]; 785 Integer currRes = alignment.get(res1[pos1]); 786 optAln[blk][1][pos]=currRes; 787 pos++; 788 pos1++; 789 } 790 } 791 assert(pos1 == optLength); 792 793 // Create length array 794 int[] optLens = new int[blockLens.size()]; 795 for(int i=0;i<blockLens.size();i++) { 796 optLens[i] = blockLens.get(i); 797 } 798 799 return replaceOptAln(afpChain, ca1, ca2, blockLens.size(), optLens, optAln); 800 } 801 802 /** 803 * @param afpChain Input afpchain. UNMODIFIED 804 * @param ca1 805 * @param ca2 806 * @param optLens 807 * @param optAln 808 * @return A NEW AfpChain based off the input but with the optAln modified 809 * @throws StructureException if an error occured during superposition 810 */ 811 public static AFPChain replaceOptAln(AFPChain afpChain, Atom[] ca1, Atom[] ca2, 812 int blockNum, int[] optLens, int[][][] optAln) throws StructureException { 813 int optLength = 0; 814 for( int blk=0;blk<blockNum;blk++) { 815 optLength += optLens[blk]; 816 } 817 818 //set everything 819 AFPChain refinedAFP = (AFPChain) afpChain.clone(); 820 refinedAFP.setOptLength(optLength); 821 refinedAFP.setBlockSize(optLens); 822 refinedAFP.setOptLen(optLens); 823 refinedAFP.setOptAln(optAln); 824 refinedAFP.setBlockNum(blockNum); 825 826 //TODO recalculate properties: superposition, tm-score, etc 827 Atom[] ca2clone = StructureTools.cloneAtomArray(ca2); // don't modify ca2 positions 828 AlignmentTools.updateSuperposition(refinedAFP, ca1, ca2clone); 829 830 AFPAlignmentDisplay.getAlign(refinedAFP, ca1, ca2clone); 831 return refinedAFP; 832 } 833 834 835 /** 836 * After the alignment changes (optAln, optLen, blockNum, at a minimum), 837 * many other properties which depend on the superposition will be invalid. 838 * 839 * This method re-runs a rigid superposition over the whole alignment 840 * and repopulates the required properties, including RMSD (TotalRMSD) and 841 * TM-Score. 842 * @param afpChain 843 * @param ca1 844 * @param ca2 Second set of ca atoms. Will be modified based on the superposition 845 * @throws StructureException 846 * @see {@link CECalculator#calc_rmsd(Atom[], Atom[], int, boolean)} 847 * contains much of the same code, but stores results in a CECalculator 848 * instance rather than an AFPChain 849 */ 850 public static void updateSuperposition(AFPChain afpChain, Atom[] ca1, Atom[] ca2) throws StructureException { 851 852 //Update ca information, because the atom array might also be changed 853 afpChain.setCa1Length(ca1.length); 854 afpChain.setCa2Length(ca2.length); 855 856 //We need this to get the correct superposition 857 int[] focusRes1 = afpChain.getFocusRes1(); 858 int[] focusRes2 = afpChain.getFocusRes2(); 859 if (focusRes1 == null) { 860 focusRes1 = new int[afpChain.getCa1Length()]; 861 afpChain.setFocusRes1(focusRes1); 862 } 863 if (focusRes2 == null) { 864 focusRes2 = new int[afpChain.getCa2Length()]; 865 afpChain.setFocusRes2(focusRes2); 866 } 867 868 if (afpChain.getNrEQR() == 0) return; 869 870 // create new arrays for the subset of atoms in the alignment. 871 Atom[] ca1aligned = new Atom[afpChain.getOptLength()]; 872 Atom[] ca2aligned = new Atom[afpChain.getOptLength()]; 873 int pos=0; 874 int[] blockLens = afpChain.getOptLen(); 875 int[][][] optAln = afpChain.getOptAln(); 876 assert(afpChain.getBlockNum() <= optAln.length); 877 878 for (int block=0; block < afpChain.getBlockNum(); block++) { 879 for(int i=0;i<blockLens[block];i++) { 880 int pos1 = optAln[block][0][i]; 881 int pos2 = optAln[block][1][i]; 882 Atom a1 = ca1[pos1]; 883 Atom a2 = (Atom) ca2[pos2].clone(); 884 ca1aligned[pos] = a1; 885 ca2aligned[pos] = a2; 886 pos++; 887 } 888 } 889 890 // this can happen when we load an old XML serialization which did not support modern ChemComp representation of modified residues. 891 if (pos != afpChain.getOptLength()){ 892 logger.warn("AFPChainScorer getTMScore: Problems reconstructing alignment! nr of loaded atoms is " + pos + " but should be " + afpChain.getOptLength()); 893 // we need to resize the array, because we allocated too many atoms earlier on. 894 ca1aligned = (Atom[]) resizeArray(ca1aligned, pos); 895 ca2aligned = (Atom[]) resizeArray(ca2aligned, pos); 896 } 897 898 //Superimpose the two structures in correspondance to the new alignment 899 SVDSuperimposer svd = new SVDSuperimposer(ca1aligned, ca2aligned); 900 Matrix matrix = svd.getRotation(); 901 Atom shift = svd.getTranslation(); 902 Matrix[] blockMxs = new Matrix[afpChain.getBlockNum()]; 903 Arrays.fill(blockMxs, matrix); 904 afpChain.setBlockRotationMatrix(blockMxs); 905 Atom[] blockShifts = new Atom[afpChain.getBlockNum()]; 906 Arrays.fill(blockShifts, shift); 907 afpChain.setBlockShiftVector(blockShifts); 908 909 for (Atom a : ca2aligned) { 910 Calc.rotate(a, matrix); 911 Calc.shift(a, shift); 912 } 913 914 //Calculate the RMSD and TM score for the new alignment 915 double rmsd = SVDSuperimposer.getRMS(ca1aligned, ca2aligned); 916 double tmScore = SVDSuperimposer.getTMScore(ca1aligned, ca2aligned, ca1.length, ca2.length); 917 afpChain.setTotalRmsdOpt(rmsd); 918 afpChain.setTMScore(tmScore); 919 920 //Calculate the RMSD and TM score for every block of the new alignment 921 double[] blockRMSD = new double[afpChain.getBlockNum()]; 922 double[] blockScore = new double[afpChain.getBlockNum()]; 923 for (int k=0; k<afpChain.getBlockNum(); k++){ 924 //Create the atom arrays corresponding to the aligned residues in the block 925 Atom[] ca1block = new Atom[afpChain.getOptLen()[k]]; 926 Atom[] ca2block = new Atom[afpChain.getOptLen()[k]]; 927 int position=0; 928 for(int i=0;i<blockLens[k];i++) { 929 int pos1 = optAln[k][0][i]; 930 int pos2 = optAln[k][1][i]; 931 Atom a1 = ca1[pos1]; 932 Atom a2 = (Atom) ca2[pos2].clone(); 933 ca1block[position] = a1; 934 ca2block[position] = a2; 935 position++; 936 } 937 if (position != afpChain.getOptLen()[k]){ 938 logger.warn("AFPChainScorer getTMScore: Problems reconstructing block alignment! nr of loaded atoms is " + pos + " but should be " + afpChain.getOptLen()[k]); 939 // we need to resize the array, because we allocated too many atoms earlier on. 940 ca1block = (Atom[]) resizeArray(ca1block, position); 941 ca2block = (Atom[]) resizeArray(ca2block, position); 942 } 943 //Superimpose the two block structures 944 SVDSuperimposer svdb = new SVDSuperimposer(ca1block, ca2block); 945 Matrix matrixb = svdb.getRotation(); 946 Atom shiftb = svdb.getTranslation(); 947 for (Atom a : ca2block) { 948 Calc.rotate(a, matrixb); 949 Calc.shift(a, shiftb); 950 } 951 //Calculate the RMSD and TM score for the block 952 double rmsdb = SVDSuperimposer.getRMS(ca1block, ca2block); 953 double tmScoreb = SVDSuperimposer.getTMScore(ca1block, ca2block, ca1.length, ca2.length); 954 blockRMSD[k] = rmsdb; 955 blockScore[k] = tmScoreb; 956 } 957 afpChain.setOptRmsd(blockRMSD); 958 afpChain.setBlockRmsd(blockRMSD); 959 afpChain.setBlockScore(blockScore); 960 } 961 962 /** 963 * Reallocates an array with a new size, and copies the contents 964 * of the old array to the new array. 965 * @param oldArray the old array, to be reallocated. 966 * @param newSize the new array size. 967 * @return A new array with the same contents. 968 */ 969 public static Object resizeArray (Object oldArray, int newSize) { 970 int oldSize = java.lang.reflect.Array.getLength(oldArray); 971 @SuppressWarnings("rawtypes") 972 Class elementType = oldArray.getClass().getComponentType(); 973 Object newArray = java.lang.reflect.Array.newInstance( 974 elementType,newSize); 975 int preserveLength = Math.min(oldSize,newSize); 976 if (preserveLength > 0) 977 System.arraycopy (oldArray,0,newArray,0,preserveLength); 978 return newArray; 979 } 980 981 /** 982 * Print an alignment map in a concise representation. Edges are given 983 * as two numbers separated by '>'. They are chained together where possible, 984 * or separated by spaces where disjoint or branched. 985 * 986 * <p>Note that more concise representations may be possible.</p> 987 * 988 * Examples: 989 * <li>1>2>3>1</li> 990 * <li>1>2>3>2 4>3</li> 991 * 992 * @param alignment The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)}) 993 * @param identity An identity-like function providing the isomorphism between 994 * the codomain of alignment (of type <T>) and the domain (type <S>). 995 * @return 996 */ 997 public static <S,T> String toConciseAlignmentString(Map<S,T> alignment, Map<T,S> identity) { 998 // Clone input to prevent changes 999 Map<S,T> alig = new HashMap<S,T>(alignment); 1000 1001 // Generate inverse alignment 1002 Map<S,List<S>> inverse = new HashMap<S,List<S>>(); 1003 for(Entry<S,T> e: alig.entrySet()) { 1004 S val = identity.get(e.getValue()); 1005 if( inverse.containsKey(val) ) { 1006 List<S> l = inverse.get(val); 1007 l.add(e.getKey()); 1008 } else { 1009 List<S> l = new ArrayList<S>(); 1010 l.add(e.getKey()); 1011 inverse.put(val,l); 1012 } 1013 } 1014 1015 StringBuilder str = new StringBuilder(); 1016 1017 while(!alig.isEmpty()){ 1018 // Pick an edge and work upstream to a root or cycle 1019 S seedNode = alig.keySet().iterator().next(); 1020 S node = seedNode; 1021 if( inverse.containsKey(seedNode)) { 1022 node = inverse.get(seedNode).iterator().next(); 1023 while( node != seedNode && inverse.containsKey(node)) { 1024 node = inverse.get(node).iterator().next(); 1025 } 1026 } 1027 1028 // Now work downstream, deleting edges as we go 1029 seedNode = node; 1030 str.append(node); 1031 1032 while(alig.containsKey(node)) { 1033 S lastNode = node; 1034 node = identity.get( alig.get(lastNode) ); 1035 1036 // Output 1037 str.append('>'); 1038 str.append(node); 1039 1040 // Remove edge 1041 alig.remove(lastNode); 1042 List<S> inv = inverse.get(node); 1043 if(inv.size() > 1) { 1044 inv.remove(node); 1045 } else { 1046 inverse.remove(node); 1047 } 1048 } 1049 if(!alig.isEmpty()) { 1050 str.append(' '); 1051 } 1052 } 1053 1054 return str.toString(); 1055 } 1056 1057 /** 1058 * @see #toConciseAlignmentString(Map, Map) 1059 */ 1060 public static <T> String toConciseAlignmentString(Map<T, T> alignment) { 1061 return toConciseAlignmentString(alignment, new IdentityMap<T>()); 1062 } 1063 1064 /** 1065 * @see #toConciseAlignmentString(Map, Map) 1066 */ 1067 public static Map<Integer, Integer> fromConciseAlignmentString(String string) { 1068 Map<Integer, Integer> map = new HashMap<Integer, Integer>(); 1069 boolean matches = true; 1070 while (matches) { 1071 Pattern pattern = Pattern.compile("(\\d+)>(\\d+)"); 1072 Matcher matcher = pattern.matcher(string); 1073 matches = matcher.find(); 1074 if (matches) { 1075 Integer from = Integer.parseInt(matcher.group(1)); 1076 Integer to = Integer.parseInt(matcher.group(2)); 1077 map.put(from, to); 1078 string = string.substring(matcher.end(1) + 1); 1079 } 1080 } 1081 return map; 1082 } 1083 1084 /** 1085 * Method that calculates the number of gaps in each subunit block of an optimal AFP alignment. 1086 * 1087 * INPUT: an optimal alignment in the format int[][][]. 1088 * OUTPUT: an int[] array of <order> length containing the gaps in each block as int[block]. 1089 */ 1090 public static int[] calculateBlockGap(int[][][] optAln){ 1091 1092 //Initialize the array to be returned 1093 int [] blockGap = new int[optAln.length]; 1094 1095 //Loop for every block and look in both chains for non-contiguous residues. 1096 for (int i=0; i<optAln.length; i++){ 1097 int gaps = 0; //the number of gaps in that block 1098 int last1 = 0; //the last residue position in chain 1 1099 int last2 = 0; //the last residue position in chain 2 1100 //Loop for every position in the block 1101 for (int j=0; j<optAln[i][0].length; j++){ 1102 //If the first position is evaluated initialize the last positions 1103 if (j==0){ 1104 last1 = optAln[i][0][j]; 1105 last2 = optAln[i][1][j]; 1106 } 1107 else{ 1108 //If one of the positions or both are not contiguous increment the number of gaps 1109 if (optAln[i][0][j] > last1+1 || optAln[i][1][j] > last2+1){ 1110 gaps++; 1111 last1 = optAln[i][0][j]; 1112 last2 = optAln[i][1][j]; 1113 } 1114 //Otherwise just set the last position to the current one 1115 else{ 1116 last1 = optAln[i][0][j]; 1117 last2 = optAln[i][1][j]; 1118 } 1119 } 1120 } 1121 blockGap[i] = gaps; 1122 } 1123 return blockGap; 1124 } 1125 1126 /** 1127 * Creates a simple interaction format (SIF) file for an alignment. 1128 * 1129 * The SIF file can be read by network software (eg Cytoscape) to analyze 1130 * alignments as graphs. 1131 * 1132 * This function creates a graph with residues as nodes and two types of edges: 1133 * 1. backbone edges, which connect adjacent residues in the aligned protein 1134 * 2. alignment edges, which connect aligned residues 1135 * 1136 * @param out Stream to write to 1137 * @param afpChain alignment to write 1138 * @param ca1 First protein, used to generate node names 1139 * @param ca2 Second protein, used to generate node names 1140 * @param backboneInteraction Two-letter string used to identify backbone edges 1141 * @param alignmentInteraction Two-letter string used to identify alignment edges 1142 * @throws IOException 1143 */ 1144 public static void alignmentToSIF(Writer out,AFPChain afpChain, 1145 Atom[] ca1,Atom[] ca2, String backboneInteraction, 1146 String alignmentInteraction) throws IOException { 1147 1148 //out.write("Res1\tInteraction\tRes2\n"); 1149 String name1 = afpChain.getName1(); 1150 String name2 = afpChain.getName2(); 1151 if(name1==null) name1=""; else name1+=":"; 1152 if(name2==null) name2=""; else name2+=":"; 1153 1154 // Print alignment edges 1155 int nblocks = afpChain.getBlockNum(); 1156 int[] blockLen = afpChain.getOptLen(); 1157 int[][][] optAlign = afpChain.getOptAln(); 1158 for(int b=0;b<nblocks;b++) { 1159 for(int r=0;r<blockLen[b];r++) { 1160 int res1 = optAlign[b][0][r]; 1161 int res2 = optAlign[b][1][r]; 1162 1163 ResidueNumber rn1 = ca1[res1].getGroup().getResidueNumber(); 1164 ResidueNumber rn2 = ca2[res2].getGroup().getResidueNumber(); 1165 1166 String node1 = name1+rn1.getChainId()+rn1.toString(); 1167 String node2 = name2+rn2.getChainId()+rn2.toString(); 1168 1169 out.write(String.format("%s\t%s\t%s\n",node1, alignmentInteraction, node2)); 1170 } 1171 } 1172 1173 // Print first backbone edges 1174 ResidueNumber rn = ca1[0].getGroup().getResidueNumber(); 1175 String last = name1+rn.getChainId()+rn.toString(); 1176 for(int i=1;i<ca1.length;i++) { 1177 rn = ca1[i].getGroup().getResidueNumber(); 1178 String curr = name1+rn.getChainId()+rn.toString(); 1179 out.write(String.format("%s\t%s\t%s\n",last, backboneInteraction, curr)); 1180 last = curr; 1181 } 1182 1183 // Print second backbone edges, if the proteins differ 1184 // Do some quick checks for whether the proteins differ 1185 // (Not perfect, but should detect major differences and CPs.) 1186 if(!name1.equals(name2) || 1187 ca1.length!=ca2.length || 1188 (ca1.length>0 && ca1[0].getGroup()!=null && ca2[0].getGroup()!=null && 1189 !ca1[0].getGroup().getResidueNumber().equals(ca2[0].getGroup().getResidueNumber()) ) ) { 1190 rn = ca2[0].getGroup().getResidueNumber(); 1191 last = name2+rn.getChainId()+rn.toString(); 1192 for(int i=1;i<ca2.length;i++) { 1193 rn = ca2[i].getGroup().getResidueNumber(); 1194 String curr = name2+rn.getChainId()+rn.toString(); 1195 out.write(String.format("%s\t%s\t%s\n",last, backboneInteraction, curr)); 1196 last = curr; 1197 } 1198 } 1199 } 1200 1201}