001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.align.util;
022
023import org.biojava.nbio.structure.*;
024import org.biojava.nbio.structure.align.ce.CECalculator;
025import org.biojava.nbio.structure.align.model.AFPChain;
026import org.biojava.nbio.structure.align.xml.AFPChainXMLParser;
027import org.biojava.nbio.structure.jama.Matrix;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.io.IOException;
032import java.io.Writer;
033import java.util.*;
034import java.util.Map.Entry;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037
038/**
039 * Methods for analyzing and manipulating AFPChains and for
040 * other pairwise alignment utilities. <p>
041 * Current methods: replace optimal alignment, create new AFPChain,
042 * format conversion, update superposition, etc.
043 *
044 * @author Spencer Bliven
045 * @author Aleix Lafita
046 *
047 */
048public class AlignmentTools {
049
050        private static final Logger logger = LoggerFactory.getLogger(AlignmentTools.class);
051
052
053        public static boolean debug = false;
054
055        /**
056         * Checks that the alignment given by afpChain is sequential. This means
057         * that the residue indices of both proteins increase monotonically as
058         * a function of the alignment position (ie both proteins are sorted).
059         *
060         * This will return false for circularly permuted alignments or other
061         * non-topological alignments. It will also return false for cases where
062         * the alignment itself is sequential but it is not stored in the afpChain
063         * in a sorted manner.
064         *
065         * Since algorithms which create non-sequential alignments split the
066         * alignment into multiple blocks, some computational time can be saved
067         * by only checking block boundaries for sequentiality. Setting
068         * <tt>checkWithinBlocks</tt> to <tt>true</tt> makes this function slower,
069         * but detects AFPChains with non-sequential blocks.
070         *
071         * Note that this method should give the same results as
072         * {@link AFPChain#isSequentialAlignment()}. However, the AFPChain version
073         * relies on the StructureAlignment algorithm correctly setting this
074         * parameter, which is sadly not always the case.
075         *
076         * @param afpChain An alignment
077         * @param checkWithinBlocks Indicates whether individual blocks should be
078         *      checked for sequentiality
079         * @return True if the alignment is sequential.
080         */
081        public static boolean isSequentialAlignment(AFPChain afpChain, boolean checkWithinBlocks) {
082                int[][][] optAln = afpChain.getOptAln();
083                int[] alnLen = afpChain.getOptLen();
084                int blocks = afpChain.getBlockNum();
085
086                if(blocks < 1) return true; //trivial case
087                if ( alnLen[0] < 1) return true;
088
089                // Check that blocks are sequential
090                if(checkWithinBlocks) {
091                        for(int block = 0; block<blocks; block++) {
092                                if(alnLen[block] < 1 ) continue; //skip empty blocks
093
094                                int prevRes1 = optAln[block][0][0];
095                                int prevRes2 = optAln[block][1][0];
096
097                                for(int pos = 1; pos<alnLen[block]; pos++) {
098                                        int currRes1 = optAln[block][0][pos];
099                                        int currRes2 = optAln[block][1][pos];
100
101                                        if(currRes1 < prevRes1) {
102                                                return false;
103                                        }
104                                        if(currRes2 < prevRes2) {
105                                                return false;
106                                        }
107
108                                        prevRes1 = currRes1;
109                                        prevRes2 = currRes2;
110                                }
111                        }
112                }
113
114                // Check that blocks are sequential
115                int prevRes1 = optAln[0][0][alnLen[0]-1];
116                int prevRes2 = optAln[0][1][alnLen[0]-1];
117
118                for(int block = 1; block<blocks;block++) {
119                        if(alnLen[block] < 1 ) continue; //skip empty blocks
120
121                        if(optAln[block][0][0]<prevRes1) {
122                                return false;
123                        }
124                        if(optAln[block][1][0]<prevRes2) {
125                                return false;
126                        }
127
128                        prevRes1 = optAln[block][0][alnLen[block]-1];
129                        prevRes2 = optAln[block][1][alnLen[block]-1];
130                }
131
132                return true;
133        }
134
135        /**
136         * Creates a Map specifying the alignment as a mapping between residue indices
137         * of protein 1 and residue indices of protein 2.
138         *
139         * <p>For example,<pre>
140         * 1234
141         * 5678</pre>
142         * becomes<pre>
143         * 1->5
144         * 2->6
145         * 3->7
146         * 4->8</pre>
147         *
148         * @param afpChain An alignment
149         * @return A mapping from aligned residues of protein 1 to their partners in protein 2.
150         * @throws StructureException If afpChain is not one-to-one
151         */
152        public static Map<Integer, Integer> alignmentAsMap(AFPChain afpChain) throws StructureException {
153                Map<Integer,Integer> map = new HashMap<Integer,Integer>();
154
155                if( afpChain.getAlnLength() < 1 ) {
156                        return map;
157                }
158                int[][][] optAln = afpChain.getOptAln();
159                int[] optLen = afpChain.getOptLen();
160                for(int block = 0; block < afpChain.getBlockNum(); block++) {
161                        for(int pos = 0; pos < optLen[block]; pos++) {
162                                int res1 = optAln[block][0][pos];
163                                int res2 = optAln[block][1][pos];
164                                if(map.containsKey(res1)) {
165                                        throw new StructureException(String.format("Residue %d aligned to both %d and %d.", res1,map.get(res1),res2));
166                                }
167                                map.put(res1,res2);
168                        }
169                }
170                return map;
171        }
172
173        /**
174         * Applies an alignment k times. Eg if alignmentMap defines function f(x),
175         * this returns a function f^k(x)=f(f(...f(x)...)).
176         *
177         * @param <T>
178         * @param alignmentMap The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)})
179         * @param k The number of times to apply the alignment
180         * @return A new alignment. If the input function is not automorphic
181         *  (one-to-one), then some inputs may map to null, indicating that the
182         *  function is undefined for that input.
183         */
184        public static <T> Map<T,T> applyAlignment(Map<T, T> alignmentMap, int k) {
185                return applyAlignment(alignmentMap, new IdentityMap<T>(), k);
186        }
187
188        /**
189         * Applies an alignment k times. Eg if alignmentMap defines function f(x),
190         * this returns a function f^k(x)=f(f(...f(x)...)).
191         *
192         * To allow for functions with different domains and codomains, the identity
193         * function allows converting back in a reasonable way. For instance, if
194         * alignmentMap represented an alignment between two proteins with different
195         * numbering schemes, the identity function could calculate the offset
196         * between residue numbers, eg I(x) = x-offset.
197         *
198         * When an identity function is provided, the returned function calculates
199         * f^k(x) = f(I( f(I( ... f(x) ... )) )).
200         *
201         * @param <S>
202         * @param <T>
203         * @param alignmentMap The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)})
204         * @param identity An identity-like function providing the isomorphism between
205         *  the codomain of alignmentMap (of type <T>) and the domain (type <S>).
206         * @param k The number of times to apply the alignment
207         * @return A new alignment. If the input function is not automorphic
208         *  (one-to-one), then some inputs may map to null, indicating that the
209         *  function is undefined for that input.
210         */
211        public static <S,T> Map<S,T> applyAlignment(Map<S, T> alignmentMap, Map<T,S> identity, int k) {
212
213                // This implementation simply applies the map k times.
214                // If k were large, it would be more efficient to do this recursively,
215                // (eg f^4 = (f^2)^2) but k will usually be small.
216
217                if(k<0) throw new IllegalArgumentException("k must be positive");
218                if(k==1) {
219                        return new HashMap<S,T>(alignmentMap);
220                }
221                // Convert to lists to establish a fixed order
222                List<S> preimage = new ArrayList<S>(alignmentMap.keySet()); // currently unmodified
223                List<S> image = new ArrayList<S>(preimage);
224
225                for(int n=1;n<k;n++) {
226                        // apply alignment
227                        for(int i=0;i<image.size();i++) {
228                                S pre = image.get(i);
229                                T intermediate = (pre==null?null: alignmentMap.get(pre));
230                                S post = (intermediate==null?null: identity.get(intermediate));
231                                image.set(i, post);
232                        }
233                }
234
235
236
237                Map<S, T> imageMap = new HashMap<S,T>(alignmentMap.size());
238
239                //TODO handle nulls consistently.
240                // assure that all the residues in the domain are valid keys
241                /*
242                for(int i=0;i<preimage.size();i++) {
243                        S pre = preimage.get(i);
244                        T intermediate = (pre==null?null: alignmentMap.get(pre));
245                        S post = (intermediate==null?null: identity.get(intermediate));
246                        imageMap.put(post, null);
247                }
248                 */
249                // now populate with actual values
250                for(int i=0;i<preimage.size();i++) {
251                        S pre = preimage.get(i);
252
253                        // image is currently f^k-1(x), so take the final step
254                        S preK1 = image.get(i);
255                        T postK = (preK1==null?null: alignmentMap.get(preK1));
256                        imageMap.put(pre,postK);
257
258                }
259                return imageMap;
260        }
261
262        /**
263         * Helper for {@link #getSymmetryOrder(Map, Map, int, float)} with a true
264         * identity function (X->X).
265         *
266         * <p>This method should only be used in cases where the two proteins
267         * aligned have identical numbering, as for self-alignments. See
268         * {@link #getSymmetryOrder(AFPChain, int, float)} for a way to guess
269         * the sequential correspondence between two proteins.
270         *
271         * @param alignment
272         * @param maxSymmetry
273         * @param minimumMetricChange
274         * @return
275         */
276        public static int getSymmetryOrder(Map<Integer, Integer> alignment,
277                        final int maxSymmetry, final float minimumMetricChange) {
278                return getSymmetryOrder(alignment, new IdentityMap<Integer>(), maxSymmetry, minimumMetricChange);
279        }
280        /**
281         * Tries to detect symmetry in an alignment.
282         *
283         * <p>Conceptually, an alignment is a function f:A->B between two sets of
284         * integers. The function may have simple topology (meaning that if two
285         * elements of A are close, then their images in B will also be close), or
286         * may have more complex topology (such as a circular permutation). This
287         * function checks <i>alignment</i> against a reference function
288         * <i>identity</i>, which should have simple topology. It then tries to
289         * determine the symmetry order of <i>alignment</i> relative to
290         * <i>identity</i>, up to a maximum order of <i>maxSymmetry</i>.
291         *
292         *
293         * <p><strong>Details</strong><br/>
294         * Considers the offset (in number of residues) which a residue moves
295         * after undergoing <i>n</i> alternating transforms by alignment and
296         * identity. If <i>n</i> corresponds to the intrinsic order of the alignment,
297         * this will be small. This algorithm tries increasing values of <i>n</i>
298         * and looks for abrupt decreases in the root mean squared offset.
299         * If none are found at <i>n</i><=maxSymmetry, the alignment is reported as
300         * non-symmetric.
301         *
302         * @param alignment The alignment to test for symmetry
303         * @param identity An alignment with simple topology which approximates
304         *  the sequential relationship between the two proteins. Should map in the
305         *  reverse direction from alignment.
306         * @param maxSymmetry Maximum symmetry to consider. High values increase
307         *  the calculation time and can lead to overfitting.
308         * @param minimumMetricChange Percent decrease in root mean squared offsets
309         *  in order to declare symmetry. 0.4f seems to work well for CeSymm.
310         * @return The order of symmetry of alignment, or 1 if no order <=
311         *  maxSymmetry is found.
312         *
313         * @see IdentityMap For a simple identity function
314         */
315        public static int getSymmetryOrder(Map<Integer, Integer> alignment, Map<Integer,Integer> identity,
316                        final int maxSymmetry, final float minimumMetricChange) {
317                List<Integer> preimage = new ArrayList<Integer>(alignment.keySet()); // currently unmodified
318                List<Integer> image = new ArrayList<Integer>(preimage);
319
320                int bestSymmetry = 1;
321                double bestMetric = Double.POSITIVE_INFINITY; //lower is better
322                boolean foundSymmetry = false;
323
324                if(debug) {
325                        logger.trace("Symm\tPos\tDelta");
326                }
327
328                for(int n=1;n<=maxSymmetry;n++) {
329                        int deltasSq = 0;
330                        int numDeltas = 0;
331                        // apply alignment
332                        for(int i=0;i<image.size();i++) {
333                                Integer pre = image.get(i);
334                                Integer intermediate = (pre==null?null: alignment.get(pre));
335                                Integer post = (intermediate==null?null: identity.get(intermediate));
336                                image.set(i, post);
337
338                                if(post != null) {
339                                        int delta = post-preimage.get(i);
340
341                                        deltasSq += delta*delta;
342                                        numDeltas++;
343
344                                        if(debug) {
345                                                logger.debug("%d\t%d\t%d\n",n,preimage.get(i),delta);
346                                        }
347                                }
348
349                        }
350
351                        // Metrics: RMS compensates for the trend of smaller numDeltas with higher order
352                        // Not normalizing by numDeltas favors smaller orders
353
354                        double metric = Math.sqrt((double)deltasSq/numDeltas); // root mean squared distance
355
356                        if(!foundSymmetry && metric < bestMetric * minimumMetricChange) {
357                                // n = 1 is never the best symmetry
358                                if(bestMetric < Double.POSITIVE_INFINITY) {
359                                        foundSymmetry = true;
360                                }
361                                bestSymmetry = n;
362                                bestMetric = metric;
363                        }
364
365                        // When debugging need to loop over everything. Unneeded in production
366                        if(!debug && foundSymmetry) {
367                                break;
368                        }
369
370                }
371                if(foundSymmetry) {
372                        return bestSymmetry;
373                } else {
374                        return 1;
375                }
376        }
377
378
379        /**
380         * Guesses the order of symmetry in an alignment
381         *
382         * <p>Uses {@link #getSymmetryOrder(Map alignment, Map identity, int, float)}
383         * to determine the the symmetry order. For the identity alignment, sorts
384         * the aligned residues of each protein sequentially, then defines the ith
385         * residues of each protein to be equivalent.
386         *
387         * <p>Note that the selection of the identity alignment here is <i>very</i>
388         * naive, and only works for proteins with very good coverage. Wherever
389         * possible, it is better to construct an identity function explicitly
390         * from a sequence alignment (or use an {@link IdentityMap} for internally
391         * symmetric proteins) and use {@link #getSymmetryOrder(Map, Map, int, float)}.
392         */
393        public static int getSymmetryOrder(AFPChain afpChain, int maxSymmetry, float minimumMetricChange) throws StructureException {
394                // alignment comes from the afpChain alignment
395                Map<Integer,Integer> alignment = AlignmentTools.alignmentAsMap(afpChain);
396
397                // Now construct identity to map aligned residues in sequential order
398                Map<Integer, Integer> identity = guessSequentialAlignment(alignment, true);
399
400
401                return AlignmentTools.getSymmetryOrder(alignment,
402                                identity,
403                                maxSymmetry, minimumMetricChange);
404        }
405
406        /**
407         * Takes a potentially non-sequential alignment and guesses a sequential
408         * version of it. Residues from each structure are sorted sequentially and
409         * then compared directly.
410         *
411         * <p>The results of this method are consistent with what one might expect
412         * from an identity function, and are therefore useful with
413         * {@link #getSymmetryOrder(Map, Map identity, int, float)}.
414         * <ul>
415         *  <li>Perfect self-alignments will have the same pre-image and image,
416         *      so will map X->X</li>
417         *  <li>Gaps and alignment errors will cause errors in the resulting map,
418         *      but only locally. Errors do not propagate through the whole
419         *      alignment.</li>
420         * </ul>
421         *
422         * <h4>Example:</h4>
423         * A non sequential alignment, represented schematically as
424         * <pre>
425         * 12456789
426         * 78912345</pre>
427         * would result in a map
428         * <pre>
429         * 12456789
430         * 12345789</pre>
431         * @param alignment The non-sequential input alignment
432         * @param inverseAlignment If false, map from structure1 to structure2. If
433         *  true, generate the inverse of that map.
434         * @return A mapping from sequential residues of one protein to those of the other
435         * @throws IllegalArgumentException if the input alignment is not one-to-one.
436         */
437        public static Map<Integer, Integer> guessSequentialAlignment(
438                        Map<Integer,Integer> alignment, boolean inverseAlignment) {
439                Map<Integer,Integer> identity = new HashMap<Integer,Integer>();
440
441                SortedSet<Integer> aligned1 = new TreeSet<Integer>();
442                SortedSet<Integer> aligned2 = new TreeSet<Integer>();
443
444                for(Entry<Integer,Integer> pair : alignment.entrySet()) {
445                        aligned1.add(pair.getKey());
446                        if( !aligned2.add(pair.getValue()) )
447                                throw new IllegalArgumentException("Alignment is not one-to-one for residue "+pair.getValue()+" of the second structure.");
448                }
449
450                Iterator<Integer> it1 = aligned1.iterator();
451                Iterator<Integer> it2 = aligned2.iterator();
452                while(it1.hasNext()) {
453                        if(inverseAlignment) { // 2->1
454                                identity.put(it2.next(),it1.next());
455                        } else { // 1->2
456                                identity.put(it1.next(),it2.next());
457                        }
458                }
459                return identity;
460        }
461
462        /**
463         * Retrieves the optimum alignment from an AFPChain and returns it as a
464         * java collection. The result is indexed in the same way as
465         * {@link AFPChain#getOptAln()}, but has the correct size().
466         * <pre>
467         * List<List<List<Integer>>> aln = getOptAlnAsList(AFPChain afpChain);
468         * aln.get(blockNum).get(structureNum={0,1}).get(pos)</pre>
469         *
470         * @param afpChain
471         * @return
472         */
473        public static List<List<List<Integer>>> getOptAlnAsList(AFPChain afpChain) {
474                int[][][] optAln = afpChain.getOptAln();
475                int[] optLen = afpChain.getOptLen();
476                List<List<List<Integer>>> blocks = new ArrayList<List<List<Integer>>>(afpChain.getBlockNum());
477                for(int blockNum=0;blockNum<afpChain.getBlockNum();blockNum++) {
478                        //TODO could improve speed an memory by wrapping the arrays with
479                        // an unmodifiable list, similar to Arrays.asList(...) but with the
480                        // correct size parameter.
481                        List<Integer> align1 = new ArrayList<Integer>(optLen[blockNum]);
482                        List<Integer> align2 = new ArrayList<Integer>(optLen[blockNum]);
483                        for(int pos=0;pos<optLen[blockNum];pos++) {
484                                align1.add(optAln[blockNum][0][pos]);
485                                align2.add(optAln[blockNum][1][pos]);
486                        }
487                        List<List<Integer>> block = new ArrayList<List<Integer>>(2);
488                        block.add(align1);
489                        block.add(align2);
490                        blocks.add(block);
491                }
492
493                return blocks;
494        }
495
496
497
498        /**
499         * A Map<K,V> can be viewed as a function from K to V. This class represents
500         * the identity function. Getting a value results in the value itself.
501         *
502         * <p>The class is a bit inconsistent when representing its contents. On
503         * the one hand, containsKey(key) is true for all objects. However,
504         * attempting to iterate through the values returns an empty set.
505         *
506         * @author Spencer Bliven
507         *
508         * @param <K>
509         */
510        public static class IdentityMap<K> extends AbstractMap<K,K> {
511                public IdentityMap() {}
512
513                /**
514                 * @param key
515                 * @return the key
516                 * @throws ClassCastException if key is not of type K
517                 */
518                @SuppressWarnings("unchecked")
519                @Override
520                public K get(Object key) {
521                        return (K)key;
522                }
523
524                /**
525                 * Always returns the empty set
526                 */
527                @Override
528                public Set<java.util.Map.Entry<K, K>> entrySet() {
529                        return Collections.emptySet();
530                }
531
532                @Override
533                public boolean containsKey(Object key) {
534                        return true;
535                }
536        }
537
538        /**
539         * Fundamentally, an alignment is just a list of aligned residues in each
540         * protein. This method converts two lists of ResidueNumbers into an
541         * AFPChain.
542         *
543         * <p>Parameters are filled with defaults (often null) or sometimes
544         * calculated.
545         *
546         * <p>For a way to modify the alignment of an existing AFPChain, see
547         * {@link AlignmentTools#replaceOptAln(AFPChain, Atom[], Atom[], Map)}
548         * @param ca1 CA atoms of the first protein
549         * @param ca2 CA atoms of the second protein
550         * @param aligned1 A list of aligned residues from the first protein
551         * @param aligned2 A list of aligned residues from the second protein.
552         *  Must be the same length as aligned1.
553         * @return An AFPChain representing the alignment. Many properties may be
554         *  null or another default.
555         * @throws StructureException if an error occured during superposition
556         * @throws IllegalArgumentException if aligned1 and aligned2 have different
557         *  lengths
558         * @see AlignmentTools#replaceOptAln(AFPChain, Atom[], Atom[], Map)
559         */
560        public static AFPChain createAFPChain(Atom[] ca1, Atom[] ca2,
561                        ResidueNumber[] aligned1, ResidueNumber[] aligned2 ) throws StructureException {
562                //input validation
563                int alnLen = aligned1.length;
564                if(alnLen != aligned2.length) {
565                        throw new IllegalArgumentException("Alignment lengths are not equal");
566                }
567
568                AFPChain a = new AFPChain(AFPChain.UNKNOWN_ALGORITHM);
569                try {
570                        a.setName1(ca1[0].getGroup().getChain().getStructure().getName());
571                        if(ca2[0].getGroup().getChain().getStructure() != null) {
572                                // common case for cloned ca2
573                                a.setName2(ca2[0].getGroup().getChain().getStructure().getName());
574                        }
575                } catch(Exception e) {
576                        // One of the structures wasn't fully created. Ignore
577                }
578                a.setBlockNum(1);
579                a.setCa1Length(ca1.length);
580                a.setCa2Length(ca2.length);
581
582                a.setOptLength(alnLen);
583                a.setOptLen(new int[] {alnLen});
584
585
586                Matrix[] ms = new Matrix[a.getBlockNum()];
587                a.setBlockRotationMatrix(ms);
588                Atom[] blockShiftVector = new Atom[a.getBlockNum()];
589                a.setBlockShiftVector(blockShiftVector);
590
591                String[][][] pdbAln = new String[1][2][alnLen];
592                for(int i=0;i<alnLen;i++) {
593                        pdbAln[0][0][i] = aligned1[i].getChainId()+":"+aligned1[i];
594                        pdbAln[0][1][i] = aligned2[i].getChainId()+":"+aligned2[i];
595                }
596
597                a.setPdbAln(pdbAln);
598
599                // convert pdbAln to optAln, and fill in some other basic parameters
600                AFPChainXMLParser.rebuildAFPChain(a, ca1, ca2);
601
602                return a;
603
604                // Currently a single block. Split into several blocks by sequence if needed
605                //              return AlignmentTools.splitBlocksByTopology(a,ca1,ca2);
606        }
607
608        /**
609         *
610         * @param a
611         * @param ca1
612         * @param ca2
613         * @return
614         * @throws StructureException if an error occurred during superposition
615         */
616        public static AFPChain splitBlocksByTopology(AFPChain a, Atom[] ca1, Atom[] ca2) throws StructureException {
617                int[][][] optAln = a.getOptAln();
618                int blockNum = a.getBlockNum();
619                int[] optLen = a.getOptLen();
620
621                // Determine block lengths
622                // Split blocks if residue indices don't increase monotonically
623                List<Integer> newBlkLen = new ArrayList<Integer>();
624                boolean blockChanged = false;
625                for(int blk=0;blk<blockNum;blk++) {
626                        int currLen=1;
627                        for(int pos=1;pos<optLen[blk];pos++) {
628                                if( optAln[blk][0][pos] <= optAln[blk][0][pos-1]
629                                                || optAln[blk][1][pos] <= optAln[blk][1][pos-1] )
630                                {
631                                        //start a new block
632                                        newBlkLen.add(currLen);
633                                        currLen = 0;
634                                        blockChanged = true;
635                                }
636                                currLen++;
637                        }
638                        if(optLen[blk] < 2 ) {
639                                newBlkLen.add(optLen[blk]);
640                        } else {
641                                newBlkLen.add(currLen);
642                        }
643                }
644
645                // Check if anything needs to be split
646                if( !blockChanged ) {
647                        return a;
648                }
649
650                // Split blocks
651                List<int[][]> blocks = new ArrayList<int[][]>( newBlkLen.size() );
652
653                int oldBlk = 0;
654                int pos = 0;
655                for(int blkLen : newBlkLen) {
656                        if( blkLen == optLen[oldBlk] ) {
657                                assert(pos == 0); //should be the whole block
658                                // Use the old block
659                                blocks.add(optAln[oldBlk]);
660                        } else {
661                                int[][] newBlock = new int[2][blkLen];
662                                assert( pos+blkLen <= optLen[oldBlk] ); // don't overrun block
663                                for(int i=0; i<blkLen;i++) {
664                                        newBlock[0][i] = optAln[oldBlk][0][pos + i];
665                                        newBlock[1][i] = optAln[oldBlk][1][pos + i];
666                                }
667                                pos += blkLen;
668                                blocks.add(newBlock);
669
670                                if( pos == optLen[oldBlk] ) {
671                                        // Finished this oldBlk, start the next
672                                        oldBlk++;
673                                        pos = 0;
674                                }
675                        }
676                }
677
678                // Store new blocks
679                int[][][] newOptAln = blocks.toArray(new int[0][][]);
680                int[] newBlockLens = new int[newBlkLen.size()];
681                for(int i=0;i<newBlkLen.size();i++) {
682                        newBlockLens[i] = newBlkLen.get(i);
683                }
684
685                return replaceOptAln(a, ca1, ca2, blocks.size(), newBlockLens, newOptAln);
686        }
687
688        /**
689         * It replaces an optimal alignment of an AFPChain and calculates all the new alignment scores and variables.
690         */
691        public static AFPChain replaceOptAln(int[][][] newAlgn, AFPChain afpChain, Atom[] ca1, Atom[] ca2) throws StructureException {
692
693                //The order is the number of groups in the newAlgn
694                int order = newAlgn.length;
695
696                //Calculate the alignment length from all the subunits lengths
697                int[] optLens = new int[order];
698                for(int s=0;s<order;s++) {
699                        optLens[s] = newAlgn[s][0].length;
700                }
701                int optLength = 0;
702                for(int s=0;s<order;s++) {
703                        optLength += optLens[s];
704                }
705
706                //Create a copy of the original AFPChain and set everything needed for the structure update
707                AFPChain copyAFP = (AFPChain) afpChain.clone();
708
709                //Set the new parameters of the optimal alignment
710                copyAFP.setOptLength(optLength);
711                copyAFP.setOptLen(optLens);
712                copyAFP.setOptAln(newAlgn);
713
714                //Set the block information of the new alignment
715                copyAFP.setBlockNum(order);
716                copyAFP.setBlockSize(optLens);
717                copyAFP.setBlockResList(newAlgn);
718                copyAFP.setBlockResSize(optLens);
719                copyAFP.setBlockGap(calculateBlockGap(newAlgn));
720
721                //Recalculate properties: superposition, tm-score, etc
722                Atom[] ca2clone = StructureTools.cloneAtomArray(ca2); // don't modify ca2 positions
723                AlignmentTools.updateSuperposition(copyAFP, ca1, ca2clone);
724
725                //It re-does the sequence alignment strings from the OptAlgn information only
726                copyAFP.setAlnsymb(null);
727                AFPAlignmentDisplay.getAlign(copyAFP, ca1, ca2clone);
728
729                return copyAFP;
730        }
731
732        /**
733         * Takes an AFPChain and replaces the optimal alignment based on an alignment map
734         *
735         * <p>Parameters are filled with defaults (often null) or sometimes
736         * calculated.
737         *
738         * <p>For a way to create a new AFPChain, see
739         * {@link AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[])}
740         *
741         * @param afpChain The alignment to be modified
742         * @param alignment The new alignment, as a Map
743         * @throws StructureException if an error occurred during superposition
744         * @see AlignmentTools#createAFPChain(Atom[], Atom[], ResidueNumber[], ResidueNumber[])
745         */
746        public static AFPChain replaceOptAln(AFPChain afpChain, Atom[] ca1, Atom[] ca2,
747                        Map<Integer, Integer> alignment) throws StructureException {
748
749                // Determine block lengths
750                // Sort ca1 indices, then start a new block whenever ca2 indices aren't
751                // increasing monotonically.
752                Integer[] res1 = alignment.keySet().toArray(new Integer[0]);
753                Arrays.sort(res1);
754                List<Integer> blockLens = new ArrayList<Integer>(2);
755                int optLength = 0;
756                Integer lastRes = alignment.get(res1[0]);
757                int blkLen = lastRes==null?0:1;
758                for(int i=1;i<res1.length;i++) {
759                        Integer currRes = alignment.get(res1[i]); //res2 index
760                        assert(currRes != null);// could be converted to if statement if assertion doesn't hold; just modify below as well.
761                        if(lastRes<currRes) {
762                                blkLen++;
763                        } else {
764                                // CP!
765                                blockLens.add(blkLen);
766                                optLength+=blkLen;
767                                blkLen = 1;
768                        }
769                        lastRes = currRes;
770                }
771                blockLens.add(blkLen);
772                optLength+=blkLen;
773
774                // Create array structure for alignment
775                int[][][] optAln = new int[blockLens.size()][][];
776                int pos1 = 0; //index into res1
777                for(int blk=0;blk<blockLens.size();blk++) {
778                        optAln[blk] = new int[2][];
779                        blkLen = blockLens.get(blk);
780                        optAln[blk][0] = new int[blkLen];
781                        optAln[blk][1] = new int[blkLen];
782                        int pos = 0; //index into optAln
783                        while(pos<blkLen) {
784                                optAln[blk][0][pos]=res1[pos1];
785                                Integer currRes = alignment.get(res1[pos1]);
786                                optAln[blk][1][pos]=currRes;
787                                pos++;
788                                pos1++;
789                        }
790                }
791                assert(pos1 == optLength);
792
793                // Create length array
794                int[] optLens = new int[blockLens.size()];
795                for(int i=0;i<blockLens.size();i++) {
796                        optLens[i] = blockLens.get(i);
797                }
798
799                return replaceOptAln(afpChain, ca1, ca2, blockLens.size(), optLens, optAln);
800        }
801
802        /**
803         * @param afpChain Input afpchain. UNMODIFIED
804         * @param ca1
805         * @param ca2
806         * @param optLens
807         * @param optAln
808         * @return A NEW AfpChain based off the input but with the optAln modified
809         * @throws StructureException if an error occured during superposition
810         */
811        public static AFPChain replaceOptAln(AFPChain afpChain, Atom[] ca1, Atom[] ca2,
812                        int blockNum, int[] optLens, int[][][] optAln) throws StructureException {
813                int optLength = 0;
814                for( int blk=0;blk<blockNum;blk++) {
815                        optLength += optLens[blk];
816                }
817
818                //set everything
819                AFPChain refinedAFP = (AFPChain) afpChain.clone();
820                refinedAFP.setOptLength(optLength);
821                refinedAFP.setBlockSize(optLens);
822                refinedAFP.setOptLen(optLens);
823                refinedAFP.setOptAln(optAln);
824                refinedAFP.setBlockNum(blockNum);
825
826                //TODO recalculate properties: superposition, tm-score, etc
827                Atom[] ca2clone = StructureTools.cloneAtomArray(ca2); // don't modify ca2 positions
828                AlignmentTools.updateSuperposition(refinedAFP, ca1, ca2clone);
829
830                AFPAlignmentDisplay.getAlign(refinedAFP, ca1, ca2clone);
831                return refinedAFP;
832        }
833
834
835        /**
836         * After the alignment changes (optAln, optLen, blockNum, at a minimum),
837         * many other properties which depend on the superposition will be invalid.
838         *
839         * This method re-runs a rigid superposition over the whole alignment
840         * and repopulates the required properties, including RMSD (TotalRMSD) and
841         * TM-Score.
842         * @param afpChain
843         * @param ca1
844         * @param ca2 Second set of ca atoms. Will be modified based on the superposition
845         * @throws StructureException
846         * @see {@link CECalculator#calc_rmsd(Atom[], Atom[], int, boolean)}
847         *  contains much of the same code, but stores results in a CECalculator
848         *  instance rather than an AFPChain
849         */
850        public static void updateSuperposition(AFPChain afpChain, Atom[] ca1, Atom[] ca2) throws StructureException {
851
852                //Update ca information, because the atom array might also be changed
853                afpChain.setCa1Length(ca1.length);
854                afpChain.setCa2Length(ca2.length);
855
856                //We need this to get the correct superposition
857                int[] focusRes1 = afpChain.getFocusRes1();
858                int[] focusRes2 = afpChain.getFocusRes2();
859                if (focusRes1 == null) {
860                        focusRes1 = new int[afpChain.getCa1Length()];
861                        afpChain.setFocusRes1(focusRes1);
862                }
863                if (focusRes2 == null) {
864                        focusRes2 = new int[afpChain.getCa2Length()];
865                        afpChain.setFocusRes2(focusRes2);
866                }
867
868                if (afpChain.getNrEQR() == 0) return;
869
870                // create new arrays for the subset of atoms in the alignment.
871                Atom[] ca1aligned = new Atom[afpChain.getOptLength()];
872                Atom[] ca2aligned = new Atom[afpChain.getOptLength()];
873                int pos=0;
874                int[] blockLens = afpChain.getOptLen();
875                int[][][] optAln = afpChain.getOptAln();
876                assert(afpChain.getBlockNum() <= optAln.length);
877
878                for (int block=0; block < afpChain.getBlockNum(); block++) {
879                        for(int i=0;i<blockLens[block];i++) {
880                                int pos1 = optAln[block][0][i];
881                                int pos2 = optAln[block][1][i];
882                                Atom a1 = ca1[pos1];
883                                Atom a2 = (Atom) ca2[pos2].clone();
884                                ca1aligned[pos] = a1;
885                                ca2aligned[pos] = a2;
886                                pos++;
887                        }
888                }
889
890                // this can happen when we load an old XML serialization which did not support modern ChemComp representation of modified residues.
891                if (pos != afpChain.getOptLength()){
892                        logger.warn("AFPChainScorer getTMScore: Problems reconstructing alignment! nr of loaded atoms is " + pos + " but should be " + afpChain.getOptLength());
893                        // we need to resize the array, because we allocated too many atoms earlier on.
894                        ca1aligned = (Atom[]) resizeArray(ca1aligned, pos);
895                        ca2aligned = (Atom[]) resizeArray(ca2aligned, pos);
896                }
897
898                //Superimpose the two structures in correspondance to the new alignment
899                SVDSuperimposer svd = new SVDSuperimposer(ca1aligned, ca2aligned);
900                Matrix matrix = svd.getRotation();
901                Atom shift = svd.getTranslation();
902                Matrix[] blockMxs = new Matrix[afpChain.getBlockNum()];
903                Arrays.fill(blockMxs, matrix);
904                afpChain.setBlockRotationMatrix(blockMxs);
905                Atom[] blockShifts = new Atom[afpChain.getBlockNum()];
906                Arrays.fill(blockShifts, shift);
907                afpChain.setBlockShiftVector(blockShifts);
908
909                for (Atom a : ca2aligned) {
910                        Calc.rotate(a, matrix);
911                        Calc.shift(a, shift);
912                }
913
914                //Calculate the RMSD and TM score for the new alignment
915                double rmsd = SVDSuperimposer.getRMS(ca1aligned, ca2aligned);
916                double tmScore = SVDSuperimposer.getTMScore(ca1aligned, ca2aligned, ca1.length, ca2.length);
917                afpChain.setTotalRmsdOpt(rmsd);
918                afpChain.setTMScore(tmScore);
919
920                //Calculate the RMSD and TM score for every block of the new alignment
921                double[] blockRMSD = new double[afpChain.getBlockNum()];
922                double[] blockScore = new double[afpChain.getBlockNum()];
923                for (int k=0; k<afpChain.getBlockNum(); k++){
924                        //Create the atom arrays corresponding to the aligned residues in the block
925                        Atom[] ca1block = new Atom[afpChain.getOptLen()[k]];
926                        Atom[] ca2block = new Atom[afpChain.getOptLen()[k]];
927                        int position=0;
928                        for(int i=0;i<blockLens[k];i++) {
929                                int pos1 = optAln[k][0][i];
930                                int pos2 = optAln[k][1][i];
931                                Atom a1 = ca1[pos1];
932                                Atom a2 = (Atom) ca2[pos2].clone();
933                                ca1block[position] = a1;
934                                ca2block[position] = a2;
935                                position++;
936                        }
937                        if (position != afpChain.getOptLen()[k]){
938                                logger.warn("AFPChainScorer getTMScore: Problems reconstructing block alignment! nr of loaded atoms is " + pos + " but should be " + afpChain.getOptLen()[k]);
939                                // we need to resize the array, because we allocated too many atoms earlier on.
940                                ca1block = (Atom[]) resizeArray(ca1block, position);
941                                ca2block = (Atom[]) resizeArray(ca2block, position);
942                        }
943                        //Superimpose the two block structures
944                        SVDSuperimposer svdb = new SVDSuperimposer(ca1block, ca2block);
945                        Matrix matrixb = svdb.getRotation();
946                        Atom shiftb = svdb.getTranslation();
947                        for (Atom a : ca2block) {
948                                Calc.rotate(a, matrixb);
949                                Calc.shift(a, shiftb);
950                        }
951                        //Calculate the RMSD and TM score for the block
952                        double rmsdb = SVDSuperimposer.getRMS(ca1block, ca2block);
953                        double tmScoreb = SVDSuperimposer.getTMScore(ca1block, ca2block, ca1.length, ca2.length);
954                        blockRMSD[k] = rmsdb;
955                        blockScore[k] = tmScoreb;
956                }
957                afpChain.setOptRmsd(blockRMSD);
958                afpChain.setBlockRmsd(blockRMSD);
959                afpChain.setBlockScore(blockScore);
960        }
961
962        /**
963         * Reallocates an array with a new size, and copies the contents
964         * of the old array to the new array.
965         * @param oldArray  the old array, to be reallocated.
966         * @param newSize   the new array size.
967         * @return          A new array with the same contents.
968         */
969        public static Object resizeArray (Object oldArray, int newSize) {
970                int oldSize = java.lang.reflect.Array.getLength(oldArray);
971                @SuppressWarnings("rawtypes")
972                Class elementType = oldArray.getClass().getComponentType();
973                Object newArray = java.lang.reflect.Array.newInstance(
974                                elementType,newSize);
975                int preserveLength = Math.min(oldSize,newSize);
976                if (preserveLength > 0)
977                        System.arraycopy (oldArray,0,newArray,0,preserveLength);
978                return newArray;
979        }
980
981        /**
982         * Print an alignment map in a concise representation. Edges are given
983         * as two numbers separated by '>'. They are chained together where possible,
984         * or separated by spaces where disjoint or branched.
985         *
986         * <p>Note that more concise representations may be possible.</p>
987         *
988         * Examples:
989         * <li>1>2>3>1</li>
990         * <li>1>2>3>2 4>3</li>
991         *
992         * @param alignment The input function, as a map (see {@link AlignmentTools#alignmentAsMap(AFPChain)})
993         * @param identity An identity-like function providing the isomorphism between
994         *  the codomain of alignment (of type <T>) and the domain (type <S>).
995         * @return
996         */
997        public static <S,T> String toConciseAlignmentString(Map<S,T> alignment, Map<T,S> identity) {
998                // Clone input to prevent changes
999                Map<S,T> alig = new HashMap<S,T>(alignment);
1000
1001                // Generate inverse alignment
1002                Map<S,List<S>> inverse = new HashMap<S,List<S>>();
1003                for(Entry<S,T> e: alig.entrySet()) {
1004                        S val = identity.get(e.getValue());
1005                        if( inverse.containsKey(val) ) {
1006                                List<S> l = inverse.get(val);
1007                                l.add(e.getKey());
1008                        } else {
1009                                List<S> l = new ArrayList<S>();
1010                                l.add(e.getKey());
1011                                inverse.put(val,l);
1012                        }
1013                }
1014
1015                StringBuilder str = new StringBuilder();
1016
1017                while(!alig.isEmpty()){
1018                        // Pick an edge and work upstream to a root or cycle
1019                        S seedNode = alig.keySet().iterator().next();
1020                        S node = seedNode;
1021                        if( inverse.containsKey(seedNode)) {
1022                                node = inverse.get(seedNode).iterator().next();
1023                                while( node != seedNode && inverse.containsKey(node)) {
1024                                        node = inverse.get(node).iterator().next();
1025                                }
1026                        }
1027
1028                        // Now work downstream, deleting edges as we go
1029                        seedNode = node;
1030                        str.append(node);
1031
1032                        while(alig.containsKey(node)) {
1033                                S lastNode = node;
1034                                node = identity.get( alig.get(lastNode) );
1035
1036                                // Output
1037                                str.append('>');
1038                                str.append(node);
1039
1040                                // Remove edge
1041                                alig.remove(lastNode);
1042                                List<S> inv = inverse.get(node);
1043                                if(inv.size() > 1) {
1044                                        inv.remove(node);
1045                                } else {
1046                                        inverse.remove(node);
1047                                }
1048                        }
1049                        if(!alig.isEmpty()) {
1050                                str.append(' ');
1051                        }
1052                }
1053
1054                return str.toString();
1055        }
1056
1057        /**
1058         * @see #toConciseAlignmentString(Map, Map)
1059         */
1060        public static <T> String toConciseAlignmentString(Map<T, T> alignment) {
1061                return toConciseAlignmentString(alignment, new IdentityMap<T>());
1062        }
1063
1064        /**
1065         * @see #toConciseAlignmentString(Map, Map)
1066         */
1067        public static Map<Integer, Integer> fromConciseAlignmentString(String string) {
1068                Map<Integer, Integer> map = new HashMap<Integer, Integer>();
1069                boolean matches = true;
1070                while (matches) {
1071                        Pattern pattern = Pattern.compile("(\\d+)>(\\d+)");
1072                        Matcher matcher = pattern.matcher(string);
1073                        matches = matcher.find();
1074                        if (matches) {
1075                                Integer from = Integer.parseInt(matcher.group(1));
1076                                Integer to = Integer.parseInt(matcher.group(2));
1077                                map.put(from, to);
1078                                string = string.substring(matcher.end(1) + 1);
1079                        }
1080                }
1081                return map;
1082        }
1083
1084        /**
1085         * Method that calculates the number of gaps in each subunit block of an optimal AFP alignment.
1086         *
1087         * INPUT: an optimal alignment in the format int[][][].
1088         * OUTPUT: an int[] array of <order> length containing the gaps in each block as int[block].
1089         */
1090        public static int[] calculateBlockGap(int[][][] optAln){
1091
1092                //Initialize the array to be returned
1093                int [] blockGap = new int[optAln.length];
1094
1095                //Loop for every block and look in both chains for non-contiguous residues.
1096                for (int i=0; i<optAln.length; i++){
1097                        int gaps = 0; //the number of gaps in that block
1098                        int last1 = 0; //the last residue position in chain 1
1099                        int last2 = 0; //the last residue position in chain 2
1100                        //Loop for every position in the block
1101                        for (int j=0; j<optAln[i][0].length; j++){
1102                                //If the first position is evaluated initialize the last positions
1103                                if (j==0){
1104                                        last1 = optAln[i][0][j];
1105                                        last2 = optAln[i][1][j];
1106                                }
1107                                else{
1108                                        //If one of the positions or both are not contiguous increment the number of gaps
1109                                        if (optAln[i][0][j] > last1+1 || optAln[i][1][j] > last2+1){
1110                                                gaps++;
1111                                                last1 = optAln[i][0][j];
1112                                                last2 = optAln[i][1][j];
1113                                        }
1114                                        //Otherwise just set the last position to the current one
1115                                        else{
1116                                                last1 = optAln[i][0][j];
1117                                                last2 = optAln[i][1][j];
1118                                        }
1119                                }
1120                        }
1121                        blockGap[i] = gaps;
1122                }
1123                return blockGap;
1124        }
1125
1126        /**
1127         * Creates a simple interaction format (SIF) file for an alignment.
1128         *
1129         * The SIF file can be read by network software (eg Cytoscape) to analyze
1130         * alignments as graphs.
1131         *
1132         * This function creates a graph with residues as nodes and two types of edges:
1133         *   1. backbone edges, which connect adjacent residues in the aligned protein
1134         *   2. alignment edges, which connect aligned residues
1135         *
1136         * @param out Stream to write to
1137         * @param afpChain alignment to write
1138         * @param ca1 First protein, used to generate node names
1139         * @param ca2 Second protein, used to generate node names
1140         * @param backboneInteraction Two-letter string used to identify backbone edges
1141         * @param alignmentInteraction Two-letter string used to identify alignment edges
1142         * @throws IOException
1143         */
1144        public static void alignmentToSIF(Writer out,AFPChain afpChain,
1145                        Atom[] ca1,Atom[] ca2, String backboneInteraction,
1146                        String alignmentInteraction) throws IOException {
1147
1148                //out.write("Res1\tInteraction\tRes2\n");
1149                String name1 = afpChain.getName1();
1150                String name2 = afpChain.getName2();
1151                if(name1==null) name1=""; else name1+=":";
1152                if(name2==null) name2=""; else name2+=":";
1153
1154                // Print alignment edges
1155                int nblocks = afpChain.getBlockNum();
1156                int[] blockLen = afpChain.getOptLen();
1157                int[][][] optAlign = afpChain.getOptAln();
1158                for(int b=0;b<nblocks;b++) {
1159                        for(int r=0;r<blockLen[b];r++) {
1160                                int res1 = optAlign[b][0][r];
1161                                int res2 = optAlign[b][1][r];
1162
1163                                ResidueNumber rn1 = ca1[res1].getGroup().getResidueNumber();
1164                                ResidueNumber rn2 = ca2[res2].getGroup().getResidueNumber();
1165
1166                                String node1 = name1+rn1.getChainId()+rn1.toString();
1167                                String node2 = name2+rn2.getChainId()+rn2.toString();
1168
1169                                out.write(String.format("%s\t%s\t%s\n",node1, alignmentInteraction, node2));
1170                        }
1171                }
1172
1173                // Print first backbone edges
1174                ResidueNumber rn = ca1[0].getGroup().getResidueNumber();
1175                String last = name1+rn.getChainId()+rn.toString();
1176                for(int i=1;i<ca1.length;i++) {
1177                        rn = ca1[i].getGroup().getResidueNumber();
1178                        String curr = name1+rn.getChainId()+rn.toString();
1179                        out.write(String.format("%s\t%s\t%s\n",last, backboneInteraction, curr));
1180                        last = curr;
1181                }
1182
1183                // Print second backbone edges, if the proteins differ
1184                // Do some quick checks for whether the proteins differ
1185                // (Not perfect, but should detect major differences and CPs.)
1186                if(!name1.equals(name2) ||
1187                                ca1.length!=ca2.length ||
1188                                (ca1.length>0 && ca1[0].getGroup()!=null && ca2[0].getGroup()!=null &&
1189                                                !ca1[0].getGroup().getResidueNumber().equals(ca2[0].getGroup().getResidueNumber()) ) ) {
1190                        rn = ca2[0].getGroup().getResidueNumber();
1191                        last = name2+rn.getChainId()+rn.toString();
1192                        for(int i=1;i<ca2.length;i++) {
1193                                rn = ca2[i].getGroup().getResidueNumber();
1194                                String curr = name2+rn.getChainId()+rn.toString();
1195                                out.write(String.format("%s\t%s\t%s\n",last, backboneInteraction, curr));
1196                                last = curr;
1197                        }
1198                }
1199        }
1200
1201}