001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.symmetry.core;
022
023import org.biojava.nbio.structure.Atom;
024import org.biojava.nbio.structure.Structure;
025
026import java.util.*;
027
028/**
029 * Represents a set of non-identical protein sequences.
030 */
031public class ProteinSequenceClusterer {
032        private Structure structure = null;
033        private Structure structure2 = null;
034        private QuatSymmetryParameters parameters = null;
035
036        private List<Atom[]> caUnaligned = new ArrayList<Atom[]>();
037        private List<String> chainIds = new ArrayList<String>();
038        private List<Integer> modelNumbers = new ArrayList<Integer>();
039        private List<String> sequences = new ArrayList<String>();
040        private List<SequenceAlignmentCluster> seqClusters = new ArrayList<SequenceAlignmentCluster>();
041        private int nucleicAcidChainCount = 0;
042        private boolean modified = true;
043
044        public ProteinSequenceClusterer(Structure structure, QuatSymmetryParameters parameters) {
045                this.structure = structure;
046                this.parameters = parameters;
047        }
048
049        public ProteinSequenceClusterer(Structure structure1, Structure structure2,  QuatSymmetryParameters parameters) {
050                this.structure = structure1;
051                this.structure2 = structure2;
052                this.parameters = parameters;
053        }
054
055        public List<SequenceAlignmentCluster> getSequenceAlignmentClusters() {
056                run();
057                return seqClusters;
058        }
059
060        public int getProteinChainCount() {
061                run();
062                return sequences.size();
063        }
064
065        /**
066         * @return the nucleicAcidChainCount
067         */
068        public int getNucleicAcidChainCount() {
069                run();
070                return nucleicAcidChainCount;
071        }
072
073        public static void sortSequenceClustersBySize(List<SequenceAlignmentCluster> clusters) {
074                Collections.sort(clusters, new Comparator<SequenceAlignmentCluster>() {
075                        @Override
076                        public int compare(SequenceAlignmentCluster c1, SequenceAlignmentCluster c2) {
077                                int sign = Math.round(Math.signum(c2.getSequenceCount() - c1.getSequenceCount()));
078                                if (sign != 0) {
079                                        return sign;
080                                }
081                                return Math.round(Math.signum(c2.getSequenceAlignmentLength() - c1.getSequenceAlignmentLength()));
082                        }
083                });
084        }
085
086        private void run() {
087                if (modified) {
088                        extractProteinChains();
089                        clusterChains();
090                        modified = false;
091                }
092        }
093        /**
094         * Populate all fields. If two structres are give, concatenate their chains.
095         */
096        private void extractProteinChains() {
097                ProteinChainExtractor extractor = new ProteinChainExtractor(structure,  parameters);
098                caUnaligned = extractor.getCalphaTraces();
099                chainIds  = extractor.getChainIds();
100                sequences = extractor.getSequences();
101                modelNumbers = extractor.getModelNumbers();
102                nucleicAcidChainCount = extractor.getNucleicAcidChainCount();
103
104                if (structure2 != null) {
105                        extractor = new ProteinChainExtractor(structure2,  parameters);
106                        caUnaligned.addAll(extractor.getCalphaTraces());
107                        chainIds.addAll(extractor.getChainIds());
108                        sequences.addAll(extractor.getSequences());
109                        modelNumbers.addAll(extractor.getModelNumbers());
110                }
111        }
112
113        /**
114         * Cluster chains based on their sequence. Initializes seqClusters to the set
115         * of non-identical sequences.
116         */
117        private void clusterChains() {
118                boolean[] processed = new boolean[caUnaligned.size()];
119                Arrays.fill(processed, false);
120
121                for (int i = 0; i < caUnaligned.size(); i++) {
122                        if (processed[i]) {
123                                continue;
124                        }
125                        processed[i] = true;
126                        // create new sequence cluster
127                        UniqueSequenceList seqList = new UniqueSequenceList(caUnaligned.get(i), chainIds.get(i), modelNumbers.get(i), 0, sequences.get(i));
128                        SequenceAlignmentCluster seqCluster = new SequenceAlignmentCluster(parameters);
129                        seqCluster.addUniqueSequenceList(seqList);
130                        seqClusters.add(seqCluster);
131
132                        for (int j = i + 1; j < caUnaligned.size(); j++) {
133                                if (processed[j]) {
134                                        continue;
135                                }
136                                // Mark any future identical sequences as processed
137                                for (SequenceAlignmentCluster c: seqClusters) {
138                                                if (c.identityMatch(caUnaligned.get(j), chainIds.get(j), modelNumbers.get(j), 0, sequences.get(j))) {
139                                                        processed[j] = true;
140                                                        //System.out.println("found identity match: " + i + " - " + j);
141                                                        break;
142                                                }
143                                }
144                        }
145                }
146                sortSequenceClustersBySize(seqClusters);
147        }
148}