001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.symmetry.core; 022 023import org.biojava.nbio.structure.Atom; 024import org.biojava.nbio.structure.Structure; 025 026import java.util.*; 027 028/** 029 * Represents a set of non-identical protein sequences. 030 */ 031public class ProteinSequenceClusterer { 032 private Structure structure = null; 033 private Structure structure2 = null; 034 private QuatSymmetryParameters parameters = null; 035 036 private List<Atom[]> caUnaligned = new ArrayList<Atom[]>(); 037 private List<String> chainIds = new ArrayList<String>(); 038 private List<Integer> modelNumbers = new ArrayList<Integer>(); 039 private List<String> sequences = new ArrayList<String>(); 040 private List<SequenceAlignmentCluster> seqClusters = new ArrayList<SequenceAlignmentCluster>(); 041 private int nucleicAcidChainCount = 0; 042 private boolean modified = true; 043 044 public ProteinSequenceClusterer(Structure structure, QuatSymmetryParameters parameters) { 045 this.structure = structure; 046 this.parameters = parameters; 047 } 048 049 public ProteinSequenceClusterer(Structure structure1, Structure structure2, QuatSymmetryParameters parameters) { 050 this.structure = structure1; 051 this.structure2 = structure2; 052 this.parameters = parameters; 053 } 054 055 public List<SequenceAlignmentCluster> getSequenceAlignmentClusters() { 056 run(); 057 return seqClusters; 058 } 059 060 public int getProteinChainCount() { 061 run(); 062 return sequences.size(); 063 } 064 065 /** 066 * @return the nucleicAcidChainCount 067 */ 068 public int getNucleicAcidChainCount() { 069 run(); 070 return nucleicAcidChainCount; 071 } 072 073 public static void sortSequenceClustersBySize(List<SequenceAlignmentCluster> clusters) { 074 Collections.sort(clusters, new Comparator<SequenceAlignmentCluster>() { 075 @Override 076 public int compare(SequenceAlignmentCluster c1, SequenceAlignmentCluster c2) { 077 int sign = Math.round(Math.signum(c2.getSequenceCount() - c1.getSequenceCount())); 078 if (sign != 0) { 079 return sign; 080 } 081 return Math.round(Math.signum(c2.getSequenceAlignmentLength() - c1.getSequenceAlignmentLength())); 082 } 083 }); 084 } 085 086 private void run() { 087 if (modified) { 088 extractProteinChains(); 089 clusterChains(); 090 modified = false; 091 } 092 } 093 /** 094 * Populate all fields. If two structres are give, concatenate their chains. 095 */ 096 private void extractProteinChains() { 097 ProteinChainExtractor extractor = new ProteinChainExtractor(structure, parameters); 098 caUnaligned = extractor.getCalphaTraces(); 099 chainIds = extractor.getChainIds(); 100 sequences = extractor.getSequences(); 101 modelNumbers = extractor.getModelNumbers(); 102 nucleicAcidChainCount = extractor.getNucleicAcidChainCount(); 103 104 if (structure2 != null) { 105 extractor = new ProteinChainExtractor(structure2, parameters); 106 caUnaligned.addAll(extractor.getCalphaTraces()); 107 chainIds.addAll(extractor.getChainIds()); 108 sequences.addAll(extractor.getSequences()); 109 modelNumbers.addAll(extractor.getModelNumbers()); 110 } 111 } 112 113 /** 114 * Cluster chains based on their sequence. Initializes seqClusters to the set 115 * of non-identical sequences. 116 */ 117 private void clusterChains() { 118 boolean[] processed = new boolean[caUnaligned.size()]; 119 Arrays.fill(processed, false); 120 121 for (int i = 0; i < caUnaligned.size(); i++) { 122 if (processed[i]) { 123 continue; 124 } 125 processed[i] = true; 126 // create new sequence cluster 127 UniqueSequenceList seqList = new UniqueSequenceList(caUnaligned.get(i), chainIds.get(i), modelNumbers.get(i), 0, sequences.get(i)); 128 SequenceAlignmentCluster seqCluster = new SequenceAlignmentCluster(parameters); 129 seqCluster.addUniqueSequenceList(seqList); 130 seqClusters.add(seqCluster); 131 132 for (int j = i + 1; j < caUnaligned.size(); j++) { 133 if (processed[j]) { 134 continue; 135 } 136 // Mark any future identical sequences as processed 137 for (SequenceAlignmentCluster c: seqClusters) { 138 if (c.identityMatch(caUnaligned.get(j), chainIds.get(j), modelNumbers.get(j), 0, sequences.get(j))) { 139 processed[j] = true; 140 //System.out.println("found identity match: " + i + " - " + j); 141 break; 142 } 143 } 144 } 145 } 146 sortSequenceClustersBySize(seqClusters); 147 } 148}