001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.contact; 022 023import java.io.Serializable; 024import java.util.ArrayList; 025import java.util.Collections; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Map; 029import java.util.Optional; 030import java.util.Set; 031import java.util.TreeMap; 032 033import org.biojava.nbio.core.util.SingleLinkageClusterer; 034import org.biojava.nbio.structure.Atom; 035import org.biojava.nbio.structure.Structure; 036import org.biojava.nbio.structure.asa.AsaCalculator; 037import org.biojava.nbio.structure.xtal.CrystalBuilder; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041 042/** 043 * A list of interfaces between 2 molecules (2 sets of atoms) 044 * 045 * @author Jose Duarte 046 * 047 */ 048public class StructureInterfaceList implements Serializable, Iterable<StructureInterface> { 049 050 private static final Logger logger = LoggerFactory.getLogger(StructureInterfaceList.class); 051 052 /** 053 * Default minimum area for a contact between two chains to be considered a 054 * valid interface. 055 * @see #removeInterfacesBelowArea(double); 056 */ 057 public static final double DEFAULT_MINIMUM_INTERFACE_AREA = 35.0; 058 /** 059 * Default number of points to use when calculating ASAs 060 * @see #calcAsas(int, int, int) 061 */ 062 public static final int DEFAULT_ASA_SPHERE_POINTS = 3000; 063 /** 064 * Default minimum size of cofactor molecule (non-chain HET atoms) that will be used 065 * @see #calcAsas(int, int, int) 066 */ 067 public static final int DEFAULT_MIN_COFACTOR_SIZE = 40; 068 069 /** 070 * Any 2 interfaces with contact overlap score larger than this value 071 * will be considered to be clustered 072 */ 073 public static final double DEFAULT_CONTACT_OVERLAP_SCORE_CLUSTER_CUTOFF = 0.2; 074 075 private static final long serialVersionUID = 1L; 076 077 private final List<StructureInterface> list; 078 079 private List<StructureInterfaceCluster> clusters = null; 080 private List<StructureInterfaceCluster> clustersNcs = null; 081 082 private Map<String, String> chainOrigNamesMap; 083 084 public StructureInterfaceList() { 085 this.list = new ArrayList<>(); 086 } 087 088 public void add(StructureInterface interf) { 089 this.list.add(interf); 090 } 091 092 public int size() { 093 return this.list.size(); 094 } 095 096 public List<StructureInterface> getList() { 097 return list; 098 } 099 100 /** 101 * Gets the interface corresponding to given id. 102 * The ids go from 1 to n 103 * If {@link #sort()} was called then the order is descendent by area. 104 * @param id 105 * @return 106 */ 107 public StructureInterface get(int id) { 108 return list.get(id-1); 109 } 110 111 /** 112 * Calculates ASAs for all interfaces in list, both for the unbound 113 * chains and for the complex of the two chains together. 114 * Also sorts the interfaces based on calculated BSA areas (descending). 115 * 116 * <p>Uses default parameters 117 */ 118 public void calcAsas() { 119 calcAsas( DEFAULT_ASA_SPHERE_POINTS, 120 Runtime.getRuntime().availableProcessors(), 121 DEFAULT_MIN_COFACTOR_SIZE ); 122 } 123 /** 124 * Calculates ASAs for all interfaces in list, both for the unbound 125 * chains and for the complex of the two chains together. 126 * Also sorts the interfaces based on calculated BSA areas (descending) 127 * @param nSpherePoints 128 * @param nThreads 129 * @param cofactorSizeToUse the minimum size of cofactor molecule (non-chain HET atoms) that will be used 130 */ 131 public void calcAsas(int nSpherePoints, int nThreads, int cofactorSizeToUse) { 132 133 // asa/bsa calculation 134 // NOTE in principle it is more efficient to calculate asas only once per unique chain 135 // BUT! the rolling ball algorithm gives slightly different values for same molecule in different 136 // rotations (due to sampling depending on orientation of axes grid). 137 // Both NACCESS and our own implementation behave like that. 138 // That's why we calculate ASAs for each rotation-unique molecule, otherwise 139 // we get discrepancies (not very big but annoying) which lead to things like negative (small) bsa values 140 141 142 Map<String, Atom[]> uniqAsaChains = new TreeMap<>(); 143 Map<String, double[]> chainAsas = new TreeMap<>(); 144 145 List<StructureInterface> redundancyReducedList; 146 if (clustersNcs != null) { 147 redundancyReducedList = new ArrayList<>(); 148 for (StructureInterfaceCluster ncsCluster : clustersNcs) { 149 // we use the first one in list as the only one for which we calculate ASAs 150 redundancyReducedList.add(ncsCluster.getMembers().get(0)); 151 } 152 153 } else { 154 redundancyReducedList = list; 155 } 156 157 // first we gather rotation-unique chains (in terms of AU id and transform id) 158 for (StructureInterface interf:redundancyReducedList) { 159 String molecId1 = interf.getMoleculeIds().getFirst()+interf.getTransforms().getFirst().getTransformId(); 160 String molecId2 = interf.getMoleculeIds().getSecond()+interf.getTransforms().getSecond().getTransformId(); 161 162 uniqAsaChains.put(molecId1, interf.getFirstAtomsForAsa(cofactorSizeToUse)); 163 uniqAsaChains.put(molecId2, interf.getSecondAtomsForAsa(cofactorSizeToUse)); 164 } 165 166 logger.debug("Will calculate uncomplexed ASA for {} orientation-unique chains.", uniqAsaChains.size()); 167 168 long start = System.currentTimeMillis(); 169 170 // we only need to calculate ASA for that subset (any translation of those will have same values) 171 for (String molecId:uniqAsaChains.keySet()) { 172 173 logger.debug("Calculating uncomplexed ASA for molecId {}, with {} atoms", molecId, uniqAsaChains.get(molecId).length); 174 175 AsaCalculator asaCalc = new AsaCalculator(uniqAsaChains.get(molecId), 176 AsaCalculator.DEFAULT_PROBE_SIZE, nSpherePoints, nThreads); 177 178 double[] atomAsas = asaCalc.calculateAsas(); 179 180 chainAsas.put(molecId, atomAsas); 181 182 } 183 long end = System.currentTimeMillis(); 184 185 logger.debug("Calculated uncomplexed ASA for {} orientation-unique chains. Time: {} s", uniqAsaChains.size(), ((end-start)/1000.0)); 186 187 logger.debug ("Will calculate complexed ASA for {} pairwise complexes.", redundancyReducedList.size()); 188 189 start = System.currentTimeMillis(); 190 191 // now we calculate the ASAs for each of the complexes 192 for (StructureInterface interf:redundancyReducedList) { 193 194 String molecId1 = interf.getMoleculeIds().getFirst()+interf.getTransforms().getFirst().getTransformId(); 195 String molecId2 = interf.getMoleculeIds().getSecond()+interf.getTransforms().getSecond().getTransformId(); 196 197 logger.debug("Calculating complexed ASAs for interface {} between molecules {} and {}", interf.getId(), molecId1, molecId2); 198 199 interf.setAsas(chainAsas.get(molecId1), chainAsas.get(molecId2), nSpherePoints, nThreads, cofactorSizeToUse); 200 201 } 202 end = System.currentTimeMillis(); 203 204 logger.debug("Calculated complexes ASA for {} pairwise complexes. Time: {} s", redundancyReducedList.size(), ((end-start)/1000.0)); 205 206 // now let's populate the interface area value for the NCS-redundant ones from the reference interface (first one in list) 207 if (clustersNcs!=null) { 208 if (chainOrigNamesMap==null) { 209 logger.warn("No chainOrigNamesMap is set. Considering NCS interfaces in same order as reference. This is likely a bug."); 210 } 211 for (StructureInterfaceCluster ncsCluster : clustersNcs) { 212 StructureInterface refInterf = ncsCluster.getMembers().get(0); 213 String refMolecId1 = refInterf.getMoleculeIds().getFirst(); 214 for (int i=1;i<ncsCluster.getMembers().size();i++) { 215 StructureInterface member = ncsCluster.getMembers().get(i); 216 member.setTotalArea(refInterf.getTotalArea()); 217 String molecId1 = member.getMoleculeIds().getFirst(); 218 if (areMolecIdsSameOrder(refMolecId1, molecId1)) { 219 // we add the reference interface GroupAsas as the GroupAsas for all other members, like that 220 // ResidueNumbers won't match in their chain ids, but otherwise all info is there without using a lot of memory 221 member.setFirstGroupAsas(refInterf.getFirstGroupAsas()); 222 member.setSecondGroupAsas(refInterf.getSecondGroupAsas()); 223 } else { 224 member.setFirstGroupAsas(refInterf.getSecondGroupAsas()); 225 member.setSecondGroupAsas(refInterf.getFirstGroupAsas()); 226 } 227 } 228 } 229 } 230 231 // finally we sort based on the ChainInterface.comparable() (based in interfaceArea) 232 sort(); 233 } 234 235 private boolean areMolecIdsSameOrder(String refMolecId, String molecId) { 236 237 if (chainOrigNamesMap==null) { 238 // we've already warned above 239 return true; 240 } 241 242 String refMolecIdOrig = chainOrigNamesMap.get(refMolecId); 243 String molecIdOrig = chainOrigNamesMap.get(molecId); 244 245 return (refMolecIdOrig.equals(molecIdOrig)); 246 } 247 248 /** 249 * Sorts the interface list and reassigns ids based on new sorting 250 * 251 */ 252 public void sort() { 253 Collections.sort(list); 254 int i=1; 255 for (StructureInterface interf:list) { 256 interf.setId(i); 257 i++; 258 } 259 } 260 261 /** 262 * Get the interface clusters for this StructureInterfaceList grouped by NCS-equivalence. 263 * This means that for any two interfaces in the same cluster: 264 * 1. The chains forming the first interface are NCS-copies of the chains forming the second interface, in any order. 265 * 2. Relative orientation of the chains is preserved, i.e. the contacts are identical. 266 * @return list of {@link StructureInterfaceCluster} objects. 267 * @since 5.0.0 268 */ 269 public List<StructureInterfaceCluster> getClustersNcs() { 270 return clustersNcs; 271 } 272 273 /** 274 * Add an interface to the list, possibly defining it as NCS-equivalent to an interface already in the list. 275 * Used to build up the NCS clustering. 276 * @param interfaceNew 277 * an interface to be added to the list. 278 * @param interfaceRef 279 * interfaceNew will be added to the cluster which contains interfaceRef. 280 * If interfaceRef is null, new cluster will be created for interfaceNew. 281 * @since 5.0.0 282 */ 283 public void addNcsEquivalent(StructureInterface interfaceNew, StructureInterface interfaceRef) { 284 285 this.add(interfaceNew); 286 if (clustersNcs == null) { 287 clustersNcs = new ArrayList<>(); 288 } 289 290 if (interfaceRef == null) { 291 StructureInterfaceCluster newCluster = new StructureInterfaceCluster(); 292 newCluster.addMember(interfaceNew); 293 clustersNcs.add(newCluster); 294 return; 295 } 296 297 Optional<StructureInterfaceCluster> clusterRef = 298 clustersNcs.stream(). 299 filter(r->r.getMembers().stream(). 300 anyMatch(c -> c.equals(interfaceRef))). 301 findFirst(); 302 303 if (clusterRef.isPresent()) { 304 clusterRef.get().addMember(interfaceNew); 305 return; 306 } 307 308 logger.warn("The specified reference interface, if not null, should have been added to this set previously. " + 309 "Creating new cluster and adding both interfaces. This is likely a bug."); 310 this.add(interfaceRef); 311 StructureInterfaceCluster newCluster = new StructureInterfaceCluster(); 312 newCluster.addMember(interfaceRef); 313 newCluster.addMember(interfaceNew); 314 clustersNcs.add(newCluster); 315 } 316 317 /** 318 * Sets a map with mapping from NCS chain names to original chain names. 319 * Necessary when {@link #addNcsEquivalent(StructureInterface, StructureInterface)} is used and NCS equivalent 320 * interfaces exist in this list and their names need mapping when setting ASAs. 321 * @param chainOrigNamesMap a map of NCS chain name to original chain name 322 */ 323 public void setChainOrigNamesMap(Map<String, String> chainOrigNamesMap) { 324 this.chainOrigNamesMap = chainOrigNamesMap; 325 } 326 327 /** 328 * Removes from this interface list all interfaces with areas 329 * below the default cutoff area. 330 * Note that this must be called after {@link #calcAsas(int, int, int)}, otherwise all areas would 331 * be 0 and thus all removed. 332 * @see #DEFAULT_MINIMUM_INTERFACE_AREA 333 */ 334 public void removeInterfacesBelowArea() { 335 removeInterfacesBelowArea(DEFAULT_MINIMUM_INTERFACE_AREA); 336 } 337 338 /** 339 * Removes from this interface list all interfaces with areas 340 * below the given cutoff area. 341 * Note that this must be called after {@link #calcAsas(int, int, int)}, otherwise all areas would 342 * be 0 and thus all removed. 343 * @param area the minimum interface buried surface area to keep. Interfaces below this value will be removed. 344 */ 345 public void removeInterfacesBelowArea(double area) { 346 347 list.removeIf(interf -> interf.getTotalArea() < area); 348 349 if (clustersNcs != null) { 350 clustersNcs.removeIf(ncsCluster -> ncsCluster.getMembers().get(0).getTotalArea() < area); 351 } 352 } 353 354 /** 355 * Calculate the interface clusters for this StructureInterfaceList 356 * using a contact overlap score to measure the similarity of interfaces. 357 * Subsequent calls will use the cached value without recomputing the clusters. 358 * The contact overlap score cutoff to consider a pair in the same cluster is 359 * the value {@link #DEFAULT_CONTACT_OVERLAP_SCORE_CLUSTER_CUTOFF} 360 * @return 361 */ 362 public List<StructureInterfaceCluster> getClusters() { 363 return getClusters(DEFAULT_CONTACT_OVERLAP_SCORE_CLUSTER_CUTOFF); 364 } 365 366 /** 367 * Calculate the interface clusters for this StructureInterfaceList 368 * using Jaccard contact set scores to measure the similarity of interfaces. 369 * Subsequent calls will use the cached value without recomputing the clusters. 370 * The clusters will be assigned ids by sorting descending by {@link StructureInterfaceCluster#getTotalArea()} 371 * @param contactOverlapScoreClusterCutoff the contact overlap score above which a pair will be 372 * clustered 373 * @return 374 */ 375 public List<StructureInterfaceCluster> getClusters(double contactOverlapScoreClusterCutoff) { 376 if (clusters!=null) { 377 return clusters; 378 } 379 380 clusters = new ArrayList<>(); 381 382 // nothing to do if we have no interfaces 383 if (list.size()==0) return clusters; 384 385 logger.debug("Calculating all-vs-all Jaccard scores for {} interfaces", list.size()); 386 double[][] matrix = new double[list.size()][list.size()]; 387 388 for (int i=0;i<list.size();i++) { 389 for (int j=i+1;j<list.size();j++) { 390 StructureInterface iInterf = list.get(i); 391 StructureInterface jInterf = list.get(j); 392 393 double scoreDirect = iInterf.getContactOverlapScore(jInterf, false); 394 double scoreInvert = iInterf.getContactOverlapScore(jInterf, true); 395 396 double maxScore = Math.max(scoreDirect, scoreInvert); 397 398 matrix[i][j] = maxScore; 399 } 400 401 } 402 403 logger.debug("Will now cluster {} interfaces based on full all-vs-all Jaccard scores matrix", list.size()); 404 SingleLinkageClusterer slc = new SingleLinkageClusterer(matrix, true); 405 406 Map<Integer, Set<Integer>> clusteredIndices = slc.getClusters(contactOverlapScoreClusterCutoff); 407 for (int clusterIdx:clusteredIndices.keySet()) { 408 List<StructureInterface> members = new ArrayList<>(); 409 for (int idx:clusteredIndices.get(clusterIdx)) { 410 members.add(list.get(idx)); 411 } 412 StructureInterfaceCluster cluster = new StructureInterfaceCluster(); 413 cluster.setMembers(members); 414 double averageScore = 0.0; 415 int countPairs = 0; 416 for (int i=0;i<members.size();i++) { 417 int iIdx = list.indexOf(members.get(i)); 418 for (int j=i+1;j<members.size();j++) { 419 averageScore += matrix[iIdx][list.indexOf(members.get(j))]; 420 countPairs++; 421 } 422 } 423 if (countPairs>0) { 424 averageScore = averageScore/countPairs; 425 } else { 426 // if only one interface in cluster we set the score to the maximum 427 averageScore = 1.0; 428 } 429 cluster.setAverageScore(averageScore); 430 clusters.add(cluster); 431 } 432 433 // finally we have to set the back-references in each StructureInterface 434 for (StructureInterfaceCluster cluster:clusters) { 435 for (StructureInterface interf:cluster.getMembers()) { 436 interf.setCluster(cluster); 437 } 438 } 439 logger.debug("Done clustering {} interfaces based on full all-vs-all Jaccard scores matrix. Found a total of {} clusters", list.size(), clusters.size()); 440 441 // now we sort by areas (descending) and assign ids based on that sorting 442 clusters.sort((o1, o2) -> Double.compare(o2.getTotalArea(), o1.getTotalArea())); //note we invert so that sorting is descending 443 444 int id = 1; 445 for (StructureInterfaceCluster cluster:clusters) { 446 cluster.setId(id); 447 id++; 448 } 449 450 return clusters; 451 } 452 453 @Override 454 public Iterator<StructureInterface> iterator() { 455 return list.iterator(); 456 } 457 458 @Override 459 public String toString() { 460 return list.toString(); 461 } 462 463 /** 464 * Calculates the interfaces for a structure using default parameters 465 * @param struc 466 * @return 467 */ 468 public static StructureInterfaceList calculateInterfaces(Structure struc) { 469 CrystalBuilder builder = new CrystalBuilder(struc); 470 StructureInterfaceList interfaces = builder.getUniqueInterfaces(); 471 logger.debug("Calculating ASA for "+interfaces.size()+" potential interfaces"); 472 interfaces.calcAsas(StructureInterfaceList.DEFAULT_ASA_SPHERE_POINTS, //fewer for performance 473 Runtime.getRuntime().availableProcessors(), 474 StructureInterfaceList.DEFAULT_MIN_COFACTOR_SIZE); 475 interfaces.removeInterfacesBelowArea(); 476 interfaces.getClusters(); 477 logger.debug("Found "+interfaces.size()+" interfaces"); 478 return interfaces; 479 } 480 481}