001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.util; 022 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026import java.util.*; 027 028 029 030/** 031 * An implementation of a single linkage clusterer 032 * 033 * See http://en.wikipedia.org/wiki/Single-linkage_clustering 034 * 035 * @author Jose Duarte 036 */ 037public class SingleLinkageClusterer { 038 039 private static final Logger logger = LoggerFactory.getLogger(SingleLinkageClusterer.class); 040 041 private class LinkedPair { 042 043 private int first; 044 private int second; 045 private double closestDistance; 046 047 public LinkedPair(int first, int second, double minDistance) { 048 this.first = first; 049 this.second = second; 050 this.closestDistance = minDistance; 051 } 052 053 public int getFirst() { 054 return first; 055 } 056 057 public int getSecond() { 058 return second; 059 } 060 061 public double getClosestDistance() { 062 return closestDistance; 063 } 064 065 @Override 066 public String toString() { 067 068 String closestDistStr = null; 069 if (closestDistance==Double.MAX_VALUE) { 070 closestDistStr = String.format("%6s", "inf"); 071 } else { 072 closestDistStr = String.format(Locale.US, "%6.2f",closestDistance); 073 } 074 075 return "["+first+","+second+"-"+closestDistStr+"]"; 076 } 077 078 } 079 080 private double[][] matrix; 081 082 private boolean isScoreMatrix; 083 084 private int numItems; 085 086 private LinkedPair[] dendrogram; 087 088 //private Set<Integer> toSkip; 089 090 private ArrayList<Integer> indicesToCheck; 091 092 093 /** 094 * Constructs a new SingleLinkageClusterer 095 * Subsequently use {@link #getDendrogram()} to get the full tree 096 * or {@link #getClusters(double)} to get the clusters at a certain cutoff in the tree 097 * Please note that the matrix will be altered during the clustering procedure. A copy must be 098 * made before by the user if needing to use the original matrix further. 099 * @param matrix the distance matrix with distance values in j>i half, all other values will be ignored 100 * @param isScoreMatrix if false the matrix will be considered a distance matrix: lower values (distances) mean closer objects, 101 * if true the matrix will be considered a score matrix: larger values (scores) mean closer objects 102 * @throws IllegalArgumentException if matrix not square 103 */ 104 public SingleLinkageClusterer(double[][] matrix, boolean isScoreMatrix) { 105 this.matrix = matrix; 106 this.isScoreMatrix = isScoreMatrix; 107 108 if (matrix.length!=matrix[0].length) { 109 throw new IllegalArgumentException("Distance matrix for clustering must be a square matrix"); 110 } 111 112 this.numItems = matrix.length; 113 114 } 115 116 /** 117 * Get the full dendrogram (size n-1) result of the hierarchical clustering 118 * @return 119 */ 120 public LinkedPair[] getDendrogram() { 121 if (dendrogram==null) { 122 clusterIt(); 123 } 124 125 return dendrogram; 126 } 127 128 /** 129 * Calculate the hierarchical clustering and store it in dendrogram array 130 * This is the naive implementation (o(n3)) of single linkage clustering as outlined in wikipedia: 131 * http://en.wikipedia.org/wiki/Single-linkage_clustering 132 */ 133 private void clusterIt() { 134 135 dendrogram = new LinkedPair[numItems-1]; 136 137 138 logger.debug("Initial matrix: \n"+matrixToString()); 139 140 141 for (int m=0;m<numItems-1;m++) { 142 143 updateIndicesToCheck(m); 144 LinkedPair pair = getClosestPair(); 145 merge(pair); 146 dendrogram[m] = pair; 147 148 //if (debug) { 149 // System.out.println("Matrix after iteration "+m+" (merged "+pair.getFirst()+","+pair.getSecond()+")"); 150 // printMatrix(); 151 //} 152 } 153 154 } 155 156 /** 157 * Merge 2 rows/columns of the matrix by the linkage function (see {@link #link(double, double)} 158 * @param closestPair 159 */ 160 private void merge(LinkedPair closestPair) { 161 162 163 int first = closestPair.getFirst(); 164 int second = closestPair.getSecond(); 165 166 for (int other=0;other<numItems;other++) { 167 matrix[Math.min(first,other)][Math.max(first, other)] = link(getDistance(first, other), getDistance(second, other)); 168 } 169 170 } 171 172 /** 173 * The linkage function: minimum of the 2 distances (i.e. single linkage clustering) 174 * @param d1 175 * @param d2 176 * @return 177 */ 178 private double link(double d1, double d2) { 179 if (isScoreMatrix) { 180 return Math.max(d1,d2); 181 } else { 182 return Math.min(d1,d2); 183 } 184 } 185 186 private double getDistance(int first, int second) { 187 return matrix[Math.min(first, second)][Math.max(first, second)]; 188 } 189 190 private void updateIndicesToCheck(int m) { 191 192 if (indicesToCheck==null) { 193 indicesToCheck = new ArrayList<Integer>(numItems); 194 195 for (int i=0;i<numItems;i++) { 196 indicesToCheck.add(i); 197 } 198 } 199 200 if (m==0) return; 201 202 indicesToCheck.remove(new Integer(dendrogram[m-1].getFirst())); 203 } 204 205 private LinkedPair getClosestPair() { 206 207 LinkedPair closestPair = null; 208 209 if (isScoreMatrix) { 210 double max = 0.0; 211 for (int i:indicesToCheck) { 212 213 for (int j:indicesToCheck) { 214 if (j<=i) continue; 215 216 if (matrix[i][j]>=max) { 217 max = matrix[i][j]; 218 closestPair = new LinkedPair(i,j,max); 219 } 220 221 } 222 } 223 } else { 224 double min = Double.MAX_VALUE; 225 for (int i:indicesToCheck) { 226 227 for (int j:indicesToCheck) { 228 if (j<=i) continue; 229 230 if (matrix[i][j]<=min) { 231 min = matrix[i][j]; 232 closestPair = new LinkedPair(i,j,min); 233 } 234 235 } 236 } 237 } 238 239 return closestPair; 240 } 241 242 /** 243 * Get the clusters by cutting the dendrogram at given cutoff 244 * @param cutoff 245 * @return Map from cluster numbers to indices of the cluster members 246 */ 247 public Map<Integer, Set<Integer>> getClusters(double cutoff) { 248 249 if (dendrogram==null) { 250 clusterIt(); 251 } 252 253 Map<Integer, Set<Integer>> clusters = new TreeMap<Integer, Set<Integer>>(); 254 255 int clusterId = 1; 256 257 for (int i=0;i<numItems-1;i++) { 258 259 if (isWithinCutoff(i, cutoff)) { 260 261 //int containingClusterId = getContainingCluster(clusters, dendrogram[i]); 262 263 int firstClusterId = -1; 264 int secondClusterId = -1; 265 for (int cId:clusters.keySet()) { 266 Set<Integer> members = clusters.get(cId); 267 268 if (members.contains(dendrogram[i].getFirst())) { 269 firstClusterId = cId; 270 } 271 if (members.contains(dendrogram[i].getSecond())) { 272 secondClusterId = cId; 273 } 274 } 275 276 277 if (firstClusterId==-1 && secondClusterId==-1) { 278 // neither member is in a cluster yet, let's assign a new cluster and put them both in 279 Set<Integer> members = new TreeSet<Integer>(); 280 members.add(dendrogram[i].getFirst()); 281 members.add(dendrogram[i].getSecond()); 282 clusters.put(clusterId, members); 283 clusterId++; 284 } else if (firstClusterId!=-1 && secondClusterId==-1) { 285 // first member was in firstClusterId already, we add second 286 clusters.get(firstClusterId).add(dendrogram[i].getSecond()); 287 } else if (secondClusterId!=-1 && firstClusterId==-1) { 288 // second member was in secondClusterId already, we add first 289 clusters.get(secondClusterId).add(dendrogram[i].getFirst()); 290 } else { 291 // both were in different clusters already 292 // we need to join them: necessarily one must be of size 1 and the other of size>=1 293 Set<Integer> firstCluster = clusters.get(firstClusterId); 294 Set<Integer> secondCluster = clusters.get(secondClusterId); 295 if (firstCluster.size()<secondCluster.size()) { 296 logger.debug("Joining cluster "+firstClusterId+" to cluster "+secondClusterId); 297 // we join first onto second 298 for (int member : firstCluster) { 299 secondCluster.add(member); 300 } 301 clusters.remove(firstClusterId); 302 } else { 303 logger.debug("Joining cluster "+secondClusterId+" to cluster "+firstClusterId); 304 // we join second onto first 305 for (int member : secondCluster) { 306 firstCluster.add(member); 307 } 308 clusters.remove(secondClusterId); 309 } 310 } 311 312 logger.debug("Within cutoff: "+dendrogram[i]); 313 314 } else { 315 316 logger.debug("Not within cutoff: "+dendrogram[i]); 317 318 } 319 } 320 321 // reassigning cluster numbers by creating a new map (there can be gaps in the numbering if cluster-joining happened) 322 Map<Integer,Set<Integer>> finalClusters = new TreeMap<Integer, Set<Integer>>(); 323 int newClusterId = 1; 324 for (int oldClusterId:clusters.keySet()) { 325 finalClusters.put(newClusterId, clusters.get(oldClusterId)); 326 newClusterId++; 327 } 328 329 // anything not clustered is assigned to a singleton cluster (cluster with one member) 330 for (int i=0;i<numItems;i++) { 331 boolean isAlreadyClustered = false; 332 for (Set<Integer> cluster:finalClusters.values()) { 333 if (cluster.contains(i)) { 334 isAlreadyClustered = true; 335 break; 336 } 337 } 338 if (!isAlreadyClustered) { 339 Set<Integer> members = new TreeSet<Integer>(); 340 members.add(i); 341 finalClusters.put(newClusterId, members); 342 newClusterId++; 343 } 344 345 } 346 347 logger.debug("Clusters: \n"+clustersToString(finalClusters)); 348 349 return finalClusters; 350 } 351 352 private boolean isWithinCutoff(int i, double cutoff) { 353 if (isScoreMatrix) { 354 if (dendrogram[i].getClosestDistance()>cutoff) { 355 return true; 356 } else { 357 return false; 358 } 359 } else { 360 if (dendrogram[i].getClosestDistance()<cutoff) { 361 return true; 362 } else { 363 return false; 364 } 365 } 366 } 367 368 private String clustersToString(Map<Integer,Set<Integer>> finalClusters) { 369 StringBuilder sb = new StringBuilder(); 370 for (int cId:finalClusters.keySet()) { 371 sb.append(cId).append(": "); 372 for (int member:finalClusters.get(cId)) { 373 sb.append(member).append(" "); 374 } 375 sb.append("\n"); 376 } 377 return sb.toString(); 378 } 379 380 private String matrixToString() { 381 StringBuilder sb = new StringBuilder(); 382 for (int i=0;i<numItems;i++) { 383 for (int j=0;j<numItems;j++) { 384 if (i==j) { 385 sb.append(String.format("%6s ","x")); 386 } 387 else if (i<j) { 388 if (matrix[i][j]==Double.MAX_VALUE) sb.append(String.format("%6s ","inf")); 389 else sb.append(String.format(Locale.US, "%6.2f ",matrix[i][j])); 390 } 391 else { 392 if (matrix[j][i]==Double.MAX_VALUE) sb.append(String.format("%6s ","inf")); 393 else sb.append(String.format(Locale.US, "%6.2f ",matrix[j][i])); 394 } 395 } 396 sb.append("\n"); 397 } 398 sb.append("\n"); 399 return sb.toString(); 400 } 401 402}