001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.util;
022
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026import java.util.*;
027
028
029
030/**
031 * An implementation of a single linkage clusterer
032 *
033 * See http://en.wikipedia.org/wiki/Single-linkage_clustering
034 *
035 * @author Jose Duarte
036 */
037public class SingleLinkageClusterer {
038
039        private static final Logger logger = LoggerFactory.getLogger(SingleLinkageClusterer.class);
040
041        private class LinkedPair {
042
043                private int first;
044                private int second;
045                private double closestDistance;
046
047                public LinkedPair(int first, int second, double minDistance) {
048                        this.first = first;
049                        this.second = second;
050                        this.closestDistance = minDistance;
051                }
052
053                public int getFirst() {
054                        return first;
055                }
056
057                public int getSecond() {
058                        return second;
059                }
060
061                public double getClosestDistance() {
062                        return closestDistance;
063                }
064
065                @Override
066                public String toString() {
067
068                        String closestDistStr = null;
069                        if (closestDistance==Double.MAX_VALUE) {
070                                closestDistStr = String.format("%6s", "inf");
071                        } else {
072                                closestDistStr = String.format(Locale.US, "%6.2f",closestDistance);
073                        }
074
075                        return "["+first+","+second+"-"+closestDistStr+"]";
076                }
077
078        }
079
080        private double[][] matrix;
081
082        private boolean isScoreMatrix;
083
084        private int numItems;
085
086        private LinkedPair[] dendrogram;
087
088        //private Set<Integer> toSkip;
089
090        private List<Integer> indicesToCheck;
091
092
093        /**
094         * Constructs a new SingleLinkageClusterer
095         * Subsequently use {@link #getDendrogram()} to get the full tree
096         * or {@link #getClusters(double)} to get the clusters at a certain cutoff in the tree
097         * Please note that the matrix will be altered during the clustering procedure. A copy must be
098         * made before by the user if needing to use the original matrix further.
099         * @param matrix the distance matrix with distance values in j>i half, all other values will be ignored
100         * @param isScoreMatrix if false the matrix will be considered a distance matrix: lower values (distances) mean closer objects,
101         * if true the matrix will be considered a score matrix: larger values (scores) mean closer objects
102         * @throws IllegalArgumentException if matrix not square
103         */
104        public SingleLinkageClusterer(double[][] matrix, boolean isScoreMatrix) {
105                this.matrix = matrix;
106                this.isScoreMatrix = isScoreMatrix;
107
108                if (matrix.length!=matrix[0].length) {
109                        throw new IllegalArgumentException("Distance matrix for clustering must be a square matrix");
110                }
111
112                this.numItems = matrix.length;
113
114        }
115
116        /**
117         * Get the full dendrogram (size n-1) result of the hierarchical clustering
118         * @return
119         */
120        public LinkedPair[] getDendrogram() {
121                if (dendrogram==null) {
122                        clusterIt();
123                }
124
125                return dendrogram;
126        }
127
128        /**
129         * Calculate the hierarchical clustering and store it in dendrogram array
130         * This is the naive implementation (o(n3)) of single linkage clustering as outlined in wikipedia:
131         * http://en.wikipedia.org/wiki/Single-linkage_clustering
132         */
133        private void clusterIt() {
134
135                dendrogram = new LinkedPair[numItems-1];
136
137
138                logger.debug("Initial matrix: \n{}", matrixToString());
139
140
141                for (int m=0;m<numItems-1;m++) {
142
143                        updateIndicesToCheck(m);
144                        LinkedPair pair = getClosestPair();
145                        merge(pair);
146                        dendrogram[m] = pair;
147
148                        //if (debug) {
149                        //      System.out.println("Matrix after iteration "+m+" (merged "+pair.getFirst()+","+pair.getSecond()+")");
150                        //      printMatrix();
151                        //}
152                }
153
154        }
155
156        /**
157         * Merge 2 rows/columns of the matrix by the linkage function (see {@link #link(double, double)}
158         * @param closestPair
159         */
160        private void merge(LinkedPair closestPair) {
161
162
163                int first = closestPair.getFirst();
164                int second = closestPair.getSecond();
165
166                for (int other=0;other<numItems;other++) {
167                        matrix[Math.min(first,other)][Math.max(first, other)] = link(getDistance(first, other), getDistance(second, other));
168                }
169
170        }
171
172        /**
173         * The linkage function: minimum of the 2 distances (i.e. single linkage clustering)
174         * @param d1
175         * @param d2
176         * @return
177         */
178        private double link(double d1, double d2) {
179                if (isScoreMatrix) {
180                        return Math.max(d1,d2);
181                } else {
182                        return Math.min(d1,d2);
183                }
184        }
185
186        private double getDistance(int first, int second) {
187                return matrix[Math.min(first, second)][Math.max(first, second)];
188        }
189
190        private void updateIndicesToCheck(int m) {
191
192                if (indicesToCheck==null) {
193                        indicesToCheck = new ArrayList<>(numItems);
194
195                        for (int i=0;i<numItems;i++) {
196                                indicesToCheck.add(i);
197                        }
198                }
199
200                if (m==0) return;
201
202                indicesToCheck.remove(new Integer(dendrogram[m-1].getFirst()));
203        }
204
205        private LinkedPair getClosestPair() {
206
207                LinkedPair closestPair = null;
208
209                if (isScoreMatrix) {
210                        double max = 0.0;
211                        for (int i:indicesToCheck) {
212
213                                for (int j:indicesToCheck) {
214                                        if (j<=i) continue;
215
216                                        if (matrix[i][j]>=max) {
217                                                max = matrix[i][j];
218                                                closestPair = new LinkedPair(i,j,max);
219                                        }
220
221                                }
222                        }
223                } else {
224                        double min = Double.MAX_VALUE;
225                        for (int i:indicesToCheck) {
226
227                                for (int j:indicesToCheck) {
228                                        if (j<=i) continue;
229
230                                        if (matrix[i][j]<=min) {
231                                                min = matrix[i][j];
232                                                closestPair = new LinkedPair(i,j,min);
233                                        }
234
235                                }
236                        }
237                }
238
239                return closestPair;
240        }
241
242        /**
243         * Get the clusters by cutting the dendrogram at given cutoff
244         * @param cutoff
245         * @return Map from cluster numbers to indices of the cluster members
246         */
247        public Map<Integer, Set<Integer>> getClusters(double cutoff) {
248
249                if (dendrogram==null) {
250                        clusterIt();
251                }
252
253                Map<Integer, Set<Integer>> clusters = new TreeMap<>();
254
255                int clusterId = 1;
256
257                for (int i=0;i<numItems-1;i++) {
258
259                        if (isWithinCutoff(i, cutoff)) {
260
261                                //int containingClusterId = getContainingCluster(clusters, dendrogram[i]);
262
263                                int firstClusterId = -1;
264                                int secondClusterId = -1;
265                                for (int cId:clusters.keySet()) {
266                                        Set<Integer> members = clusters.get(cId);
267
268                                        if (members.contains(dendrogram[i].getFirst())) {
269                                                firstClusterId = cId;
270                                        }
271                                        if (members.contains(dendrogram[i].getSecond())) {
272                                                secondClusterId = cId;
273                                        }
274                                }
275
276
277                                if (firstClusterId==-1 && secondClusterId==-1) {
278                                        // neither member is in a cluster yet, let's assign a new cluster and put them both in
279                                        Set<Integer> members = new TreeSet<>();
280                                        members.add(dendrogram[i].getFirst());
281                                        members.add(dendrogram[i].getSecond());
282                                        clusters.put(clusterId, members);
283                                        clusterId++;
284                                } else if (firstClusterId!=-1 && secondClusterId==-1) {
285                                        // first member was in firstClusterId already, we add second
286                                        clusters.get(firstClusterId).add(dendrogram[i].getSecond());
287                                } else if (secondClusterId!=-1 && firstClusterId==-1) {
288                                        // second member was in secondClusterId already, we add first
289                                        clusters.get(secondClusterId).add(dendrogram[i].getFirst());
290                                } else {
291                                        // both were in different clusters already
292                                        // we need to join them: necessarily one must be of size 1 and the other of size>=1
293                                        Set<Integer> firstCluster = clusters.get(firstClusterId);
294                                        Set<Integer> secondCluster = clusters.get(secondClusterId);
295                                        if (firstCluster.size()<secondCluster.size()) {
296                                                logger.debug("Joining cluster "+firstClusterId+" to cluster "+secondClusterId);
297                                                // we join first onto second
298                                                for (int member : firstCluster) {
299                                                        secondCluster.add(member);
300                                                }
301                                                clusters.remove(firstClusterId);
302                                        } else {
303                                                logger.debug("Joining cluster "+secondClusterId+" to cluster "+firstClusterId);
304                                                // we join second onto first
305                                                for (int member : secondCluster) {
306                                                        firstCluster.add(member);
307                                                }
308                                                clusters.remove(secondClusterId);
309                                        }
310                                }
311
312                                logger.debug("Within cutoff:     {}", dendrogram[i]);
313
314                        } else {
315
316                                logger.debug("Not within cutoff: {}", dendrogram[i]);
317
318                        }
319                }
320
321                // reassigning cluster numbers by creating a new map (there can be gaps in the numbering if cluster-joining happened)
322                Map<Integer,Set<Integer>> finalClusters = new TreeMap<>();
323                int newClusterId = 1;
324                for (int oldClusterId:clusters.keySet()) {
325                        finalClusters.put(newClusterId, clusters.get(oldClusterId));
326                        newClusterId++;
327                }
328
329                // anything not clustered is assigned to a singleton cluster (cluster with one member)
330                for (int i=0;i<numItems;i++) {
331                        boolean isAlreadyClustered = false;
332                        for (Set<Integer> cluster:finalClusters.values()) {
333                                if (cluster.contains(i)) {
334                                        isAlreadyClustered = true;
335                                        break;
336                                }
337                        }
338                        if (!isAlreadyClustered) {
339                                Set<Integer> members = new TreeSet<>();
340                                members.add(i);
341                                finalClusters.put(newClusterId, members);
342                                newClusterId++;
343                        }
344
345                }
346
347                logger.debug("Clusters: \n{}", clustersToString(finalClusters));
348
349                return finalClusters;
350        }
351
352        private boolean isWithinCutoff(int i, double cutoff) {
353                if (isScoreMatrix) {
354                        if (dendrogram[i].getClosestDistance()>cutoff) {
355                                return true;
356                        } else {
357                                return false;
358                        }
359                } else {
360                        if (dendrogram[i].getClosestDistance()<cutoff) {
361                                return true;
362                        } else {
363                                return false;
364                        }
365                }
366        }
367
368        private String clustersToString(Map<Integer,Set<Integer>> finalClusters) {
369                StringBuilder sb = new StringBuilder();
370                for (int cId:finalClusters.keySet()) {
371                        sb.append(cId).append(": ");
372                        for (int member:finalClusters.get(cId)) {
373                                sb.append(member).append(" ");
374                        }
375                        sb.append("\n");
376                }
377                return sb.toString();
378        }
379
380        private String matrixToString() {
381                StringBuilder sb = new StringBuilder();
382                for (int i=0;i<numItems;i++) {
383                        for (int j=0;j<numItems;j++) {
384                                if (i==j) {
385                                        sb.append(String.format("%6s ","x"));
386                                }
387                                else if (i<j) {
388                                        if (matrix[i][j]==Double.MAX_VALUE) sb.append(String.format("%6s ","inf"));
389                                        else sb.append(String.format(Locale.US, "%6.2f ",matrix[i][j]));
390                                }
391                                else {
392                                        if (matrix[j][i]==Double.MAX_VALUE) sb.append(String.format("%6s ","inf"));
393                                        else sb.append(String.format(Locale.US, "%6.2f ",matrix[j][i]));
394                                }
395                        }
396                        sb.append("\n");
397                }
398                sb.append("\n");
399                return sb.toString();
400        }
401
402}