extract stabilit clusters

anuunchin · anuunchin · commit e42bdb56d819 · 2026-01-25T16:04:21.000+01:00
diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
@@ -58,7 +58,7 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     coreDistances = matrix(0, rows=n, cols=1)
     for(i in 1:n) {
-        kthDist = computeKthSmallest(distances[i,], minPts)
+        kthDist = computeKthSmallest(t(distances[i,]), minPts)  # Add t() here!
         coreDistances[i] = kthDist
     }
    
@@ -68,7 +68,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
 
-    # TODO: get stable cluster with stability score
+    [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize)
+    
     # TODO: build cluster model
     
     # temp  dummy values
@@ -184,9 +185,9 @@ union = function(Matrix[Double] parent, Matrix[Double] rank,
 buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n)
     return (Matrix[Double] hierarchy, Matrix[Double] sizes)
 {
-    # sort edges by weight in ascending order
-    # to build the hierarchy from dense cores outward
-    sorted = order(target=weights, by=1, decreasing=FALSE)
+    # create indexed weights to preserve original positions after sorting
+    indexedWeights = cbind(seq(1, nrow(weights)), weights)
+    sorted = order(target=indexedWeights, by=2, decreasing=FALSE)
 
     # parent[i] = i, meaning each point is its own parent in the beginning
     parent = seq(1, n)
@@ -241,3 +242,157 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
         }
     }
 }
+
+getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId)
+    return (Matrix[Double] leaves)
+{
+    if(nodeId <= n) {
+        leaves = matrix(nodeId, rows=1, cols=1)
+    } else {
+        mergeIdx = nodeId - n
+        left = as.integer(as.scalar(hierarchy[mergeIdx,1]))
+        right = as.integer(as.scalar(hierarchy[mergeIdx,2]))
+        
+        leftLeaves = getLeafDescendants(hierarchy, n, left)
+        rightLeaves = getLeafDescendants(hierarchy, n, right)
+        
+        leaves = rbind(leftLeaves, rightLeaves)
+    }
+}
+
+extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights, 
+                                  Integer n, Integer minClSize)
+    return (Matrix[Double] labels, Matrix[Double] stabilities)
+{
+    numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events
+    numNodes = 2*n - 1 # total nodes in the dendogram
+    
+    # convert distances to lambda (density)
+    lambda = matrix(0, rows=numMerges, cols=1)
+    for(i in 1:numMerges) {
+        dist = as.scalar(hierarchy[i,3])
+        if(dist > 0) {
+            lambda[i,1] = 1.0 / dist
+        } else {
+            lambda[i,1] = 1e15
+        }
+    }
+    
+    lambda_birth = matrix(1e15, rows=numNodes, cols=1)
+    lambda_death = matrix(0, rows=numNodes, cols=1)
+    cluster_size = matrix(0, rows=numNodes, cols=1)
+
+    # initialize the leaf nodes to have cluster size 1
+    for(i in 1:n) {
+        cluster_size[i,1] = 1
+    }
+    
+    for(i in 1:numMerges) {
+        left = as.integer(as.scalar(hierarchy[i,1]))
+        right = as.integer(as.scalar(hierarchy[i,2]))
+        newId = n + i
+        merge_lambda = as.scalar(lambda[i,1])
+        
+        # cluster newId starts existing as its own cluster at this density level
+        # and that's why the children get their det set at the same density
+        lambda_birth[newId,1] = merge_lambda
+        lambda_death[left,1] = merge_lambda
+        lambda_death[right,1] = merge_lambda
+        cluster_size[newId,1] = as.scalar(cluster_size[left,1]) + as.scalar(cluster_size[right,1])
+    }
+    
+    # root cluster exists all the way
+    rootId = 2*n - 1
+    lambda_death[rootId,1] = 0
+    
+    # compute own stability for each internal node
+    # NOTE: If the cluster is big enough, we assign stability.
+    # The more long-lived it is (birth - death) and 
+    # the larger it is, the more stable it is.
+    stability = matrix(0, rows=numNodes, cols=1)
+    for(nodeId in (n+1):numNodes) {
+        size = as.scalar(cluster_size[nodeId,1])
+        birth = as.scalar(lambda_birth[nodeId,1])
+        death = as.scalar(lambda_death[nodeId,1])
+        if(size >= minClSize) {
+            stability[nodeId,1] = size * (birth - death)
+        }
+    }
+    
+    # compute subtree stability (best achievable from each subtree)
+    subtree_stability = matrix(0, rows=numNodes, cols=1)
+    
+    # leaf nodes have 0 subtree stability
+    for(i in 1:n) {
+        subtree_stability[i,1] = 0
+    }
+    
+    # process merges in order (bottom-up)
+    for(i in 1:numMerges) {
+        nodeId = n + i
+        left = as.integer(as.scalar(hierarchy[i,1]))
+        right = as.integer(as.scalar(hierarchy[i,2]))
+        
+        children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
+        own_stab = as.scalar(stability[nodeId,1])
+        
+        # Subtree stability is the best we can achieve from this subtree
+        if(children_subtree > own_stab) {
+            subtree_stability[nodeId,1] = children_subtree
+        } else {
+            subtree_stability[nodeId,1] = own_stab
+        }
+    }
+    
+    # select clusters
+    selected = matrix(0, rows=numNodes, cols=1)
+    selected[rootId,1] = 1
+    
+    i = numMerges
+    while(i >= 1) {
+        nodeId = n + i
+        
+        if(as.scalar(selected[nodeId,1]) == 1) {
+            left = as.integer(as.scalar(hierarchy[i,1]))
+            right = as.integer(as.scalar(hierarchy[i,2]))
+            
+            children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
+            own_stab = as.scalar(stability[nodeId,1])
+            parent_size = as.scalar(cluster_size[nodeId,1])
+            
+            # select children if they have higher subtree stability
+            if(parent_size < minClSize | children_subtree > own_stab) {
+                selected[nodeId,1] = 0
+                selected[left,1] = 1
+                selected[right,1] = 1
+            }
+        }
+        
+        i = i - 1
+    }
+    
+    # assign labels
+    labels = matrix(-1, rows=n, cols=1)
+    cluster_id = 1
+    
+    for(nodeId in 1:numNodes) {
+        if(as.scalar(selected[nodeId,1]) == 1) {
+            size = as.scalar(cluster_size[nodeId,1])
+            
+            if(size >= minClSize) {
+                leaves = getLeafDescendants(hierarchy, n, nodeId)
+                
+                for(j in 1:nrow(leaves)) {
+                    leafId = as.integer(as.scalar(leaves[j,1]))
+                    if(leafId >= 1 & leafId <= n) {
+                        labels[leafId,1] = cluster_id
+                    }
+                }
+                
+                cluster_id = cluster_id + 1
+            }
+        }
+    }
+    
+    stabilities = stability
+}
diff --git a/test_extract_stabe_clusters.dml b/test_extract_stabe_clusters.dml
@@ -0,0 +1,136 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 6 point example with clear cluster structure
+# 1,2,3 form tight cluster A, 4,5,6 form tight cluster B, A and B are far apart (10)
+
+n = 6
+distances = matrix(10, rows=n, cols=n)
+
+# points 1,2,3 (cluster A)
+distances[1,2] = 1
+distances[2,1] = 1
+
+distances[1,3] = 2
+distances[3,1] = 2
+
+distances[2,3] = 1
+distances[3,2] = 1
+
+# points 4,5,6 (cluster B)
+distances[4,5] = 1
+distances[5,4] = 1
+
+distances[4,6] = 2
+distances[6,4] = 2
+
+distances[5,6] = 1
+distances[6,5] = 1
+
+# zero diagonal (to self)
+for(i in 1:n) {
+    distances[i,i] = 0
+}
+
+
+print("\nBuilding MST")
+expected_edges = matrix("2 1  3 2  6 3  5 6  4 5", rows=5, cols=2)
+expected_weights = matrix("1 1 10 1 1", rows=5, cols=1)
+[edges, weights] = hdb::buildMST(distances, n)
+edges_match = (min(edges == expected_edges) == 1)
+weights_match = (min(weights == expected_weights) == 1)
+if (edges_match) {
+    print("Pass: edges match.")
+} else {
+    print("Fail: edges don't match.")
+}
+if (weights_match) {
+    print("Pass: weights match.")
+} else {
+    print("Fail: weights don't match.")
+}
+print("MST edges:\n" + toString(edges))
+print("MST weights:\n " + toString(weights))
+
+
+print("\nBuilding hierarchy")
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n)
+expected_hierarchy = matrix("2 1 1  3 7 1  5 6 1  4 9 1  10 8 10", rows=5, cols=3)
+expected_sizes = matrix("2 3 2 3 6", rows=5, cols=1)
+hierachy_match = (min(hierarchy == expected_hierarchy) == 1)
+sizes_match = (min(sizes == expected_sizes) == 1)
+if (hierachy_match) {
+    print("Pass: hierachy mathes.")
+} else {
+    print("Fail: hierarchy doesn't match.")
+}
+if (sizes_match) {
+    print("Pass: sizes match.")
+} else {
+    print("Fail: sizes don't match.")
+}
+print("Hierarchy:\n" + toString(hierarchy))
+print("Sizes:\n" + toString(sizes))
+
+
+print("\nExtracting stable clusters with minClSize=2")
+[labels, stabilities] = hdb::extractStableClusters(hierarchy, weights, n, 2)
+expected_labels = matrix("1 1 1 2 2 2", rows=6, cols=1)
+expected_stabilites = matrix("0 0 0 0 0 0 0 2.7 0 2.7 0.6", rows=n*2-1, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+tolerance = 1e-10
+stabilities_match = max(abs(stabilities - expected_stabilites)) < tolerance
+if (labels_match) {
+    print("Pass: labels match.")
+} else {
+    print("Fail: labels don't match.")
+}
+if (stabilities_match) {
+    print("Pass: stabilities match.")
+} else {
+    print("Fail: stabilities don't match.")
+}
+print("Cluster labels:\n" + toString(labels))
+print("Top stabilities:\n" + toString(stabilities))
+
+
+
+# check results (we have some duplicate logic here, but anyway)
+num_clusters = max(labels)
+num_noise = sum(labels == -1)
+
+print("\nNumber of clusters found: " + num_clusters)
+print("Number of noise points: " + num_noise)
+
+# should find 2 clusters
+test1 = (num_clusters == 2)
+print("Found 2 clusters: " + test1)
+
+# no points should be noise
+test2 = (num_noise == 0)
+print("No noise points: " + test2)
+
+# points 1,2,3 should be in same cluster
+label1 = as.scalar(labels[1])
+label2 = as.scalar(labels[2])
+label3 = as.scalar(labels[3])
+test3 = (label1 == label2) & (label2 == label3) & (label1 > 0)
+print("points 1,2,3 in same cluster: " + test3)
+
+# points 4,5,6 should be in same
+label4 = as.scalar(labels[4])
+label5 = as.scalar(labels[5])
+label6 = as.scalar(labels[6])
+test4 = (label4 == label5) & (label5 == label6) & (label4 > 0)
+print("Points 4,5,6 in same cluster: " + test4)
+
+# clusters A and B should be different
+test5 = (label1 != label4)
+print("Two clusters are different: " + test5)
+
+test_pass = edges_match & weights_match & hierachy_match & sizes_match & labels_match & stabilities_match & test1 & test2 & test3 & test4 & test5
+
+if(test_pass) {
+    print("\nAll tests passed\n")
+} else {
+    print("\nTests failed\n")
+}