@@ -58,7 +58,7 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
5858
5959 coreDistances = matrix(0, rows=n, cols=1)
6060 for(i in 1:n) {
61- kthDist = computeKthSmallest(distances[i,], minPts)
61+ kthDist = computeKthSmallest(t( distances[i,]) , minPts) # Add t() here!
6262 coreDistances[i] = kthDist
6363 }
6464
@@ -68,7 +68,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
6868
6969 [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
7070
71- # TODO: get stable cluster with stability score
71+ [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize)
72+
7273 # TODO: build cluster model
7374
7475 # temp dummy values
@@ -184,9 +185,9 @@ union = function(Matrix[Double] parent, Matrix[Double] rank,
184185buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n)
185186 return (Matrix[Double] hierarchy, Matrix[Double] sizes)
186187{
187- # sort edges by weight in ascending order
188- # to build the hierarchy from dense cores outward
189- sorted = order(target=weights , by=1 , decreasing=FALSE)
188+ # create indexed weights to preserve original positions after sorting
189+ indexedWeights = cbind(seq(1, nrow(weights)), weights)
190+ sorted = order(target=indexedWeights , by=2 , decreasing=FALSE)
190191
191192 # parent[i] = i, meaning each point is its own parent in the beginning
192193 parent = seq(1, n)
@@ -241,3 +242,157 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
241242 }
242243 }
243244}
245+
246+ getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId)
247+ return (Matrix[Double] leaves)
248+ {
249+ if(nodeId <= n) {
250+ leaves = matrix(nodeId, rows=1, cols=1)
251+ } else {
252+ mergeIdx = nodeId - n
253+ left = as.integer(as.scalar(hierarchy[mergeIdx,1]))
254+ right = as.integer(as.scalar(hierarchy[mergeIdx,2]))
255+
256+ leftLeaves = getLeafDescendants(hierarchy, n, left)
257+ rightLeaves = getLeafDescendants(hierarchy, n, right)
258+
259+ leaves = rbind(leftLeaves, rightLeaves)
260+ }
261+ }
262+
263+ extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights,
264+ Integer n, Integer minClSize)
265+ return (Matrix[Double] labels, Matrix[Double] stabilities)
266+ {
267+ numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events
268+ numNodes = 2*n - 1 # total nodes in the dendogram
269+
270+ # convert distances to lambda (density)
271+ lambda = matrix(0, rows=numMerges, cols=1)
272+ for(i in 1:numMerges) {
273+ dist = as.scalar(hierarchy[i,3])
274+ if(dist > 0) {
275+ lambda[i,1] = 1.0 / dist
276+ } else {
277+ lambda[i,1] = 1e15
278+ }
279+ }
280+
281+ lambda_birth = matrix(1e15, rows=numNodes, cols=1)
282+ lambda_death = matrix(0, rows=numNodes, cols=1)
283+ cluster_size = matrix(0, rows=numNodes, cols=1)
284+
285+ # initialize the leaf nodes to have cluster size 1
286+ for(i in 1:n) {
287+ cluster_size[i,1] = 1
288+ }
289+
290+ for(i in 1:numMerges) {
291+ left = as.integer(as.scalar(hierarchy[i,1]))
292+ right = as.integer(as.scalar(hierarchy[i,2]))
293+ newId = n + i
294+ merge_lambda = as.scalar(lambda[i,1])
295+
296+ # cluster newId starts existing as its own cluster at this density level
297+ # and that's why the children get their det set at the same density
298+ lambda_birth[newId,1] = merge_lambda
299+ lambda_death[left,1] = merge_lambda
300+ lambda_death[right,1] = merge_lambda
301+ cluster_size[newId,1] = as.scalar(cluster_size[left,1]) + as.scalar(cluster_size[right,1])
302+ }
303+
304+ # root cluster exists all the way
305+ rootId = 2*n - 1
306+ lambda_death[rootId,1] = 0
307+
308+ # compute own stability for each internal node
309+ # NOTE: If the cluster is big enough, we assign stability.
310+ # The more long-lived it is (birth - death) and
311+ # the larger it is, the more stable it is.
312+ stability = matrix(0, rows=numNodes, cols=1)
313+ for(nodeId in (n+1):numNodes) {
314+ size = as.scalar(cluster_size[nodeId,1])
315+ birth = as.scalar(lambda_birth[nodeId,1])
316+ death = as.scalar(lambda_death[nodeId,1])
317+ if(size >= minClSize) {
318+ stability[nodeId,1] = size * (birth - death)
319+ }
320+ }
321+
322+ # compute subtree stability (best achievable from each subtree)
323+ subtree_stability = matrix(0, rows=numNodes, cols=1)
324+
325+ # leaf nodes have 0 subtree stability
326+ for(i in 1:n) {
327+ subtree_stability[i,1] = 0
328+ }
329+
330+ # process merges in order (bottom-up)
331+ for(i in 1:numMerges) {
332+ nodeId = n + i
333+ left = as.integer(as.scalar(hierarchy[i,1]))
334+ right = as.integer(as.scalar(hierarchy[i,2]))
335+
336+ children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
337+ own_stab = as.scalar(stability[nodeId,1])
338+
339+ # Subtree stability is the best we can achieve from this subtree
340+ if(children_subtree > own_stab) {
341+ subtree_stability[nodeId,1] = children_subtree
342+ } else {
343+ subtree_stability[nodeId,1] = own_stab
344+ }
345+ }
346+
347+ # select clusters
348+ selected = matrix(0, rows=numNodes, cols=1)
349+ selected[rootId,1] = 1
350+
351+ i = numMerges
352+ while(i >= 1) {
353+ nodeId = n + i
354+
355+ if(as.scalar(selected[nodeId,1]) == 1) {
356+ left = as.integer(as.scalar(hierarchy[i,1]))
357+ right = as.integer(as.scalar(hierarchy[i,2]))
358+
359+ children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
360+ own_stab = as.scalar(stability[nodeId,1])
361+ parent_size = as.scalar(cluster_size[nodeId,1])
362+
363+ # select children if they have higher subtree stability
364+ if(parent_size < minClSize | children_subtree > own_stab) {
365+ selected[nodeId,1] = 0
366+ selected[left,1] = 1
367+ selected[right,1] = 1
368+ }
369+ }
370+
371+ i = i - 1
372+ }
373+
374+ # assign labels
375+ labels = matrix(-1, rows=n, cols=1)
376+ cluster_id = 1
377+
378+ for(nodeId in 1:numNodes) {
379+ if(as.scalar(selected[nodeId,1]) == 1) {
380+ size = as.scalar(cluster_size[nodeId,1])
381+
382+ if(size >= minClSize) {
383+ leaves = getLeafDescendants(hierarchy, n, nodeId)
384+
385+ for(j in 1:nrow(leaves)) {
386+ leafId = as.integer(as.scalar(leaves[j,1]))
387+ if(leafId >= 1 & leafId <= n) {
388+ labels[leafId,1] = cluster_id
389+ }
390+ }
391+
392+ cluster_id = cluster_id + 1
393+ }
394+ }
395+ }
396+
397+ stabilities = stability
398+ }
0 commit comments