PyDataBlog · 70Gage70 · Jun 29, 2024 · Jun 29, 2024
diff --git a/src/coreset.jl b/src/coreset.jl
@@ -70,7 +70,9 @@ function kmeans!(alg::Coreset, containers, X, k, weights, metric::Euclidean = Eu
 
     totalcost = sum(containers.totalcost)
 
-    return KmeansResult(res.centers, containers.labels, T[], Int[], T[], totalcost, res.iterations, res.converged)
+    counts = collect(values(sort(countmap(containers.labels))))
+
+    return KmeansResult(res.centers, containers.labels, T[], counts, T[], totalcost, res.iterations, res.converged)
 end
 
 

diff --git a/src/elkan.jl b/src/elkan.jl
@@ -81,10 +81,12 @@ function kmeans!(alg::Elkan, containers, X, k, weights=nothing, metric=Euclidean
         println("Successfully terminated with convergence.")
     end
 
+    counts = collect(values(sort(countmap(containers.labels))))
+
     # TODO empty placeholder vectors should be calculated
     # TODO Float64 type definitions is too restrictive, should be relaxed
     # especially during GPU related development
-    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged)
 end
 
 

diff --git a/src/hamerly.jl b/src/hamerly.jl
@@ -70,10 +70,12 @@ function kmeans!(alg::Hamerly, containers, X, k, weights=nothing, metric=Euclide
         println("Successfully terminated with convergence.")
     end
 
+    counts = collect(values(sort(countmap(containers.labels))))
+
     # TODO empty placeholder vectors should be calculated
     # TODO Float64 type definitions is too restrictive, should be relaxed
     # especially during GPU related development
-    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged)
 end
 
 

diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -146,6 +146,8 @@ end
     kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(),
     k_init="k-means++", max_iters=300, tol=1e-6, verbose=true, rng = Random.GLOBAL_RNG)
 
+### IMPLEMENTATION NOTES
+
 This main function employs the K-means algorithm to cluster all examples
 in the training data (design_matrix) into k groups using either the
 `k-means++` or random initialisation technique for selecting the initial
@@ -155,7 +157,8 @@ At the end of the number of iterations specified (max_iters), convergence is
 achieved if difference between the current and last cost objective is
 less than the tolerance level (tol). An error is thrown if convergence fails.
 
-Arguments:
+### ARGUMENTS
+
 - `alg` defines one of the algorithms used to calculate `k-means`. This
 argument can be omitted, by default Lloyd algorithm is used.
 - `n_threads` defines number of threads used for calculations, by default it is equal
@@ -169,6 +172,18 @@ alternatively one can use `rand` to choose random points for init.
 - `verbose` is verbosity level. Details of operations can be either printed or not by setting verbose accordingly.
 
 A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
+
+### EXAMPLE 
+
+```julia
+X = rand(2, 100)                # 100 points in 2d
+km = kmeans(X, 5)               # 5 clusters with the default (LLoyd) algo
+km_yy = kmeans(Yinyang(), X, 5) # 5 clusters with the Yinyang algo
+
+kma = km.assignments            # X[:,i] is a member of cluster kma[i]
+kmc = km.centers                # cluster i has center kmc[:,i]
+kmn = km.counts                 # cluster i has kmn[i] points
+```
 """
 function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
                 weights = nothing,

diff --git a/src/lloyd.jl b/src/lloyd.jl
@@ -58,10 +58,12 @@ function kmeans!(alg::Lloyd, containers, X, k, weights=nothing, metric=Euclidean
         println("Successfully terminated with convergence.")
     end
 
+    counts = collect(values(sort(countmap(containers.labels))))
+
     # TODO empty placeholder vectors should be calculated
     # TODO Float64 type definitions is too restrictive, should be relaxed
     # especially during GPU related development
-    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged)
 end
 
 kmeans(design_matrix, k;

diff --git a/src/yinyang.jl b/src/yinyang.jl
@@ -106,10 +106,12 @@ function kmeans!(alg::Yinyang, containers, X, k, weights, metric::Euclidean = Eu
         println("Successfully terminated with convergence.")
     end
 
+    counts = collect(values(sort(countmap(containers.labels))))
+
     # TODO empty placeholder vectors should be calculated
     # TODO Float64 type definitions is too restrictive, should be relaxed
     # especially during GPU related development
-    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged)
 end