diff --git a/src/coreset.jl b/src/coreset.jl index 2ca2a51..b56bde0 100644 --- a/src/coreset.jl +++ b/src/coreset.jl @@ -70,7 +70,9 @@ function kmeans!(alg::Coreset, containers, X, k, weights, metric::Euclidean = Eu totalcost = sum(containers.totalcost) - return KmeansResult(res.centers, containers.labels, T[], Int[], T[], totalcost, res.iterations, res.converged) + counts = collect(values(sort(countmap(containers.labels)))) + + return KmeansResult(res.centers, containers.labels, T[], counts, T[], totalcost, res.iterations, res.converged) end diff --git a/src/elkan.jl b/src/elkan.jl index db9e898..6cd2920 100644 --- a/src/elkan.jl +++ b/src/elkan.jl @@ -81,10 +81,12 @@ function kmeans!(alg::Elkan, containers, X, k, weights=nothing, metric=Euclidean println("Successfully terminated with convergence.") end + counts = collect(values(sort(countmap(containers.labels)))) + # TODO empty placeholder vectors should be calculated # TODO Float64 type definitions is too restrictive, should be relaxed # especially during GPU related development - return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged) + return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged) end diff --git a/src/hamerly.jl b/src/hamerly.jl index 8b51738..63135d3 100644 --- a/src/hamerly.jl +++ b/src/hamerly.jl @@ -70,10 +70,12 @@ function kmeans!(alg::Hamerly, containers, X, k, weights=nothing, metric=Euclide println("Successfully terminated with convergence.") end + counts = collect(values(sort(countmap(containers.labels)))) + # TODO empty placeholder vectors should be calculated # TODO Float64 type definitions is too restrictive, should be relaxed # especially during GPU related development - return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged) + return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged) end diff --git a/src/kmeans.jl b/src/kmeans.jl index c96d0c6..fa5d393 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -146,6 +146,8 @@ end kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true, rng = Random.GLOBAL_RNG) +### IMPLEMENTATION NOTES + This main function employs the K-means algorithm to cluster all examples in the training data (design_matrix) into k groups using either the `k-means++` or random initialisation technique for selecting the initial @@ -155,7 +157,8 @@ At the end of the number of iterations specified (max_iters), convergence is achieved if difference between the current and last cost objective is less than the tolerance level (tol). An error is thrown if convergence fails. -Arguments: +### ARGUMENTS + - `alg` defines one of the algorithms used to calculate `k-means`. This argument can be omitted, by default Lloyd algorithm is used. - `n_threads` defines number of threads used for calculations, by default it is equal @@ -169,6 +172,18 @@ alternatively one can use `rand` to choose random points for init. - `verbose` is verbosity level. Details of operations can be either printed or not by setting verbose accordingly. A `KmeansResult` structure representing labels, centroids, and sum_squares is returned. + +### EXAMPLE + +```julia +X = rand(2, 100) # 100 points in 2d +km = kmeans(X, 5) # 5 clusters with the default (LLoyd) algo +km_yy = kmeans(Yinyang(), X, 5) # 5 clusters with the Yinyang algo + +kma = km.assignments # X[:,i] is a member of cluster kma[i] +kmc = km.centers # cluster i has center kmc[:,i] +kmn = km.counts # cluster i has kmn[i] points +``` """ function kmeans(alg::AbstractKMeansAlg, design_matrix, k; weights = nothing, diff --git a/src/lloyd.jl b/src/lloyd.jl index 47b624f..09a1167 100644 --- a/src/lloyd.jl +++ b/src/lloyd.jl @@ -58,10 +58,12 @@ function kmeans!(alg::Lloyd, containers, X, k, weights=nothing, metric=Euclidean println("Successfully terminated with convergence.") end + counts = collect(values(sort(countmap(containers.labels)))) + # TODO empty placeholder vectors should be calculated # TODO Float64 type definitions is too restrictive, should be relaxed # especially during GPU related development - return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged) + return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged) end kmeans(design_matrix, k; diff --git a/src/yinyang.jl b/src/yinyang.jl index 7a856ca..7ca9e9f 100644 --- a/src/yinyang.jl +++ b/src/yinyang.jl @@ -106,10 +106,12 @@ function kmeans!(alg::Yinyang, containers, X, k, weights, metric::Euclidean = Eu println("Successfully terminated with convergence.") end + counts = collect(values(sort(countmap(containers.labels)))) + # TODO empty placeholder vectors should be calculated # TODO Float64 type definitions is too restrictive, should be relaxed # especially during GPU related development - return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged) + return KmeansResult(centroids, containers.labels, T[], counts, T[], totalcost, niters, converged) end