-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmlcc.bic.R
executable file
·142 lines (140 loc) · 6.11 KB
/
mlcc.bic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#' Multiple Latent Components Clustering - Subspace clustering with automatic
#' estimation of number of clusters and their dimension
#'
#' This function is an implementation of Multiple Latent Components Clustering
#' (MLCC) algorithm which clusteres quantitative variables into a number, chosen
#' using mBIC, of groups. For each considered number of clusters in
#' \emph{numb.clusters} \code{\link{mlcc.reps}} function is called. It invokes
#' K-means based algorithm (\code{\link{mlcc.kmeans}}) finding local minimum of
#' mBIC, which is run a given number of times (\emph{numb.runs}) with different
#' initializations. The best partition is choosen with mBIC (see
#' \code{\link{mlcc.reps}} function).
#'
#' @param X A data frame or a matrix with only continuous variables.
#' @param numb.clusters A vector, numbers of clusters to be checked.
#' @param numb.runs An integer, number of runs (initializations) of
#' \code{\link{mlcc.kmeans}}.
#' @param stop.criterion An integer, if an iteration of
#' \code{\link{mlcc.kmeans}} algorithm makes less changes in partitions than
#' \code{stop.criterion}, \code{\link{mlcc.kmeans}} stops.
#' @param max.iter An integer, maximum number of iterations of the loop in
#' \code{\link{mlcc.kmeans}} algorithm.
#' @param max.dim An integer, if estimate.dimensions is FALSE then max.dim is
#' dimension of each subspace. If estimate.dimensions is TRUE then subspaces
#' dimensions are estimated from the range [1, max.dim].
#' @param scale A boolean, if TRUE (value set by default) then variables in
#' dataset are scaled to zero mean and unit variance.
#' @param numb.cores An integer, number of cores to be used, by default all
#' cores are used.
#' @param greedy A boolean, if TRUE (value set by default) the clusters are
#' estimated in a greedy way - first local minimum of mBIC is chosen.
#' @param estimate.dimensions A boolean, if TRUE (value set by default)
#' subspaces dimensions are estimated.
#' @param verbose A boolean, if TRUE plot with mBIC values for different numbers
#' of clusters is produced and values of mBIC, computed for every number of
#' clusters and subspaces dimensions, are printed (value set by default is
#' FALSE).
#' @param flat.prior A boolean, if TRUE then, instead of an informative prior
#' that takes into account number of models for a given number of clusters,
#' flat prior is used.
#' @param show.warnings A boolean, if set to TRUE all warnings are displayed,
#' default value is FALSE.
#' @export
#' @return An object of class mlcc.fit consisting of \item{segmentation}{a
#' vector containing the partition of the variables} \item{BIC}{numeric, value
#' of mBIC} \item{subspacesDimensions}{a list containing dimensions of the
#' subspaces} \item{nClusters}{an integer, estimated number of clusters}
#' \item{factors}{a list of matrices, basis for each subspace}
#' \item{all.fit}{a list of segmentation, mBIC, subspaces dimension for all
#' numbers of clusters considered for an estimated subspace dimensions}
#' \item{all.fit.dims}{a list of lists of segmentation, mBIC, subspaces
#' dimension for all numbers of clusters and subspaces dimensions considered}
#' @examples
#' \donttest{
#' sim.data <- data.simulation(n = 50, SNR = 1, K = 3, numb.vars = 50, max.dim = 3)
#' mlcc.res <- mlcc.bic(sim.data$X, numb.clusters = 1:5, numb.runs = 20,
#' numb.cores = 1, verbose = TRUE)
#' show.clusters(sim.data$X, mlcc.res$segmentation)
#' }
mlcc.bic <- function(X, numb.clusters = 1:10, numb.runs = 30, stop.criterion = 1,
max.iter = 30, max.dim = 4, scale = TRUE, numb.cores = NULL,
greedy = TRUE, estimate.dimensions = TRUE,
verbose = FALSE, flat.prior = FALSE, show.warnings = FALSE) {
if (is.data.frame(X)) {
warning("X is not a matrix. Casting to matrix.")
X <- as.matrix(X)
}
if (any(is.na(X))) {
warning("Missing values are imputed by the mean of the variable")
X[is.na(X)] <- matrix(apply(X, 2, mean, na.rm = TRUE),
ncol = ncol(X), nrow = nrow(X),
byrow = TRUE
)[is.na(X)]
}
if (any(!sapply(X, is.numeric))) {
auxi <- NULL
for (j in 1:ncol(X)) {
if (!is.numeric(X[, j])) {
auxi <- c(auxi, j)
}
}
stop(paste("\nThe following variables are not quantitative: ", auxi))
}
if (scale) {
X <- scale(X)
}
n <- nrow(X)
p <- ncol(X)
greedy.stop <- max(numb.clusters)
results <- list()
if (verbose) {
cat("Number of clusters \t BIC \n")
}
for (i in 1:length(numb.clusters)) {
number.clusters <- numb.clusters[i]
MLCC.fit <- mlcc.reps(
X = X, numb.clusters = number.clusters, numb.runs = numb.runs,
max.dim = max.dim, scale = FALSE, numb.cores = numb.cores, estimate.dimensions = estimate.dimensions,
flat.prior = flat.prior, show.warnings = show.warnings
)
results[[i]] <- list(
segmentation = MLCC.fit$segmentation, BIC = MLCC.fit$BIC,
subspacesDimensions = lapply(MLCC.fit$basis, ncol), nClusters = number.clusters,
factors = MLCC.fit$basis
)
if (greedy & (i > 2)) {
if ((results[[i]]$BIC < results[[i - 1]]$BIC) & (results[[i - 2]]$BIC <
results[[i - 1]]$BIC)) {
greedy.stop <- i
if (verbose) {
cat(paste(" ", number.clusters, " ", formatC(results[[i]]$BIC,
digits = ceiling(log(abs(results[[i]]$BIC), 10))
), "\n"))
}
break
}
}
if (verbose) {
cat(paste(" ", number.clusters, " ", formatC(results[[i]]$BIC,
digits = ceiling(log(abs(results[[i]]$BIC), 10))
), "\n"))
}
}
BICs <- lapply(results, function(res) res$BIC)
if (verbose) {
plot(numb.clusters[1:greedy.stop], BICs,
type = "b", xaxt = "n", ylab = "BIC",
xlab = "Number of clusters"
)
axis(side = 1, labels = numb.clusters[1:greedy.stop], at = numb.clusters[1:greedy.stop])
}
result <- results[[which.max(BICs)]]
result$factors <- lapply(1:result$nClusters, function(i) {
d <- ncol(result$factors[[i]])
colnames(result$factors[[i]]) <- paste(i, 1:d)
result$factors[[i]]
})
result$all.fit <- results
class(result) <- "mlcc.fit"
return(result)
}