-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathauxiliary.functions.R
executable file
·200 lines (193 loc) · 7.73 KB
/
auxiliary.functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#' mBIC for subspace clustering
#'
#' Computes the value of modified Bayesian Information Criterion (mBIC) for
#' given data set partition and clusters' dimensionalities. In each cluster we
#' assume that variables are spanned by few factors. Considering maximum
#' likelihood we get that those factors are in fact principal components.
#' Additionally, it uses by default an informative prior distribution on models.
#'
#'
#' @param X A matrix with only quantitative variables.
#' @param segmentation A vector, segmentation for which likelihood is computed.
#' Clusters numbers should be from range [1, numb.clusters].
#' @param dims A vector of integers, dimensions of subspaces. Number of
#' principal components (fixed or chosen by PESEL criterion) that span each
#' subspace.
#' @param numb.clusters An integer, number of clusters.
#' @param max.dim An integer, upper bound for allowed dimension of a subspace.
#' @param flat.prior A boolean, if TRUE (default is FALSE) then flat prior on
#' models is used.
#' @keywords internal
#' @return Value of mBIC
cluster.pca.BIC <- function(X, segmentation, dims, numb.clusters, max.dim, flat.prior = FALSE) {
if (!is.matrix(X)) {
# if X is one variable it is stored as vector
X <- matrix(X, ncol = 1)
}
D <- dim(X)[1]
p <- dim(X)[2]
formula <- rep(0, numb.clusters)
for (k in 1:numb.clusters) {
# one cluster
dimk <- dims[k]
Xk <- X[, segmentation == k, drop = F]
if (dim(Xk)[2] > dimk && dim(Xk)[1] > dimk) {
formula[k] <- pesel(
X = Xk, npc.min = dimk, npc.max = dimk, scale = FALSE,
method = "heterogenous"
)$vals[1]
} else {
warning("The dimensionality of the cluster was greater or equal than
max(number of observation, number of variables) in the cluster.
Ignoring the cluster during mBIC calculation")
formula[k] <- 0
}
}
# apriori
apriori.segmentations <- -p * log(numb.clusters)
apriori.dimensions <- -log(max.dim) * numb.clusters
BIC <- sum(formula)
if (!flat.prior) {
BIC <- BIC + apriori.segmentations + apriori.dimensions
}
return(BIC)
}
#' Chooses a subspace for a variable
#'
#' Selects a subspace closest to a given variable. To select the subspace, the method
#' considers (for every subspace) a subset of its principal components and tries
#' to fit a linear model with the variable as the response. Then the method chooses
#' the subspace for which the value of BIC was the highest.
#'
#' @param variable A variable to be assigned.
#' @param pcas Orthogonal basis for each of the subspaces.
#' @param number.clusters Number of subspaces (clusters).
#' @param show.warnings A boolean - if set to TRUE all warnings are displayed, default value is FALSE.
#' @param common_sigma A boolean - if set to FALSE, seperate sigma is estimated for each cluster,
#' default value is TRUE
#' @keywords internal
#' @return index Number of most similar subspace to variable.
choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings = FALSE, common_sigma = TRUE) {
BICs <- NULL
if (common_sigma) {
res <- fastLmPure(cbind(1, as.matrix(Matrix::bdiag(pcas))), rep(variable, number.clusters), method = 0L)$residuals
n <- length(variable)
sigma.hat <- sqrt(sum(res^2) / (n * number.clusters))
if (sigma.hat < 1e-15 && show.warnings) {
warning("In function choose.cluster.BIC: estimated value of noise in cluster is <1e-15. It might corrupt the result.")
}
for (i in 1:number.clusters) {
nparams <- ncol(pcas[[i]])
res.part <- res[((i - 1) * n + 1):(i * n)]
loglik <- sum(dnorm(res.part, 0, sigma.hat, log = T))
BICs[i] <- loglik - nparams * log(n) / 2
}
} else {
for (i in 1:number.clusters) {
nparams <- ncol(pcas[[i]])
n <- length(variable)
res <- fastLmPure(pcas[[i]], variable, method = 0L)$residuals
sigma.hat <- sqrt(sum(res^2) / n)
if (sigma.hat < 1e-15 && show.warnings) {
warning("In function choose.cluster.BIC: estimated value of noise in cluster is <1e-15. It might corrupt the result.")
}
loglik <- sum(dnorm(res, 0, sigma.hat, log = T))
BICs[i] <- loglik - nparams * log(n) / 2
}
}
return(which.max(BICs))
}
#' Calculates principal components for every cluster
#'
#' For given segmentation this function estimates dimensionality of each cluster (or chooses fixed dimensionality)
#' and for each cluster calculates the number of principal components equal to the this dimensionality
#'
#' @param X A data matrix.
#' @param segmentation A vector, segmentation of variables into clusters.
#' @param number.clusters An integer, number of subspaces (clusters).
#' @param max.subspace.dim An integer, upper bound for allowed dimension of subspace.
#' @param estimate.dimensions A boolean, if TRUE subspaces dimensions are estimated using PESEL.
#' @keywords internal
#' @return A subset of principal components for every cluster.
calculate.pcas <- function(X, segmentation, number.clusters, max.subspace.dim, estimate.dimensions) {
rowNumb <- dim(X)[1]
pcas <- lapply(1:number.clusters, function(k) {
Xk <- X[, segmentation == k, drop = F]
sub.dim <- dim(Xk)
if (sub.dim[2] > 0) {
a <- summary(prcomp(x = Xk))
if (estimate.dimensions) {
max.dim <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
cut <- max(0, pesel(
X = Xk, npc.min = 0, npc.max = max.dim, scale = FALSE,
method = "heterogenous"
)$nPCs)
} else {
cut <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
}
return(matrix(a$x[, seq_len(cut)], nrow = rowNumb))
} else { #if there are no variables initiate a cluster at random
return(matrix(rnorm(rowNumb), nrow = rowNumb, ncol = 1))
}
})
return(pcas)
}
#' Plot mlcc.fit class object
#'
#' @param x mlcc.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
#' @export
#' @keywords internal
plot.mlcc.fit <- function(x, ...) {
clusterNumbs <- lapply(x$all.fit, function(y) y$nClusters)
BICVals <- lapply(x$all.fit, function(y) y$BIC)
plot.default(clusterNumbs, BICVals, type = "b", xaxt = "n", ylab = "BIC", xlab = "Number of clusters")
axis(side = 1, labels = clusterNumbs, at = clusterNumbs)
}
#' Print mlcc.fit class object
#'
#' @param x mlcc.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
#' @export
#' @keywords internal
print.mlcc.fit <- function(x, ...) {
cat("$nClusters: ", x$nClusters, "\n")
cat("$segmentation:\n")
print(x$segmentation)
cat("$BIC: ", x$BIC, "\n")
cat("$subspacesDimensions:\n", unlist(x$subspacesDimensions), "\n")
}
#' Print mlcc.reps.fit class object
#'
#' @param x mlcc.reps.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are
#' ignored in this function.
#' @export
#' @keywords internal
print.mlcc.reps.fit <- function(x, ...) {
cat("$segmentation:\n")
print(x$segmentation)
cat("$BIC: ", x$BIC, "\n")
cat("$basis:\n")
cat(str(x$basis))
}
#' Print clusters obtained from MLCC
#'
#' @param data The original data set.
#' @param segmentation A vector, segmentation of variables into clusters.
#' @export
show.clusters <- function(data, segmentation) {
data <- as.data.frame(data)
max_cluster_size <- max(as.data.frame(table(segmentation))$Freq)
clusters <- lapply(1:max(segmentation), function(i) {
colnames_in_cluster <- colnames(data)[segmentation == i]
current_cluster_size <- length(colnames_in_cluster)
c(
colnames_in_cluster,
rep("-", times = max_cluster_size - current_cluster_size)
)
})
clusters <- as.data.frame(clusters)
colnames(clusters) <- paste("cluster", 1:max(segmentation), sep = "_")
print(clusters)
}