Skip to content

Commit

Permalink
apply code style
Browse files Browse the repository at this point in the history
  • Loading branch information
psobczyk committed Mar 8, 2020
1 parent 1a56409 commit ca8e0c7
Show file tree
Hide file tree
Showing 22 changed files with 504 additions and 435 deletions.
70 changes: 38 additions & 32 deletions R/auxiliary.functions.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
#' mBIC for subspace clustering
#'
#' Computes the value of modified Bayesian Information Criterion (mBIC) for
#' given data set partition and clusters' dimensionalities. In each cluster we
#' assume that variables are spanned by few factors. Considering maximum
#' likelihood we get that those factors are in fact principal components.
#'
#' Computes the value of modified Bayesian Information Criterion (mBIC) for
#' given data set partition and clusters' dimensionalities. In each cluster we
#' assume that variables are spanned by few factors. Considering maximum
#' likelihood we get that those factors are in fact principal components.
#' Additionally, it uses by default an informative prior distribution on models.
#'
#'
#'
#'
#' @param X A matrix with only quantitative variables.
#' @param segmentation A vector, segmentation for which likelihood is computed.
#' @param segmentation A vector, segmentation for which likelihood is computed.
#' Clusters numbers should be from range [1, numb.clusters].
#' @param dims A vector of integers, dimensions of subspaces. Number of
#' principal components (fixed or chosen by PESEL criterion) that span each
#' @param dims A vector of integers, dimensions of subspaces. Number of
#' principal components (fixed or chosen by PESEL criterion) that span each
#' subspace.
#' @param numb.clusters An integer, number of clusters.
#' @param max.dim An integer, upper bound for allowed dimension of a subspace.
#' @param flat.prior A boolean, if TRUE (default is FALSE) then flat prior on
#' @param flat.prior A boolean, if TRUE (default is FALSE) then flat prior on
#' models is used.
#' @keywords internal
#' @return Value of mBIC
Expand All @@ -26,15 +26,17 @@ cluster.pca.BIC <- function(X, segmentation, dims, numb.clusters, max.dim, flat.
}
D <- dim(X)[1]
p <- dim(X)[2]

formula <- rep(0, numb.clusters)
for (k in 1:numb.clusters) {
# one cluster
dimk <- dims[k]
Xk <- X[, segmentation == k, drop = F]
if (dim(Xk)[2] > dimk && dim(Xk)[1] > dimk) {
formula[k] <- pesel(X = Xk, npc.min = dimk, npc.max = dimk, scale = FALSE,
method = "heterogenous")$vals[1]
formula[k] <- pesel(
X = Xk, npc.min = dimk, npc.max = dimk, scale = FALSE,
method = "heterogenous"
)$vals[1]
} else {
warning("The dimensionality of the cluster was greater or equal than max(number of observation, number of variables) in the cluster.
Ignoring the cluster during mBIC calculation")
Expand All @@ -52,23 +54,23 @@ cluster.pca.BIC <- function(X, segmentation, dims, numb.clusters, max.dim, flat.
}

#' Choses a subspace for a variable
#'
#' Selects a subspace closest to a given variable. To select the subspace, the method
#' considers (for every subspace) a subset of its principal components and tries
#' to fit a linear model with the variable as the response. Then the method chooses
#'
#' Selects a subspace closest to a given variable. To select the subspace, the method
#' considers (for every subspace) a subset of its principal components and tries
#' to fit a linear model with the variable as the response. Then the method chooses
#' the subspace for which the value of BIC was the highest.
#'
#' @param variable A variable to be assigned.
#' @param pcas Orthogonal basis for each of the subspaces.
#' @param number.clusters Number of subspaces (clusters).
#' @param show.warnings A boolean - if set to TRUE all warnings are displayed, default value is FALSE.
#' @param common_sigma A boolean - if set to FALSE, seperate sigma is estimated for each cluster,
#' default value is TRUE
#' default value is TRUE
#' @keywords internal
#' @return index Number of most similar subspace to variable.
choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings = FALSE, common_sigma = TRUE) {
BICs <- NULL
if(common_sigma) {
if (common_sigma) {
res <- fastLmPure(as.matrix(Matrix::bdiag(pcas)), rep(variable, number.clusters), method = 0L)$residuals
n <- length(variable)
sigma.hat <- sqrt(sum(res^2) / (n * number.clusters))
Expand All @@ -79,19 +81,19 @@ choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings =
nparams <- ncol(pcas[[i]])
res.part <- res[((i - 1) * n + 1):(i * n)]
loglik <- sum(dnorm(res.part, 0, sigma.hat, log = T))
BICs[i] <- loglik - nparams * log(n)/2
BICs[i] <- loglik - nparams * log(n) / 2
}
} else {
for (i in 1:number.clusters) {
nparams <- ncol(pcas[[i]])
n <- length(variable)
res <- fastLmPure(pcas[[i]], variable, method = 0L)$residuals
sigma.hat <- sqrt(sum(res^2)/n)
sigma.hat <- sqrt(sum(res^2) / n)
if (sigma.hat < 1e-15 && show.warnings) {
warning("In function choose.cluster.BIC: estimated value of noise in cluster is <1e-15. It might corrupt the result.")
}
loglik <- sum(dnorm(res, 0, sigma.hat, log = T))
BICs[i] <- loglik - nparams * log(n)/2
BICs[i] <- loglik - nparams * log(n) / 2
}
}
which.max(BICs)
Expand All @@ -100,7 +102,7 @@ choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings =
#' Calculates principal components for every cluster
#'
#' For given segmentation this function estimates dimensionality of each cluster (or chooses fixed dimensionality)
#' and for each cluster calculates the number of principal components equal to the this dimensionality
#' and for each cluster calculates the number of principal components equal to the this dimensionality
#'
#' @param X A data matrix.
#' @param segmentation A vector, segmentation of variables into clusters.
Expand All @@ -118,8 +120,10 @@ calculate.pcas <- function(X, segmentation, number.clusters, max.subspace.dim, e
a <- summary(prcomp(x = Xk))
if (estimate.dimensions) {
max.dim <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
cut <- max(1, pesel(X = Xk, npc.min = 1, npc.max = max.dim, scale = FALSE,
method = "heterogenous")$nPCs)
cut <- max(1, pesel(
X = Xk, npc.min = 1, npc.max = max.dim, scale = FALSE,
method = "heterogenous"
)$nPCs)
} else {
cut <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
}
Expand All @@ -132,7 +136,7 @@ calculate.pcas <- function(X, segmentation, number.clusters, max.subspace.dim, e
}

#' Plot mlcc.fit class object
#'
#'
#' @param x mlcc.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
#' @export
Expand All @@ -145,7 +149,7 @@ plot.mlcc.fit <- function(x, ...) {
}

#' Print mlcc.fit class object
#'
#'
#' @param x mlcc.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
#' @export
Expand All @@ -159,7 +163,7 @@ print.mlcc.fit <- function(x, ...) {
}

#' Print mlcc.reps.fit class object
#'
#'
#' @param x mlcc.reps.fit class object
#' @param ... Further arguments to be passed to or from other methods. They are
#' ignored in this function.
Expand All @@ -174,7 +178,7 @@ print.mlcc.reps.fit <- function(x, ...) {
}

#' Print clusters obtained from MLCC
#'
#'
#' @param data The original data set.
#' @param segmentation A vector, segmentation of variables into clusters.
#' @export
Expand All @@ -184,8 +188,10 @@ show.clusters <- function(data, segmentation) {
clusters <- lapply(1:max(segmentation), function(i) {
colnames_in_cluster <- colnames(data)[segmentation == i]
current_cluster_size <- length(colnames_in_cluster)
c(colnames_in_cluster,
rep("-", times = max_cluster_size - current_cluster_size))
c(
colnames_in_cluster,
rep("-", times = max_cluster_size - current_cluster_size)
)
})
clusters <- as.data.frame(clusters)
colnames(clusters) <- paste("cluster", 1:max(segmentation), sep = "_")
Expand Down
68 changes: 37 additions & 31 deletions R/data.simulation.R
Original file line number Diff line number Diff line change
@@ -1,46 +1,48 @@
#' Simulates subspace clustering data
#'
#' Generates data for simulation with a low-rank subspace structure: variables
#' are clustered and each cluster has a low-rank representation. Factors than
#'
#' Generates data for simulation with a low-rank subspace structure: variables
#' are clustered and each cluster has a low-rank representation. Factors than
#' span subspaces are not shared between clusters.
#'
#'
#' @param n An integer, number of individuals.
#' @param SNR A numeric, signal to noise ratio measured as variance of the
#' @param SNR A numeric, signal to noise ratio measured as variance of the
#' variable, element of a subspace, to the variance of noise.
#' @param K An integer, number of subspaces.
#' @param numb.vars An integer, number of variables in each subspace.
#' @param max.dim An integer, if equal.dims is TRUE then max.dim is dimension of
#' each subspace. If equal.dims is FALSE then subspaces dimensions are drawn
#' each subspace. If equal.dims is FALSE then subspaces dimensions are drawn
#' from uniform distribution on [min.dim,max.dim].
#' @param min.dim An integer, minimal dimension of subspace .
#' @param equal.dims A boolean, if TRUE (value set by default) all clusters are
#' @param equal.dims A boolean, if TRUE (value set by default) all clusters are
#' of the same dimension.
#' @export
#' @return A list consisting of: \item{X}{matrix, generated data}
#' \item{signals}{matrix, data without noise} \item{dims}{vector, dimensions
#' of subspaces} \item{factors}{matrix, columns of which span subspaces}
#' @return A list consisting of: \item{X}{matrix, generated data}
#' \item{signals}{matrix, data without noise} \item{dims}{vector, dimensions
#' of subspaces} \item{factors}{matrix, columns of which span subspaces}
#' \item{s}{vector, true partiton of variables}
#' @examples
#' sim.data <- data.simulation()
#' sim.data2 <- data.simulation(n = 30, SNR = 2, K = 5, numb.vars = 20,
#' max.dim = 3, equal.dims = FALSE)
data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim = 2,
min.dim = 1, equal.dims = TRUE) {
sigma <- 1/SNR
#' sim.data2 <- data.simulation(
#' n = 30, SNR = 2, K = 5, numb.vars = 20,
#' max.dim = 3, equal.dims = FALSE
#' )
data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim = 2,
min.dim = 1, equal.dims = TRUE) {
sigma <- 1 / SNR
# subspaces dimensions depend on equal.dims value
if (equal.dims) {
dims <- rep(max.dim, K)
} else {
dims <- sample(1:max.dim, K, replace = T)
}

X <- NULL
Y <- NULL
s <- NULL
factors <- NULL
for (j in 1:K) {
Z <- qr.Q(qr(replicate(dims[j], rnorm(n, 0, 1))))
coeff <- matrix(runif(dims[j] * numb.vars, 0.1, 1) * sign(runif(dims[j] *
coeff <- matrix(runif(dims[j] * numb.vars, 0.1, 1) * sign(runif(dims[j] *
numb.vars, -1, 1)), nrow = dims[j])
SIGNAL <- Z %*% coeff
SIGNAL <- scale(SIGNAL)
Expand All @@ -54,36 +56,38 @@ data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim =


#' Simulates subspace clustering data with shared factors
#'
#'
#' Generating data for simulation with a low-rank subspace structure: variables
#' are clustered and each cluster has a low-rank representation. Factors that
#' span subspaces are shared between clusters.
#'
#'
#' @inheritParams data.simulation
#' @param numb.factors An integer, number of factors from which subspaces basis
#' will be drawn.
#' @param separation.parameter a numeric, coefficients of variables in each
#' @param separation.parameter a numeric, coefficients of variables in each
#' subspace basis are drawn from range [separation.parameter,1]
#' @export
#' @return A list consisting of: \item{X}{matrix, generated data}
#' @return A list consisting of: \item{X}{matrix, generated data}
#' \item{signals}{matrix, data without noise} \item{factors}{matrix, columns
#' of which span subspaces} \item{indices}{list of vectors, indices of factors
#' that span subspaces} \item{dims}{vector, dimensions of subspaces}
#' that span subspaces} \item{dims}{vector, dimensions of subspaces}
#' \item{s}{vector, true partiton of variables}
#' @examples
#' sim.data <- data.simulation.factors()
#' sim.data2 <- data.simulation.factors(n = 30, SNR = 2, K = 5, numb.vars = 20,
#' numb.factors = 10, max.dim = 3, equal.dims = FALSE, separation.parameter = 0.2)
data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, numb.factors = 10,
min.dim = 1, max.dim = 2, equal.dims = TRUE, separation.parameter = 0.1) {
sigma <- 1/SNR
#' sim.data2 <- data.simulation.factors(
#' n = 30, SNR = 2, K = 5, numb.vars = 20,
#' numb.factors = 10, max.dim = 3, equal.dims = FALSE, separation.parameter = 0.2
#' )
data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, numb.factors = 10,
min.dim = 1, max.dim = 2, equal.dims = TRUE, separation.parameter = 0.1) {
sigma <- 1 / SNR
# subspaces dimensions depend on equal.dims value
if (equal.dims) {
dims <- rep(max.dim, K)
} else {
dims <- sample(min.dim:max.dim, K, replace = T)
}

factors <- scale(replicate(numb.factors, rnorm(n, 0, 1)))
X <- NULL
Y <- NULL
Expand All @@ -92,14 +96,16 @@ data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, nu
for (j in 1:K) {
factors.indices[[j]] <- sample(numb.factors, dims[j], replace = FALSE)
Z <- factors[, factors.indices[[j]], drop = FALSE]
coeff <- matrix(runif(dims[j] * numb.vars, separation.parameter, 1) * sign(runif(dims[j] *
coeff <- matrix(runif(dims[j] * numb.vars, separation.parameter, 1) * sign(runif(dims[j] *
numb.vars, -1, 1)), nrow = dims[j])
SIGNAL <- Z %*% coeff
SIGNAL <- scale(SIGNAL)
Y <- cbind(Y, SIGNAL)
X <- cbind(X, SIGNAL + replicate(numb.vars, rnorm(n, 0, sigma)))
s <- c(s, rep(j, numb.vars))
}
return(list(X = X, signals = Y, factors = factors, indices = factors.indices,
dims = dims, s = s))
return(list(
X = X, signals = Y, factors = factors, indices = factors.indices,
dims = dims, s = s
))
}
49 changes: 26 additions & 23 deletions R/integration.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
#' Computes integration and acontamination of the clustering
#'
#' Integartion and acontamination are measures of the quality of a clustering
#' with a reference to a true partition. Let \eqn{X = (x_1, \ldots x_p)} be the
#' data set, \eqn{A} be a partition into clusters \eqn{A_1, \ldots A_n} (true
#' partition) and \eqn{B} be a partition into clusters \eqn{B_1, \ldots, B_m}.
#' Then for cluster \eqn{A_j} integration is eqaul to: \deqn{Int(A_j) =
#' \frac{max_{k = 1, \ldots, m} \# \{ i \in \{ 1, \ldots p \}: x_i \in A_j
#' \wedge x_i \in B_k \} }{\# A_j}} The \eqn{B_k} for which the value is
#' maximized is called the integrating cluster of \eqn{A_j}. Then the
#' integration for the whole clustering equals is \eqn{Int(A,B) = \frac{1}{n}
#' \sum_{j=1}^n Int(A_j)} .The acontamination is defined by: \deqn{Acont(A_j) =
#'
#' Integartion and acontamination are measures of the quality of a clustering
#' with a reference to a true partition. Let \eqn{X = (x_1, \ldots x_p)} be the
#' data set, \eqn{A} be a partition into clusters \eqn{A_1, \ldots A_n} (true
#' partition) and \eqn{B} be a partition into clusters \eqn{B_1, \ldots, B_m}.
#' Then for cluster \eqn{A_j} integration is eqaul to: \deqn{Int(A_j) =
#' \frac{max_{k = 1, \ldots, m} \# \{ i \in \{ 1, \ldots p \}: x_i \in A_j
#' \wedge x_i \in B_k \} }{\# A_j}} The \eqn{B_k} for which the value is
#' maximized is called the integrating cluster of \eqn{A_j}. Then the
#' integration for the whole clustering equals is \eqn{Int(A,B) = \frac{1}{n}
#' \sum_{j=1}^n Int(A_j)} .The acontamination is defined by: \deqn{Acont(A_j) =
#' \frac{ \# \{ i \in \{ 1, \ldots p \}: x_i \in A_j \wedge x_i \in B_k \} }{\#
#' B_k}} where \eqn{B_k} is the integrating cluster for \eqn{A_j}. Then the
#' acontamination for the whole dataset is \eqn{Acont(A,B) = \frac{1}{n}
#' B_k}} where \eqn{B_k} is the integrating cluster for \eqn{A_j}. Then the
#' acontamination for the whole dataset is \eqn{Acont(A,B) = \frac{1}{n}
#' \sum_{j=1}^n Acont(A_j)}
#'
#'
#' @param group A vector, first partition.
#' @param true_group A vector, second (reference) partition.
#' @references {M. Sołtys. Metody analizy skupień. Master’s thesis, Wrocław
#' @references {M. Sołtys. Metody analizy skupień. Master’s thesis, Wrocław
#' University of Technology, 2010}
#' @export
#' @return An array containing values of integration and acontamination.
#' @examples
#' \donttest{
#' sim.data <- data.simulation(n = 20, SNR = 1, K = 2, numb.vars = 50, max.dim = 2)
#' true_segmentation <- rep(1:2, each=50)
#' mlcc.fit <- mlcc.reps(sim.data$X, numb.clusters = 2, max.dim = 2, numb.cores=1)
#' integration(mlcc.fit$segmentation, true_segmentation)}
#'
#' true_segmentation <- rep(1:2, each = 50)
#' mlcc.fit <- mlcc.reps(sim.data$X, numb.clusters = 2, max.dim = 2, numb.cores = 1)
#' integration(mlcc.fit$segmentation, true_segmentation)
#' }
#'
integration <- function(group, true_group) {
n <- length(group)
K1 <- max(unique(group))
Expand All @@ -37,14 +38,16 @@ integration <- function(group, true_group) {
}
integrationMatrix <- matrix(0, nrow = K1, ncol = K2)
for (i in 1:n) {
integrationMatrix[group[i], true_group[i]] = integrationMatrix[group[i],
true_group[i]] + 1
integrationMatrix[group[i], true_group[i]] <- integrationMatrix[
group[i],
true_group[i]
] + 1
}
clusters <- apply(integrationMatrix, 2, max)
cluster_indices <- apply(integrationMatrix, 2, which.max)
sizes_true <- apply(integrationMatrix, 2, sum)
sizes_group <- apply(integrationMatrix, 1, sum)
int <- clusters/sizes_true
acont <- clusters/sizes_group[cluster_indices]
int <- clusters / sizes_true
acont <- clusters / sizes_group[cluster_indices]
return(c(mean(int), mean(acont)))
}
Loading

0 comments on commit ca8e0c7

Please sign in to comment.