apply code style

psobczyk · Mar 8, 2020 · ca8e0c7 · ca8e0c7
1 parent 1a56409
commit ca8e0c7
Show file tree

Hide file tree

Showing 22 changed files with 504 additions and 435 deletions.
diff --git a/R/auxiliary.functions.R b/R/auxiliary.functions.R
@@ -1,21 +1,21 @@
 #' mBIC for subspace clustering
-#' 
-#' Computes the value of modified Bayesian Information Criterion (mBIC) for 
-#' given data set partition and clusters' dimensionalities. In each cluster we 
-#' assume that variables are spanned by few factors. Considering maximum 
-#' likelihood we get that those factors are in fact principal components. 
+#'
+#' Computes the value of modified Bayesian Information Criterion (mBIC) for
+#' given data set partition and clusters' dimensionalities. In each cluster we
+#' assume that variables are spanned by few factors. Considering maximum
+#' likelihood we get that those factors are in fact principal components.
 #' Additionally, it uses by default an informative prior distribution on models.
-#' 
-#' 
+#'
+#'
 #' @param X A matrix with only quantitative variables.
-#' @param segmentation A vector, segmentation for which likelihood is computed. 
+#' @param segmentation A vector, segmentation for which likelihood is computed.
 #'   Clusters numbers should be from range [1, numb.clusters].
-#' @param dims A vector of integers, dimensions of subspaces. Number of 
-#'   principal components (fixed or chosen by PESEL criterion) that span each 
+#' @param dims A vector of integers, dimensions of subspaces. Number of
+#'   principal components (fixed or chosen by PESEL criterion) that span each
 #'   subspace.
 #' @param numb.clusters An integer, number of clusters.
 #' @param max.dim An integer, upper bound for allowed dimension of a subspace.
-#' @param flat.prior A boolean, if TRUE (default is FALSE) then flat prior on 
+#' @param flat.prior A boolean, if TRUE (default is FALSE) then flat prior on
 #'   models is used.
 #' @keywords internal
 #' @return Value of mBIC
@@ -26,15 +26,17 @@ cluster.pca.BIC <- function(X, segmentation, dims, numb.clusters, max.dim, flat.
   }
   D <- dim(X)[1]
   p <- dim(X)[2]
-  
+
   formula <- rep(0, numb.clusters)
   for (k in 1:numb.clusters) {
     # one cluster
     dimk <- dims[k]
     Xk <- X[, segmentation == k, drop = F]
     if (dim(Xk)[2] > dimk && dim(Xk)[1] > dimk) {
-      formula[k] <- pesel(X = Xk, npc.min = dimk, npc.max = dimk, scale = FALSE, 
-        method = "heterogenous")$vals[1]
+      formula[k] <- pesel(
+        X = Xk, npc.min = dimk, npc.max = dimk, scale = FALSE,
+        method = "heterogenous"
+      )$vals[1]
     } else {
       warning("The dimensionality of the cluster was greater or equal than max(number of observation, number of variables) in the cluster.
               Ignoring the cluster during mBIC calculation")
@@ -52,23 +54,23 @@ cluster.pca.BIC <- function(X, segmentation, dims, numb.clusters, max.dim, flat.
 }
 
 #' Choses a subspace for a variable
-#' 
-#' Selects a subspace closest to a given variable. To select the subspace, the method 
-#' considers (for every subspace) a subset of its principal components and tries 
-#' to fit a linear model with the variable as the response. Then the method chooses 
+#'
+#' Selects a subspace closest to a given variable. To select the subspace, the method
+#' considers (for every subspace) a subset of its principal components and tries
+#' to fit a linear model with the variable as the response. Then the method chooses
 #' the subspace for which the value of BIC was the highest.
 #'
 #' @param variable A variable to be assigned.
 #' @param pcas Orthogonal basis for each of the subspaces.
 #' @param number.clusters Number of subspaces (clusters).
 #' @param show.warnings A boolean - if set to TRUE all warnings are displayed, default value is FALSE.
 #' @param common_sigma A boolean - if set to FALSE, seperate sigma is estimated for each cluster,
-#' default value is TRUE  
+#' default value is TRUE
 #' @keywords internal
 #' @return index Number of most similar subspace to variable.
 choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings = FALSE, common_sigma = TRUE) {
   BICs <- NULL
-  if(common_sigma) {
+  if (common_sigma) {
     res <- fastLmPure(as.matrix(Matrix::bdiag(pcas)), rep(variable, number.clusters), method = 0L)$residuals
     n <- length(variable)
     sigma.hat <- sqrt(sum(res^2) / (n * number.clusters))
@@ -79,19 +81,19 @@ choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings =
       nparams <- ncol(pcas[[i]])
       res.part <- res[((i - 1) * n + 1):(i * n)]
       loglik <- sum(dnorm(res.part, 0, sigma.hat, log = T))
-      BICs[i] <- loglik - nparams * log(n)/2
+      BICs[i] <- loglik - nparams * log(n) / 2
     }
   } else {
     for (i in 1:number.clusters) {
       nparams <- ncol(pcas[[i]])
       n <- length(variable)
       res <- fastLmPure(pcas[[i]], variable, method = 0L)$residuals
-      sigma.hat <- sqrt(sum(res^2)/n)
+      sigma.hat <- sqrt(sum(res^2) / n)
       if (sigma.hat < 1e-15 && show.warnings) {
         warning("In function choose.cluster.BIC: estimated value of noise in cluster is <1e-15. It might corrupt the result.")
       }
       loglik <- sum(dnorm(res, 0, sigma.hat, log = T))
-      BICs[i] <- loglik - nparams * log(n)/2
+      BICs[i] <- loglik - nparams * log(n) / 2
     }
   }
   which.max(BICs)
@@ -100,7 +102,7 @@ choose.cluster.BIC <- function(variable, pcas, number.clusters, show.warnings =
 #' Calculates principal components for every cluster
 #'
 #' For given segmentation this function estimates dimensionality of each cluster (or chooses fixed dimensionality)
-#' and for each cluster calculates the number of principal components equal to the this dimensionality 
+#' and for each cluster calculates the number of principal components equal to the this dimensionality
 #'
 #' @param X A data matrix.
 #' @param segmentation A vector, segmentation of variables into clusters.
@@ -118,8 +120,10 @@ calculate.pcas <- function(X, segmentation, number.clusters, max.subspace.dim, e
       a <- summary(prcomp(x = Xk))
       if (estimate.dimensions) {
         max.dim <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
-        cut <- max(1, pesel(X = Xk, npc.min = 1, npc.max = max.dim, scale = FALSE, 
-          method = "heterogenous")$nPCs)
+        cut <- max(1, pesel(
+          X = Xk, npc.min = 1, npc.max = max.dim, scale = FALSE,
+          method = "heterogenous"
+        )$nPCs)
       } else {
         cut <- min(max.subspace.dim, floor(sqrt(sub.dim[2])), sub.dim[1])
       }
@@ -132,7 +136,7 @@ calculate.pcas <- function(X, segmentation, number.clusters, max.subspace.dim, e
 }
 
 #' Plot mlcc.fit class object
-#' 
+#'
 #' @param x mlcc.fit class object
 #' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
 #' @export
@@ -145,7 +149,7 @@ plot.mlcc.fit <- function(x, ...) {
 }
 
 #' Print mlcc.fit class object
-#' 
+#'
 #' @param x mlcc.fit class object
 #' @param ... Further arguments to be passed to or from other methods. They are ignored in this function.
 #' @export
@@ -159,7 +163,7 @@ print.mlcc.fit <- function(x, ...) {
 }
 
 #' Print mlcc.reps.fit class object
-#' 
+#'
 #' @param x mlcc.reps.fit class object
 #' @param ... Further arguments to be passed to or from other methods. They are
 #'   ignored in this function.
@@ -174,7 +178,7 @@ print.mlcc.reps.fit <- function(x, ...) {
 }
 
 #' Print clusters obtained from MLCC
-#' 
+#'
 #' @param data The original data set.
 #' @param segmentation A vector, segmentation of variables into clusters.
 #' @export
@@ -184,8 +188,10 @@ show.clusters <- function(data, segmentation) {
   clusters <- lapply(1:max(segmentation), function(i) {
     colnames_in_cluster <- colnames(data)[segmentation == i]
     current_cluster_size <- length(colnames_in_cluster)
-    c(colnames_in_cluster,
-      rep("-", times = max_cluster_size - current_cluster_size))
+    c(
+      colnames_in_cluster,
+      rep("-", times = max_cluster_size - current_cluster_size)
+    )
   })
   clusters <- as.data.frame(clusters)
   colnames(clusters) <- paste("cluster", 1:max(segmentation), sep = "_")

diff --git a/R/data.simulation.R b/R/data.simulation.R
@@ -1,46 +1,48 @@
 #' Simulates subspace clustering data
-#' 
-#' Generates data for simulation with a low-rank subspace structure: variables 
-#' are clustered and each cluster has a low-rank representation. Factors than 
+#'
+#' Generates data for simulation with a low-rank subspace structure: variables
+#' are clustered and each cluster has a low-rank representation. Factors than
 #' span subspaces are not shared between clusters.
-#' 
+#'
 #' @param n An integer, number of individuals.
-#' @param SNR A numeric, signal to noise ratio measured as variance of the 
+#' @param SNR A numeric, signal to noise ratio measured as variance of the
 #'   variable, element of a subspace, to the variance of noise.
 #' @param K An integer, number of subspaces.
 #' @param numb.vars An integer, number of variables in each subspace.
 #' @param max.dim An integer, if equal.dims is TRUE then max.dim is dimension of
-#'   each subspace. If equal.dims is FALSE then subspaces dimensions are drawn 
+#'   each subspace. If equal.dims is FALSE then subspaces dimensions are drawn
 #'   from uniform distribution on [min.dim,max.dim].
 #' @param min.dim An integer, minimal dimension of subspace .
-#' @param equal.dims A boolean, if TRUE (value set by default) all clusters are 
+#' @param equal.dims A boolean, if TRUE (value set by default) all clusters are
 #'   of the same dimension.
 #' @export
-#' @return A list consisting of: \item{X}{matrix, generated data} 
-#'   \item{signals}{matrix, data without noise} \item{dims}{vector, dimensions 
-#'   of subspaces} \item{factors}{matrix, columns of which span subspaces} 
+#' @return A list consisting of: \item{X}{matrix, generated data}
+#'   \item{signals}{matrix, data without noise} \item{dims}{vector, dimensions
+#'   of subspaces} \item{factors}{matrix, columns of which span subspaces}
 #'   \item{s}{vector, true partiton of variables}
 #' @examples
 #' sim.data <- data.simulation()
-#' sim.data2 <- data.simulation(n = 30, SNR = 2, K = 5, numb.vars = 20, 
-#'                              max.dim = 3, equal.dims = FALSE)
-data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim = 2, 
-  min.dim = 1, equal.dims = TRUE) {
-  sigma <- 1/SNR
+#' sim.data2 <- data.simulation(
+#'   n = 30, SNR = 2, K = 5, numb.vars = 20,
+#'   max.dim = 3, equal.dims = FALSE
+#' )
+data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim = 2,
+                            min.dim = 1, equal.dims = TRUE) {
+  sigma <- 1 / SNR
   # subspaces dimensions depend on equal.dims value
   if (equal.dims) {
     dims <- rep(max.dim, K)
   } else {
     dims <- sample(1:max.dim, K, replace = T)
   }
-  
+
   X <- NULL
   Y <- NULL
   s <- NULL
   factors <- NULL
   for (j in 1:K) {
     Z <- qr.Q(qr(replicate(dims[j], rnorm(n, 0, 1))))
-    coeff <- matrix(runif(dims[j] * numb.vars, 0.1, 1) * sign(runif(dims[j] * 
+    coeff <- matrix(runif(dims[j] * numb.vars, 0.1, 1) * sign(runif(dims[j] *
       numb.vars, -1, 1)), nrow = dims[j])
     SIGNAL <- Z %*% coeff
     SIGNAL <- scale(SIGNAL)
@@ -54,36 +56,38 @@ data.simulation <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, max.dim =
 
 
 #' Simulates subspace clustering data with shared factors
-#' 
+#'
 #' Generating data for simulation with a low-rank subspace structure: variables
 #' are clustered and each cluster has a low-rank representation. Factors that
 #' span subspaces are shared between clusters.
-#' 
+#'
 #' @inheritParams data.simulation
 #' @param numb.factors An integer, number of factors from which subspaces basis
 #'   will be drawn.
-#' @param separation.parameter a numeric, coefficients of variables in each 
+#' @param separation.parameter a numeric, coefficients of variables in each
 #'   subspace basis are drawn from range [separation.parameter,1]
 #' @export
-#' @return A list consisting of: \item{X}{matrix, generated data} 
+#' @return A list consisting of: \item{X}{matrix, generated data}
 #'   \item{signals}{matrix, data without noise} \item{factors}{matrix, columns
 #'   of which span subspaces} \item{indices}{list of vectors, indices of factors
-#'   that span subspaces} \item{dims}{vector, dimensions of subspaces} 
+#'   that span subspaces} \item{dims}{vector, dimensions of subspaces}
 #'   \item{s}{vector, true partiton of variables}
 #' @examples
 #' sim.data <- data.simulation.factors()
-#' sim.data2 <- data.simulation.factors(n = 30, SNR = 2, K = 5, numb.vars = 20,
-#'              numb.factors = 10, max.dim = 3, equal.dims = FALSE, separation.parameter = 0.2)
-data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, numb.factors = 10, 
-  min.dim = 1, max.dim = 2, equal.dims = TRUE, separation.parameter = 0.1) {
-  sigma <- 1/SNR
+#' sim.data2 <- data.simulation.factors(
+#'   n = 30, SNR = 2, K = 5, numb.vars = 20,
+#'   numb.factors = 10, max.dim = 3, equal.dims = FALSE, separation.parameter = 0.2
+#' )
+data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, numb.factors = 10,
+                                    min.dim = 1, max.dim = 2, equal.dims = TRUE, separation.parameter = 0.1) {
+  sigma <- 1 / SNR
   # subspaces dimensions depend on equal.dims value
   if (equal.dims) {
     dims <- rep(max.dim, K)
   } else {
     dims <- sample(min.dim:max.dim, K, replace = T)
   }
-  
+
   factors <- scale(replicate(numb.factors, rnorm(n, 0, 1)))
   X <- NULL
   Y <- NULL
@@ -92,14 +96,16 @@ data.simulation.factors <- function(n = 100, SNR = 1, K = 10, numb.vars = 30, nu
   for (j in 1:K) {
     factors.indices[[j]] <- sample(numb.factors, dims[j], replace = FALSE)
     Z <- factors[, factors.indices[[j]], drop = FALSE]
-    coeff <- matrix(runif(dims[j] * numb.vars, separation.parameter, 1) * sign(runif(dims[j] * 
+    coeff <- matrix(runif(dims[j] * numb.vars, separation.parameter, 1) * sign(runif(dims[j] *
       numb.vars, -1, 1)), nrow = dims[j])
     SIGNAL <- Z %*% coeff
     SIGNAL <- scale(SIGNAL)
     Y <- cbind(Y, SIGNAL)
     X <- cbind(X, SIGNAL + replicate(numb.vars, rnorm(n, 0, sigma)))
     s <- c(s, rep(j, numb.vars))
   }
-  return(list(X = X, signals = Y, factors = factors, indices = factors.indices, 
-    dims = dims, s = s))
+  return(list(
+    X = X, signals = Y, factors = factors, indices = factors.indices,
+    dims = dims, s = s
+  ))
 }
diff --git a/R/integration.R b/R/integration.R
@@ -1,33 +1,34 @@
 #' Computes integration and acontamination of the clustering
-#' 
-#' Integartion and acontamination are measures of the quality of a clustering 
-#' with a reference to a true partition. Let \eqn{X = (x_1, \ldots x_p)} be the 
-#' data set, \eqn{A} be a partition into clusters \eqn{A_1, \ldots A_n} (true 
-#' partition) and \eqn{B} be a partition into clusters \eqn{B_1, \ldots, B_m}. 
-#' Then for cluster \eqn{A_j} integration is eqaul to: \deqn{Int(A_j) = 
-#' \frac{max_{k = 1, \ldots, m} \# \{  i \in \{ 1, \ldots p \}: x_i \in A_j 
-#' \wedge x_i \in B_k \}  }{\# A_j}} The \eqn{B_k} for which the value is 
-#' maximized is called the integrating cluster of \eqn{A_j}. Then the 
-#' integration for the whole clustering equals is \eqn{Int(A,B) = \frac{1}{n} 
-#' \sum_{j=1}^n Int(A_j)} .The acontamination is defined by: \deqn{Acont(A_j) = 
+#'
+#' Integartion and acontamination are measures of the quality of a clustering
+#' with a reference to a true partition. Let \eqn{X = (x_1, \ldots x_p)} be the
+#' data set, \eqn{A} be a partition into clusters \eqn{A_1, \ldots A_n} (true
+#' partition) and \eqn{B} be a partition into clusters \eqn{B_1, \ldots, B_m}.
+#' Then for cluster \eqn{A_j} integration is eqaul to: \deqn{Int(A_j) =
+#' \frac{max_{k = 1, \ldots, m} \# \{  i \in \{ 1, \ldots p \}: x_i \in A_j
+#' \wedge x_i \in B_k \}  }{\# A_j}} The \eqn{B_k} for which the value is
+#' maximized is called the integrating cluster of \eqn{A_j}. Then the
+#' integration for the whole clustering equals is \eqn{Int(A,B) = \frac{1}{n}
+#' \sum_{j=1}^n Int(A_j)} .The acontamination is defined by: \deqn{Acont(A_j) =
 #' \frac{ \# \{  i \in \{ 1, \ldots p \}: x_i \in A_j \wedge x_i \in B_k \} }{\#
-#' B_k}} where \eqn{B_k} is the integrating cluster for \eqn{A_j}. Then the 
-#' acontamination for the whole dataset is \eqn{Acont(A,B) = \frac{1}{n} 
+#' B_k}} where \eqn{B_k} is the integrating cluster for \eqn{A_j}. Then the
+#' acontamination for the whole dataset is \eqn{Acont(A,B) = \frac{1}{n}
 #' \sum_{j=1}^n Acont(A_j)}
-#' 
+#'
 #' @param group A vector, first partition.
 #' @param true_group A vector, second (reference) partition.
-#' @references {M. Sołtys. Metody analizy skupień. Master’s thesis, Wrocław 
+#' @references {M. Sołtys. Metody analizy skupień. Master’s thesis, Wrocław
 #'   University of Technology, 2010}
 #' @export
 #' @return An array containing values of integration and acontamination.
 #' @examples
 #' \donttest{
 #' sim.data <- data.simulation(n = 20, SNR = 1, K = 2, numb.vars = 50, max.dim = 2)
-#' true_segmentation <- rep(1:2, each=50)
-#' mlcc.fit <- mlcc.reps(sim.data$X, numb.clusters = 2, max.dim = 2, numb.cores=1)
-#' integration(mlcc.fit$segmentation, true_segmentation)}
-#' 
+#' true_segmentation <- rep(1:2, each = 50)
+#' mlcc.fit <- mlcc.reps(sim.data$X, numb.clusters = 2, max.dim = 2, numb.cores = 1)
+#' integration(mlcc.fit$segmentation, true_segmentation)
+#' }
+#'
 integration <- function(group, true_group) {
   n <- length(group)
   K1 <- max(unique(group))
@@ -37,14 +38,16 @@ integration <- function(group, true_group) {
   }
   integrationMatrix <- matrix(0, nrow = K1, ncol = K2)
   for (i in 1:n) {
-    integrationMatrix[group[i], true_group[i]] = integrationMatrix[group[i], 
-      true_group[i]] + 1
+    integrationMatrix[group[i], true_group[i]] <- integrationMatrix[
+      group[i],
+      true_group[i]
+    ] + 1
   }
   clusters <- apply(integrationMatrix, 2, max)
   cluster_indices <- apply(integrationMatrix, 2, which.max)
   sizes_true <- apply(integrationMatrix, 2, sum)
   sizes_group <- apply(integrationMatrix, 1, sum)
-  int <- clusters/sizes_true
-  acont <- clusters/sizes_group[cluster_indices]
+  int <- clusters / sizes_true
+  acont <- clusters / sizes_group[cluster_indices]
   return(c(mean(int), mean(acont)))
 }