-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathintegration.R
executable file
·53 lines (53 loc) · 2.37 KB
/
integration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#' Computes integration and acontamination of the clustering
#'
#' Integartion and acontamination are measures of the quality of a clustering
#' with a reference to a true partition. Let \eqn{X = (x_1, \ldots x_p)} be the
#' data set, \eqn{A} be a partition into clusters \eqn{A_1, \ldots A_n} (true
#' partition) and \eqn{B} be a partition into clusters \eqn{B_1, \ldots, B_m}.
#' Then for cluster \eqn{A_j} integration is eqaul to: \deqn{Int(A_j) =
#' \frac{max_{k = 1, \ldots, m} \# \{ i \in \{ 1, \ldots p \}: x_i \in A_j
#' \wedge x_i \in B_k \} }{\# A_j}} The \eqn{B_k} for which the value is
#' maximized is called the integrating cluster of \eqn{A_j}. Then the
#' integration for the whole clustering equals is \eqn{Int(A,B) = \frac{1}{n}
#' \sum_{j=1}^n Int(A_j)} .The acontamination is defined by: \deqn{Acont(A_j) =
#' \frac{ \# \{ i \in \{ 1, \ldots p \}: x_i \in A_j \wedge x_i \in B_k \} }{\#
#' B_k}} where \eqn{B_k} is the integrating cluster for \eqn{A_j}. Then the
#' acontamination for the whole dataset is \eqn{Acont(A,B) = \frac{1}{n}
#' \sum_{j=1}^n Acont(A_j)}
#'
#' @param group A vector, first partition.
#' @param true_group A vector, second (reference) partition.
#' @references {M. Sołtys. Metody analizy skupień. Master’s thesis, Wrocław
#' University of Technology, 2010}
#' @export
#' @return An array containing values of integration and acontamination.
#' @examples
#' \donttest{
#' sim.data <- data.simulation(n = 20, SNR = 1, K = 2, numb.vars = 50, max.dim = 2)
#' true_segmentation <- rep(1:2, each = 50)
#' mlcc.fit <- mlcc.reps(sim.data$X, numb.clusters = 2, max.dim = 2, numb.cores = 1)
#' integration(mlcc.fit$segmentation, true_segmentation)
#' }
#'
integration <- function(group, true_group) {
n <- length(group)
K1 <- max(unique(group))
K2 <- max(unique(true_group))
if (n != length(true_group)) {
stop("Partitions are of different lengths")
}
integrationMatrix <- matrix(0, nrow = K1, ncol = K2)
for (i in 1:n) {
integrationMatrix[group[i], true_group[i]] <- integrationMatrix[
group[i],
true_group[i]
] + 1
}
clusters <- apply(integrationMatrix, 2, max)
cluster_indices <- apply(integrationMatrix, 2, which.max)
sizes_true <- apply(integrationMatrix, 2, sum)
sizes_group <- apply(integrationMatrix, 1, sum)
int <- clusters / sizes_true
acont <- clusters / sizes_group[cluster_indices]
return(c(mean(int), mean(acont)))
}