-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest1.R
150 lines (121 loc) · 4.8 KB
/
test1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
library(mvnfast)
library(ggplot2)
library(plotly)
library(tidyverse)
library(clues)
library(psych)
set.seed(2018)
#covariate matrix is 1000 x 50
x <- matrix(rnorm(50*1000), nrow=1000, ncol=50)
#the first 10 covariates combined with next 20 -> 4 clusters in X
x[1:500,1:10] <- x[1:500,1:10]+1
x[501:1000,1:10] <- x[501:1000,1:10]-1
cur.ndx <- sample(1:1000)
x[cur.ndx[1:500],11:30] <- x[cur.ndx[1:500],11:30]+2
x[cur.ndx[501:1000],11:30] <- x[cur.ndx[501:1000],11:30]-2
#Ys are clustered into 2 groups
#Why aren't they a function of X?
y = data.frame(y1 = c(rnorm(500, 1), rnorm(500)),
y2 = c(rnorm(500, 1), rnorm(500)),
y3 = c(rnorm(500, 1), rnorm(500)))
true.clus <- rep(c(1, 2), each = 500)
# cbind(y, true.clus) %>%
# data.frame() %>%
# plot_ly(x = ~y1, y = ~y2, z = ~y3,
# color = ~factor(true.clus),
# colors = c('#BF382A', '#0C4B8E')) %>%
# add_markers()
#Get predicted Ys
yhat1 <- fitted(lm(y$y1~x))
yhat2 <- fitted(lm(y$y2~x))
yhat3 <- fitted(lm(y$y3~x))
yhat <- cbind(yhat1, yhat2, yhat3)
#Get cluster assignments
fitted.clus <- apply(yhat, 2, scale) %>%
dist() %>%
hclust() %>%
cutree(k = 2)
x.clus <- apply(x, 2, scale) %>%
dist() %>%
hclust() %>%
cutree(k = 2)
y.clus <- apply(y, 2, scale) %>%
dist() %>%
hclust() %>%
cutree(k = 2)
#Now for the crazy stuff.
#First let's try clustering on a bunch of distribution estimates.
#We already have E[Y]
#estimate the variances
var.y1 <- fitted(lm((y$y1 - yhat1)**2~x))
var.y2 <- fitted(lm((y$y2 - yhat2)**2~x))
var.y3 <- fitted(lm((y$y3 - yhat3)**2~x))
#set a lower bound so that you don't have negative or teeny tiny variance estimates
var <- data.frame(var.y1, var.y2, var.y3) %>%
apply(c(1, 2), function(x) max(x, 0.01)) %>%
as.data.frame()
#estimate the covariances
cov.y1.y2 <- fitted(lm((y$y1 - yhat1) * (y$y2 - yhat2) ~ x))
cov.y1.y3 <- fitted(lm((y$y1 - yhat1) * (y$y3 - yhat3) ~ x))
cov.y2.y3 <- fitted(lm((y$y2 - yhat2) * (y$y3 - yhat3) ~ x))
#convert covariance into correlation
corr.y1.y2 <- cov.y1.y2 / sqrt(var$var.y1 * var$var.y2)
corr.y1.y3 <- cov.y1.y3 / sqrt(var$var.y1 * var$var.y3)
corr.y2.y3 <- cov.y2.y3 / sqrt(var$var.y2 * var$var.y3)
#13 of the correlations are outside the interval [-0.9999, 0.9999]
#Just force them back into the interval
corr <- data.frame(corr.y1.y2, corr.y1.y3, corr.y2.y3) %>%
apply(c(1, 2), function(x) ifelse(x < -0.9999, -0.9999,
ifelse(x > 0.9999, 0.9999,
x))) %>%
as.data.frame()
#For E[Y], we made it "unitless" by dividing by its sd
#We similarly want to make variance and correlation unit free as well
#Put all our estimated variables together
U <- data.frame(E.Y1 = yhat1 %>% scale(),
E.Y2 = yhat2 %>% scale(),
E.Y3 = yhat3 %>% scale(),
Var.Y1 = sqrt(var$var.y1) / sd(yhat1),
Var.y2 = sqrt(var$var.y2) / sd(yhat2),
Var.y3 = sqrt(var$var.y3) / sd(yhat3),
Corr.Y1.Y2 = fisherz(corr$corr.y1.y2),
Corr.Y1.Y3 = fisherz(corr$corr.y1.y3),
Corr.Y2.Y3 = fisherz(corr$corr.y2.y3))
U.clus <- cutree(hclust(dist(U)), k = 2)
#True cluster assignments should have mean c(1, 0)
data.frame(fitted.clus, x.clus, y.clus, U.clus, y) %>%
gather(Method, Cluster.Assignment, -y1, -y2, -y3) %>%
group_by(Method, Cluster.Assignment) %>%
summarise(m.y1 = mean(y1),
m.y2 = mean(y2),
m.y3 = mean(y3))
#misclassification for y-hat, x, y, and u clusterings
c(sum(fitted.clus != true.clus),
sum(x.clus != true.clus),
sum(y.clus != true.clus),
sum(U.clus != true.clus)) / nrow(y)
#The U clustering was what we had planned initially.
#Above is version 1 of kooky idea
#For this simulated dataset, it clearly does not perform that well
#Kosorok thought we had lost information in the modelling
#So version 2 of kooky idea was born:
#Find some way to incorporate both Y and U, including some kind of weight
#I have not tried this particular version of choosing w before
w <- cor(dist(U), cophenetic(hclust(dist(U)))) /
cor(dist(y), cophenetic(hclust(dist(y))))
D <- dist(y) + (dist(U) ** w)
UY.clus <- cutree(hclust(D), k = 2)
#True cluster assignments should have mean c(1, 0)
data.frame(fitted.clus, x.clus, y.clus, U.clus, UY.clus, y) %>%
gather(Method, Cluster.Assignment, -y1, -y2, -y3) %>%
group_by(Method, Cluster.Assignment) %>%
summarise(m.y1 = mean(y1),
m.y2 = mean(y2),
m.y3 = mean(y3))
#misclassification for y-hat, x, y, u, and weighted u and y clusterings
c(sum(fitted.clus != true.clus),
sum(x.clus != true.clus),
sum(y.clus != true.clus),
sum(U.clus != true.clus),
sum(UY.clus != true.clus)) / nrow(y)
#It doesn't show any improvement from clustering on U alone for this simulation study either