-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy-pca.R
238 lines (192 loc) · 7.44 KB
/
my-pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# Michelle Li
# pca
# last updated: July 20, 2018
# tried 2 PCA packages, working with Nestorowa and Paul data
# added handwritten version
library(rgl)
#### PREPARING DATA ----------------------------------------------
## NOTES ON OUR DATA:
## Nestorowa data: 1645 cells x 4290 genes
## Paul data: 8716 genes x 2730 cells (need transpose for PCA)
## Grover data: 46175 genes x 135 cells (need transpose for PCA)
## Rockne data: 12674 genes x 101 cells (need transpose for PCA)
## PCA DIMENSIONS:
## cells = rows, genes = columns
## read in data and convert to matrix (Nestorowa, Grover, non-.RData file)
rawdata <- read.delim("~/GitHub/cell-diff-reu-2018/data/coordinates_gene_counts_flow_cytometry.txt", row.names=1)
# rawdata <- read.csv("~/GitHub/cell-diff-reu-2018/data/grover_expression.txt", row.names=1, sep="")
cleandata <- na.omit(rawdata) # remove rows with NA
data <- cleandata[,14:ncol(cleandata)] # choose relevant gene exp data
data <- cleandata
data <- as.matrix(data)
## load data (Paul, .RData file)
load("~/GitHub/cell-diff-reu-2018/data/Paul_Cell_MARSseq_GSE72857.RData")
dataname <- "Paul Data" # enter title of data
## create factor for grouping
# rownames <- row.names(data)
# groups <- as.character(rownames)
# groups[grepl("HSPC",groups)] <- "HSPC" # replace all labels with hspc
# groups[grepl("LT",groups)] <- "LT.HSC" # replace with lt.hsc
# groups[grepl("Prog",groups)] <- "PROG" # same for prog
# groupsf <- factor(groups)
groupsf <- factor(cluster.id) # paul data
nclusters <- nlevels(groupsf) # number of groups to cluster
colors <- rainbow(nclusters) # choose colors
################### Handwritten PCA ##########################
## refer to above to choose whether you need to transpose your data or not
# data <- t(data) # may need to take transpose
## center data
M1centered <- scale(data, center=TRUE, scale=FALSE) # subtracts column means from each column
#### EIG/SVD ------------------------------------------------
## NOTE: Eig has not been looked at in a long time
## testing covariance
# C <- cov(M1centered) # returns ncol x ncol
# eigC <- eigen(C)
# eigvectors <- eigC$vectors
## eig
# C <- cov(M1centered) # will return a cov matrix, ncol x ncol
# eig <- eigen(C)
# eigvalues <- eig$values[1:3]
# V <- eig$vectors[,1:3] # V=columns are the eig vectors of Gram matrix
# V <- -V # by default, eig vectors in R point in the negative direction. Multiply by -1 to use the positive-pointing vector
# U1eig <- M1centered %*% V # U: columns are the eig vectors of original covariance matrix
# Ueig <- scale(U1eig, center=FALSE, scale=TRUE) # normalize U
# eig: U (eig vectors of original covariance matrix) = M1centered * V (eig vectors of fake covariance matrix (Gram Matrix))
# U from eig is a scale of the U from svd
# unclear on the exact relationship between U from eig and U from svd
## svd
svd <- svd(M1centered, nu=3, nv=3) # only first 3 left/right singular vectors for efficiency
U <- svd$u # dim: nrowx3
V <- svd$v # dim: ncolx3
## Note: U = data %*% V %*% S^-1
## Calculate PC scores
# weights <- t(M1centered) %*% U # dim: ncolx3
scores <- M1centered %*% V # dim: nrowx3
#### PLOTTING -------------------------------------------------
# par(mar = c(5.1,4.1,4.1,8.1), xpd = "TRUE") # use if you need extra space in margins for legend
## plot scores 2d
plot(x=scores[,1], y=scores[,2],
xlab="PC1 scores", ylab="PC2 scores",
col = colors[groupsf],
pch=20,
main = paste("PCA Scores (Handwritten),", dataname))
legend("bottomright", inset = c(0,0),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=0.4, pt.cex=1)
## plot scores 3d
plot3d(x=scores[,1], y=scores[,2], z=scores[,3],
xlab="PC1 scores", ylab="PC2 scores", zlab="PC3 scores",
col = colors[groupsf],
pch=20,
main = paste("PCA Scores (Handwritten),", dataname))
legend3d("bottomright", inset=c(0.2,0.2),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=1)
## plot PCs 2d (eigenvectors of covariance matrix)
plot(x=V[,1], y=V[,2],
xlab="PC1", ylab="PC2",
col = colors[groupsf],
pch=20,
main = paste("Principal Components (Handwritten PCA),", dataname))
legend("bottomleft", inset = c(0,0),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=0.4, pt.cex=1)
## plot PCs 3d
plot3d(x=V[,1], y=V[,2], z=V[,3],
xlab="PC1", ylab="PC2", zlab="PC3",
col = colors[groupsf],
pch=20,
main = paste("Principal Components (Handwritten PCA),", dataname))
legend3d("bottomright", inset=c(0.2,0.2),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=1)
################### PCA Packages #######################
#### PCA with function prcomp, uses svd -----------------------
## run pca using prcomp
pca1 <- prcomp(data, center=TRUE, scale = TRUE)
rotation1 <- pca1$rotation # columns are the eigenvectors ncol x ncol (# cells)
x1 <- pca1$x # x = rotated data (centered and scaled if requested) %*% rotation matrix
## rotation1 is equivalent to V, x1 is equivalent to scores
## plot
par(mar = c(5,5,5,5), xpd = "TRUE") # add extra space to margins for legend
## plot PCs (eigenvectors)
## 2d
plot(x=rotation1[,1], y=rotation1[,2],
xlab="PC1", ylab="PC2",
col = colors[groupsf],
pch=20,
main = paste("Principal Components (prcomp), ", dataname))
legend("bottomright", inset = c(0,0),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=1, cex=0.4, pt.cex=1)
## 3d
plot3d(x=rotation1[,1], y=rotation1[,2], z=rotation1[,3],
xlab="PC1", ylab="PC2", zlab="PC3",
col = colors[groupsf],
pch=20,
main = paste("Principal Components (prcomp), ", dataname))
legend3d("bottomright", inset = c(0,0),
legend = levels(groupfs),
pch = 20,
col = colors,
ncol=1, cex=1)
## plot scores
## 2d
plot(x=x1[,1], y=x1[,2],
xlab="PC1 scores", ylab="PC2 scores",
col = colors[groupsf],
pch=20,
main = paste("PCA (prcomp),", dataname))
legend("bottomright", inset=c(0.2,0.2),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=0.4, pt.cex=1)
## 3d
plot3d(x=x1[,1], y=x1[,2], z=x1[,3],
xlab="PC1 scores", ylab="PC2 scores", zlab="PC3 scores",
col = colors[groupsf],
pch=20,
main = paste("PCA (prcomp),", dataname))
legend3d("bottomright", inset=c(0.2,0.2),
legend = levels(groupsf),
pch = 20,
col = colors,
ncol=3, cex=1)
#### PCA with function PCA() from package "FactoMineR", uses eig -----------------------
library(FactoMineR)
# takes in a dataframe X with n rows (individuals) and p columns (numeric variables)
data <- as.data.frame(data)
pca2 <- PCA(data) # n rows (individuals), p columns (numeric variables)
coord <- pca2$ind$coord
## plot 2d
plot(x=coord[,1], y=coord[,2],
xlab = "PC1", ylab = "PC2",
col=colors[groupsf],
pch = 20,
main=paste("2D PCA (FactoMineR),", dataname))
legend("topright", inset = c(0,0),
legend = levels(groupsf),
col = colors,
pch = 20,
ncol=3, cex=0.6) # add legend
## plot 3d
plot3d(x=coord[,1], y=coord[,2], z=coord[,3],
xlab="PC1", ylab="PC2", zlab="PC3",
col=colors[groupsf],
main = paste("3D PCA (FactoMineR), ", dataname))
legend3d("topright", inset=c(0.1,0.1),
legend = levels(groupsf),
col = colors,
pch = 20,
ncol=1, cex=1)