-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiment.R
186 lines (162 loc) · 5.44 KB
/
experiment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# Author: Johan de Joode
# Journal: Journal for North-West Semitic Languages
# POS = Part of Speech
# p.pos = Proportions of Parts of Speech
library(devtools)
# Install the plotting library if you do not already have it
install_github("vqv/ggbiplot")
library(ggbiplot)
library(rgl)
library(car)
# Set the working directory to the location of this file.
# This is specifically put in place for work with RStudio.
# Without RStudio one should setwd manually
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# Read the ETCBC dataset
data = readRDS(file='./data/bhsa2017_simple.rds')
# inspect the head of the table
head(data)
# The ETCBC data also tags non-consonantal articles
View(table(data[data$pdp == 'art','g_cons_utf8']))
#! Uncomment the following line to remove non-consonantal articles from the data
# data <- data[!((data$pdp == 'art') & (data$g_cons_utf8 != 'ה')),]
# POS per book
# I am using the phrase-dependent (context-aware) POS, rather than the simple dictionary form
# because participles really functions as nouns, they should indicate nouns and not verbs
pos <- table(data$book, data$pdp)
addmargins(pos, 2)
# Change the names
# The order matters here
POS <- c(adjv = "Adj.",
advb = "Adv.",
art = "Art.",
conj = "Conj.",
inrg = "Interrog. Part.",
intj = "Interj.",
nega = "Neg. Part.",
nmpr = "Proper Noun",
prde = "Demon. Pronoun",
prep = "Prep.",
prin = "Interrog. Pronoun",
prps = "Pers. Pronoun",
subs = "Noun",
verb = "Verb"
)
colnames(pos) <- POS
BOOKS <- c("Amos" = "Amos",
"Canticum" = "Song",
"Chronica_I" = "1 Chr",
"Chronica_II" = "2 Chr",
"Daniel" = "Dan",
"Deuteronomium" = "Deut",
"Ecclesiastes" = "Eccl",
"Esra" = "Ezra",
"Esther" = "Esth",
"Exodus" = "Exod",
"Ezechiel" = "Ezek",
"Genesis" = "Gen",
"Habakuk" = "Hab",
"Haggai" = "Hag",
"Hosea" = "Hos",
"Iob" = "Job",
"Jeremia" = "Jer",
"Jesaia" = "Isa",
"Joel" = "Joel",
"Jona" = "Jonah",
"Josua" = "Josh",
"Judices" = "Judg",
"Leviticus" = "Lev",
"Maleachi" = "Mal",
"Micha" = "Mic",
"Nahum" = "Nah",
"Nehemia" = "Neh",
"Numeri" = "Num",
"Obadia" = "Obad",
"Proverbia" = "Prov",
"Psalmi" = "Ps",
"Reges_I" = "1 Kgs",
"Reges_II" = "2 Kgs",
"Ruth" = "Ruth",
"Sacharia" = "Zech",
"Samuel_I" = "1 Sam",
"Samuel_II" = "2 Sam",
"Threni" = "Lam",
"Zephania" = "Zeph")
rownames(pos) <- BOOKS
rownames(pos)
# work with proportions
p.pos <- prop.table(pos, 1)
# inspect the result
round(addmargins(p.pos, 2), 4)
# save the data for inspection
write.csv(round(p.pos *10000), './results/p_pos_per_ten_thousand.csv')
write.csv(pos, './results/pos.csv')
# Visually inspect whether the distributions are normal
par(mfrow=c(4,4)) # Set the number of graphs
for (feature in colnames(p.pos)) {
hist(p.pos[,feature], main=NULL, ylab=NULL, xlab=NULL)
title(feature)
}
dev.copy2pdf(file = "./results/distributions_original.pdf")
par(mfrow=c(1,1)) # Reset the number of graphs
# Transform the data to data which PCA requires
transform2 <- function(x) {
# --------
# Function by Dirk Speelman
# Box-Cox power transformation that makes x as normal as possible
#
# remark:
# we initially wanted to work with
# transform <- function(x, pow) { return(abs(x)^pow*sign(x))}
# with pow equal to 0.5 for a.o. LLR measures,
# but then decided to systematically use Box-Cox transformations
# --------
# -- make values strictly positive (needed for powerTransform(y)) --
if (min(x) <= 0) {
x <- x + 0.0000001 - min(x)
}
# -- bc transformation
lambda <- powerTransform(x)$lambda
return(bcPower(x, lambda))
}
for (f in colnames(p.pos)) {
p.pos[,f] <- transform2(p.pos[,f])
}
results <- numeric(length=ncol(p.pos))
for (idx in 1:ncol(p.pos)) {
# Test each transformed POS for normality
results[idx] <- shapiro.test(p.pos[,idx])$p.value
}
results
# Reinspect the distributions visually
par(mfrow=c(4,4))
for (feature in colnames(p.pos)) {
hist(p.pos[,feature], main=NULL, ylab=NULL, xlab=NULL)
title(feature)
}
dev.copy2pdf(file = "./results/distributions_after_boxcox.pdf")
par(mfrow=c(1,1))
# Compute PCA
pca <- prcomp(p.pos, center = TRUE, scale. = TRUE)
# Create a biplot
par(mfrow=c(1,1))
ggbiplot(pca, labels = row.names(p.pos))
ggsave(file = "./results/PCA_POS.pdf")
# Create a 3D visualisation, this needs to be screencaptured in some cases
# https://cran.r-project.org/web/packages/pca3d/pca3d.pdf
library(pca3d)
pca3d(pca, show.labels = TRUE)
makeMoviePCA(dir="./results", clean=TRUE, type = "gif", movie = "POS", convert=TRUE)
# Plot the contributions of each POS to the first three dimensions
# This needs to be saved manually as you need to redimension the window
round(pca$rotation[,1:3], 3)
par(mfrow=c(1,3), mar = c(10,4,4,2) + 0.1)
for (dimen in 1:3) {
barplot(pca$rotation[,dimen], las=2, horiz=FALSE)
title(paste0('PC',dimen))
}
# Plot the correlations between POS to confirm our findings
library(corrplot)
par(mfrow=c(1,1))
corrplot(cor(p.pos), order = "hclust", cl.pos = "b", col = c("white", "black"), bg = "gold2")
dev.copy2pdf(file = "./results/corrplot.pdf")