-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisualizing_scripts.R
82 lines (76 loc) · 3.14 KB
/
visualizing_scripts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(tidyverse)
library(lubridate)
library(tidytext)
library(ggwordcloud)
df <- read_csv("data.csv")
Sys.setlocale("LC_TIME", "en_US.UTF-8")
df |>
mutate(date = lubridate::make_date(year = 2020, month = month, day = day)) |>
ggplot(aes(date))+
geom_dotplot(dotsize = 0.3)+
facet_wrap(~year, ncol = 1)+
labs(x = "", y = "")+
theme_minimal() +
theme(axis.text.y = element_blank(),
text = element_text(size = 18))+
scale_x_date(date_labels = "%B")
Sys.setlocale("LC_TIME", "ru_RU.UTF-8")
df |>
mutate(author = str_remove_all(author, "\\(.*?\\)"),
author = str_split(author, ", ")) |>
unnest_longer(author) |>
mutate(author = str_squish(author)) |>
count(author, sort = TRUE) |>
filter(n > 1) |>
mutate(author = fct_reorder(author, n)) |>
ggplot(aes(n, author))+
geom_col()+
labs(x = "", y = "")+
theme_minimal()
map(stopwords::stopwords_getsources()[-c(3, 6, 8)], function(i){
print(i)
stopwords::stopwords(language = "en", source = i)}) |>
unlist() |>
unique() |>
sort() ->
stopwords_en
df |>
mutate(abstract = str_replace_all(abstract, "Nakh-Daghestanian", "Nakh_Daghestanian"),
abstract = str_replace_all(abstract, "cross-linguistic", "cross_linguistic"),
abstract = str_replace_all(abstract, "East Caucasian", "East_Caucasian"),
abstract = str_replace_all(abstract, "West Caucasian", "West_Caucasian"),
abstract = str_replace_all(abstract, "East-Caucasian", "East_Caucasian"),
abstract = str_replace_all(abstract, "West-Caucasian", "West_Caucasian"),
abstract = str_replace_all(abstract, "Dagestan", "Daghestan"),
abstract = str_replace_all(abstract, "language", "languages"),
abstract = str_replace_all(abstract, "languagess", "languages")) |>
unnest_tokens(input = "abstract", output = "word") |>
count(word, sort = TRUE) |>
anti_join(tibble(word = stopwords_en)) |>
filter(str_detect(word, "[A-z]"),
!(word %in% c("e.g", "eds", "https", "university", "i.e")),
n > 15) |>
mutate(word = case_when(word == "east_caucasian" ~ "East Caucasian",
word == "caucasus" ~ "Caucasus",
word == "daghestan" ~ "Daghestan",
word == "daghestanian" ~ "Daghestanian",
word == "nakh_daghestanian" ~ "Nakh-Daghestanian",
word == "cross_linguistic" ~ "cross-linguistic",
word == "russian" ~ "Russian",
word == "rutul" ~ "Rutul",
word == "andic" ~ "Andic",
word == "avar" ~ "Avar",
word == "lezgic" ~ "Lezgic",
word == "andi" ~ "Andi",
TRUE ~ word),
n = log(n)) |>
ggplot(aes(label = word, size = n))+
geom_text_wordcloud(rm_outside = TRUE, grid_margin = 2, seed = 42,
shape = "square",
max_grid_size = 138) +
theme_minimal()
# number of seminars
df |>
mutate(date = lubridate::make_date(year = year, month = month, day = day)) |>
distinct(date) |>
nrow()