forked from jennybc/gapminder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06_smell-test-gap-merged.R
88 lines (74 loc) · 2.54 KB
/
06_smell-test-gap-merged.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#' ---
#' date: "`r format(Sys.Date())`"
#' output:
#' html_document:
#' keep_md: TRUE
#' ---
library(plyr)
suppressPackageStartupMessages(library(dplyr))
library(ggplot2)
library(readr)
gap_dat <- read_tsv("05_gap-merged-with-china-1952.tsv") %>%
mutate(country = factor(country),
continent = factor(continent))
gap_dat %>% str()
#' Do we have NAs?
gap_dat %>%
sapply(function(x) x %>% is.na() %>% sum())
#' year
gap_dat$year %>% summary()
#' Confirming we have 1950, 1951, ..., 2007
all.equal(gap_dat$year %>% unique() %>% sort(), 1950:2007)
#' How much data do we have for each year?
ggplot(gap_dat, aes(x = year)) + geom_histogram(binwidth = 1)
#' Most countries have data every five years, e.g. 1952, 1957, 1962, and so on.
#' country
gap_dat$country %>% str()
country_freq <- gap_dat %>%
count(country)
ggplot(country_freq, aes(x = country, y = n)) +
geom_bar(stat = "identity") # ugly but worth seeing
(p <- ggplot(country_freq, aes(x = n)) + geom_histogram(binwidth = 1))
p + xlim(c(1, 16))
country_freq$n %>% table()
#' Most countries have data for 12 years, i.e. the years highlighted above. Some
#' have data for 58 years, which I assume is the maximum. Otherwise, there's a
#' little bit of everything between 1 and 58.
#' continent
gap_dat$continent %>% levels()
gap_dat$continent %>% summary()
#' 301 rows have no continent data :( but we already knew that.
#' Is continent data uniform for all rows pertaining to one country?
tmp <- gap_dat %>%
group_by(country) %>%
summarize(n_continent = n_distinct(continent))
tmp$n_continent %>% table()
#' Yes, a minor miracle. All 187 countries have exactly 1 associated value of
#' continent.
#'
#' Fixing the continent data is a separate task to be tackled in the next
#' script.
#' population
gap_dat$pop %>% summary(digits = 10)
#' We have little countries
gap_dat[which.min(gap_dat$pop),]
#' like Aruba w/ 60K people
#'
#' and big countries
gap_dat[which.max(gap_dat$pop),]
#' like China w/ 1.3B people
ggplot(gap_dat,aes(x = pop)) + geom_density() + scale_x_log10()
#' life expectancy
gap_dat$lifeExp %>% summary()
#' This is comptible with normal human life span. Yay.
ggplot(gap_dat,aes(x = lifeExp)) + geom_density()
#' Note bimodality. Modes ~ 42 and 72.
gap_dat$gdpPercap %>% summary()
#' $113K???? really?
gap_dat[which.max(gap_dat$gdpPercap),]
#' OIL! Kuwait, 1957. Or maybe a data quality problem.
#' <https://github.com/jennybc/gapminder/issues/9>
ggplot(gap_dat,aes(x = gdpPercap)) + geom_density()
#' Looks plausible. Loooong right tail.
#'
#' Nothing looks completely insane. Done.