-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.R
59 lines (52 loc) · 2.56 KB
/
data_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(tidyverse)
library(fuzzyjoin)
# taken from https://www.waterboards.ca.gov/drinking_water/certlic/drinkingwater/documents/leadsamplinginschools/monthly_posting_final.xlsx
test_results_df <- read_csv("data/raw/monthly_posting_final.csv") %>%
filter(
SchoolType == "public"
) %>%
select(
PwsID, DISTRICT, SchoolName, SchoolAddress, SchoolCounty, `Action_Level_Exceedance?`
) %>%
group_by(
PwsID, DISTRICT, SchoolName, SchoolAddress, SchoolCounty
) %>%
summarize(
exceeded_action_level = "Yes" %in% `Action_Level_Exceedance?`
)
# taken from https://www.cde.ca.gov/ds/ad/documents/frpm1718.xlsx
free_reduced_lunch_df <- read_csv("data/raw/free_reduced_lunch.csv") %>%
select(
`District Name`, `School Name`, `School Type`, `Enrollment \n(K-12)`, `Free Meal \nCount \n(K-12)`, `Percent (%) \nEligible Free \n(K-12)`, `FRPM Count \n(K-12)`, `Percent (%) \nEligible FRPM \n(K-12)`
) %>%
mutate(
fuzzyjoin_school = paste(`District Name`, `School Name`, sep = " ")
)
# taken from https://www.cde.ca.gov/SchoolDirectory/report?rid=dl1&tp=xlsx&ict=Y
public_schools_directory_df <- read_csv("data/raw/public_schools_directory.csv") %>%
select(District, School, Street, City, Zip) %>%
filter(School != "No Data", Street != "No Data") %>%
mutate(
fuzzyjoin_school = paste(District, School, sep = " ")
)
# taken from https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2017-18&cCat=Enrollment&cPage=filesenr.asp
public_schools_enrollment_df <- read_csv("data/raw/school_enrollment.csv") %>%
select(DISTRICT, SCHOOL, ETHNIC, GENDER, ENR_TOTAL) %>%
mutate(
fuzzyjoin_school = paste(DISTRICT, SCHOOL, sep = " ")
)
# taken from https://www.cde.ca.gov/SchoolDirectory/report?rid=dl1&tp=xlsx&ict=Y
public_schools_directory_df <- read_csv("data/raw/public_schools_directory.csv") %>%
select(District, School, Street, City, Zip) %>%
filter(School != "No Data", Street != "No Data") %>%
mutate(
fuzzyjoin_school = paste(District, School, sep = " ")
)
public_schools_directory_df %>%
inner_join(free_reduced_lunch_df, by = "fuzzyjoin_school") %>%
mutate(fuzzyjoin_test_results = paste(Street, City, "CA", Zip, sep = " ")) %>%
inner_join(test_results_df, by = c("fuzzyjoin_test_results" = "SchoolAddress"))
all_school_data_df <- public_schools_directory_df %>%
stringdist_inner_join(free_reduced_lunch_df, by = "fuzzyjoin_school") %>%
mutate(fuzzyjoin_test_results = paste(Street, City, "CA", Zip, sep = " ")) %>%
stringdist_inner_join(test_results_df, by = c("fuzzyjoin_test_results", "SchoolAddress", max_dist = 3))