-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanPrecipData.R
87 lines (73 loc) · 3.32 KB
/
cleanPrecipData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
library(tidyverse)
# 1.read in inventory file
data <-read_lines("ghcn-m_v4_prcp_inventory.txt", skip = 0)
# 2. add a header for the file
data_tibble <- tibble(
GHCN_ID = str_sub(data, 1, 11),
Latitude = str_sub(data, 13, 20),
Longitude = str_sub(data, 22, 30),
Elevation = str_sub(data, 32, 37),
State = str_sub(data, 39, 40),
Station = str_sub(data, 42, 79),
WMO_ID = str_sub(data, 81, 85),
First_Year = str_sub(data, 87, 90),
Last_Year = str_sub(data, 91, 95)
)
# get rid of leading / trailing spaces using str_squish
# use across() from dplyr to apply str_squish() from stringr to the specified columns of the data frame.
data_tibble <- data_tibble %>%mutate(across(c(2,3,4,7,8,9), str_squish))
# 3. from inventory file, get a list of names of files for which there is data for 2023
data_tibble_selected <- data_tibble_cleaned %>% filter(Last_Year == 2023)
# 4. store the ids in a list for those files
ids_array <- data_tibble_selected %>%pull(GHCN_ID)
# 5. filter the csv files for only those files and copy them to the "2023" folder
csv_files <- list.files(path = "./stations_data", pattern = ".csv")
for (file_name in csv_files) {
file_stem <- gsub("\\.csv", "", file_name)
if (file_stem %in% ids_array) {
cat("Matched file:", file_name, "\n")
file_path <- file.path("./stations_data", file_name)
new_file_path <- file.path("./stations_data/2023", file_name)
file.copy(file_path, new_file_path)
}
}
# 6. for all the csv files in the 2023 folder, add a header
csv_files2023 <- list.files(path = "./stations_data/2023", pattern = ".csv", full.names = TRUE)
header_row <- c("GHCN identifier", "Station_name", "Latitude", "Longitude", "Elevation", "yearmonth", "Precipitation_value", "Measurement_flag", "Quality_control_flag", "Source_flag", "Source_index")
for (file_name in csv_files2023) {
lines <- readLines(file_name)
lines <- c(paste(header_row, collapse = ","), lines)
writeLines(lines, file_name, sep = "\n")
}
# 7.for each file in 2023, get only the row for 2023-05, and append it to a final csv dataset 2023MayData.csv
# output file name
output_file <- "./final/2023MayData.csv"
# create a data frame of headers using tibble(), for some reason it doesn't work in the below code when stored as a list
header_df <- tibble(
GHCN_identifier = "GHCN identifier",
Station_name = "Station_name",
Latitude = "Latitude",
Longitude = "Longitude",
Elevation = "Elevation",
yearmonth = "yearmonth",
Precipitation_value = "Precipitation_value",
Measurement_flag = "Measurement_flag",
Quality_control_flag = "Quality_control_flag",
Source_flag = "Source_flag",
Source_index = "Source_index"
)
for (csv_file in csv_files2023) {
# Read the CSV file
csv_data <- read_csv(csv_file)
# Filter for rows where yearmonth is "202305"
filtered_data <- csv_data %>% filter(yearmonth == "202305")
#filtered_data <- as.data.frame(filtered_data)
# If there are any matching rows, write them to the output file
if (nrow(filtered_data) > 0) {
# Write the header row and the matching rows to the output file
if (!file.exists(output_file)) {
write_csv(header_df, output_file, col_names = FALSE)
}
write_csv(filtered_data, output_file, append = TRUE, col_names = FALSE)
}
}