Skip to content

Commit

Permalink
Merge pull request #26 from ruralinnovation/dev/Update_release_J24
Browse files Browse the repository at this point in the history
add raw data for june 2024
  • Loading branch information
defuneste authored Jan 24, 2025
2 parents fe69872 + 4fb7a72 commit f7578ed
Show file tree
Hide file tree
Showing 10 changed files with 556 additions and 12 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: cori.data.fcc
Title: Process FCC data
Version: 0.1.2
Version: 0.2.1
Authors@R:
person(given="Olivier", family="Leroy", email="[email protected]", role = c("aut", "cre"))
Description: Functions to get and process FCC data.
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# cori.data.fcc 0.2.1

## Major changes

* We updated raw data with the June 2024 FCC release

* CORI opinionated version at Census Block level is also now using that version.

# cori.data.fcc 0.1.2

## Minor improvements
Expand Down
4 changes: 2 additions & 2 deletions R/get_county_nbm_raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#'
#' @param geoid_co a string matching a GEOID for a county
#' @param frn a string of 10 numbers matching FCC's FRN, default is "all"
#' @param release a date, set by default to be '2023-12-01'
#' @param release a date, set by default to be '2024-06-01'
#'
#' @return a data frame
#'
Expand All @@ -24,7 +24,7 @@
#' guilford_cty <- get_county_nbm_raw(geoid_co = "37081")
#'}

get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2023-12-01") {
get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2024-06-01") {

# do I need a look up for county?

Expand Down
4 changes: 2 additions & 2 deletions R/get_frn_nbm_bl.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ get_frn_nbm_bl <- function(frn) {
DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
statement <- sprintf(
"select *
from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
where
combo_frn in (
select combo_frn
from
read_parquet('s3://cori.data.fcc/rel_combo_frn.parquet')
read_parquet('s3://cori.data.fcc/rel_combo_frn-J24.parquet')
where frn = '%s'
);", frn)

Expand Down
2 changes: 1 addition & 1 deletion R/get_nbm_bl.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ get_nbm_bl <- function(geoid_co) {
DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
statement <- sprintf(
"select *
from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
where geoid_co = '%s';", geoid_co)

DBI::dbGetQuery(con, statement)
Expand Down
71 changes: 70 additions & 1 deletion data-raw/NBM.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,73 @@ DBI::dbExecute(con, copy_stat)

DBI::dbDisconnect(con)

system("aws s3 nbm_raw s3://cori.data.fcc/nbm_raw")
system("aws s3 sync nbm_raw s3://cori.data.fcc/nbm_raw")

## update January 2025, adding June2024
# assuming list of csv in data_swamp

library(duckdb)

con <- DBI::dbConnect(duckdb::duckdb(), tempfile())

# I needed to run because FCC naming J24 can be june, january ...
dir <- "data_swamp/10dec2024/"

raw_csv <- list.files(dir, pattern = "*.csv", recursive = TRUE)
raw_csv <- paste0(dir, raw_csv)

# better names is defined above
better_name <- vapply(raw_csv, better_fcc_name, FUN.VALUE = character(1))

file.rename(raw_csv, better_name)


## I went overkill with that one, it is probably not needed
DBI::dbExecute(con, "PRAGMA max_temp_directory_size='10GiB'")

copy_stat <- "
COPY
(SELECT
frn,
provider_id,
brand_name,
location_id,
technology,
max_advertised_download_speed,
max_advertised_upload_speed,
low_latency,
business_residential_code,
state_usps,
block_geoid as geoid_bl,
substring(block_geoid, 1, 5) as geoid_co,
strptime(split_part(split_part(filename, '_', 8), '.', 1), '%d%b%Y')::DATE
as file_time_stamp,
strptime(split_part(filename, '_', 7), '%B%Y')::DATE as release
FROM
read_csv(
'data_swamp/10dec2024/*.csv',
types = {
'frn' : 'VARCHAR(10)',
'provider_id': 'TEXT',
'brand_name' : 'TEXT',
'location_id': 'TEXT',
'technology' : 'VARCHAR(2)',
'max_advertised_download_speed' : INTEGER,
'max_advertised_upload_speed' : INTEGER,
'low_latency' : 'BOOLEAN',
'business_residential_code': 'VARCHAR(1)',
'state_usps' : 'VARCHAR(2)',
'block_geoid': 'VARCHAR(15)'
},
ignore_errors = true,
delim=',', quote='\"',
new_line='\\n', skip=0,
header=true, filename=true))
TO 'nbm_raw' (FORMAT 'parquet', PARTITION_BY(release, state_usps, technology)
);"

DBI::dbExecute(con, copy_stat)

DBI::dbDisconnect(con)

system("aws s3 sync nbm_raw/release=2024-06-01 s3://cori.data.fcc/nbm_raw/release=2024-06-01")
Loading

0 comments on commit f7578ed

Please sign in to comment.