Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add raw data for june 2024 #26

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: cori.data.fcc
Title: Process FCC data
Version: 0.1.2
Version: 0.2.1
Authors@R:
person(given="Olivier", family="Leroy", email="[email protected]", role = c("aut", "cre"))
Description: Functions to get and process FCC data.
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# cori.data.fcc 0.2.1

## Major changes

* We updated raw data with the June 2024 FCC release

* CORI opinionated version at Census Block level is also now using that version.

# cori.data.fcc 0.1.2

## Minor improvements
Expand Down
4 changes: 2 additions & 2 deletions R/get_county_nbm_raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#'
#' @param geoid_co a string matching a GEOID for a county
#' @param frn a string of 10 numbers matching FCC's FRN, default is "all"
#' @param release a date, set by default to be '2023-12-01'
#' @param release a date, set by default to be '2024-06-01'
#'
#' @return a data frame
#'
Expand All @@ -24,7 +24,7 @@
#' guilford_cty <- get_county_nbm_raw(geoid_co = "37081")
#'}

get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2023-12-01") {
get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2024-06-01") {

# do I need a look up for county?

Expand Down
4 changes: 2 additions & 2 deletions R/get_frn_nbm_bl.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ get_frn_nbm_bl <- function(frn) {
DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
statement <- sprintf(
"select *
from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
where
combo_frn in (
select combo_frn
from
read_parquet('s3://cori.data.fcc/rel_combo_frn.parquet')
read_parquet('s3://cori.data.fcc/rel_combo_frn-J24.parquet')
where frn = '%s'
);", frn)

Expand Down
2 changes: 1 addition & 1 deletion R/get_nbm_bl.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ get_nbm_bl <- function(geoid_co) {
DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
statement <- sprintf(
"select *
from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
where geoid_co = '%s';", geoid_co)

DBI::dbGetQuery(con, statement)
Expand Down
71 changes: 70 additions & 1 deletion data-raw/NBM.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,73 @@ DBI::dbExecute(con, copy_stat)

DBI::dbDisconnect(con)

system("aws s3 nbm_raw s3://cori.data.fcc/nbm_raw")
system("aws s3 sync nbm_raw s3://cori.data.fcc/nbm_raw")

## update January 2025, adding June2024
# assuming list of csv in data_swamp

library(duckdb)

con <- DBI::dbConnect(duckdb::duckdb(), tempfile())

# I needed to run because FCC naming J24 can be june, january ...
dir <- "data_swamp/10dec2024/"

raw_csv <- list.files(dir, pattern = "*.csv", recursive = TRUE)
raw_csv <- paste0(dir, raw_csv)

# better names is defined above
better_name <- vapply(raw_csv, better_fcc_name, FUN.VALUE = character(1))

file.rename(raw_csv, better_name)


## I went overkill with that one, it is probably not needed
DBI::dbExecute(con, "PRAGMA max_temp_directory_size='10GiB'")

copy_stat <- "
COPY
(SELECT
frn,
provider_id,
brand_name,
location_id,
technology,
max_advertised_download_speed,
max_advertised_upload_speed,
low_latency,
business_residential_code,
state_usps,
block_geoid as geoid_bl,
substring(block_geoid, 1, 5) as geoid_co,
strptime(split_part(split_part(filename, '_', 8), '.', 1), '%d%b%Y')::DATE
as file_time_stamp,
strptime(split_part(filename, '_', 7), '%B%Y')::DATE as release
FROM
read_csv(
'data_swamp/10dec2024/*.csv',
types = {
'frn' : 'VARCHAR(10)',
'provider_id': 'TEXT',
'brand_name' : 'TEXT',
'location_id': 'TEXT',
'technology' : 'VARCHAR(2)',
'max_advertised_download_speed' : INTEGER,
'max_advertised_upload_speed' : INTEGER,
'low_latency' : 'BOOLEAN',
'business_residential_code': 'VARCHAR(1)',
'state_usps' : 'VARCHAR(2)',
'block_geoid': 'VARCHAR(15)'
},
ignore_errors = true,
delim=',', quote='\"',
new_line='\\n', skip=0,
header=true, filename=true))
TO 'nbm_raw' (FORMAT 'parquet', PARTITION_BY(release, state_usps, technology)
);"

DBI::dbExecute(con, copy_stat)

DBI::dbDisconnect(con)

system("aws s3 sync nbm_raw/release=2024-06-01 s3://cori.data.fcc/nbm_raw/release=2024-06-01")
Loading
Loading