Merge pull request #26 from ruralinnovation/dev/Update_release_J24

add raw data for june 2024
ruralinnovation · Jan 24, 2025 · f7578ed · f7578ed
2 parents fe69872 + 4fb7a72
commit f7578ed
Show file tree

Hide file tree

Showing 10 changed files with 556 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cori.data.fcc
 Title: Process FCC data 
-Version: 0.1.2
+Version: 0.2.1
 Authors@R: 
     person(given="Olivier", family="Leroy", email="[email protected]", role = c("aut", "cre"))
 Description: Functions to get and process FCC data.

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# cori.data.fcc 0.2.1
+
+## Major changes
+
+* We updated raw data with the June 2024 FCC release
+
+* CORI opinionated version at Census Block level is also now using that version. 
+
 # cori.data.fcc 0.1.2
 
 ## Minor improvements 

diff --git a/R/get_county_nbm_raw.R b/R/get_county_nbm_raw.R
@@ -12,7 +12,7 @@
 #'
 #' @param geoid_co a string matching a GEOID for a county
 #' @param frn a string of 10 numbers matching FCC's FRN, default is "all"
-#' @param release a date, set by default to be '2023-12-01'
+#' @param release a date, set by default to be '2024-06-01'
 #'
 #' @return a data frame
 #'
@@ -24,7 +24,7 @@
 #'  guilford_cty <- get_county_nbm_raw(geoid_co = "37081")
 #'}
 
-get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2023-12-01") {
+get_county_nbm_raw <- function(geoid_co, frn = "all", release = "2024-06-01") {
 
   # do I need a look up for county?
 

diff --git a/R/get_frn_nbm_bl.R b/R/get_frn_nbm_bl.R
@@ -36,12 +36,12 @@ get_frn_nbm_bl <- function(frn) {
   DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
   statement <- sprintf(
    "select * 
- 		  from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
+ 		  from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
     where 
       combo_frn in (
     							  select combo_frn 
     							  from 
-										read_parquet('s3://cori.data.fcc/rel_combo_frn.parquet')
+										read_parquet('s3://cori.data.fcc/rel_combo_frn-J24.parquet')
     								where frn = '%s'
     );", frn)
 

diff --git a/R/get_nbm_bl.R b/R/get_nbm_bl.R
@@ -31,7 +31,7 @@ get_nbm_bl <- function(geoid_co) {
   DBI::dbExecute(con, "INSTALL httpfs;LOAD httpfs")
   statement <- sprintf(
     "select * 
- 		  from read_parquet('s3://cori.data.fcc/nbm_block/*/*.parquet')
+ 		  from read_parquet('s3://cori.data.fcc/nbm_block-J24/*/*.parquet')
     where geoid_co = '%s';", geoid_co)
 
   DBI::dbGetQuery(con, statement)

diff --git a/data-raw/NBM.R b/data-raw/NBM.R
@@ -116,4 +116,73 @@ DBI::dbExecute(con, copy_stat)
 
 DBI::dbDisconnect(con)
 
-system("aws s3 nbm_raw s3://cori.data.fcc/nbm_raw")
+system("aws s3 sync nbm_raw s3://cori.data.fcc/nbm_raw")
+
+## update January 2025, adding June2024
+# assuming list of csv in data_swamp
+
+library(duckdb)
+
+con <- DBI::dbConnect(duckdb::duckdb(),  tempfile())
+
+# I needed to run because FCC naming J24 can be june, january ... 
+dir <- "data_swamp/10dec2024/"
+
+raw_csv <- list.files(dir, pattern = "*.csv", recursive = TRUE)
+raw_csv <- paste0(dir, raw_csv)
+
+# better names is defined above
+better_name <- vapply(raw_csv, better_fcc_name, FUN.VALUE = character(1))
+
+file.rename(raw_csv, better_name)
+
+
+## I went overkill with that one, it is probably not needed
+DBI::dbExecute(con, "PRAGMA max_temp_directory_size='10GiB'")
+
+copy_stat <- "
+COPY
+    (SELECT 
+      frn, 
+      provider_id, 
+      brand_name,
+      location_id,
+      technology,
+      max_advertised_download_speed,
+      max_advertised_upload_speed,
+      low_latency,
+      business_residential_code,
+      state_usps,
+      block_geoid as geoid_bl, 
+      substring(block_geoid, 1, 5) as geoid_co,
+      strptime(split_part(split_part(filename, '_', 8), '.', 1), '%d%b%Y')::DATE
+       as file_time_stamp,
+      strptime(split_part(filename, '_', 7), '%B%Y')::DATE as release 
+    FROM 
+    read_csv(
+             'data_swamp/10dec2024/*.csv',
+              types = { 
+                        'frn'        : 'VARCHAR(10)',
+                        'provider_id': 'TEXT',
+                        'brand_name' : 'TEXT',
+                        'location_id': 'TEXT', 
+                        'technology' : 'VARCHAR(2)', 
+                        'max_advertised_download_speed' : INTEGER,
+                        'max_advertised_upload_speed' : INTEGER,
+                        'low_latency' : 'BOOLEAN',
+                        'business_residential_code': 'VARCHAR(1)',
+                        'state_usps' : 'VARCHAR(2)',
+                        'block_geoid': 'VARCHAR(15)'  
+    },   
+              ignore_errors = true,         
+              delim=',', quote='\"',
+              new_line='\\n', skip=0, 
+              header=true, filename=true))
+    TO 'nbm_raw' (FORMAT 'parquet', PARTITION_BY(release, state_usps, technology)
+    );"
+
+DBI::dbExecute(con, copy_stat)
+
+DBI::dbDisconnect(con)
+
+system("aws s3 sync nbm_raw/release=2024-06-01 s3://cori.data.fcc/nbm_raw/release=2024-06-01")