create constants.py and update utils

cchuong · cchuong · commit 6a002e04b62c · 2024-09-16T11:39:30.000-07:00
diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py
@@ -0,0 +1,80 @@
+VIRUSES = {
+    "parainfluenza": "hpiv",
+    "piv": "hpiv",
+    "para": "hpiv",
+    "adenovirus": "adv",
+    "adeno": "adv",
+    "human metapneumovirus": "hmpv",
+    "enterovirus/rhinovirus": "ev_rv",
+    "rhinovirus": "ev_rv",
+    "rhv": "ev_rv",
+    "entero/rhino": "ev_rv",
+    "rhino":"ev_rv",
+    "ev/rv":"ev_rv",
+    "evrv":"ev_rv",
+    "coronavirus":"hcov",
+    "coron":"hcov",
+    "coro":"hcov",
+    "respiratory syncytial virus":"rsv",
+    "influenza":"flu",
+    "sarscov2":"sars-cov-2"
+}
+
+GEOS = {
+    "newfoundland": "nl",
+    "prince edward island":"pe",
+    "nova scotia":"ns",
+    "new brunswick":"nb",
+    "québec":"qc",
+    "province of québec":"qc",
+    "quebec":"qc",
+    "ontario":"on",
+    "province of ontario":"on",
+    "manitoba" : "mb",
+    "saskatchewan":"sk",
+    "alberta": "ab",
+    "british columbia" :"bc",
+    "yukon" : "yk",
+    "northwest territories" : "nt",
+    "nunavut" : "nu",
+    "canada":"ca",
+    "can":"ca" ,
+    "at":"atlantic",
+    "pr" :"prairies" ,
+    "terr" :"territories"
+ }
+
+REGIONS = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on',
+            'prairies', 'pr', "british columbia", 'bc',"territories",'terr']
+NATION = ["canada","can",'ca']
+
+BASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/",
+"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"]
+
+HISTORIC_SEASON_URL = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html",
+"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"]
+
+ALTENRATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
+
+SEASON_BASE_URL = "https://www.canada.ca"
+
+LAST_WEEK_OF_YEAR = 35
+
diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py
@@ -64,8 +64,6 @@ def get_report_date(week,start_year,epi=False):
         report_date =  str(epi_week)
         
     return(report_date)
-
-
        
 
 def get_table_captions(soup):
@@ -257,6 +255,17 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
     geo_types = [create_geo_types(g,"lab") for g in table['geo_value']]
     table.insert(3,"geo_type",geo_types)
     
+    # Calculate number of positive tests based on pct_positive and total tests
+    if flu:
+        table["flu_a_positive_tests"] = (table["flu_a_pct_positive"]/100)*table["flu_tests"]
+        table["flu_b_positive_tests"] = (table["flu_b_pct_positive"]/100)*table["flu_tests"]
+        
+        table["flu_positive_tests"] =  table["flu_a_positive_tests"] +  table["flu_b_positive_tests"]
+        table["flu_pct_positive"] =   (table["flu_positive_tests"]/table["flu_tests"])*100
+    else:
+        table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"]
+    
+    
     table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
 
     return(table)
@@ -309,12 +318,15 @@ def get_season_reports(url):
                 if "Positive Adenovirus" in caption.text:
                     tab.select_one('td').decompose()
             
-            # Replace commas with periods
-            tab2 = re.sub(",",r".",str(tab))
-            
+            if not "number" in caption.text.lower():
+                # Replace commas with periods
+                tab = re.sub(",",r".",str(tab))
+            else:
+                tab = re.sub(",",r"",str(tab))
+                
             # Read table
             na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
-            table =  pd.read_html(tab2,na_values=na_values)[0].dropna(how="all")
+            table =  pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
             
             # Check for multiline headers
             if isinstance(table.columns, pd.MultiIndex):
@@ -405,12 +417,12 @@ def get_season_reports(url):
                 all_number_tables=pd.concat([all_number_tables,number_detections_table])
 
     # write files to csvs
-    all_respiratory_detection_table.to_csv(path+"/"+path+"_respiratory_detections.csv", index=True) 
-    all_positive_tables.to_csv(path+"/"+path+"_positive_tests.csv", index=True)
+    all_respiratory_detection_table.to_csv(path+"/respiratory_detections.csv", index=True) 
+    all_positive_tables.to_csv(path+"/positive_tests.csv", index=True)
     
     # Write the number of detections table to csv if it exists (i.e has rows)
     if len(all_number_tables) != 0:
-        all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True) 
+        all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) 
 
  #%% Scrape each season
 
diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py
@@ -6,83 +6,27 @@
 from datetime import datetime
 import math
 
+from constants import VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR 
+
 def abbreviate_virus(full_name):
     lowercase=full_name.lower()
-    
-    if any(name in lowercase for name in ["parainfluenza","para","piv"]):
-        if "hpiv" not in lowercase:
-            abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase)
-        else:
-            abbrev = lowercase
-    elif any(name in lowercase for name in ["adenovirus","adeno"]):
-        abbrev =  re.sub("adenovirus|adeno","adv",lowercase)
-    elif "human metapneumovirus" in lowercase:
-        abbrev =  re.sub("human metapneumovirus","hmpv",lowercase)
-    elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]):
-        abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase)
-    elif any(name in lowercase for name in ["coronavirus","coron","coro"]):
-        abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase)
-    elif "respiratory syncytial virus" in lowercase:
-        abbrev = re.sub("respiratory syncytial virus","rsv",lowercase)
-    elif "influenza" in lowercase:
-        abbrev = re.sub("influenza","flu",lowercase)       
-    elif "sarscov2" in lowercase:
-        abbrev = re.sub("sarscov2","sars-cov-2",lowercase) 
-    else:
-        abbrev=lowercase
-    return(abbrev)
+    keys = (re.escape(k) for k in VIRUSES.keys())
+    pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
+    result = pattern.sub(lambda x: VIRUSES[x.group()], lowercase)
+    return(result)
 
 def abbreviate_geo(full_name):
     lowercase=full_name.lower()
-    
-    if "newfoundland" in lowercase:
-        abbrev =  "nl"
-    elif "prince edward island" in lowercase:
-        abbrev =  "pe"
-    elif "nova scotia" in lowercase:
-        abbrev =  "ns"
-    elif "new brunswick" in lowercase:
-        abbrev =  "nb"
-    elif "nova scotia" in lowercase:
-        abbrev =  "ns"     
-    elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase):
-        abbrev = "qc"  
-    elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase):
-        abbrev =  "on"
-    elif "manitoba" in lowercase:
-        abbrev =  "mb"
-    elif "saskatchewan" in lowercase:
-        abbrev =  "sk"
-    elif "alberta" in lowercase:
-        abbrev =  "ab"
-    elif "british columbia" in lowercase:
-        abbrev =  "bc"
-    elif "yukon" in lowercase:
-        abbrev =  "yk"
-    elif "northwest territories" in lowercase:
-        abbrev =  "nt"
-    elif "nunavut" in lowercase:
-        abbrev =  "nu"
-    elif re.match("canada|can",lowercase):
-        abbrev = "ca" 
-    elif re.match(r"^at\b",lowercase):
-        abbrev = "atlantic" 
-    elif "pr" in lowercase:
-        abbrev = "prairies" 
-    elif "terr" in lowercase:
-        abbrev = "territories" 
-    else:
-        abbrev=lowercase
-    return(abbrev)
+    keys = (re.escape(k) for k in GEOS.keys())
+    pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
+
+    result = pattern.sub(lambda x: GEOS[x.group()], lowercase)
+    return(result)
 
 def create_geo_types(geo,default_geo):
-    regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on',
-               'prairies', 'pr', "british columbia", 'bc',"territories",'terr']
-    nation = ["canada","can",'ca']
-                    
-    if geo in nation:
+    if geo in NATION:
         geo_type="nation"
-    elif geo in regions:
+    elif geo in REGIONS:
         geo_type="region"
     else:
         geo_type = default_geo
@@ -163,7 +107,7 @@ def get_weekly_data(base_url,start_year):
     week_string = week_df.iloc[0]['Text'].lower()
     current_week = int(re.search("week (.+?) ", week_string).group(1))
 
-    if current_week < 34:
+    if current_week < LAST_WEEK_OF_YEAR:
         current_year = start_year+1
     else:
         current_year = start_year