Skip to content

Commit 6a002e0

Browse files
committed
create constants.py and update utils
1 parent 01af95f commit 6a002e0

File tree

3 files changed

+115
-79
lines changed

3 files changed

+115
-79
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
VIRUSES = {
2+
"parainfluenza": "hpiv",
3+
"piv": "hpiv",
4+
"para": "hpiv",
5+
"adenovirus": "adv",
6+
"adeno": "adv",
7+
"human metapneumovirus": "hmpv",
8+
"enterovirus/rhinovirus": "ev_rv",
9+
"rhinovirus": "ev_rv",
10+
"rhv": "ev_rv",
11+
"entero/rhino": "ev_rv",
12+
"rhino":"ev_rv",
13+
"ev/rv":"ev_rv",
14+
"evrv":"ev_rv",
15+
"coronavirus":"hcov",
16+
"coron":"hcov",
17+
"coro":"hcov",
18+
"respiratory syncytial virus":"rsv",
19+
"influenza":"flu",
20+
"sarscov2":"sars-cov-2"
21+
}
22+
23+
GEOS = {
24+
"newfoundland": "nl",
25+
"prince edward island":"pe",
26+
"nova scotia":"ns",
27+
"new brunswick":"nb",
28+
"québec":"qc",
29+
"province of québec":"qc",
30+
"quebec":"qc",
31+
"ontario":"on",
32+
"province of ontario":"on",
33+
"manitoba" : "mb",
34+
"saskatchewan":"sk",
35+
"alberta": "ab",
36+
"british columbia" :"bc",
37+
"yukon" : "yk",
38+
"northwest territories" : "nt",
39+
"nunavut" : "nu",
40+
"canada":"ca",
41+
"can":"ca" ,
42+
"at":"atlantic",
43+
"pr" :"prairies" ,
44+
"terr" :"territories"
45+
}
46+
47+
REGIONS = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on',
48+
'prairies', 'pr', "british columbia", 'bc',"territories",'terr']
49+
NATION = ["canada","can",'ca']
50+
51+
BASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/",
52+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/",
53+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/",
54+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/",
55+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/",
56+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/",
57+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/",
58+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/",
59+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/",
60+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/",
61+
"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"]
62+
63+
HISTORIC_SEASON_URL = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html",
64+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html",
65+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html",
66+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html",
67+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html",
68+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html",
69+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html",
70+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html",
71+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html",
72+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html",
73+
"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"]
74+
75+
ALTENRATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
76+
77+
SEASON_BASE_URL = "https://www.canada.ca"
78+
79+
LAST_WEEK_OF_YEAR = 35
80+

src/acquisition/rvdss/rvdss_historic.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,6 @@ def get_report_date(week,start_year,epi=False):
6464
report_date = str(epi_week)
6565

6666
return(report_date)
67-
68-
6967

7068

7169
def get_table_captions(soup):
@@ -257,6 +255,17 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
257255
geo_types = [create_geo_types(g,"lab") for g in table['geo_value']]
258256
table.insert(3,"geo_type",geo_types)
259257

258+
# Calculate number of positive tests based on pct_positive and total tests
259+
if flu:
260+
table["flu_a_positive_tests"] = (table["flu_a_pct_positive"]/100)*table["flu_tests"]
261+
table["flu_b_positive_tests"] = (table["flu_b_pct_positive"]/100)*table["flu_tests"]
262+
263+
table["flu_positive_tests"] = table["flu_a_positive_tests"] + table["flu_b_positive_tests"]
264+
table["flu_pct_positive"] = (table["flu_positive_tests"]/table["flu_tests"])*100
265+
else:
266+
table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"]
267+
268+
260269
table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
261270

262271
return(table)
@@ -309,12 +318,15 @@ def get_season_reports(url):
309318
if "Positive Adenovirus" in caption.text:
310319
tab.select_one('td').decompose()
311320

312-
# Replace commas with periods
313-
tab2 = re.sub(",",r".",str(tab))
314-
321+
if not "number" in caption.text.lower():
322+
# Replace commas with periods
323+
tab = re.sub(",",r".",str(tab))
324+
else:
325+
tab = re.sub(",",r"",str(tab))
326+
315327
# Read table
316328
na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
317-
table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all")
329+
table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
318330

319331
# Check for multiline headers
320332
if isinstance(table.columns, pd.MultiIndex):
@@ -405,12 +417,12 @@ def get_season_reports(url):
405417
all_number_tables=pd.concat([all_number_tables,number_detections_table])
406418

407419
# write files to csvs
408-
all_respiratory_detection_table.to_csv(path+"/"+path+"_respiratory_detections.csv", index=True)
409-
all_positive_tables.to_csv(path+"/"+path+"_positive_tests.csv", index=True)
420+
all_respiratory_detection_table.to_csv(path+"/respiratory_detections.csv", index=True)
421+
all_positive_tables.to_csv(path+"/positive_tests.csv", index=True)
410422

411423
# Write the number of detections table to csv if it exists (i.e has rows)
412424
if len(all_number_tables) != 0:
413-
all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True)
425+
all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
414426

415427
#%% Scrape each season
416428

src/acquisition/rvdss/utils.py

Lines changed: 14 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -6,83 +6,27 @@
66
from datetime import datetime
77
import math
88

9+
from constants import VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR
10+
911
def abbreviate_virus(full_name):
1012
lowercase=full_name.lower()
11-
12-
if any(name in lowercase for name in ["parainfluenza","para","piv"]):
13-
if "hpiv" not in lowercase:
14-
abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase)
15-
else:
16-
abbrev = lowercase
17-
elif any(name in lowercase for name in ["adenovirus","adeno"]):
18-
abbrev = re.sub("adenovirus|adeno","adv",lowercase)
19-
elif "human metapneumovirus" in lowercase:
20-
abbrev = re.sub("human metapneumovirus","hmpv",lowercase)
21-
elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]):
22-
abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase)
23-
elif any(name in lowercase for name in ["coronavirus","coron","coro"]):
24-
abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase)
25-
elif "respiratory syncytial virus" in lowercase:
26-
abbrev = re.sub("respiratory syncytial virus","rsv",lowercase)
27-
elif "influenza" in lowercase:
28-
abbrev = re.sub("influenza","flu",lowercase)
29-
elif "sarscov2" in lowercase:
30-
abbrev = re.sub("sarscov2","sars-cov-2",lowercase)
31-
else:
32-
abbrev=lowercase
33-
return(abbrev)
13+
keys = (re.escape(k) for k in VIRUSES.keys())
14+
pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
15+
result = pattern.sub(lambda x: VIRUSES[x.group()], lowercase)
16+
return(result)
3417

3518
def abbreviate_geo(full_name):
3619
lowercase=full_name.lower()
37-
38-
if "newfoundland" in lowercase:
39-
abbrev = "nl"
40-
elif "prince edward island" in lowercase:
41-
abbrev = "pe"
42-
elif "nova scotia" in lowercase:
43-
abbrev = "ns"
44-
elif "new brunswick" in lowercase:
45-
abbrev = "nb"
46-
elif "nova scotia" in lowercase:
47-
abbrev = "ns"
48-
elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase):
49-
abbrev = "qc"
50-
elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase):
51-
abbrev = "on"
52-
elif "manitoba" in lowercase:
53-
abbrev = "mb"
54-
elif "saskatchewan" in lowercase:
55-
abbrev = "sk"
56-
elif "alberta" in lowercase:
57-
abbrev = "ab"
58-
elif "british columbia" in lowercase:
59-
abbrev = "bc"
60-
elif "yukon" in lowercase:
61-
abbrev = "yk"
62-
elif "northwest territories" in lowercase:
63-
abbrev = "nt"
64-
elif "nunavut" in lowercase:
65-
abbrev = "nu"
66-
elif re.match("canada|can",lowercase):
67-
abbrev = "ca"
68-
elif re.match(r"^at\b",lowercase):
69-
abbrev = "atlantic"
70-
elif "pr" in lowercase:
71-
abbrev = "prairies"
72-
elif "terr" in lowercase:
73-
abbrev = "territories"
74-
else:
75-
abbrev=lowercase
76-
return(abbrev)
20+
keys = (re.escape(k) for k in GEOS.keys())
21+
pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
22+
23+
result = pattern.sub(lambda x: GEOS[x.group()], lowercase)
24+
return(result)
7725

7826
def create_geo_types(geo,default_geo):
79-
regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on',
80-
'prairies', 'pr', "british columbia", 'bc',"territories",'terr']
81-
nation = ["canada","can",'ca']
82-
83-
if geo in nation:
27+
if geo in NATION:
8428
geo_type="nation"
85-
elif geo in regions:
29+
elif geo in REGIONS:
8630
geo_type="region"
8731
else:
8832
geo_type = default_geo
@@ -163,7 +107,7 @@ def get_weekly_data(base_url,start_year):
163107
week_string = week_df.iloc[0]['Text'].lower()
164108
current_week = int(re.search("week (.+?) ", week_string).group(1))
165109

166-
if current_week < 34:
110+
if current_week < LAST_WEEK_OF_YEAR:
167111
current_year = start_year+1
168112
else:
169113
current_year = start_year

0 commit comments

Comments
 (0)