Skip to content

Commit eecc6b8

Browse files
committed
small clarifications in variable names, comments, log messages, and logic in incoming metadata processing
1 parent a356310 commit eecc6b8

File tree

1 file changed

+20
-10
lines changed
  • src/acquisition/covid_hosp/common

1 file changed

+20
-10
lines changed

src/acquisition/covid_hosp/common/utils.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -192,30 +192,38 @@ def update_dataset(database, network, newer_than=None, older_than=None):
192192
metadata = network.fetch_metadata(logger=logger)
193193
datasets = []
194194
# daily runs specify no bounds; patching runs specify at least one bound
195-
patching = any(bound is not None for bound in (newer_than, older_than))
195+
is_patch_run = any(bound is not None for bound in (newer_than, older_than))
196+
if is_patch_run:
197+
logger.warn('runing update_dataset() as a "patch" with some specific date bound[s] specified;'
198+
' this will include and overwrite any revisions that were already collected.',
199+
newer_than=newer_than, older_than=older_than)
196200
if older_than is None:
201+
# by default, include days "older than tomorrow" which thus includes "today"
197202
older_than = (datetime.datetime.today().date() + datetime.timedelta(days=1))
198203
if newer_than is None:
204+
# by default, include days "newer than the day before the last update"
205+
# which thus includes the day of the last update (in case there are new updates
206+
# that day which were published after the one we already ingested)
199207
with database.connect() as db:
200208
max_issue = db.get_max_issue(logger=logger)
201209
newer_than = (max_issue - datetime.timedelta(days=1))
210+
logger.info("looking up issues in date range", newer_than=newer_than, older_than=older_than)
202211
daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger)
203212
if not daily_issues:
204-
logger.info("no new issues; nothing to do")
213+
logger.info("no issues found in date range; nothing to do")
205214
return False
206215
for issue, revisions in daily_issues.items():
207216
issue_int = int(issue.strftime("%Y%m%d"))
208217
# download dataset(s) and save associated metadata
209218
dataset_list = []
210219
all_metadata = []
211220
for url, index in revisions:
212-
if not patching:
213-
# for daily runs, we only want new datasets
214-
with database.connect() as db:
215-
already_in_db = db.contains_revision(url)
216-
if already_in_db:
217-
logger.info(f"already collected revision: {url}")
218-
if patching or not already_in_db:
221+
with database.connect() as db:
222+
already_in_db = db.contains_revision(url)
223+
if already_in_db:
224+
logger.info(f"already collected revision: {url}")
225+
if is_patch_run or not already_in_db:
226+
logger.info(f"including dataset revision: {url}")
219227
dataset_list.append(network.fetch_dataset(url, logger=logger))
220228
all_metadata.append((url, metadata.loc[index].reset_index().to_json()))
221229
if not dataset_list:
@@ -230,8 +238,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
230238
dataset,
231239
all_metadata
232240
))
241+
tot_revs = sum(len(revisions) for revisions in daily_issues.values())
242+
logger.info(f"{len(daily_issues)} issues checked w/ {tot_revs} revisions, resulting in {len(datasets)} datasets.")
233243
if not datasets:
234-
logger.info(f"{len(daily_issues)} issues checked containing {sum(len(revisions) for revisions in daily_issues.values())} revisions; nothing to do")
244+
logger.info("nothing to do, exiting")
235245
return False
236246
with database.connect() as db:
237247
for issue_int, dataset, all_metadata in datasets:

0 commit comments

Comments
 (0)