@@ -192,30 +192,38 @@ def update_dataset(database, network, newer_than=None, older_than=None):
192
192
metadata = network .fetch_metadata (logger = logger )
193
193
datasets = []
194
194
# daily runs specify no bounds; patching runs specify at least one bound
195
- patching = any (bound is not None for bound in (newer_than , older_than ))
195
+ is_patch_run = any (bound is not None for bound in (newer_than , older_than ))
196
+ if is_patch_run :
197
+ logger .warn ('runing update_dataset() as a "patch" with some specific date bound[s] specified;'
198
+ ' this will include and overwrite any revisions that were already collected.' ,
199
+ newer_than = newer_than , older_than = older_than )
196
200
if older_than is None :
201
+ # by default, include days "older than tomorrow" which thus includes "today"
197
202
older_than = (datetime .datetime .today ().date () + datetime .timedelta (days = 1 ))
198
203
if newer_than is None :
204
+ # by default, include days "newer than the day before the last update"
205
+ # which thus includes the day of the last update (in case there are new updates
206
+ # that day which were published after the one we already ingested)
199
207
with database .connect () as db :
200
208
max_issue = db .get_max_issue (logger = logger )
201
209
newer_than = (max_issue - datetime .timedelta (days = 1 ))
210
+ logger .info ("looking up issues in date range" , newer_than = newer_than , older_than = older_than )
202
211
daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than , logger = logger )
203
212
if not daily_issues :
204
- logger .info ("no new issues; nothing to do" )
213
+ logger .info ("no issues found in date range ; nothing to do" )
205
214
return False
206
215
for issue , revisions in daily_issues .items ():
207
216
issue_int = int (issue .strftime ("%Y%m%d" ))
208
217
# download dataset(s) and save associated metadata
209
218
dataset_list = []
210
219
all_metadata = []
211
220
for url , index in revisions :
212
- if not patching :
213
- # for daily runs, we only want new datasets
214
- with database .connect () as db :
215
- already_in_db = db .contains_revision (url )
216
- if already_in_db :
217
- logger .info (f"already collected revision: { url } " )
218
- if patching or not already_in_db :
221
+ with database .connect () as db :
222
+ already_in_db = db .contains_revision (url )
223
+ if already_in_db :
224
+ logger .info (f"already collected revision: { url } " )
225
+ if is_patch_run or not already_in_db :
226
+ logger .info (f"including dataset revision: { url } " )
219
227
dataset_list .append (network .fetch_dataset (url , logger = logger ))
220
228
all_metadata .append ((url , metadata .loc [index ].reset_index ().to_json ()))
221
229
if not dataset_list :
@@ -230,8 +238,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
230
238
dataset ,
231
239
all_metadata
232
240
))
241
+ tot_revs = sum (len (revisions ) for revisions in daily_issues .values ())
242
+ logger .info (f"{ len (daily_issues )} issues checked w/ { tot_revs } revisions, resulting in { len (datasets )} datasets." )
233
243
if not datasets :
234
- logger .info (f" { len ( daily_issues ) } issues checked containing { sum ( len ( revisions ) for revisions in daily_issues . values ()) } revisions; nothing to do" )
244
+ logger .info (" nothing to do, exiting " )
235
245
return False
236
246
with database .connect () as db :
237
247
for issue_int , dataset , all_metadata in datasets :
0 commit comments