cmu-delphi · evidencebp · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/scripts/report_missing_covidcast_meta.py b/scripts/report_missing_covidcast_meta.py
@@ -38,7 +38,12 @@ def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]:
 
 
 def gen_row(source: str, signal: str, info: Dict) -> Dict:
-    is_weighted = signal.startswith('smoothed_w') and not (signal.startswith('smoothed_wa') or signal.startswith('smoothed_we') or signal.startswith('smoothed_wi') or signal.startswith('smoothed_wo') or signal.startswith('smoothed_wu'))
+    is_weighted = (signal.startswith('smoothed_w') 
+                    and not (signal.startswith('smoothed_wa') 
+                             or signal.startswith('smoothed_we') 
+                             or signal.startswith('smoothed_wi') 
+                             or signal.startswith('smoothed_wo') 
+                             or signal.startswith('smoothed_wu')))
     base_name = signal.replace('smoothed_w', 'smoothed_') if is_weighted else signal
     bool_str = lambda x: 'TRUE' if x else 'FALSE'
 

diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py
@@ -161,80 +161,81 @@
     `total` = %s
 """
 
+# insert (or update) table `cdc`
+def insert_cdc(cur, date, page, state, num):
+    cur.execute(sql_cdc, (date, page, state, num, num))
+
+# insert (or update) table `cdc_meta`
+def insert_cdc_meta(cur, date, state, total):
+    cur.execute(sql_cdc_meta, (date, date, state, total, total))
+
+# loop over rows until the header row is found
+def find_header(reader):
+    for row in reader:
+        if len(row) > 0 and row[0] == "Date":
+            return True
+    return False
+
+# parse csv files for `cdc` and `cdc_meta`
+def parse_csv(cur, meta):
+    def handler(cur, reader):
+        if not find_header(reader):
+            raise Exception("header not found")
+        count = 0
+        cols = 3 if meta else 4
+        for row in reader:
+            if len(row) != cols:
+                continue
+            if meta:
+                (a, c, d) = row
+            else:
+                (a, b, c, d) = row
+            c = c[:-16]
+            if c not in STATES:
+                continue
+            a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d")
+            c = STATES[c]
+            d = int(d)
+            if meta:
+                insert_cdc_meta(cur, a, c, d)
+            else:
+                insert_cdc(cur, a, b, c, d)
+            count += 1
+        return count
+
+    return handler
+
+
+# recursively open zip files
+def parse_zip(cur, zf, level=1):
+    for name in zf.namelist():
+        prefix = " " * level
+        print(prefix, name)
+        if name[-4:] == ".zip":
+            with zf.open(name) as temp:
+                with ZipFile(io.BytesIO(temp.read())) as zf2:
+                    parse_zip(cur, zf2, level + 1)
+        elif name[-4:] == ".csv":
+            handler = None
+            if "Flu Pages by Region" in name:
+                handler = parse_csv(cur, False)
+            elif "Regions for all CDC" in name:
+                handler = parse_csv(cur, True)
+            else:
+                print(prefix, " (skipped)")
+            if handler is not None:
+                with zf.open(name) as temp:
+                    count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8"))))
+                print(prefix, f" {int(count)} rows")
+        else:
+            print(prefix, " (ignored)")
 
 def upload(test_mode):
     # connect
     u, p = secrets.db.epi
     cnx = mysql.connector.connect(user=u, password=p, database="epidata")
     cur = cnx.cursor()
 
-    # insert (or update) table `cdc`
-    def insert_cdc(date, page, state, num):
-        cur.execute(sql_cdc, (date, page, state, num, num))
-
-    # insert (or update) table `cdc_meta`
-    def insert_cdc_meta(date, state, total):
-        cur.execute(sql_cdc_meta, (date, date, state, total, total))
-
-    # loop over rows until the header row is found
-    def find_header(reader):
-        for row in reader:
-            if len(row) > 0 and row[0] == "Date":
-                return True
-        return False
-
-    # parse csv files for `cdc` and `cdc_meta`
-    def parse_csv(meta):
-        def handler(reader):
-            if not find_header(reader):
-                raise Exception("header not found")
-            count = 0
-            cols = 3 if meta else 4
-            for row in reader:
-                if len(row) != cols:
-                    continue
-                if meta:
-                    (a, c, d) = row
-                else:
-                    (a, b, c, d) = row
-                c = c[:-16]
-                if c not in STATES:
-                    continue
-                a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d")
-                c = STATES[c]
-                d = int(d)
-                if meta:
-                    insert_cdc_meta(a, c, d)
-                else:
-                    insert_cdc(a, b, c, d)
-                count += 1
-            return count
-
-        return handler
-
-    # recursively open zip files
-    def parse_zip(zf, level=1):
-        for name in zf.namelist():
-            prefix = " " * level
-            print(prefix, name)
-            if name[-4:] == ".zip":
-                with zf.open(name) as temp:
-                    with ZipFile(io.BytesIO(temp.read())) as zf2:
-                        parse_zip(zf2, level + 1)
-            elif name[-4:] == ".csv":
-                handler = None
-                if "Flu Pages by Region" in name:
-                    handler = parse_csv(False)
-                elif "Regions for all CDC" in name:
-                    handler = parse_csv(True)
-                else:
-                    print(prefix, " (skipped)")
-                if handler is not None:
-                    with zf.open(name) as temp:
-                        count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8"))))
-                    print(prefix, f" {int(count)} rows")
-            else:
-                print(prefix, " (ignored)")
 
     # find, parse, and move zip files
     zip_files = glob.glob("/common/cdc_stage/*.zip")
@@ -244,7 +245,7 @@ def parse_zip(zf, level=1):
     print("parsing...")
     for f in zip_files:
         with ZipFile(f) as zf:
-            parse_zip(zf)
+            parse_zip(cur, zf)
     print("moving...")
     for f in zip_files:
         src = f

diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py
@@ -192,37 +192,14 @@ def nan_safe_dtype(dtype, value):
     num_values = len(dataframe.index)
     if logger:
       logger.info('updating values', count=num_values)
-    n = 0
-    rows_affected = 0
-    many_values = []
-    with self.new_cursor() as cursor:
-      for index, row in dataframe.iterrows():
-        values = []
-        for c in dataframe_columns_and_types:
-          values.append(nan_safe_dtype(c.dtype, row[c.csv_name]))
-        many_values.append(id_and_publication_date +
-          tuple(values) +
-          tuple(i.csv_name for i in self.additional_fields))
-        n += 1
-        # insert in batches because one at a time is slow and all at once makes
-        # the connection drop :(
-        if n % 5_000 == 0:
-          try:
-            cursor.executemany(sql, many_values)
-            rows_affected += cursor.rowcount
-            many_values = []
-          except Exception as e:
-            if logger:
-              logger.error('error on insert', publ_date=publication_date, in_lines=(n-5_000, n), index=index, values=values, exception=e)
-            raise e
-      # insert final batch
-      if many_values:
-        cursor.executemany(sql, many_values)
-        rows_affected += cursor.rowcount
-      if logger:
-        # NOTE: REPLACE INTO marks 2 rows affected for a "replace" (one for a delete and one for a re-insert)
-        # which allows us to count rows which were updated
-        logger.info('rows affected', total=rows_affected, updated=rows_affected-num_values)
+    self._process_rows(publication_date
+                       , dataframe
+                       , logger
+                       , dataframe_columns_and_types
+                       , nan_safe_dtype
+                       , sql
+                       , id_and_publication_date
+                       , num_values)
 
     # deal with non/seldomly updated columns used like a fk table (if this database needs it)
     if hasattr(self, 'AGGREGATE_KEY_COLS'):
@@ -261,6 +238,40 @@ def nan_safe_dtype(dtype, value):
       with self.new_cursor() as cur:
         cur.executemany(ak_insert_sql, ak_data)
 
+  def _process_rows(self, publication_date, dataframe, logger, dataframe_columns_and_types, nan_safe_dtype, sql
+                    , id_and_publication_date, num_values):
+      n = 0
+      rows_affected = 0
+      many_values = []
+      with self.new_cursor() as cursor:
+        for index, row in dataframe.iterrows():
+          values = []
+          for c in dataframe_columns_and_types:
+            values.append(nan_safe_dtype(c.dtype, row[c.csv_name]))
+          many_values.append(id_and_publication_date +
+          tuple(values) +
+          tuple(i.csv_name for i in self.additional_fields))
+          n += 1
+        # insert in batches because one at a time is slow and all at once makes
+        # the connection drop :(
+          if n % 5_000 == 0:
+            try:
+              cursor.executemany(sql, many_values)
+              rows_affected += cursor.rowcount
+              many_values = []
+            except Exception as e:
+              if logger:
+                logger.error('error on insert', publ_date=publication_date, in_lines=(n-5_000, n), index=index, values=values, exception=e)
+              raise e
+      # insert final batch
+        if many_values:
+          cursor.executemany(sql, many_values)
+          rows_affected += cursor.rowcount
+        if logger:
+        # NOTE: REPLACE INTO marks 2 rows affected for a "replace" (one for a delete and one for a re-insert)
+        # which allows us to count rows which were updated
+          logger.info('rows affected', total=rows_affected, updated=rows_affected-num_values)
+
 
   def get_max_issue(self, logger=False):
     """Fetch the most recent issue.

diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py
@@ -126,7 +126,10 @@ def issues_to_fetch(metadata, newer_than, older_than, logger=False):
     if logger:
       if n_beyond > 0:
         logger.info("issues available beyond selection", on_or_newer=older_than, count=n_beyond)
-      logger.info("issues selected", newer_than=str(newer_than), older_than=str(older_than), count=n_selected)
+      logger.info("issues selected"
+                  , newer_than=str(newer_than)
+                  , older_than=str(older_than)
+                  , count=n_selected)
     return daily_issues
 
   @staticmethod
@@ -239,7 +242,8 @@ def update_dataset(database, network, newer_than=None, older_than=None):
         all_metadata
       ))
     tot_revs = sum(len(revisions) for revisions in daily_issues.values())
-    logger.info(f"{len(daily_issues)} issues checked w/ {tot_revs} revisions, resulting in {len(datasets)} datasets.")
+    logger.info(f"{len(daily_issues)} issues checked w/ {tot_revs} revisions"
+                + f", resulting in {len(datasets)} datasets.")
     if not datasets:
       logger.info("nothing to do, exiting")
       return False