HTTPArchive · tunetheweb · Mar 7, 2024 · Mar 7, 2024 · Mar 9, 2024 · Mar 11, 2024
diff --git a/modules/non_summary_pipeline.py b/modules/non_summary_pipeline.py
@@ -12,7 +12,8 @@
 from modules import utils, constants, transformation
 
 # BigQuery can handle rows up to 100 MB.
-MAX_CONTENT_SIZE = 2 * 1024 * 1024
+MAX_CONTENT_SIZE = 100 * 1000000
+MAX_BODY_CONTENT_SIZE = 20 * 1000000
 # Number of times to partition the requests tables.
 NUM_PARTITIONS = 4
 
@@ -202,6 +203,12 @@ def get_response_bodies(har):
     """Parses response bodies from a HAR object."""
 
     page_url = get_page_url(har)
+    if not page_url:
+        logging.warning(
+            "Skipping response bodies: unable to get page URL (see preceding warning)."
+        )
+        return None
+
     requests = har.get("log").get("entries")
 
     response_bodies = []
@@ -215,20 +222,21 @@ def get_response_bodies(har):
         if body is None:
             continue
 
-        truncated = len(body) > MAX_CONTENT_SIZE
+        truncated = len(body) > MAX_BODY_CONTENT_SIZE
         if truncated:
             logging.warning(
                 'Truncating response body for "%s". Response body size %s exceeds limit %s.'
-                % (request_url, len(body), MAX_CONTENT_SIZE)
+                % (request_url, len(body), MAX_BODY_CONTENT_SIZE)
             )
+            body = body[:MAX_BODY_CONTENT_SIZE]
 
         metadata = get_metadata(har)
 
         response_bodies.append(
             {
                 "page": page_url,
                 "url": request_url,
-                "body": body[:MAX_CONTENT_SIZE],
+                "body": body,
                 "truncated": truncated,
                 "date": har["date"],
                 "client": har["client"],
@@ -247,6 +255,13 @@ def get_technologies(har):
 
     page = har.get("log").get("pages")[0]
     page_url = page.get("_URL")
+
+    if not page_url:
+        logging.warning(
+            "Skipping technologies: unable to get page URL (see preceding warning)."
+        )
+        return None
+
     app_names = page.get("_detected_apps", {})
     categories = page.get("_detected", {})
     metadata = get_metadata(har)
@@ -430,7 +445,11 @@ def to_json(obj):
     if not obj:
         raise ValueError
 
-    return json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
+    return (
+        json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
+        .encode("utf-8", "surrogatepass")
+        .decode("utf-8", "replace")
+    )
 
 
 def from_json(file_name, element):

diff --git a/modules/transformation.py b/modules/transformation.py
@@ -85,7 +85,7 @@ def __init__(
             "create_disposition": BigQueryDisposition.CREATE_IF_NEEDED,
             "write_disposition": BigQueryDisposition.WRITE_APPEND,
             "additional_bq_parameters": {
-                "maxBadRecords": 10,
+                "maxBadRecords": 100,
                 "ignoreUnknownValues": True,
                 **self.additional_bq_parameters,
             },

diff --git a/run_pipeline_all.sh b/run_pipeline_all.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 python run_all.py \
-  --input_file=gs://httparchive/crawls_manifest/android-Sep_1_2023.txt \
+  --input_file=gs://httparchive/crawls_manifest/android-Feb_1_2024.txt \
   --runner=DataflowRunner \
   --project=httparchive \
   --temp_location=gs://httparchive-staging/experimental/temp \
@@ -9,4 +9,4 @@ python run_all.py \
   --setup_file=./setup.py \
   --machine_type=n1-standard-32 \
   --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \
-  --noauth_local_webserver
+  --max_cache_memory_usage_mb=0
diff --git a/run_pipeline_combined.sh b/run_pipeline_combined.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # shellcheck disable=SC1143,SC2211,SC2215
 python3 run_combined.py \
-  --input_file=gs://httparchive/crawls_manifest/chrome-Sep_1_2023.txt \
+  --input_file=gs://httparchive/crawls_manifest/chrome-Feb_1_2024.txt \
   --runner=DataflowRunner \
   --project=httparchive \
   --temp_location=gs://httparchive-staging/experimental/temp \
@@ -10,4 +10,4 @@ python3 run_combined.py \
   --setup_file=./setup.py \
   --machine_type=n1-standard-32 \
   --worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \
-  --noauth_local_webserver
+  --max_cache_memory_usage_mb=0