Skip to content
This repository was archived by the owner on Dec 18, 2024. It is now read-only.

More fixes to improve run #247

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions modules/non_summary_pipeline.py
Original file line number Diff line number Diff line change
@@ -12,7 +12,8 @@
from modules import utils, constants, transformation

# BigQuery can handle rows up to 100 MB.
MAX_CONTENT_SIZE = 2 * 1024 * 1024
MAX_CONTENT_SIZE = 100 * 1000000
MAX_BODY_CONTENT_SIZE = 20 * 1000000
# Number of times to partition the requests tables.
NUM_PARTITIONS = 4

@@ -202,6 +203,12 @@ def get_response_bodies(har):
"""Parses response bodies from a HAR object."""

page_url = get_page_url(har)
if not page_url:
logging.warning(
"Skipping response bodies: unable to get page URL (see preceding warning)."
)
return None

requests = har.get("log").get("entries")

response_bodies = []
@@ -215,20 +222,21 @@ def get_response_bodies(har):
if body is None:
continue

truncated = len(body) > MAX_CONTENT_SIZE
truncated = len(body) > MAX_BODY_CONTENT_SIZE
if truncated:
logging.warning(
'Truncating response body for "%s". Response body size %s exceeds limit %s.'
% (request_url, len(body), MAX_CONTENT_SIZE)
% (request_url, len(body), MAX_BODY_CONTENT_SIZE)
)
body = body[:MAX_BODY_CONTENT_SIZE]

metadata = get_metadata(har)

response_bodies.append(
{
"page": page_url,
"url": request_url,
"body": body[:MAX_CONTENT_SIZE],
"body": body,
"truncated": truncated,
"date": har["date"],
"client": har["client"],
@@ -247,6 +255,13 @@ def get_technologies(har):

page = har.get("log").get("pages")[0]
page_url = page.get("_URL")

if not page_url:
logging.warning(
"Skipping technologies: unable to get page URL (see preceding warning)."
)
return None

app_names = page.get("_detected_apps", {})
categories = page.get("_detected", {})
metadata = get_metadata(har)
@@ -430,7 +445,11 @@ def to_json(obj):
if not obj:
raise ValueError

return json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
return (
json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
.encode("utf-8", "surrogatepass")
.decode("utf-8", "replace")
)


def from_json(file_name, element):
2 changes: 1 addition & 1 deletion modules/transformation.py
Original file line number Diff line number Diff line change
@@ -85,7 +85,7 @@ def __init__(
"create_disposition": BigQueryDisposition.CREATE_IF_NEEDED,
"write_disposition": BigQueryDisposition.WRITE_APPEND,
"additional_bq_parameters": {
"maxBadRecords": 10,
"maxBadRecords": 100,
"ignoreUnknownValues": True,
**self.additional_bq_parameters,
},
4 changes: 2 additions & 2 deletions run_pipeline_all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
python run_all.py \
--input_file=gs://httparchive/crawls_manifest/android-Sep_1_2023.txt \
--input_file=gs://httparchive/crawls_manifest/android-Feb_1_2024.txt \
--runner=DataflowRunner \
--project=httparchive \
--temp_location=gs://httparchive-staging/experimental/temp \
@@ -9,4 +9,4 @@ python run_all.py \
--setup_file=./setup.py \
--machine_type=n1-standard-32 \
--worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \
--noauth_local_webserver
--max_cache_memory_usage_mb=0
4 changes: 2 additions & 2 deletions run_pipeline_combined.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# shellcheck disable=SC1143,SC2211,SC2215
python3 run_combined.py \
--input_file=gs://httparchive/crawls_manifest/chrome-Sep_1_2023.txt \
--input_file=gs://httparchive/crawls_manifest/chrome-Feb_1_2024.txt \
--runner=DataflowRunner \
--project=httparchive \
--temp_location=gs://httparchive-staging/experimental/temp \
@@ -10,4 +10,4 @@ python3 run_combined.py \
--setup_file=./setup.py \
--machine_type=n1-standard-32 \
--worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd \
--noauth_local_webserver
--max_cache_memory_usage_mb=0