Skip to content
This repository has been archived by the owner on Dec 18, 2024. It is now read-only.

Tech report pipeline #238

Merged
merged 53 commits into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
e1eb9a3
New tech_report_pipeline
giancarloaf Nov 12, 2023
449231b
clean up testing code
giancarloaf Nov 12, 2023
685116e
update to make queries more flexible
giancarloaf Nov 13, 2023
f49c940
add tech_report_deletion.py
giancarloaf Nov 14, 2023
a479045
cleanup tech_report_deletion.py
giancarloaf Nov 14, 2023
88a2fff
downgrade firestore version to solve dependency conflict
maceto Nov 17, 2023
f7bebd1
new pipeline where id is generated
maceto Nov 17, 2023
dd7de1c
add queries and keys to constants.py
giancarloaf Nov 20, 2023
834951d
updates to tech_report_pipeline
giancarloaf Nov 20, 2023
721924a
fix query age weight
maceto Nov 30, 2023
85a704b
sort keys for id hashing
giancarloaf Dec 2, 2023
f755623
parameterize query builder
giancarloaf Dec 2, 2023
b274fc5
Firestore write to retry on any Exception
giancarloaf Dec 2, 2023
61afe5d
fix pipeline argument parsing
giancarloaf Dec 2, 2023
95d2443
linting
giancarloaf Dec 2, 2023
89624d7
update page_weight query in constants
giancarloaf Dec 3, 2023
775b244
add date filtering by month
giancarloaf Dec 3, 2023
e8c77cc
WriteToFirestoreDoFn yields hash_id, element
giancarloaf Dec 3, 2023
6a47b09
extract pipeline arg parsing
giancarloaf Dec 3, 2023
01fc722
optional debug logging
giancarloaf Dec 3, 2023
546fc62
added a field in techonologies query
maceto Dec 7, 2023
a07858b
deleted test file tec_report_pipeline_inserts
maceto Dec 7, 2023
f50ff43
fix debug logging for firestore ids
giancarloaf Dec 7, 2023
b25f298
Update requirements
giancarloaf Jan 28, 2024
5f58545
Update dependency installation in workflows
giancarloaf Jan 28, 2024
468098f
Update Apache Beam version to 2.52.0
giancarloaf Jan 28, 2024
35087a3
Simplify date filtering in technology queries
giancarloaf Feb 11, 2024
75674e4
Update google-cloud-firestore version
giancarloaf Feb 11, 2024
9940e84
Add validation for missing keys in technology_hash_id function
giancarloaf Feb 11, 2024
396ec86
Add unit tests for technology_hash_id function
giancarloaf Feb 11, 2024
062d2a3
Merge branch 'main' into tech_report_pipeline
giancarloaf Feb 11, 2024
5a898f9
Update dependency installation in unittest workflow
giancarloaf Feb 11, 2024
269e191
Linting
giancarloaf Feb 11, 2024
f32b412
Remove unnecessary code for local testing
giancarloaf Feb 11, 2024
8923e9f
Linting
giancarloaf Feb 11, 2024
e99eae7
Update import statement for google.cloud.firestore in tech_report_del…
giancarloaf Feb 11, 2024
9954c47
Update import statement for google.cloud.firestore in tech_report_pip…
giancarloaf Feb 11, 2024
0f56757
Update google-cloud-firestore version to 2.14.0
giancarloaf Feb 11, 2024
94d4965
Add Python 3.8 setup and dependency installation to linting action
giancarloaf Feb 11, 2024
4ef9c8d
testing: Add check for library path
giancarloaf Feb 12, 2024
24487a4
Fix import statement for Google Cloud Firestore
giancarloaf Feb 12, 2024
7b70987
Update import statements in tech_report_deletion.py and tech_report_p…
giancarloaf Feb 12, 2024
ebd0fef
Fix import formatting in tech_report_deletion.py and tech_report_pipe…
giancarloaf Feb 12, 2024
9292a2a
Remove Python setup and dependency installation from linting action
giancarloaf Feb 12, 2024
b5fa43a
Make required parameters optional
giancarloaf Mar 9, 2024
8ee2eaa
Merge branch 'main' into tech_report_pipeline
giancarloaf Mar 9, 2024
9428e24
linting - shorten comments
giancarloaf Mar 9, 2024
ac94e8e
fix adoption query
giancarloaf Mar 9, 2024
27b8663
fix lighthouse query
giancarloaf Mar 9, 2024
6a2472e
fix core_web_vitals query
giancarloaf Mar 9, 2024
e13d267
fix page_weight query
giancarloaf Mar 9, 2024
1c567ae
update query mapping comment to note escaping
giancarloaf Mar 9, 2024
22c42fe
update dependencies
giancarloaf Mar 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/deploy-dataflow-flex-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- "cloudbuild.yaml"
- "Dockerfile"
- "flex_template_metadata_*.json"
- "requirements.txt"
- "requirements*.txt"

jobs:
deploy:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*.pyc
/.vscode
.coverage
.tool-versions

# Ignore generated credentials from google-github-actions/auth
gha-creds-*.json
309 changes: 309 additions & 0 deletions modules/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,312 @@ class MaxContentSize(Enum):

# limit response bodies to 20MB
RESPONSE_BODIES = 20 * 1000000


TECHNOLOGY_QUERY_ID_KEYS = {
"adoption": ["date", "technology", "geo", "rank"],
"lighthouse": ["date", "technology", "geo", "rank"],
"core_web_vitals": ["date", "technology", "geo", "rank"],
"page_weight": ["date", "technology", "geo", "rank"],
"technologies": ["client", "technology", "category"],
"categories": ["category"],
}
"""Mapping of query types to a list of fields that uniquely identify a row."""

# editorconfig-checker-disable
TECHNOLOGY_QUERIES = {
"adoption": """
CREATE TEMPORARY FUNCTION GET_ADOPTION(
records ARRAY<STRUCT<
client STRING,
origins INT64
>>
) RETURNS STRUCT<
desktop INT64,
mobile INT64
> LANGUAGE js AS '''
return Object.fromEntries(records.map(({{client, origins}}) => {{
return [client, origins];
}}));
''';

SELECT
STRING(DATE(date)) as date,
app AS technology,
rank,
geo,
GET_ADOPTION(ARRAY_AGG(STRUCT(
client,
origins
))) AS adoption
FROM
`httparchive.core_web_vitals.technologies`
WHERE date = '{date}'
GROUP BY date, app, rank, geo
""",
"lighthouse": """
CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE(
records ARRAY<STRUCT<
client STRING,
median_lighthouse_score_accessibility NUMERIC,
median_lighthouse_score_best_practices NUMERIC,
median_lighthouse_score_performance NUMERIC,
median_lighthouse_score_pwa NUMERIC,
median_lighthouse_score_seo NUMERIC
>>
) RETURNS ARRAY<STRUCT<
name STRING,
desktop STRUCT<
median_score NUMERIC
>,
mobile STRUCT<
median_score NUMERIC
>
>> LANGUAGE js AS '''
const METRIC_MAP = {{
accessibility: 'median_lighthouse_score_accessibility',
best_practices: 'median_lighthouse_score_best_practices',
performance: 'median_lighthouse_score_performance',
pwa: 'median_lighthouse_score_pwa',
seo: 'median_lighthouse_score_seo',
}};

// Initialize the Lighthouse map.
const lighthouse = Object.fromEntries(Object.keys(METRIC_MAP).map(metricName => {{
return [metricName, {{name: metricName}}];
}}));

// Populate each client record.
records.forEach(record => {{
Object.entries(METRIC_MAP).forEach(([metricName, median_score]) => {{
lighthouse[metricName][record.client] = {{median_score: record[median_score]}};
}});
}});

return Object.values(lighthouse);
''';

SELECT
STRING(DATE(date)) as date,
app AS technology,
rank,
geo,
GET_LIGHTHOUSE(ARRAY_AGG(STRUCT(
client,
median_lighthouse_score_accessibility,
median_lighthouse_score_best_practices,
median_lighthouse_score_performance,
median_lighthouse_score_pwa,
median_lighthouse_score_seo

))) AS lighthouse
FROM
`httparchive.core_web_vitals.technologies`
WHERE date = '{date}'
GROUP BY date, app, rank, geo
""",
"core_web_vitals": """
CREATE TEMPORARY FUNCTION GET_VITALS(
records ARRAY<STRUCT<
client STRING,
origins_with_good_fid INT64,
origins_with_good_cls INT64,
origins_with_good_lcp INT64,
origins_with_good_fcp INT64,
origins_with_good_ttfb INT64,
origins_with_good_inp INT64,
origins_with_any_fid INT64,
origins_with_any_cls INT64,
origins_with_any_lcp INT64,
origins_with_any_fcp INT64,
origins_with_any_ttfb INT64,
origins_with_any_inp INT64,
origins_with_good_cwv INT64,
origins_eligible_for_cwv INT64
>>
) RETURNS ARRAY<STRUCT<
name STRING,
desktop STRUCT<
good_number INT64,
tested INT64
>,
mobile STRUCT<
good_number INT64,
tested INT64
>
>> LANGUAGE js AS '''
const METRIC_MAP = {{
overall: ['origins_with_good_cwv', 'origins_eligible_for_cwv'],
LCP: ['origins_with_good_lcp', 'origins_with_any_lcp'],
CLS: ['origins_with_good_cls', 'origins_with_any_cls'],
FID: ['origins_with_good_fid', 'origins_with_any_fid'],
FCP: ['origins_with_good_fcp', 'origins_with_any_fcp'],
TTFB: ['origins_with_good_ttfb', 'origins_with_any_ttfb'],
INP: ['origins_with_good_inp', 'origins_with_any_inp']
}};

// Initialize the vitals map.
const vitals = Object.fromEntries(Object.keys(METRIC_MAP).map(metricName => {{
return [metricName, {{name: metricName}}];
}}));

// Populate each client record.
records.forEach(record => {{
Object.entries(METRIC_MAP).forEach(([metricName, [good_number, tested]]) => {{
vitals[metricName][record.client] = {{good_number: record[good_number], tested: record[tested]}};
}});
}});

return Object.values(vitals);
''';

SELECT
STRING(DATE(date)) as date,
app AS technology,
rank,
geo,
GET_VITALS(ARRAY_AGG(STRUCT(
client,
origins_with_good_fid,
origins_with_good_cls,
origins_with_good_lcp,
origins_with_good_fcp,
origins_with_good_ttfb,
origins_with_good_inp,
origins_with_any_fid,
origins_with_any_cls,
origins_with_any_lcp,
origins_with_any_fcp,
origins_with_any_ttfb,
origins_with_any_inp,
origins_with_good_cwv,
origins_eligible_for_cwv
))) AS vitals
FROM
`httparchive.core_web_vitals.technologies`
WHERE date = '{date}'
GROUP BY date, app, rank, geo
""",
"technologies": """
SELECT
client,
app AS technology,
description,
category,
SPLIT(category, ",") AS category_obj,
NULL AS similar_technologies,
origins
FROM
`httparchive.core_web_vitals.technologies`
JOIN
`httparchive.core_web_vitals.technology_descriptions`
ON
app = technology
WHERE date = '{date}' AND geo = 'ALL' AND rank = 'ALL'
ORDER BY origins DESC
""",
"page_weight": """
CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT(
records ARRAY<STRUCT<
client STRING,
total INT64,
js INT64,
images INT64
>>
) RETURNS ARRAY<STRUCT<
name STRING,
mobile STRUCT<
median_bytes INT64
>,
desktop STRUCT<
median_bytes INT64
>
>> LANGUAGE js AS '''
const METRICS = ['total', 'js', 'images'];

// Initialize the page weight map.
const pageWeight = Object.fromEntries(METRICS.map(metricName => {{
return [metricName, {{name: metricName}}];
}}));

// Populate each client record.
records.forEach(record => {{
METRICS.forEach(metricName => {{
pageWeight[metricName][record.client] = {{median_bytes: record[metricName]}};
}});
}});

return Object.values(pageWeight);
''';

SELECT
STRING(DATE(date)) as date,
app AS technology,
rank,
geo,
GET_PAGE_WEIGHT(ARRAY_AGG(STRUCT(
client,
median_bytes_total,
median_bytes_js,
median_bytes_image
))) AS pageWeight
FROM
`httparchive.core_web_vitals.technologies`
WHERE date = '{date}'
GROUP BY date, app, rank, geo
""",
"categories": """
WITH categories AS (
SELECT
category,
COUNT(DISTINCT root_page) AS origins
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
WHERE
date = '{date}' AND
client = 'mobile'
GROUP BY
category
),

technologies AS (
SELECT
category,
technology,
COUNT(DISTINCT root_page) AS origins
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
WHERE
date = '{date}' AND
client = 'mobile'
GROUP BY
category,
technology
)

SELECT
category,
categories.origins,
ARRAY_AGG(technology ORDER BY technologies.origins DESC) AS technologies
FROM
categories
JOIN
technologies
USING
(category)
GROUP BY
category,
categories.origins
ORDER BY
categories.origins DESC
"""
}
"""Mapping of query types to BigQuery SQL queries.
The queries are formatted with the `date` parameter.
Queries containing javascript UDFs require additional curly braces to escape the braces in the UDF.
"""
# editorconfig-checker-enable
Loading
Loading