Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e8f230e
feat: create newtab-content aggregates and combine with newtab
LiamMcFall Jul 23, 2025
3ed81ae
Merge branch 'main' into newtab-content_dev
LiamMcFall Jul 24, 2025
649a237
adds newtab_content_surface_id
LiamMcFall Jul 24, 2025
308debb
update schemas and create new view
LiamMcFall Jul 25, 2025
a690672
Merge branch 'main' into newtab-content_dev
LiamMcFall Jul 25, 2025
78b46fb
updated model with new fields
LiamMcFall Aug 4, 2025
4225449
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 4, 2025
0e4a20e
revert changes to newtab_items_daily_v1
LiamMcFall Aug 4, 2025
18f679f
Merge branch 'newtab-content_dev' of https://github.com/mozilla/bigqu…
LiamMcFall Aug 4, 2025
f88d7c9
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 11, 2025
c101a95
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 27, 2025
b446b04
newtab_content_items_daily combined view
LiamMcFall Aug 27, 2025
f195d88
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 27, 2025
4e25d46
Merge branch 'newtab-content_dev' of https://github.com/mozilla/bigqu…
LiamMcFall Aug 27, 2025
c71e200
create combined model and new view to join latest corpus item values
LiamMcFall Aug 28, 2025
4412609
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 28, 2025
0bdd02f
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 28, 2025
ba36bde
cast as nulls and remove extra field from CTE
LiamMcFall Aug 28, 2025
9c8a62b
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 28, 2025
61407b7
fix newtab_content_surface_id reference
LiamMcFall Aug 28, 2025
1cded0c
add filter to newtab-content events to ensure content_redacted newtab…
LiamMcFall Aug 28, 2025
a3e1acc
Merge branch 'main' into newtab-content_dev
LiamMcFall Aug 28, 2025
c0f5361
use udf and update country field source
LiamMcFall Sep 3, 2025
3c517e3
Merge branch 'main' into newtab-content_dev
LiamMcFall Sep 3, 2025
cdb91f4
Merge branch 'main' into newtab-content_dev
LiamMcFall Sep 3, 2025
41fee8a
make the content_redacted filter for newtab events explicit and count…
LiamMcFall Sep 3, 2025
8da180f
Merge branch 'main' into newtab-content_dev
LiamMcFall Sep 3, 2025
940f829
Merge branch 'main' into newtab-content_dev
LiamMcFall Sep 4, 2025
0ddc894
change url field name to match previous table
LiamMcFall Sep 5, 2025
b5e4277
Merge branch 'main' into newtab-content_dev
LiamMcFall Sep 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
friendly_name: Newtab Content Items Daily Combined
description: |-
A view of the combined (Newtab + Newtab-Content) daily aggregation of newtab content actions on content/items,
joined with the latest corpus item details from the corpus_items_current table so that the most current values for
the corpus item are available.
owners:
- [email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.firefox_desktop_derived.newtab_content_items_daily_combined_v1`
AS
SELECT
content.*,
corpus_items.* EXCEPT (corpus_item_id, row_num)
FROM
`moz-fx-data-shared-prod.firefox_desktop_derived.newtab_content_items_daily_v1` AS content
LEFT OUTER JOIN
`moz-fx-data-shared-prod.snowflake_migration_derived.corpus_items_current_v1` AS corpus_items
ON content.corpus_item_id = corpus_items.corpus_item_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
friendly_name: Newtab Content Items Daily
description: |-
A daily aggregation of newtab content actions on content/items (example: impressions, clicks, dismissals)
for Firefox desktop, partitioned by day.
owners:
- [email protected]
labels:
application: newtab
incremental: true
schedule: daily
dag: bqetl_newtab
owner1: lmcfall
table_type: aggregate
scheduling:
dag_name: bqetl_newtab
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: true
expiration_days: null
range_partitioning: null
clustering:
fields:
- channel
- country
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozsoc-ml/service
- workgroup:mozsoc-ml/developers
references: {}
require_column_descriptions: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
WITH newtab_events_unnested AS (
SELECT
DATE(submission_timestamp) AS submission_date,
mozfun.norm.browser_version_info(client_info.app_display_version).major_version AS app_version,
normalized_channel AS channel,
metrics.string.newtab_locale AS locale,
normalized_country_code AS country,
metrics.string.newtab_content_surface_id AS newtab_content_surface_id,
timestamp AS event_timestamp,
category AS event_category,
name AS event_name,
extra AS event_details,
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.newtab_v1`,
UNNEST(events)
WHERE
DATE(submission_timestamp) = @submission_date
AND category IN ('pocket')
AND name IN ('impression', 'click', 'dismiss')
AND mozfun.norm.browser_version_info(
client_info.app_display_version
).major_version >= 121 -- the [Pocket team started using Glean](https://github.com/Pocket/dbt-snowflake/pull/459) from this version on. This prevents duplicates for previous releases.
),
newtab_flattened_events AS (
SELECT
submission_date,
SAFE_CAST(app_version AS INT64) AS app_version,
channel,
locale,
country,
newtab_content_surface_id,
event_category,
event_name,
mozfun.map.get_key(event_details, 'corpus_item_id') AS corpus_item_id,
SAFE_CAST(mozfun.map.get_key(event_details, 'position') AS INT64) AS position,
SAFE_CAST(mozfun.map.get_key(event_details, 'is_sponsored') AS BOOLEAN) AS is_sponsored,
SAFE_CAST(
mozfun.map.get_key(event_details, 'is_section_followed') AS BOOLEAN
) AS is_section_followed,
mozfun.map.get_key(event_details, 'matches_selected_topic') AS matches_selected_topic,
SAFE_CAST(mozfun.map.get_key(event_details, 'received_rank') AS INT64) AS received_rank,
mozfun.map.get_key(event_details, 'section') AS section,
SAFE_CAST(mozfun.map.get_key(event_details, 'section_position') AS INT64) AS section_position,
mozfun.map.get_key(event_details, 'topic') AS topic,
IFNULL(mozfun.map.get_key(event_details, 'content_redacted'), 'false') AS content_redacted,
NULL AS newtab_content_ping_version
FROM
newtab_events_unnested
),
newtab_daily_agg AS (
SELECT
submission_date,
app_version,
channel,
country,
IFNULL(
newtab_content_surface_id,
mozfun.newtab.scheduled_surface_id_v1(country, locale)
) AS newtab_content_surface_id,
corpus_item_id,
position,
is_sponsored,
is_section_followed,
matches_selected_topic,
received_rank,
section,
section_position,
topic,
content_redacted,
newtab_content_ping_version,
COUNTIF(event_name = 'impression') AS impression_count,
COUNTIF(event_name = 'click') AS click_count,
COUNTIF(event_name = 'dismiss') AS dismiss_count
FROM
newtab_flattened_events
WHERE
-- Filters out non-redacted events. Redacted events will be counted in the newtab_content ping data.
content_redacted = 'false'
GROUP BY
submission_date,
app_version,
channel,
country,
newtab_content_surface_id,
corpus_item_id,
position,
is_sponsored,
is_section_followed,
matches_selected_topic,
received_rank,
section,
section_position,
topic,
content_redacted,
newtab_content_ping_version
),
newtab_content_events_unnested AS (
SELECT
DATE(submission_timestamp) AS submission_date,
normalized_channel AS channel,
IFNULL(metrics.string.newtab_content_country, normalized_country_code) AS country,
metrics.string.newtab_content_surface_id AS newtab_content_surface_id,
timestamp AS event_timestamp,
category AS event_category,
name AS event_name,
extra AS event_details,
metrics.quantity.newtab_content_ping_version AS newtab_content_ping_version
FROM
`moz-fx-data-shared-prod.firefox_desktop.newtab_content`,
UNNEST(events)
WHERE
DATE(submission_timestamp) = @submission_date
AND category IN ('newtab_content')
AND name IN ('impression', 'click', 'dismiss')
),
newtab_content_flattened_events AS (
SELECT
submission_date,
NULL AS app_version,
channel,
country,
newtab_content_surface_id,
event_category,
event_name,
mozfun.map.get_key(event_details, 'corpus_item_id') AS corpus_item_id,
SAFE_CAST(mozfun.map.get_key(event_details, 'position') AS INT64) AS position,
SAFE_CAST(mozfun.map.get_key(event_details, 'is_sponsored') AS BOOLEAN) AS is_sponsored,
SAFE_CAST(
mozfun.map.get_key(event_details, 'is_section_followed') AS BOOLEAN
) AS is_section_followed,
mozfun.map.get_key(event_details, 'matches_selected_topic') AS matches_selected_topic,
SAFE_CAST(mozfun.map.get_key(event_details, 'received_rank') AS INT64) AS received_rank,
mozfun.map.get_key(event_details, 'section') AS section,
SAFE_CAST(mozfun.map.get_key(event_details, 'section_position') AS INT64) AS section_position,
mozfun.map.get_key(event_details, 'topic') AS topic,
CAST(NULL AS STRING) AS content_redacted,
newtab_content_ping_version
FROM
newtab_content_events_unnested
),
newtab_content_daily_agg AS (
SELECT
submission_date,
app_version,
channel,
country,
newtab_content_surface_id,
corpus_item_id,
position,
is_sponsored,
is_section_followed,
matches_selected_topic,
received_rank,
section,
section_position,
topic,
content_redacted,
newtab_content_ping_version,
COUNTIF(event_name = 'impression') AS impression_count,
COUNTIF(event_name = 'click') AS click_count,
COUNTIF(event_name = 'dismiss') AS dismiss_count
FROM
newtab_content_flattened_events
WHERE
-- Only including events from pings with a version
-- to ensure all events coming from this CTE are from the Newtab-Content ping
newtab_content_ping_version IS NOT NULL
GROUP BY
submission_date,
app_version,
channel,
country,
newtab_content_surface_id,
corpus_item_id,
position,
is_sponsored,
is_section_followed,
matches_selected_topic,
received_rank,
section,
section_position,
topic,
content_redacted,
newtab_content_ping_version
)
SELECT
*
FROM
newtab_daily_agg
UNION ALL
SELECT
*
FROM
newtab_content_daily_agg
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
fields:
- name: submission_date
type: DATE
mode: NULLABLE
description: Date when client action took place
- name: channel
type: STRING
mode: NULLABLE
- name: country
type: STRING
mode: NULLABLE
- name: newtab_content_surface_id
type: STRING
mode: NULLABLE
- name: corpus_item_id
type: STRING
mode: NULLABLE
- name: position
type: INTEGER
mode: NULLABLE
description: The position (0-index) of the pocket tile.
- name: is_sponsored
type: BOOLEAN
mode: NULLABLE
description: Whether the pocket tile was sponsored (has an ad callback).
- name: is_section_followed
type: BOOLEAN
mode: NULLABLE
description: If click belongs in a section, if that section is followed
- name: matches_selected_topic
type: STRING
mode: NULLABLE
description: >
Returns value based on if a the topic of the pocket recommendation
matches one of the user-selected topic categories
- name: received_rank
type: INTEGER
mode: NULLABLE
description: >
The rank or order of the recommendation at the time it was sent to
the client.
- name: section
type: STRING
mode: NULLABLE
description: If click belongs in a section, the name of the section
- name: section_position
type: INTEGER
mode: NULLABLE
description: If click belongs in a section, the numberic position of the section
- name: topic
type: STRING
mode: NULLABLE
description: The topic of the recommendation. Like "entertainment".
- name: content_redacted
type: STRING
mode: NULLABLE
description: Are content details sent separately in the newtab_content ping
- name: newtab_content_ping_version
type: INTEGER
mode: NULLABLE
- name: impression_count
type: INTEGER
mode: NULLABLE
description: Count of articles impressed on Newtab
- name: click_count
type: INTEGER
mode: NULLABLE
description: Count of articles clicked on Newtab
- name: dismiss_count
type: INTEGER
mode: NULLABLE
description: Count of articles dismissed on Newtab
- name: app_version
type: INTEGER
mode: NULLABLE
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ WITH corpus_item_id_row_nums AS (
SELECT
approved_corpus_item_external_id AS corpus_item_id,
title,
url,
url AS recommendation_url,
authors,
publisher,
reviewed_corpus_item_updated_at AS corpus_item_updated_at,
Expand Down