Skip to content

Commit de144b0

Browse files
Technologies cleanup automation (#52)
* cleanup drafts * aggregated query * crawl pipeline patch * staging tech assert * simpler category rewrite * typos * comments * alias * Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard <[email protected]> * Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard <[email protected]> * Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard <[email protected]> * lint * Update definitions/declarations/httparchive.js Co-authored-by: Barry Pollard <[email protected]> --------- Co-authored-by: Barry Pollard <[email protected]>
1 parent 68fce03 commit de144b0

File tree

2 files changed

+115
-16
lines changed

2 files changed

+115
-16
lines changed
Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
1-
const stagingTables = ['pages', 'requests', 'parsed_css']
2-
for (const table of stagingTables) {
1+
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
2+
['pages', 'requests', 'parsed_css'].forEach(table =>
33
declare({
44
schema: 'crawl_staging',
55
name: table
66
})
7-
}
7+
)
88

9-
declare({
10-
schema: 'wappalyzer',
11-
name: 'technologies'
12-
})
9+
// See https://github.com/HTTPArchive/dataform/issues/43
10+
assert('corrupted_technology_values')
11+
.tags(['crawl_complete'])
12+
.query(ctx => `
13+
SELECT
14+
date,
15+
client,
16+
tech,
17+
COUNT(DISTINCT page) AS cnt_pages,
18+
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
19+
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
20+
LEFT JOIN pages.technologies AS tech
21+
LEFT JOIN tech.categories AS category
22+
WHERE
23+
date = '${constants.currentMonth}' AND
24+
(
25+
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
26+
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
27+
OR ARRAY_LENGTH(tech.categories) = 0
28+
)
29+
GROUP BY
30+
date,
31+
client,
32+
tech
33+
ORDER BY cnt_pages DESC
34+
`);
1335

14-
declare({
15-
schema: 'wappalyzer',
16-
name: 'categories'
17-
})
36+
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
37+
['technologies', 'categories'].forEach(table =>
38+
declare({
39+
schema: 'wappalyzer',
40+
name: table
41+
})
42+
)

definitions/output/crawl/pages.js

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,23 +52,97 @@ publish('pages', {
5252
DELETE FROM ${ctx.self()}
5353
WHERE date = '${constants.currentMonth}' AND
5454
client = 'desktop';
55-
`).query(ctx => `
55+
56+
INSERT INTO ${ctx.self()}
5657
SELECT
5758
*
5859
FROM ${ctx.ref('crawl_staging', 'pages')}
5960
WHERE date = '${constants.currentMonth}' AND
6061
client = 'desktop'
61-
${constants.devRankFilter}
62-
`).postOps(ctx => `
62+
${constants.devRankFilter};
63+
6364
DELETE FROM ${ctx.self()}
6465
WHERE date = '${constants.currentMonth}' AND
6566
client = 'mobile';
66-
67-
INSERT INTO ${ctx.self()}
67+
`).query(ctx => `
6868
SELECT
6969
*
7070
FROM ${ctx.ref('crawl_staging', 'pages')}
7171
WHERE date = '${constants.currentMonth}' AND
7272
client = 'mobile'
7373
${constants.devRankFilter}
74+
`).postOps(ctx => `
75+
CREATE TEMP TABLE technologies_cleaned AS (
76+
WITH wappalyzer AS (
77+
SELECT DISTINCT
78+
name AS technology,
79+
categories
80+
FROM ${ctx.ref('wappalyzer', 'technologies')}
81+
),
82+
83+
pages AS (
84+
SELECT
85+
client,
86+
page,
87+
tech.technology,
88+
tech.categories,
89+
tech.info
90+
FROM ${ctx.self()} AS pages
91+
LEFT JOIN pages.technologies AS tech
92+
WHERE date = '${constants.currentMonth}' ${constants.devRankFilter}
93+
),
94+
95+
-- Identify impacted pages
96+
impacted_pages AS (
97+
SELECT DISTINCT
98+
client,
99+
page
100+
FROM pages
101+
LEFT JOIN pages.categories AS category
102+
WHERE
103+
-- Technology is corrupted
104+
technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
105+
-- Technology's category is corrupted
106+
CONCAT(technology, category) NOT IN (
107+
SELECT DISTINCT
108+
CONCAT(technology, category)
109+
FROM wappalyzer
110+
LEFT JOIN wappalyzer.categories AS category
111+
)
112+
),
113+
114+
-- Keep valid technologies and use correct categories
115+
reconstructed_technologies AS (
116+
SELECT
117+
client,
118+
page,
119+
ARRAY_AGG(STRUCT(
120+
pages.technology,
121+
wappalyzer.categories,
122+
pages.info
123+
)) AS technologies
124+
FROM pages
125+
INNER JOIN impacted_pages
126+
USING (client, page)
127+
INNER JOIN wappalyzer
128+
ON pages.technology = wappalyzer.technology
129+
GROUP BY
130+
client,
131+
page
132+
)
133+
134+
SELECT
135+
client,
136+
page,
137+
technologies
138+
FROM reconstructed_technologies
139+
);
140+
141+
-- Update the crawl.pages table with the cleaned and restored technologies
142+
UPDATE ${ctx.self()} AS pages
143+
SET technologies = technologies_cleaned.technologies
144+
FROM technologies_cleaned
145+
WHERE pages.date = '${constants.currentMonth}' AND
146+
pages.client = technologies_cleaned.client AND
147+
pages.page = technologies_cleaned.page;
74148
`)

0 commit comments

Comments
 (0)