@@ -52,23 +52,97 @@ publish('pages', {
52
52
DELETE FROM ${ ctx . self ( ) }
53
53
WHERE date = '${ constants . currentMonth } ' AND
54
54
client = 'desktop';
55
- ` ) . query ( ctx => `
55
+
56
+ INSERT INTO ${ ctx . self ( ) }
56
57
SELECT
57
58
*
58
59
FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) }
59
60
WHERE date = '${ constants . currentMonth } ' AND
60
61
client = 'desktop'
61
- ${ constants . devRankFilter }
62
- ` ) . postOps ( ctx => `
62
+ ${ constants . devRankFilter } ;
63
+
63
64
DELETE FROM ${ ctx . self ( ) }
64
65
WHERE date = '${ constants . currentMonth } ' AND
65
66
client = 'mobile';
66
-
67
- INSERT INTO ${ ctx . self ( ) }
67
+ ` ) . query ( ctx => `
68
68
SELECT
69
69
*
70
70
FROM ${ ctx . ref ( 'crawl_staging' , 'pages' ) }
71
71
WHERE date = '${ constants . currentMonth } ' AND
72
72
client = 'mobile'
73
73
${ constants . devRankFilter }
74
+ ` ) . postOps ( ctx => `
75
+ CREATE TEMP TABLE technologies_cleaned AS (
76
+ WITH wappalyzer AS (
77
+ SELECT DISTINCT
78
+ name AS technology,
79
+ categories
80
+ FROM ${ ctx . ref ( 'wappalyzer' , 'technologies' ) }
81
+ ),
82
+
83
+ pages AS (
84
+ SELECT
85
+ client,
86
+ page,
87
+ tech.technology,
88
+ tech.categories,
89
+ tech.info
90
+ FROM ${ ctx . self ( ) } AS pages
91
+ LEFT JOIN pages.technologies AS tech
92
+ WHERE date = '${ constants . currentMonth } ' ${ constants . devRankFilter }
93
+ ),
94
+
95
+ -- Identify impacted pages
96
+ impacted_pages AS (
97
+ SELECT DISTINCT
98
+ client,
99
+ page
100
+ FROM pages
101
+ LEFT JOIN pages.categories AS category
102
+ WHERE
103
+ -- Technology is corrupted
104
+ technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
105
+ -- Technology's category is corrupted
106
+ CONCAT(technology, category) NOT IN (
107
+ SELECT DISTINCT
108
+ CONCAT(technology, category)
109
+ FROM wappalyzer
110
+ LEFT JOIN wappalyzer.categories AS category
111
+ )
112
+ ),
113
+
114
+ -- Keep valid technologies and use correct categories
115
+ reconstructed_technologies AS (
116
+ SELECT
117
+ client,
118
+ page,
119
+ ARRAY_AGG(STRUCT(
120
+ pages.technology,
121
+ wappalyzer.categories,
122
+ pages.info
123
+ )) AS technologies
124
+ FROM pages
125
+ INNER JOIN impacted_pages
126
+ USING (client, page)
127
+ INNER JOIN wappalyzer
128
+ ON pages.technology = wappalyzer.technology
129
+ GROUP BY
130
+ client,
131
+ page
132
+ )
133
+
134
+ SELECT
135
+ client,
136
+ page,
137
+ technologies
138
+ FROM reconstructed_technologies
139
+ );
140
+
141
+ -- Update the crawl.pages table with the cleaned and restored technologies
142
+ UPDATE ${ ctx . self ( ) } AS pages
143
+ SET technologies = technologies_cleaned.technologies
144
+ FROM technologies_cleaned
145
+ WHERE pages.date = '${ constants . currentMonth } ' AND
146
+ pages.client = technologies_cleaned.client AND
147
+ pages.page = technologies_cleaned.page;
74
148
` )
0 commit comments