Skip to content

Commit c6c5b2f

Browse files
Add GPT-domain fast-path guard and fix entity-encoded URL bypass in rewriteHtmlString
1 parent 92c9cb9 commit c6c5b2f

File tree

2 files changed

+65
-7
lines changed

2 files changed

+65
-7
lines changed

crates/js/lib/src/integrations/gpt/script_guard.ts

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -204,16 +204,25 @@ function rewriteLinkHref(
204204
* `document.write` / `document.writeln`.
205205
*
206206
* Uses `DOMParser` for robust HTML parsing instead of regex so that
207-
* edge-cases (unquoted attributes, unusual spacing, mixed quote styles)
208-
* are handled by the browser's native parser. The raw `getAttribute`
209-
* value is swapped in the original HTML string so the surrounding markup
210-
* is preserved verbatim.
207+
* edge-cases (unquoted attributes, unusual spacing, mixed quote styles,
208+
* HTML-entity-encoded query parameters) are handled by the browser's
209+
* native parser. GPT script `src` attributes are mutated in the parsed
210+
* DOM and the result is serialized back to HTML.
211211
*
212212
* If the GPT domain is present in the HTML but `DOMParser` is
213213
* unavailable or throws, the function **fails closed** (returns an
214214
* empty string) rather than passing the unproxied URL through.
215+
*
216+
* Non-GPT HTML is always passed through unchanged regardless of
217+
* `DOMParser` availability.
215218
*/
216219
function rewriteHtmlString(html: string): string {
220+
// Fast-path: if the HTML does not reference the GPT domain at all,
221+
// pass it through unchanged. This avoids unnecessary DOMParser
222+
// overhead and, critically, prevents non-GPT document.write calls
223+
// from being silently dropped when DOMParser is unavailable.
224+
if (!html.includes(GPT_DOMAIN)) return html;
225+
217226
if (typeof DOMParser === 'undefined') {
218227
log.warn(
219228
`${LOG_PREFIX}: DOMParser unavailable, blocking document.write HTML that references GPT domain`
@@ -224,7 +233,7 @@ function rewriteHtmlString(html: string): string {
224233
try {
225234
const doc = new DOMParser().parseFromString(html, 'text/html');
226235
const scripts = doc.querySelectorAll('script[src]');
227-
let result = html;
236+
let didRewriteAny = false;
228237

229238
for (const script of scripts) {
230239
const rawSrc = script.getAttribute('src') ?? '';
@@ -235,10 +244,17 @@ function rewriteHtmlString(html: string): string {
235244
original: rawSrc,
236245
rewritten: rewrittenUrl,
237246
});
238-
result = result.replaceAll(rawSrc, rewrittenUrl);
247+
// Mutate the parsed DOM so that HTML-entity-encoded attribute
248+
// values (e.g. `&`) are handled correctly. Serializing the
249+
// DOM back to HTML avoids the mismatch between decoded
250+
// `getAttribute()` values and the raw HTML string.
251+
script.setAttribute('src', rewrittenUrl);
252+
didRewriteAny = true;
239253
}
240254

241-
return result;
255+
// DOMParser wraps input in <html><head>…</head><body>…</body></html>.
256+
// Bare <script> tags land in <head>, so we serialize from both.
257+
return didRewriteAny ? (doc.head?.innerHTML ?? '') + (doc.body?.innerHTML ?? '') : html;
242258
} catch (err) {
243259
log.warn(
244260
`${LOG_PREFIX}: failed to parse document.write HTML containing GPT domain, blocking`,

crates/js/lib/test/integrations/gpt/script_guard.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,4 +303,46 @@ describe('GPT script guard', () => {
303303
globalThis.DOMParser = originalDOMParser;
304304
}
305305
});
306+
307+
it('passes non-GPT HTML through unchanged when DOMParser is unavailable', () => {
308+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
309+
document.write = nativeWriteSpy as unknown as typeof document.write;
310+
311+
const originalDOMParser = globalThis.DOMParser;
312+
// @ts-expect-error — simulating an environment without DOMParser
313+
delete globalThis.DOMParser;
314+
315+
try {
316+
installGptGuard();
317+
318+
const html = '<p>Hello, world!</p>';
319+
document.write(html);
320+
321+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
322+
expect(nativeWriteSpy).toHaveBeenCalledWith(html);
323+
} finally {
324+
globalThis.DOMParser = originalDOMParser;
325+
}
326+
});
327+
328+
// -----------------------------------------------------------------------
329+
// HTML-entity-encoded URLs
330+
// -----------------------------------------------------------------------
331+
332+
it('rewrites GPT URLs that contain HTML-escaped entities like &amp;', () => {
333+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
334+
document.write = nativeWriteSpy as unknown as typeof document.write;
335+
336+
installGptGuard();
337+
338+
document.write(
339+
'<script src="https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js?x=1&amp;y=2"></script>'
340+
);
341+
342+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
343+
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
344+
expect(writtenHtml).toContain(window.location.host);
345+
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
346+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
347+
});
306348
});

0 commit comments

Comments
 (0)