From 7b5ba9783d44c6287559c3fc274c6a19ee76fc26 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jan 2025 00:55:50 -0800 Subject: [PATCH] additional fixes: - redirect: if page url is /path/ -> /path, don't add as extra seed - proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy --- package.json | 2 +- src/crawler.ts | 2 ++ src/util/argParser.ts | 3 +++ src/util/blockrules.ts | 4 +++- src/util/file_reader.ts | 3 ++- src/util/originoverride.ts | 6 +++++- src/util/proxy.ts | 10 ++++++++-- src/util/recorder.ts | 23 ++++++++++++++--------- src/util/sitemapper.ts | 6 +++++- 9 files changed, 43 insertions(+), 16 deletions(-) diff --git a/package.json b/package.json index 8f71b907..edcca228 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.5.0-beta.2", + "version": "1.5.0-beta.3", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 63229618..a141026a 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1498,6 +1498,7 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("crawl already finished, running post-crawl tasks", { state: initState, }); + this.finalExit = true; await this.postCrawl(); return; } else if (await this.crawlState.isCrawlStopped()) { @@ -1945,6 +1946,7 @@ self.__bx_behaviors.selectMainBehavior(); depth === 0 && !isChromeError && respUrl !== url.split("#")[0] && + respUrl + "/" !== url && !downloadResponse ) { data.seedId = await this.crawlState.addExtraSeed( diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 4c9db398..513657ee 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -700,6 +700,9 @@ class ArgParser { // background behaviors to apply const behaviorOpts: { [key: string]: string | boolean } = {}; + if (argv.blockAds) { + argv.behaviors.push("autoclick"); + } if (argv.behaviors.length > 0) { argv.behaviors.forEach((x: string) => { if (BEHAVIOR_TYPES.includes(x)) { diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index 5d3238fb..0e7fb511 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -5,6 +5,7 @@ import { HTTPRequest, Page } from "puppeteer-core"; import { Browser } from "./browser.js"; import { fetch } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; const RULE_TYPES = ["block", "allowOnly"]; @@ -271,7 +272,7 @@ export class BlockRules { logDetails: Record, ) { try { - const res = await fetch(reqUrl); + const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() }); const text = await res.text(); return !!text.match(frameTextMatch); @@ -302,6 +303,7 @@ export class BlockRules { method: "PUT", headers: { "Content-Type": "text/html" }, body, + dispatcher: getProxyDispatcher(), }); } } diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index 7eea4162..284f0dd8 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -6,6 +6,7 @@ import util from "util"; import { exec as execCallback } from "child_process"; import { logger } from "./logger.js"; +import { getProxyDispatcher } from "./proxy.js"; const exec = util.promisify(execCallback); @@ -85,7 +86,7 @@ async function collectOnlineBehavior(url: string): Promise { const behaviorFilepath = `/app/behaviors/${filename}`; try { - const res = await fetch(url); + const res = await fetch(url, { dispatcher: getProxyDispatcher() }); const fileContents = await res.text(); await fsp.writeFile(behaviorFilepath, fileContents); logger.info( diff --git a/src/util/originoverride.ts b/src/util/originoverride.ts index a00a2b54..1b2b8c41 100644 --- a/src/util/originoverride.ts +++ b/src/util/originoverride.ts @@ -3,6 +3,7 @@ import { formatErr, logger } from "./logger.js"; import { Browser } from "./browser.js"; import { fetch } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; export class OriginOverride { originOverride: { origUrl: URL; destUrl: URL }[]; @@ -45,7 +46,10 @@ export class OriginOverride { headers.set("origin", orig.origin); } - const resp = await fetch(newUrl, { headers }); + const resp = await fetch(newUrl, { + headers, + dispatcher: getProxyDispatcher(), + }); const body = Buffer.from(await resp.arrayBuffer()); const respHeaders = Object.fromEntries(resp.headers); diff --git a/src/util/proxy.ts b/src/util/proxy.ts index cf6a3437..5b15d6e2 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -1,5 +1,5 @@ import net from "net"; -import { Agent, Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; +import { Agent, Dispatcher, ProxyAgent } from "undici"; import child_process from "child_process"; @@ -13,6 +13,8 @@ const SSH_PROXY_LOCAL_PORT = 9722; const SSH_WAIT_TIMEOUT = 30000; +let proxyDispatcher: Dispatcher | undefined = undefined; + export function getEnvProxyUrl() { if (process.env.PROXY_SERVER) { return process.env.PROXY_SERVER; @@ -46,10 +48,14 @@ export async function initProxy( // set global fetch() dispatcher (with proxy, if any) const dispatcher = createDispatcher(proxy, agentOpts); - setGlobalDispatcher(dispatcher); + proxyDispatcher = dispatcher; return proxy; } +export function getProxyDispatcher() { + return proxyDispatcher; +} + export function createDispatcher( proxyUrl: string, opts: Agent.Options, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index a10c8175..2e0005b6 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,7 +8,7 @@ import { isRedirectStatus, } from "./reqresp.js"; -import { fetch, getGlobalDispatcher, Response } from "undici"; +import { fetch, Response } from "undici"; import { getCustomRewriter, @@ -23,6 +23,7 @@ import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; import { CDPSession, Protocol } from "puppeteer-core"; import { Crawler } from "../crawler.js"; +import { getProxyDispatcher } from "./proxy.js"; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_TEXT_REWRITE_SIZE = 25_000_000; @@ -1588,14 +1589,18 @@ class AsyncFetcher { const headers = reqresp.getRequestHeadersDict(); - const dispatcher = getGlobalDispatcher().compose((dispatch) => { - return (opts, handler) => { - if (opts.headers) { - reqresp.requestHeaders = opts.headers as Record; - } - return dispatch(opts, handler); - }; - }); + let dispatcher = getProxyDispatcher(); + + if (dispatcher) { + dispatcher = dispatcher.compose((dispatch) => { + return (opts, handler) => { + if (opts.headers) { + reqresp.requestHeaders = opts.headers as Record; + } + return dispatch(opts, handler); + }; + }); + } const resp = await fetch(url!, { method, diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index a13eae16..3ffb40c7 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -10,6 +10,7 @@ import { DETECT_SITEMAP } from "./constants.js"; import { sleep } from "./timing.js"; import { fetch, Response } from "undici"; +import { getProxyDispatcher } from "./proxy.js"; const SITEMAP_CONCURRENCY = 5; @@ -65,7 +66,10 @@ export class SitemapReader extends EventEmitter { async _fetchWithRetry(url: string, message: string) { while (true) { - const resp = await fetch(url, { headers: this.headers }); + const resp = await fetch(url, { + headers: this.headers, + dispatcher: getProxyDispatcher(), + }); if (resp.ok) { return resp;