Skip to content

Commit

Permalink
additional fixes:
Browse files Browse the repository at this point in the history
- redirect: if page url is /path/ -> /path, don't add as extra seed
- proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy
  • Loading branch information
ikreymer committed Jan 18, 2025
1 parent 5961a52 commit 7b5ba97
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 16 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.5.0-beta.2",
"version": "1.5.0-beta.3",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
2 changes: 2 additions & 0 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("crawl already finished, running post-crawl tasks", {
state: initState,
});
this.finalExit = true;
await this.postCrawl();
return;
} else if (await this.crawlState.isCrawlStopped()) {
Expand Down Expand Up @@ -1945,6 +1946,7 @@ self.__bx_behaviors.selectMainBehavior();
depth === 0 &&
!isChromeError &&
respUrl !== url.split("#")[0] &&
respUrl + "/" !== url &&
!downloadResponse
) {
data.seedId = await this.crawlState.addExtraSeed(
Expand Down
3 changes: 3 additions & 0 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,9 @@ class ArgParser {

// background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {};
if (argv.blockAds) {
argv.behaviors.push("autoclick");
}
if (argv.behaviors.length > 0) {
argv.behaviors.forEach((x: string) => {
if (BEHAVIOR_TYPES.includes(x)) {
Expand Down
4 changes: 3 additions & 1 deletion src/util/blockrules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { HTTPRequest, Page } from "puppeteer-core";
import { Browser } from "./browser.js";

import { fetch } from "undici";
import { getProxyDispatcher } from "./proxy.js";

const RULE_TYPES = ["block", "allowOnly"];

Expand Down Expand Up @@ -271,7 +272,7 @@ export class BlockRules {
logDetails: Record<string, any>,
) {
try {
const res = await fetch(reqUrl);
const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() });
const text = await res.text();

return !!text.match(frameTextMatch);
Expand Down Expand Up @@ -302,6 +303,7 @@ export class BlockRules {
method: "PUT",
headers: { "Content-Type": "text/html" },
body,
dispatcher: getProxyDispatcher(),
});
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/util/file_reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import util from "util";
import { exec as execCallback } from "child_process";

import { logger } from "./logger.js";
import { getProxyDispatcher } from "./proxy.js";

const exec = util.promisify(execCallback);

Expand Down Expand Up @@ -85,7 +86,7 @@ async function collectOnlineBehavior(url: string): Promise<FileSources> {
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
Expand Down
6 changes: 5 additions & 1 deletion src/util/originoverride.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { formatErr, logger } from "./logger.js";
import { Browser } from "./browser.js";

import { fetch } from "undici";
import { getProxyDispatcher } from "./proxy.js";

export class OriginOverride {
originOverride: { origUrl: URL; destUrl: URL }[];
Expand Down Expand Up @@ -45,7 +46,10 @@ export class OriginOverride {
headers.set("origin", orig.origin);
}

const resp = await fetch(newUrl, { headers });
const resp = await fetch(newUrl, {
headers,
dispatcher: getProxyDispatcher(),
});

const body = Buffer.from(await resp.arrayBuffer());
const respHeaders = Object.fromEntries(resp.headers);
Expand Down
10 changes: 8 additions & 2 deletions src/util/proxy.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import net from "net";
import { Agent, Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
import { Agent, Dispatcher, ProxyAgent } from "undici";

import child_process from "child_process";

Expand All @@ -13,6 +13,8 @@ const SSH_PROXY_LOCAL_PORT = 9722;

const SSH_WAIT_TIMEOUT = 30000;

let proxyDispatcher: Dispatcher | undefined = undefined;

export function getEnvProxyUrl() {
if (process.env.PROXY_SERVER) {
return process.env.PROXY_SERVER;
Expand Down Expand Up @@ -46,10 +48,14 @@ export async function initProxy(

// set global fetch() dispatcher (with proxy, if any)
const dispatcher = createDispatcher(proxy, agentOpts);
setGlobalDispatcher(dispatcher);
proxyDispatcher = dispatcher;
return proxy;
}

export function getProxyDispatcher() {
return proxyDispatcher;
}

export function createDispatcher(
proxyUrl: string,
opts: Agent.Options,
Expand Down
23 changes: 14 additions & 9 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
isRedirectStatus,
} from "./reqresp.js";

import { fetch, getGlobalDispatcher, Response } from "undici";
import { fetch, Response } from "undici";

import {
getCustomRewriter,
Expand All @@ -23,6 +23,7 @@ import { WARCWriter } from "./warcwriter.js";
import { RedisCrawlState, WorkerId } from "./state.js";
import { CDPSession, Protocol } from "puppeteer-core";
import { Crawler } from "../crawler.js";
import { getProxyDispatcher } from "./proxy.js";

const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
Expand Down Expand Up @@ -1588,14 +1589,18 @@ class AsyncFetcher {

const headers = reqresp.getRequestHeadersDict();

const dispatcher = getGlobalDispatcher().compose((dispatch) => {
return (opts, handler) => {
if (opts.headers) {
reqresp.requestHeaders = opts.headers as Record<string, string>;
}
return dispatch(opts, handler);
};
});
let dispatcher = getProxyDispatcher();

if (dispatcher) {
dispatcher = dispatcher.compose((dispatch) => {
return (opts, handler) => {
if (opts.headers) {
reqresp.requestHeaders = opts.headers as Record<string, string>;
}
return dispatch(opts, handler);
};
});
}

const resp = await fetch(url!, {
method,
Expand Down
6 changes: 5 additions & 1 deletion src/util/sitemapper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { DETECT_SITEMAP } from "./constants.js";
import { sleep } from "./timing.js";

import { fetch, Response } from "undici";
import { getProxyDispatcher } from "./proxy.js";

const SITEMAP_CONCURRENCY = 5;

Expand Down Expand Up @@ -65,7 +66,10 @@ export class SitemapReader extends EventEmitter {

async _fetchWithRetry(url: string, message: string) {
while (true) {
const resp = await fetch(url, { headers: this.headers });
const resp = await fetch(url, {
headers: this.headers,
dispatcher: getProxyDispatcher(),
});

if (resp.ok) {
return resp;
Expand Down

0 comments on commit 7b5ba97

Please sign in to comment.