Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autoclick Support #729

Merged
merged 15 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BROWSER_VERSION=1.73.104
ARG BROWSER_VERSION=1.74.48
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}

FROM ${BROWSER_IMAGE_BASE}
Expand Down Expand Up @@ -39,7 +39,7 @@ ADD config/ /app/

ADD html/ /app/html/

ARG RWP_VERSION=2.2.4
ARG RWP_VERSION=2.2.5
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
Expand Down
1 change: 1 addition & 0 deletions behaviors.js

Large diffs are not rendered by default.

29 changes: 20 additions & 9 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ Options:
e-page-application crawling or when
different hashtags load dynamic cont
ent
--selectLinks one or more selectors for extracting
--selectLinks, --linkSelector One or more selectors for extracting
links, in the format [css selector]
->[property to use],[css selector]->
@[attribute to use]
[array] [default: ["a[href]->href"]]
--clickSelector Selector for elements to click when
using the autoclick behavior
[string] [default: "a"]
--blockRules Additional rules for blocking certai
n URLs from being loaded, by URL reg
ex and optionally via text match in
Expand All @@ -75,7 +78,8 @@ Options:
[string] [default: "crawl-@ts"]
--headless Run in headless mode, otherwise star
t xvfb [boolean] [default: false]
--driver JS driver for the crawler [string]
--driver Custom driver for the crawler, if an
y [string]
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
rateCdx e with pywb after crawl is done
[boolean] [default: false]
Expand Down Expand Up @@ -142,8 +146,7 @@ Options:
o crawl working directory) [string]
--behaviors Which background behaviors to enable
on each page
[array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
ult: ["autoplay","autofetch","autoscroll","siteSpecific"]]
[array] [default: ["autoplay","autofetch","autoscroll","siteSpecific"]]
--behaviorTimeout If >0, timeout (in seconds) for in-p
age behavior will run on each page.
If 0, a behavior can run until finis
Expand All @@ -163,8 +166,10 @@ Options:
hich contains the browser profile di
rectory [string]
--screenshot Screenshot options for crawler, can
include: view, thumbnail, fullPage
[array] [choices: "view", "thumbnail", "fullPage"] [default: []]
include: view, thumbnail, fullPage,
fullPageFinal
[array] [choices: "view", "thumbnail", "fullPage", "fullPageFinal"] [default:
[]]
--screencastPort If set to a non-zero value, starts a
n HTTP server with screencast access
ible on this port
Expand Down Expand Up @@ -251,9 +256,15 @@ Options:
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direc
tory of behavior files
id values: URL to file, path to file
, path to directory of behaviors, UR
L to Git repo of behaviors (prefixed
with git+, optionally specify branc
h and relative path to a directory w
ithin repo as branch and path query
parameters, e.g. --customBehaviors "
git+https://git.example.com/repo.git
?branch=dev&path=some/dir"
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.4.2",
"version": "1.5.0-beta.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand All @@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.6.6",
"browsertrix-behaviors": "^0.7.0",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",
Expand All @@ -31,7 +31,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^23.7.1",
"puppeteer-core": "^24.1.0",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
106 changes: 91 additions & 15 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,7 @@ import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { initRedis } from "./util/redis.js";
import { logger, formatErr, LogDetails } from "./util/logger.js";
import {
WorkerOpts,
WorkerState,
closeWorkers,
runWorkers,
} from "./util/worker.js";
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

Expand Down Expand Up @@ -689,14 +684,9 @@ export class Crawler {
return !!seed.isIncluded(url, depth, extraHops, logDetails);
}

async setupPage({
page,
cdp,
workerid,
callbacks,
recorder,
frameIdToExecId,
}: WorkerOpts) {
async setupPage(opts: WorkerState) {
const { page, cdp, workerid, callbacks, frameIdToExecId, recorder } = opts;

await this.browser.setupPage({ page, cdp });

await this.setupExecContextEvents(cdp, frameIdToExecId);
Expand Down Expand Up @@ -775,6 +765,87 @@ self.__bx_behaviors.selectMainBehavior();

await this.browser.addInitScript(page, initScript);
}

// only add if running with autoclick behavior
if (this.params.behaviors.includes("autoclick")) {
// Ensure off-page navigation is canceled while behavior is running
page.on("dialog", async (dialog) => {
let accepted = true;
if (dialog.type() === "beforeunload") {
if (opts.pageBlockUnload) {
accepted = false;
await dialog.dismiss();
} else {
await dialog.accept();
}
} else {
await dialog.accept();
}
logger.debug("JS Dialog", {
accepted,
blockingUnload: opts.pageBlockUnload,
message: dialog.message(),
type: dialog.type(),
page: page.url(),
workerid,
});
});

// Close any windows opened during navigation from autoclick
await cdp.send("Target.setDiscoverTargets", { discover: true });

cdp.on("Target.targetCreated", async (params) => {
const { targetInfo } = params;
const { type, openerFrameId, targetId } = targetInfo;

try {
if (
type === "page" &&
openerFrameId &&
opts.frameIdToExecId.has(openerFrameId)
) {
await cdp.send("Target.closeTarget", { targetId });
} else {
logger.warn("Extra target not closed", { targetInfo });
}

await cdp.send("Runtime.runIfWaitingForDebugger");
} catch (e) {
// target likely already closed
}
});

void cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: true,
flatten: false,
});

if (this.recording) {
await cdp.send("Page.enable");

cdp.on("Page.windowOpen", async (params) => {
const { seedId, depth, extraHops = 0, url } = opts.data;

const logDetails = { page: url, workerid };

await this.queueInScopeUrls(
seedId,
[params.url],
depth,
extraHops,
false,
logDetails,
);
});
}
}

await page.exposeFunction("__bx_addSet", (data: string) =>
this.crawlState.addToUserSet(data),
);

// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
}

async setupExecContextEvents(
Expand Down Expand Up @@ -932,6 +1003,7 @@ self.__bx_behaviors.selectMainBehavior();
}

opts.markPageUsed();
opts.pageBlockUnload = false;

if (auth) {
await page.setExtraHTTPHeaders({ Authorization: auth });
Expand All @@ -955,8 +1027,12 @@ self.__bx_behaviors.selectMainBehavior();
);
data.favicon = await this.getFavicon(page, logDetails);

opts.pageBlockUnload = true;

await this.doPostLoadActions(opts);

opts.pageBlockUnload = false;

await this.awaitPageExtraDelay(opts);
}

Expand Down Expand Up @@ -1111,7 +1187,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async teardownPage({ workerid }: WorkerOpts) {
async teardownPage({ workerid }: WorkerState) {
if (this.screencaster) {
await this.screencaster.stopById(workerid);
}
Expand Down
22 changes: 22 additions & 0 deletions src/create-login-profile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
import { DISPLAY } from "./util/constants.js";
import { initProxy } from "./util/proxy.js";
//import { sleep } from "./util/timing.js";

const profileHTML = fs.readFileSync(
new URL("../html/createProfile.html", import.meta.url),
Expand Down Expand Up @@ -437,6 +438,27 @@ class InteractiveBrowser {

// attempt to keep everything to initial tab if headless
if (this.params.headless) {
void cdp.send("Target.setDiscoverTargets", { discover: true });

cdp.on("Target.targetCreated", async (params) => {
const { targetInfo } = params;
const { type, openerFrameId } = targetInfo;

if (type === "page" && openerFrameId) {
await cdp.send("Target.closeTarget", {
targetId: params.targetInfo.targetId,
});
}

await cdp.send("Runtime.runIfWaitingForDebugger");
});

void cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: true,
flatten: false,
});

cdp.send("Page.enable").catch((e) => logger.warn("Page.enable error", e));

cdp.on("Page.windowOpen", async (resp) => {
Expand Down
4 changes: 2 additions & 2 deletions src/replaycrawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Crawler } from "./crawler.js";
import { ReplayServer } from "./util/replayserver.js";
import { sleep } from "./util/timing.js";
import { logger, formatErr } from "./util/logger.js";
import { WorkerOpts, WorkerState } from "./util/worker.js";
import { WorkerState } from "./util/worker.js";
import { PageState } from "./util/state.js";
import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js";

Expand Down Expand Up @@ -718,7 +718,7 @@ export class ReplayCrawler extends Crawler {
return text;
}

async teardownPage(opts: WorkerOpts) {
async teardownPage(opts: WorkerState) {
const { page } = opts;
await this.processPageInfo(page);
await super.teardownPage(opts);
Expand Down
23 changes: 21 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
BEHAVIOR_TYPES,
ExtractSelector,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
Expand Down Expand Up @@ -165,13 +166,21 @@ class ArgParser {
},

selectLinks: {
alias: "linkSelector",
describe:
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]",
type: "array",
default: ["a[href]->href"],
coerce,
},

clickSelector: {
describe:
"Selector for elements to click when using the autoclick behavior",
type: "string",
default: "a",
},

blockRules: {
describe:
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
Expand Down Expand Up @@ -351,7 +360,6 @@ class ArgParser {
describe: "Which background behaviors to enable on each page",
type: "array",
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
choices: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
coerce,
},

Expand Down Expand Up @@ -693,9 +701,20 @@ class ArgParser {
// background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {};
if (argv.behaviors.length > 0) {
argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
argv.behaviors.forEach((x: string) => {
if (BEHAVIOR_TYPES.includes(x)) {
behaviorOpts[x] = true;
} else {
logger.warn(
"Unknown behavior specified, ignoring",
{ behavior: x },
"behavior",
);
}
});
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts);
} else {
argv.behaviorOpts = "";
Expand Down
Loading
Loading