From 94d9a1ea3399df1a1f6653a5a2f2b4784de98dde Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 30 Aug 2025 12:41:10 -0700 Subject: [PATCH 01/29] dedup work: - resource dedup via page digest - page dedup via page digest check, blocking of dupe page --- src/crawler.ts | 10 +++- src/util/recorder.ts | 129 +++++++++++++++++++++++++++++++++++++------ src/util/state.ts | 28 ++++++++++ 3 files changed, 147 insertions(+), 20 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 8ad585d8..b777d27e 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1095,7 +1095,7 @@ self.__bx_behaviors.selectMainBehavior(); const { page, cdp, data, workerid, callbacks, recorder } = opts; data.callbacks = callbacks; - const { url, seedId } = data; + const { url, seedId, depth } = data; const auth = this.seeds[seedId].authHeader(); @@ -1168,6 +1168,7 @@ self.__bx_behaviors.selectMainBehavior(); if (recorder) { recorder.pageSeed = seed; + recorder.pageSeedDepth = depth; } // run custom driver here, if any @@ -1346,6 +1347,7 @@ self.__bx_behaviors.selectMainBehavior(); } else { if (pageSkipped) { await this.crawlState.markExcluded(url); + this.limitHit = false; } else { const retry = await this.crawlState.markFailed(url, noRetries); @@ -2217,7 +2219,11 @@ self.__bx_behaviors.selectMainBehavior(); if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) { // excluded in recorder data.pageSkipped = true; - logger.warn("Page Load Blocked, skipping", { msg, loadState }); + logger.warn( + "Page Load Blocked, skipping", + { msg, loadState }, + "pageStatus", + ); throw new Error("logged"); } else { return this.pageFailed("Page Load Failed", retry, { diff --git a/src/util/recorder.ts b/src/util/recorder.ts index c28cbbcb..b7f6d1af 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -27,6 +27,7 @@ import { getProxyDispatcher } from "./proxy.js"; import { ScopedSeed } from "./seeds.js"; import EventEmitter from "events"; import { DEFAULT_MAX_RETRIES } from "./constants.js"; +import { createHash } from "crypto"; const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000; const MAX_TEXT_REWRITE_SIZE = 25_000_000; @@ -37,7 +38,7 @@ const TAKE_STREAM_BUFF_SIZE = 1024 * 64; const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe"; -const WRITE_DUPE_KEY = "s:writedupe"; +const WRITE_DUPE_KEY = "dupe"; const MIME_EVENT_STREAM = "text/event-stream"; @@ -141,6 +142,7 @@ export class Recorder extends EventEmitter { pageid!: string; pageSeed?: ScopedSeed; + pageSeedDepth = 0; frameIdToExecId: Map | null; @@ -831,6 +833,20 @@ export class Recorder extends EventEmitter { const rewritten = await this.rewriteResponse(reqresp, mimeType); + if (url === this.pageUrl && reqresp.payload && this.pageSeedDepth >= 1) { + const hash = + "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex"); + const res = await this.crawlState.getHashDupe(WRITE_DUPE_KEY, hash, url); + if (res && res.dupe) { + const errorReason = "BlockedByResponse"; + await cdp.send("Fetch.failRequest", { + requestId, + errorReason, + }); + return true; + } + } + // not rewritten, and not streaming, return false to continue if (!rewritten && !streamingConsume) { if (!reqresp.payload) { @@ -1519,11 +1535,9 @@ export class Recorder extends EventEmitter { const { url } = reqresp; const { logDetails } = this; try { - let readSize = await serializer.digestRecord(); - if (serializer.httpHeadersBuff) { - readSize -= serializer.httpHeadersBuff.length; - } - reqresp.readSize = readSize; + reqresp.readSize = await serializer.digestRecord({ + includeHeadersSize: false, + }); // set truncated field and recompute header buff if (reqresp.truncated) { logger.warn( @@ -1609,20 +1623,20 @@ export class Recorder extends EventEmitter { return false; } - if ( - url && - method === "GET" && - !isRedirectStatus(status) && - !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) - ) { - logNetwork("Skipping dupe", { url, status, ...this.logDetails }); - return false; - } + // if ( + // url && + // method === "GET" && + // !isRedirectStatus(status) && + // !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) + // ) { + // logNetwork("Skipping dupe", { url, status, ...this.logDetails }); + // return false; + // } - const responseRecord = createResponse(reqresp, pageid, iter); + let responseRecord = createResponse(reqresp, pageid, iter); const requestRecord = createRequest(reqresp, responseRecord, pageid); - const serializer = new WARCSerializer(responseRecord, { + let serializer = new WARCSerializer(responseRecord, { gzip, maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE, }); @@ -1633,6 +1647,7 @@ export class Recorder extends EventEmitter { ) { serializer.externalBuffer?.purge(); await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); + //await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); return false; } @@ -1659,7 +1674,39 @@ export class Recorder extends EventEmitter { } } } else { - await serializer.digestRecord(); + reqresp.readSize = await serializer.digestRecord({ + includeHeadersSize: false, + }); + } + + const hash = responseRecord.warcPayloadDigest || ""; + const date = responseRecord.warcDate || ""; + + const isEmpty = reqresp.readSize === 0; + + if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) { + const { dupe, origUrl, origDate } = await this.crawlState.getHashDupe( + WRITE_DUPE_KEY, + hash, + url, + ); + + if (dupe) { + // duplicate url at origTs + // skip, no need for revisit + logNetwork("Skipping dupe", { url, status, ...this.logDetails }); + return false; + } else if (origUrl && origDate) { + serializer.externalBuffer?.purge(); + ({ responseRecord, serializer } = await createRevisitForResponse( + responseRecord, + serializer, + origUrl, + origDate, + )); + } else { + // no dupe, continue + } } let modified = false; @@ -1690,6 +1737,10 @@ export class Recorder extends EventEmitter { this.addPageRecord(reqresp); + if (!isEmpty) { + await this.crawlState.addHashDupe(WRITE_DUPE_KEY, hash, url, date); + } + return true; } } @@ -2053,6 +2104,48 @@ function createResponse( ); } +// ================================================================= +// revisit +async function createRevisitForResponse( + responseRecord: WARCRecord, + serializer: WARCSerializer, + refersToUrl: string, + refersToDate: string, +) { + const origPayloadDigest = responseRecord.warcPayloadDigest; + + const warcHeaders: Record = { + "WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!, + }; + + const revisitRecord = WARCRecord.create({ + url: responseRecord.warcTargetURI!, + date: responseRecord.warcDate!, + warcVersion: "WARC/1.1", + type: "revisit", + warcHeaders, + refersToUrl, + refersToDate, + }); + revisitRecord.httpHeaders = responseRecord.httpHeaders; + + serializer = new WARCSerializer(revisitRecord, { + gzip: true, + maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE, + }); + + await serializer.digestRecord(); + + if (origPayloadDigest) { + revisitRecord.warcHeaders.headers.set( + "WARC-Payload-Digest", + origPayloadDigest, + ); + } + + return { serializer, responseRecord: revisitRecord }; +} + // ================================================================= // request function createRequest( diff --git a/src/util/state.ts b/src/util/state.ts index ab9810ea..68c46dc2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1073,6 +1073,34 @@ return inx; return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url); } + async getHashDupe( + key: string, + hash: string, + url: string, + ): Promise<{ dupe?: boolean; origDate?: string; origUrl?: string }> { + const value = await this.redis.hget(key, hash); + if (!value) { + return {}; + } + const val = value.split("|"); + // if matches the first entry, return + if (val[1] === url) { + return { dupe: true }; + } + // otherwise, check if a revisit entry + if (await this.redis.sismember(`${key}:${hash}`, url)) { + return { dupe: true }; + } + return { origUrl: val[1], origDate: val[0] }; + } + + async addHashDupe(key: string, hash: string, url: string, date: string) { + const val = date + "|" + url; + if (!(await this.redis.hsetnx(key, hash, val))) { + await this.redis.sadd(`${key}:${hash}`, url); + } + } + async isInUserSet(value: string) { return (await this.redis.sismember(this.key + ":user", value)) === 1; } From f80fded4559eb7b4a30b15a8dfca31e6dd0464f6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 16 Sep 2025 17:48:13 -0700 Subject: [PATCH 02/29] args: add separate --dedupIndexUrl to support separate redis for dedup indexing prep: - move WACZLoader to wacz for reuse --- src/crawler.ts | 23 ++++++++++------------- src/replaycrawler.ts | 39 +-------------------------------------- src/util/argParser.ts | 7 +++++++ src/util/redis.ts | 14 ++++++++++++++ src/util/state.ts | 11 +++++++---- src/util/wacz.ts | 40 +++++++++++++++++++++++++++++++++++++++- 6 files changed, 78 insertions(+), 56 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index b777d27e..4cba5a38 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -31,7 +31,7 @@ import { } from "./util/storage.js"; import { ScreenCaster, WSTransport } from "./util/screencaster.js"; import { Screenshots } from "./util/screenshots.js"; -import { initRedis } from "./util/redis.js"; +import { initRedisWaitForSuccess } from "./util/redis.js"; import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js"; import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; @@ -342,6 +342,7 @@ export class Crawler { async initCrawlState() { const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; + const dedupRedisUrl = this.params.dedupStoreUrl || redisUrl; if (!redisUrl.startsWith("redis://")) { logger.fatal( @@ -349,18 +350,7 @@ export class Crawler { ); } - let redis; - - while (true) { - try { - redis = await initRedis(redisUrl); - break; - } catch (e) { - //logger.fatal("Unable to connect to state store Redis: " + redisUrl); - logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state"); - await sleep(1); - } - } + const redis = await initRedisWaitForSuccess(redisUrl); logger.debug( `Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, @@ -368,6 +358,12 @@ export class Crawler { "state", ); + let dedupRedis = redis; + + if (redisUrl !== dedupRedisUrl) { + dedupRedis = await initRedisWaitForSuccess(dedupRedisUrl); + } + logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state"); this.crawlState = new RedisCrawlState( @@ -376,6 +372,7 @@ export class Crawler { this.maxPageTime, os.hostname(), this.params.maxPageRetries, + dedupRedis, ); if (this.params.logErrorsToRedis) { diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 819bcf39..13160c60 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -10,9 +10,6 @@ import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js"; import fsp from "fs/promises"; import path from "path"; -import { ZipRangeReader, createLoader } from "@webrecorder/wabac"; - -import { AsyncIterReader } from "warcio"; import { parseArgs } from "./util/argParser.js"; import { PNG } from "pngjs"; @@ -23,6 +20,7 @@ import { MAX_URL_LENGTH } from "./util/reqresp.js"; import { openAsBlob } from "fs"; import { WARCWriter } from "./util/warcwriter.js"; import { parseRx } from "./util/seeds.js"; +import { WACZLoader } from "./util/wacz.js"; // RWP Replay Prefix const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/"; @@ -784,38 +782,3 @@ export class ReplayCrawler extends Crawler { return null; } } - -class WACZLoader { - url: string; - zipreader: ZipRangeReader | null; - - constructor(url: string) { - this.url = url; - this.zipreader = null; - } - - async init() { - if (!this.url.startsWith("http://") && !this.url.startsWith("https://")) { - const blob = await openAsBlob(this.url); - this.url = URL.createObjectURL(blob); - } - - const loader = await createLoader({ url: this.url }); - - this.zipreader = new ZipRangeReader(loader); - } - - async loadFile(fileInZip: string) { - const { reader } = await this.zipreader!.loadFile(fileInZip); - - if (!reader) { - return null; - } - - if (!reader.iterLines) { - return new AsyncIterReader(reader); - } - - return reader; - } -} diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 78fd35b9..ae0fd2cd 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -445,6 +445,13 @@ class ArgParser { default: "redis://localhost:6379/0", }, + dedupStoreUrl: { + describe: + "If set, url for remote redis server to store state. Otherwise, using local redis instance", + type: "string", + default: "redis://localhost:6379/0", + }, + saveState: { describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted", diff --git a/src/util/redis.ts b/src/util/redis.ts index 56b3bb27..325ce9ed 100644 --- a/src/util/redis.ts +++ b/src/util/redis.ts @@ -1,5 +1,6 @@ import { Redis } from "ioredis"; import { logger } from "./logger.js"; +import { sleep } from "./timing.js"; const error = console.error; @@ -34,6 +35,19 @@ export async function initRedis(url: string) { return redis; } +export async function initRedisWaitForSuccess(redisUrl: string, retrySecs = 1) { + while (true) { + try { + return await initRedis(redisUrl); + break; + } catch (e) { + //logger.fatal("Unable to connect to state store Redis: " + redisUrl); + logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state"); + await sleep(retrySecs); + } + } +} + export function setExitOnRedisError() { exitOnError = true; } diff --git a/src/util/state.ts b/src/util/state.ts index 68c46dc2..f1d151d7 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -217,6 +217,7 @@ export type SaveState = { export class RedisCrawlState { redis: Redis; maxRetries: number; + dedupRedis: Redis; uid: string; key: string; @@ -248,8 +249,10 @@ export class RedisCrawlState { maxPageTime: number, uid: string, maxRetries?: number, + dedupRedis?: Redis, ) { this.redis = redis; + this.dedupRedis = dedupRedis || redis; this.uid = uid; this.key = key; @@ -1078,7 +1081,7 @@ return inx; hash: string, url: string, ): Promise<{ dupe?: boolean; origDate?: string; origUrl?: string }> { - const value = await this.redis.hget(key, hash); + const value = await this.dedupRedis.hget(key, hash); if (!value) { return {}; } @@ -1088,7 +1091,7 @@ return inx; return { dupe: true }; } // otherwise, check if a revisit entry - if (await this.redis.sismember(`${key}:${hash}`, url)) { + if (await this.dedupRedis.sismember(`${key}:${hash}`, url)) { return { dupe: true }; } return { origUrl: val[1], origDate: val[0] }; @@ -1096,8 +1099,8 @@ return inx; async addHashDupe(key: string, hash: string, url: string, date: string) { const val = date + "|" + url; - if (!(await this.redis.hsetnx(key, hash, val))) { - await this.redis.sadd(`${key}:${hash}`, url); + if (!(await this.dedupRedis.hsetnx(key, hash, val))) { + await this.dedupRedis.sadd(`${key}:${hash}`, url); } } diff --git a/src/util/wacz.ts b/src/util/wacz.ts index fcf4eabc..3fa28d24 100644 --- a/src/util/wacz.ts +++ b/src/util/wacz.ts @@ -1,5 +1,5 @@ import path, { basename } from "node:path"; -import fs from "node:fs"; +import fs, { openAsBlob } from "node:fs"; import fsp from "node:fs/promises"; import { Writable, Readable } from "node:stream"; import { pipeline } from "node:stream/promises"; @@ -16,6 +16,8 @@ import { makeZip, InputWithoutMeta } from "client-zip"; import { logger, formatErr } from "./logger.js"; import { streamFinish } from "./warcwriter.js"; import { getDirSize } from "./storage.js"; +import { createLoader, ZipRangeReader } from "@webrecorder/wabac"; +import { AsyncIterReader } from "warcio"; const DATAPACKAGE_JSON = "datapackage.json"; const DATAPACKAGE_DIGEST_JSON = "datapackage-digest.json"; @@ -427,3 +429,39 @@ export async function mergeCDXJ( await removeIndexFile(INDEX_CDXJ); } } + +// ============================================================================ +export class WACZLoader { + url: string; + zipreader: ZipRangeReader | null; + + constructor(url: string) { + this.url = url; + this.zipreader = null; + } + + async init() { + if (!this.url.startsWith("http://") && !this.url.startsWith("https://")) { + const blob = await openAsBlob(this.url); + this.url = URL.createObjectURL(blob); + } + + const loader = await createLoader({ url: this.url }); + + this.zipreader = new ZipRangeReader(loader); + } + + async loadFile(fileInZip: string) { + const { reader } = await this.zipreader!.loadFile(fileInZip); + + if (!reader) { + return null; + } + + if (!reader.iterLines) { + return new AsyncIterReader(reader); + } + + return reader; + } +} From aa8a189c0f978786173dcc49159d969de863e322 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Sep 2025 19:23:32 -0700 Subject: [PATCH 03/29] add indexer entrypoint: - populate dedup index from remote wacz/multi wacz/multiwacz json refactor: - move WACZLoader to wacz to be shared with indexer - state: move hash-based dedup to RedisDedupIndex cli args: - add --minPageDedupDepth to indicate when pages are skipped for dedup - skip same URLs by same hash within same crawl --- Dockerfile | 5 +- src/indexer.ts | 180 ++++++++++++++++++++++++++++++++++++++++++ src/util/argParser.ts | 8 +- src/util/constants.ts | 2 + src/util/recorder.ts | 73 ++++++++++------- src/util/state.ts | 83 +++++++++---------- src/util/wacz.ts | 13 +++ 7 files changed, 292 insertions(+), 72 deletions(-) create mode 100644 src/indexer.ts diff --git a/Dockerfile b/Dockerfile index ec5de684..9a5cad3b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,11 +44,12 @@ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rw ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz -RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/* +RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js /app/dist/indexer.js && chmod a+r /app/html/rwp/* RUN ln -s /app/dist/main.js /usr/bin/crawl; \ ln -s /app/dist/main.js /usr/bin/qa; \ - ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile + ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile; \ + ln -s /app/dist/indexer.js /usr/bin/indexer; RUN mkdir -p /app/behaviors diff --git a/src/indexer.ts b/src/indexer.ts new file mode 100644 index 00000000..d8b64594 --- /dev/null +++ b/src/indexer.ts @@ -0,0 +1,180 @@ +#!/usr/bin/env node + +import yargs from "yargs"; +import { logger } from "./util/logger.js"; +import { getInfoString } from "./util/file_reader.js"; +import { openAsBlob } from "node:fs"; +import { WACZLoader } from "./util/wacz.js"; +import { ExitCodes } from "./util/constants.js"; +import { initRedisWaitForSuccess } from "./util/redis.js"; +import { AsyncIterReader } from "warcio"; +import { RedisDedupIndex } from "./util/state.js"; + +export class CrawlIndexer { + constructor() {} + + initArgs() { + return yargs(process.argv) + .usage("indexer [options]") + .options({ + dedupStoreUrl: { + describe: "URL for remote redis instance to index into", + type: "string", + required: true, + }, + + sourceUrl: { + describe: "Source WACZ or Multi WACZ or Multi WACZ JSON to index", + type: "string", + required: true, + }, + }) + .parseSync(); + } + + async run() { + logger.setDebugLogging(true); + + process.on("SIGINT", () => this.handleTerminate("SIGINT")); + + process.on("SIGTERM", () => this.handleTerminate("SIGTERM")); + + logger.info(await getInfoString()); + + const params = this.initArgs(); + + const redis = await initRedisWaitForSuccess(params.dedupStoreUrl); + const dedupIndex = new RedisDedupIndex(redis); + + const allFiles = []; + + for await (const waczfile of this.iterWACZ(params.sourceUrl)) { + allFiles.push(waczfile); + } + + let count = 0; + const total = allFiles.length; + + for (const waczfile of allFiles) { + count += 1; + const loader = new WACZLoader(waczfile); + logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile }); + for await (const file of loader.iterFiles("indexes/")) { + const filename = file.filename; + if (filename.endsWith(".cdx.gz")) { + logger.debug("Processing CDX GZ Index", { filename }); + await this.ingestCDXJ(dedupIndex, loader, filename, "gzip"); + } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) { + logger.debug("Processing CDX Index", { filename }); + await this.ingestCDXJ(dedupIndex, loader, filename); + } + } + } + + logger.info("Done!"); + process.exit(ExitCodes.Success); + } + + async ingestCDXJ( + dedupIndex: RedisDedupIndex, + loader: WACZLoader, + filename: string, + compression?: string, + ) { + let reader = await loader.loadFile(filename); + + if (!reader) { + logger.error("File not found, skipping!"); + return; + } + + if (compression === "gzip") { + reader = new AsyncIterReader(reader, "gzip", false); + } + + let count = 0; + + for await (const line of reader.iterLines()) { + const inx = line.indexOf(" {"); + if (inx < 0) { + logger.error("Skipping invalid CDXJ, no JSON", { line }); + continue; + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let cdx: Record; + + try { + cdx = JSON.parse(line.slice(inx)); + } catch (e) { + logger.error("Skipping invalid CDXJ, JSON invalid", { line }); + continue; + } + + const date = line.split(" ", 2)[1]; + const url = cdx.url; + const hash = cdx.digest; + + if (url.startsWith("urn:")) { + continue; + } + + // only adding originals to dedup against, don't want to dedup against existing revisits + if (cdx.mime === "warc/revisit") { + continue; + } + + if (url && date && hash) { + await dedupIndex.addHashDupe(hash, url, date); + } else { + logger.warn("Skipping invalid CDXJ, data missing", { + url, + date, + digest: hash, + }); + continue; + } + + count += 1; + } + + logger.debug("Processed", { count }); + } + + async *iterWACZ(url: string): AsyncIterable { + let path: string = url; + + try { + path = new URL(url).pathname; + } catch (e) { + // ignore + } + + if (path.endsWith(".wacz")) { + yield url; + } else if (path.endsWith(".json")) { + if (!url.startsWith("http://") && !url.startsWith("https://")) { + const blob = await openAsBlob(url); + url = URL.createObjectURL(blob); + } + + const resp = await fetch(url); + const json = await resp.json(); + + for (const entry of json.resources) { + if (entry.path) { + yield* this.iterWACZ(entry.path); + } + } + } else { + logger.warn("Unknown source", { url }, "replay"); + } + } + + handleTerminate(signame: string) { + logger.info(`Got signal ${signame}, exiting`); + process.exit(ExitCodes.SignalInterrupted); + } +} + +await new CrawlIndexer().run(); diff --git a/src/util/argParser.ts b/src/util/argParser.ts index ae0fd2cd..197548d3 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -449,7 +449,13 @@ class ArgParser { describe: "If set, url for remote redis server to store state. Otherwise, using local redis instance", type: "string", - default: "redis://localhost:6379/0", + }, + + minPageDedupDepth: { + describe: + "If set >= 0, minimum depth at which duplicate pages can be skipped. -1 means never skip duplicate pages", + type: "number", + default: -1, }, saveState: { diff --git a/src/util/constants.ts b/src/util/constants.ts index fa8232ce..b8565508 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -22,6 +22,8 @@ export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; +export const HASH_DUPE_KEY = "dupe"; + export enum BxFunctionBindings { BehaviorLogFunc = "__bx_log", AddLinkFunc = "__bx_addLink", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index b7f6d1af..404853cb 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -15,12 +15,19 @@ import { removeRangeAsQuery, rewriteDASH, rewriteHLS, + tsToDate, } from "@webrecorder/wabac"; import { WARCRecord, multiValueHeader } from "warcio"; import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; -import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js"; +import { + LoadState, + normalizeDedupStatus, + PageState, + RedisCrawlState, + WorkerId, +} from "./state.js"; import { CDPSession, Protocol } from "puppeteer-core"; import { Crawler } from "../crawler.js"; import { getProxyDispatcher } from "./proxy.js"; @@ -38,7 +45,7 @@ const TAKE_STREAM_BUFF_SIZE = 1024 * 64; const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe"; -const WRITE_DUPE_KEY = "dupe"; +const WRITE_DUPE_KEY = "s:writedupe"; const MIME_EVENT_STREAM = "text/event-stream"; @@ -143,6 +150,7 @@ export class Recorder extends EventEmitter { pageSeed?: ScopedSeed; pageSeedDepth = 0; + minPageDedupDepth = -1; frameIdToExecId: Map | null; @@ -166,6 +174,8 @@ export class Recorder extends EventEmitter { this.shouldSaveStorage = !!crawler.params.saveStorage; + this.minPageDedupDepth = crawler.params.minPageDedupDepth; + this.writer = writer; this.fetcherQ = new PQueue({ concurrency: 1 }); @@ -833,11 +843,16 @@ export class Recorder extends EventEmitter { const rewritten = await this.rewriteResponse(reqresp, mimeType); - if (url === this.pageUrl && reqresp.payload && this.pageSeedDepth >= 1) { + if ( + url === this.pageUrl && + reqresp.payload && + this.minPageDedupDepth >= 0 && + this.pageSeedDepth >= this.minPageDedupDepth + ) { const hash = "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex"); - const res = await this.crawlState.getHashDupe(WRITE_DUPE_KEY, hash, url); - if (res && res.dupe) { + const { origUrl } = await this.crawlState.getHashDupe(hash); + if (origUrl) { const errorReason = "BlockedByResponse"; await cdp.send("Fetch.failRequest", { requestId, @@ -1518,7 +1533,11 @@ export class Recorder extends EventEmitter { if ( method === "GET" && url && - !(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status)) + !(await this.crawlState.addIfNoDupe( + ASYNC_FETCH_DUPE_KEY, + url, + normalizeDedupStatus(status), + )) ) { reqresp.asyncLoading = false; return true; @@ -1629,7 +1648,7 @@ export class Recorder extends EventEmitter { // !isRedirectStatus(status) && // !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) // ) { - // logNetwork("Skipping dupe", { url, status, ...this.logDetails }); + // logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails }); // return false; // } @@ -1646,7 +1665,11 @@ export class Recorder extends EventEmitter { !(await this.checkStreamingRecordPayload(reqresp, serializer, false)) ) { serializer.externalBuffer?.purge(); - await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); + await this.crawlState.removeDupe( + ASYNC_FETCH_DUPE_KEY, + url, + normalizeDedupStatus(status), + ); //await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); return false; } @@ -1680,29 +1703,29 @@ export class Recorder extends EventEmitter { } const hash = responseRecord.warcPayloadDigest || ""; + + if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) { + serializer.externalBuffer?.purge(); + return false; + } + const date = responseRecord.warcDate || ""; const isEmpty = reqresp.readSize === 0; if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) { - const { dupe, origUrl, origDate } = await this.crawlState.getHashDupe( - WRITE_DUPE_KEY, - hash, - url, - ); + const { origUrl, origDate } = await this.crawlState.getHashDupe(hash); - if (dupe) { - // duplicate url at origTs - // skip, no need for revisit - logNetwork("Skipping dupe", { url, status, ...this.logDetails }); - return false; - } else if (origUrl && origDate) { + if (hash && origUrl && origDate) { + const date = tsToDate(origDate).toISOString(); + // always write revisit here + // duplicate URLs in same crawl filtered out separately serializer.externalBuffer?.purge(); ({ responseRecord, serializer } = await createRevisitForResponse( responseRecord, serializer, origUrl, - origDate, + date, )); } else { // no dupe, continue @@ -1738,7 +1761,7 @@ export class Recorder extends EventEmitter { this.addPageRecord(reqresp); if (!isEmpty) { - await this.crawlState.addHashDupe(WRITE_DUPE_KEY, hash, url, date); + await this.crawlState.addHashDupe(hash, url, date); } return true; @@ -2116,6 +2139,7 @@ async function createRevisitForResponse( const warcHeaders: Record = { "WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!, + "WARC-Payload-Digest": origPayloadDigest!, }; const revisitRecord = WARCRecord.create({ @@ -2136,13 +2160,6 @@ async function createRevisitForResponse( await serializer.digestRecord(); - if (origPayloadDigest) { - revisitRecord.warcHeaders.headers.set( - "WARC-Payload-Digest", - origPayloadDigest, - ); - } - return { serializer, responseRecord: revisitRecord }; } diff --git a/src/util/state.ts b/src/util/state.ts index f1d151d7..90f7a931 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -7,6 +7,7 @@ import { MAX_DEPTH, DEFAULT_MAX_RETRIES, ROBOTS_CACHE_LIMIT, + HASH_DUPE_KEY } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; @@ -45,11 +46,11 @@ const normalizeUrlOpts: NormamlizeUrlOptions = { // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup -function normalizeDedupStatus(status: number): number { +export function normalizeDedupStatus(status: number): string { if (status === 0 || status === 206) { - return 200; + return "200"; } - return status; + return status + ""; } // ============================================================================ @@ -214,10 +215,41 @@ export type SaveState = { }; // ============================================================================ -export class RedisCrawlState { +export class RedisDedupIndex { + dedupRedis: Redis; + + constructor(dedupRedis: Redis) { + this.dedupRedis = dedupRedis; + } + + async getHashDupe( + hash: string, + key = HASH_DUPE_KEY, + //url: string, + ): Promise<{ origDate?: string; origUrl?: string }> { + const value = await this.dedupRedis.hget(key, hash); + if (!value) { + return {}; + } + const val = value.split("|"); + return { origUrl: val[1], origDate: val[0] }; + } + + async addHashDupe( + hash: string, + url: string, + date: string, + key = HASH_DUPE_KEY, + ) { + const val = date.replace(/[^\d]/g, "") + "|" + url; + await this.dedupRedis.hsetnx(key, hash, val); + } +} + +// ============================================================================ +export class RedisCrawlState extends RedisDedupIndex { redis: Redis; maxRetries: number; - dedupRedis: Redis; uid: string; key: string; @@ -251,8 +283,8 @@ export class RedisCrawlState { maxRetries?: number, dedupRedis?: Redis, ) { + super(dedupRedis || redis); this.redis = redis; - this.dedupRedis = dedupRedis || redis; this.uid = uid; this.key = key; @@ -1064,44 +1096,13 @@ return inx; return await this.redis.zcard(this.qkey); } - async addIfNoDupe(key: string, url: string, status: number) { + async addIfNoDupe(key: string, url: string, other_id: string) { url = normalizeUrl(url, normalizeUrlOpts); - return ( - (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === - 1 - ); - } - - async removeDupe(key: string, url: string, status: number) { - return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url); + return (await this.redis.sadd(key, other_id + "|" + url)) === 1; } - async getHashDupe( - key: string, - hash: string, - url: string, - ): Promise<{ dupe?: boolean; origDate?: string; origUrl?: string }> { - const value = await this.dedupRedis.hget(key, hash); - if (!value) { - return {}; - } - const val = value.split("|"); - // if matches the first entry, return - if (val[1] === url) { - return { dupe: true }; - } - // otherwise, check if a revisit entry - if (await this.dedupRedis.sismember(`${key}:${hash}`, url)) { - return { dupe: true }; - } - return { origUrl: val[1], origDate: val[0] }; - } - - async addHashDupe(key: string, hash: string, url: string, date: string) { - const val = date + "|" + url; - if (!(await this.dedupRedis.hsetnx(key, hash, val))) { - await this.dedupRedis.sadd(`${key}:${hash}`, url); - } + async removeDupe(key: string, url: string, other_id: string) { + return await this.redis.srem(key, other_id + "|" + url); } async isInUserSet(value: string) { diff --git a/src/util/wacz.ts b/src/util/wacz.ts index 3fa28d24..acbbfe94 100644 --- a/src/util/wacz.ts +++ b/src/util/wacz.ts @@ -464,4 +464,17 @@ export class WACZLoader { return reader; } + + async *iterFiles(prefix: string) { + if (!this.zipreader) { + await this.init(); + } + const entries = await this.zipreader!.load(); + + for (const [key, value] of Object.entries(entries)) { + if (key.startsWith(prefix)) { + yield value; + } + } + } } From af0c0701b10c2675afd7d2f9ee81229e1b781ffb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Sep 2025 20:02:01 -0700 Subject: [PATCH 04/29] keep skipping dupe URLs as before --- src/util/recorder.ts | 52 ++++++++++++++++++-------------------------- src/util/state.ts | 11 ++++++---- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 404853cb..2ccc94af 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -21,13 +21,7 @@ import { import { WARCRecord, multiValueHeader } from "warcio"; import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; -import { - LoadState, - normalizeDedupStatus, - PageState, - RedisCrawlState, - WorkerId, -} from "./state.js"; +import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js"; import { CDPSession, Protocol } from "puppeteer-core"; import { Crawler } from "../crawler.js"; import { getProxyDispatcher } from "./proxy.js"; @@ -1533,11 +1527,7 @@ export class Recorder extends EventEmitter { if ( method === "GET" && url && - !(await this.crawlState.addIfNoDupe( - ASYNC_FETCH_DUPE_KEY, - url, - normalizeDedupStatus(status), - )) + !(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status)) ) { reqresp.asyncLoading = false; return true; @@ -1642,15 +1632,19 @@ export class Recorder extends EventEmitter { return false; } - // if ( - // url && - // method === "GET" && - // !isRedirectStatus(status) && - // !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) - // ) { - // logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails }); - // return false; - // } + if ( + url && + method === "GET" && + !isRedirectStatus(status) && + !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) + ) { + logNetwork("Skipping exact URL dupe in this crawl", { + url, + status, + ...this.logDetails, + }); + return false; + } let responseRecord = createResponse(reqresp, pageid, iter); const requestRecord = createRequest(reqresp, responseRecord, pageid); @@ -1665,12 +1659,8 @@ export class Recorder extends EventEmitter { !(await this.checkStreamingRecordPayload(reqresp, serializer, false)) ) { serializer.externalBuffer?.purge(); - await this.crawlState.removeDupe( - ASYNC_FETCH_DUPE_KEY, - url, - normalizeDedupStatus(status), - ); - //await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); + await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status); + await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); return false; } @@ -1704,10 +1694,10 @@ export class Recorder extends EventEmitter { const hash = responseRecord.warcPayloadDigest || ""; - if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) { - serializer.externalBuffer?.purge(); - return false; - } + // if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) { + // serializer.externalBuffer?.purge(); + // return false; + // } const date = responseRecord.warcDate || ""; diff --git a/src/util/state.ts b/src/util/state.ts index 90f7a931..71c091b0 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1096,13 +1096,16 @@ return inx; return await this.redis.zcard(this.qkey); } - async addIfNoDupe(key: string, url: string, other_id: string) { + async addIfNoDupe(key: string, url: string, status: number) { url = normalizeUrl(url, normalizeUrlOpts); - return (await this.redis.sadd(key, other_id + "|" + url)) === 1; + return ( + (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === + 1 + ); } - async removeDupe(key: string, url: string, other_id: string) { - return await this.redis.srem(key, other_id + "|" + url); + async removeDupe(key: string, url: string, status: number) { + return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url); } async isInUserSet(value: string) { From 60ff421782ce4cd518dfd402ef1a753e28601a8d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Sep 2025 20:48:32 -0700 Subject: [PATCH 05/29] warc writing: - update to warcio 2.4.6, write WARC-Payload-Digest along with WARC-Block-Digest for revisists - copy additional custom WARC headers to revisit from response --- src/util/recorder.ts | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 2ccc94af..5a6d65f0 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -2095,14 +2095,14 @@ function createResponse( warcHeaders["WARC-Resource-Type"] = reqresp.resourceType; } - if (!contentIter) { - contentIter = [reqresp.payload] as Iterable; - } - if (Object.keys(reqresp.extraOpts).length) { warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts); } + if (!contentIter) { + contentIter = [reqresp.payload] as Iterable; + } + return WARCRecord.create( { url, @@ -2117,6 +2117,14 @@ function createResponse( ); } +// ================================================================= +const REVISIT_COPY_HEADERS = [ + "WARC-Page-ID", + "WARC-Protocol", + "WARC-Resource-Type", + "WARC-JSON-Metadata", +]; + // ================================================================= // revisit async function createRevisitForResponse( @@ -2125,12 +2133,17 @@ async function createRevisitForResponse( refersToUrl: string, refersToDate: string, ) { - const origPayloadDigest = responseRecord.warcPayloadDigest; + const payloadDigestForRevisit = responseRecord.warcPayloadDigest || ""; - const warcHeaders: Record = { - "WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!, - "WARC-Payload-Digest": origPayloadDigest!, - }; + const warcHeaders: Record = {}; + + const origWarcHeaders = responseRecord.warcHeaders.headers; + + for (const header in REVISIT_COPY_HEADERS) { + if (origWarcHeaders.has(header)) { + warcHeaders[header] = origWarcHeaders.get(header)!; + } + } const revisitRecord = WARCRecord.create({ url: responseRecord.warcTargetURI!, @@ -2148,7 +2161,7 @@ async function createRevisitForResponse( maxMemSize: MAX_BROWSER_DEFAULT_FETCH_SIZE, }); - await serializer.digestRecord(); + await serializer.digestRecord({ payloadDigestForRevisit }); return { serializer, responseRecord: revisitRecord }; } From 3995629e0d096581dddb4ad81e5212699fc3e678 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Sep 2025 23:36:25 -0700 Subject: [PATCH 06/29] rename --dedupStoreUrl -> redisDedupUrl bump version to 1.9.0 fix typo --- src/crawler.ts | 2 +- src/indexer.ts | 4 ++-- src/util/argParser.ts | 2 +- src/util/recorder.ts | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 4cba5a38..380f6fd4 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -342,7 +342,7 @@ export class Crawler { async initCrawlState() { const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; - const dedupRedisUrl = this.params.dedupStoreUrl || redisUrl; + const dedupRedisUrl = this.params.redisDedupUrl || redisUrl; if (!redisUrl.startsWith("redis://")) { logger.fatal( diff --git a/src/indexer.ts b/src/indexer.ts index d8b64594..a04cd2e2 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -17,7 +17,7 @@ export class CrawlIndexer { return yargs(process.argv) .usage("indexer [options]") .options({ - dedupStoreUrl: { + redisDedupUrl: { describe: "URL for remote redis instance to index into", type: "string", required: true, @@ -43,7 +43,7 @@ export class CrawlIndexer { const params = this.initArgs(); - const redis = await initRedisWaitForSuccess(params.dedupStoreUrl); + const redis = await initRedisWaitForSuccess(params.redisDedupUrl); const dedupIndex = new RedisDedupIndex(redis); const allFiles = []; diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 197548d3..22c15243 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -445,7 +445,7 @@ class ArgParser { default: "redis://localhost:6379/0", }, - dedupStoreUrl: { + redisDedupUrl: { describe: "If set, url for remote redis server to store state. Otherwise, using local redis instance", type: "string", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 5a6d65f0..42f14bbb 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -2139,7 +2139,7 @@ async function createRevisitForResponse( const origWarcHeaders = responseRecord.warcHeaders.headers; - for (const header in REVISIT_COPY_HEADERS) { + for (const header of REVISIT_COPY_HEADERS) { if (origWarcHeaders.has(header)) { warcHeaders[header] = origWarcHeaders.get(header)!; } From 77dff861b73afd8d4e2c906a23be153fb4e4701b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 18 Sep 2025 02:04:28 -0700 Subject: [PATCH 07/29] update to latest warcio (2.4.7) to fix issus when returning payload only size --- Dockerfile | 2 +- src/util/recorder.ts | 4 ++-- yarn.lock | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9a5cad3b..6111211a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,7 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \ cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \ rm /tmp/ads/ad-hosts.txt -RUN yarn install --network-timeout 1000000 +RUN yarn install --network-timeout 1000000 --network-concurrency 1 ADD tsconfig.json /app/ ADD src /app/src diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 42f14bbb..9d580bb5 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1545,7 +1545,7 @@ export class Recorder extends EventEmitter { const { logDetails } = this; try { reqresp.readSize = await serializer.digestRecord({ - includeHeadersSize: false, + returnPayloadOnlySize: true, }); // set truncated field and recompute header buff if (reqresp.truncated) { @@ -1688,7 +1688,7 @@ export class Recorder extends EventEmitter { } } else { reqresp.readSize = await serializer.digestRecord({ - includeHeadersSize: false, + returnPayloadOnlySize: true, }); } diff --git a/yarn.lock b/yarn.lock index e09d37b5..322a2e1d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5581,6 +5581,19 @@ warcio@^2.4.7: uuid-random "^1.3.2" yargs "^17.7.2" +"warcio@git+https://github.com/webrecorder/warcio.js#fix-digestrecord-no-headers": + version "2.4.7" + resolved "git+https://github.com/webrecorder/warcio.js#0e95227bb6af3b089dc397827ac2db175a5b3b9e" + dependencies: + "@types/pako" "^1.0.7" + "@types/stream-buffers" "^3.0.7" + base32-encode "^2.0.0" + hash-wasm "^4.9.0" + pako "^1.0.11" + tempy "^3.1.0" + uuid-random "^1.3.2" + yargs "^17.7.2" + web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864" From 5c02c0a18cd7c15335bf12ce8704fc44e60b0406 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 18 Sep 2025 12:17:33 -0700 Subject: [PATCH 08/29] bump to 2.4.7 --- yarn.lock | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/yarn.lock b/yarn.lock index 322a2e1d..e09d37b5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5581,19 +5581,6 @@ warcio@^2.4.7: uuid-random "^1.3.2" yargs "^17.7.2" -"warcio@git+https://github.com/webrecorder/warcio.js#fix-digestrecord-no-headers": - version "2.4.7" - resolved "git+https://github.com/webrecorder/warcio.js#0e95227bb6af3b089dc397827ac2db175a5b3b9e" - dependencies: - "@types/pako" "^1.0.7" - "@types/stream-buffers" "^3.0.7" - base32-encode "^2.0.0" - hash-wasm "^4.9.0" - pako "^1.0.11" - tempy "^3.1.0" - uuid-random "^1.3.2" - yargs "^17.7.2" - web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864" From 94ac0584883509a8c9cd7a4850adbbc466fd8b37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 18 Sep 2025 13:10:53 -0700 Subject: [PATCH 09/29] tests: add dedup-basic.test for simple dedup, ensure number of revisit records === number of response records --- tests/dedup-basic.test.js | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 tests/dedup-basic.test.js diff --git a/tests/dedup-basic.test.js b/tests/dedup-basic.test.js new file mode 100644 index 00000000..9c06cb1c --- /dev/null +++ b/tests/dedup-basic.test.js @@ -0,0 +1,97 @@ +import {exec, execSync} from "child_process"; +import fs from "fs"; +import path from "path"; +import { Redis } from "ioredis"; +import { WARCParser } from "warcio"; + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + + +let redisId; +//let crawler1, crawler2; + +beforeAll(() => { + execSync("docker network create dedup"); + + redisId = execSync("docker run --rm --network=dedup -p 37379:6379 --name dedup-redis -d redis"); +}); + +afterAll(async () => { + execSync(`docker kill ${redisId}`); + + await sleep(3000); + + //await Promise.allSettled([crawler1, crawler2]); + + execSync("docker network rm dedup"); +}); + +function runCrawl(name) { + fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true }); + + const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379`); + + return new Promise((resolve) => { + crawler.on("exit", (code) => { + resolve(code); + }); + }); +} + +function loadFirstWARC(name) { + const archiveWarcLists = fs.readdirSync( + `test-crawls/collections/${name}/archive`, + ); + + const warcName = path.join(`test-crawls/collections/${name}/archive`, archiveWarcLists[0]); + + const nodeStream = fs.createReadStream(warcName); + + const parser = new WARCParser(nodeStream); + + return parser; +} + +test("check revisit records written on duplicate crawl", async () => { + + expect(await runCrawl("dedup-test-orig")).toBe(0); + expect(await runCrawl("dedup-test-dupe")).toBe(0); + + let statusCode = -1; + + let response = 0; + let revisit = 0; + + const parserOrig = loadFirstWARC("dedup-test-orig"); + + for await (const record of parserOrig) { + if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { + continue; + } + + if (record.warcType === "response") { + response++; + } + } + + const dupeOrig = loadFirstWARC("dedup-test-dupe"); + + for await (const record of dupeOrig) { + if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { + continue; + } + + if (record.warcType === "revisit") { + revisit++; + } + } + + expect(response).toBeGreaterThan(0); + + // revisits should match number of responses for non urn: + expect(response).toBe(revisit); +}); + + From 7e553b6a8725c4aab5205184237ff6071cbf3096 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 19 Sep 2025 20:54:52 -0700 Subject: [PATCH 10/29] deps update --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6111211a..9a5cad3b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,7 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \ cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \ rm /tmp/ads/ad-hosts.txt -RUN yarn install --network-timeout 1000000 --network-concurrency 1 +RUN yarn install --network-timeout 1000000 ADD tsconfig.json /app/ ADD src /app/src From 81d7848a79518534225ffc5e714d3fea7e2a6b38 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 22 Sep 2025 17:46:19 -0700 Subject: [PATCH 11/29] dedup indexing: strip hash prefix from digest, as cdx does not have it tests: add index import + dedup crawl to ensure digests match fully --- src/util/state.ts | 2 ++ tests/dedup-basic.test.js | 44 +++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index 71c091b0..eb15b10c 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -227,6 +227,7 @@ export class RedisDedupIndex { key = HASH_DUPE_KEY, //url: string, ): Promise<{ origDate?: string; origUrl?: string }> { + hash = hash.split(":").at(-1)!; const value = await this.dedupRedis.hget(key, hash); if (!value) { return {}; @@ -242,6 +243,7 @@ export class RedisDedupIndex { key = HASH_DUPE_KEY, ) { const val = date.replace(/[^\d]/g, "") + "|" + url; + hash = hash.split(":").at(-1)!; await this.dedupRedis.hsetnx(key, hash, val); } } diff --git a/tests/dedup-basic.test.js b/tests/dedup-basic.test.js index 9c06cb1c..85a03f9d 100644 --- a/tests/dedup-basic.test.js +++ b/tests/dedup-basic.test.js @@ -1,7 +1,7 @@ import {exec, execSync} from "child_process"; import fs from "fs"; import path from "path"; -import { Redis } from "ioredis"; +import Redis from "ioredis"; import { WARCParser } from "warcio"; function sleep(ms) { @@ -10,7 +10,7 @@ function sleep(ms) { let redisId; -//let crawler1, crawler2; +let numResponses = 0; beforeAll(() => { execSync("docker network create dedup"); @@ -28,10 +28,10 @@ afterAll(async () => { execSync("docker network rm dedup"); }); -function runCrawl(name) { +function runCrawl(name, db="0") { fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true }); - const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379`); + const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379/${db} --generateWACZ`); return new Promise((resolve) => { crawler.on("exit", (code) => { @@ -92,6 +92,42 @@ test("check revisit records written on duplicate crawl", async () => { // revisits should match number of responses for non urn: expect(response).toBe(revisit); + + numResponses = response; +}); + + +test("import index and crawl dupe", async () => { + + execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --redisDedupUrl redis://dedup-redis:6379/1`); + + const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null }); + + await redis.connect({maxRetriesPerRequest: 50}); + + expect(await redis.hlen("dupe")).toBe(numResponses); +}); + + +test("imported crawl dupe matches previous dupe count", async () => { + expect(await runCrawl("dedup-test-dupe-2", 1)).toBe(0); + + const dupeOrig = loadFirstWARC("dedup-test-dupe-2"); + + let revisit = 0; + + for await (const record of dupeOrig) { + if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { + continue; + } + + if (record.warcType === "revisit") { + revisit++; + } + } + + // matches same number of revisits as original + expect(revisit).toBe(numResponses); }); From 76737b72fda6225013b065463153abf1bf6c674c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 22 Sep 2025 22:30:08 -0700 Subject: [PATCH 12/29] use dedup redis for queue up wacz files that need to be updated use pending queue to support retries in case of failure store both id and actual URL in case URL changes in subsequent retries --- src/indexer.ts | 22 +++++++++--------- src/util/state.ts | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index a04cd2e2..2390becc 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -46,19 +46,18 @@ export class CrawlIndexer { const redis = await initRedisWaitForSuccess(params.redisDedupUrl); const dedupIndex = new RedisDedupIndex(redis); - const allFiles = []; - - for await (const waczfile of this.iterWACZ(params.sourceUrl)) { - allFiles.push(waczfile); + for await (const [name, waczfile] of this.iterWACZ(params.sourceUrl)) { + await dedupIndex.addHashSource(name, waczfile); } let count = 0; - const total = allFiles.length; + let res; - for (const waczfile of allFiles) { + while ((res = await dedupIndex.nextQueuedHashSource())) { + const { id, url, total } = res; count += 1; - const loader = new WACZLoader(waczfile); - logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile }); + const loader = new WACZLoader(url); + logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile: url }); for await (const file of loader.iterFiles("indexes/")) { const filename = file.filename; if (filename.endsWith(".cdx.gz")) { @@ -69,6 +68,7 @@ export class CrawlIndexer { await this.ingestCDXJ(dedupIndex, loader, filename); } } + await dedupIndex.addDoneSource(id); } logger.info("Done!"); @@ -141,7 +141,7 @@ export class CrawlIndexer { logger.debug("Processed", { count }); } - async *iterWACZ(url: string): AsyncIterable { + async *iterWACZ(url: string, name?: string): AsyncIterable<[string, string]> { let path: string = url; try { @@ -151,7 +151,7 @@ export class CrawlIndexer { } if (path.endsWith(".wacz")) { - yield url; + yield [name || url, url]; } else if (path.endsWith(".json")) { if (!url.startsWith("http://") && !url.startsWith("https://")) { const blob = await openAsBlob(url); @@ -163,7 +163,7 @@ export class CrawlIndexer { for (const entry of json.resources) { if (entry.path) { - yield* this.iterWACZ(entry.path); + yield* this.iterWACZ(entry.path, entry.name); } } } else { diff --git a/src/util/state.ts b/src/util/state.ts index eb15b10c..3b331ef5 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -218,6 +218,12 @@ export type SaveState = { export class RedisDedupIndex { dedupRedis: Redis; + sourceDone = "src:d"; + sourceQ = "src:q"; + pendingQ = "pending:q"; + sourceP = "src:p"; + pendingPrefix = "pending:q:"; + constructor(dedupRedis: Redis) { this.dedupRedis = dedupRedis; } @@ -246,6 +252,58 @@ export class RedisDedupIndex { hash = hash.split(":").at(-1)!; await this.dedupRedis.hsetnx(key, hash, val); } + + async addHashSource(id: string, url: string) { + // already handled this source + if (await this.dedupRedis.sismember(this.sourceDone, id)) { + return; + } + await this.dedupRedis.lpush(this.sourceQ, JSON.stringify({ id, url })); + } + + async addDoneSource(id: string) { + await this.dedupRedis.sadd(this.sourceDone, id); + } + + async nextQueuedHashSource() { + let res: string | null = await this.dedupRedis.lmove( + this.sourceQ, + this.pendingQ, + "RIGHT", + "LEFT", + ); + // use circular pending Q to support retries + if (!res) { + const len = await this.dedupRedis.llen(this.pendingQ); + for (let i = 0; i < len; i++) { + res = await this.dedupRedis.lmove( + this.pendingQ, + this.pendingQ, + "RIGHT", + "LEFT", + ); + if (res) { + const { id } = JSON.parse(res); + if (await this.dedupRedis.get(this.pendingPrefix + id)) { + res = null; + continue; + } else { + break; + } + } + } + } + + if (!res) { + return null; + } + + await this.dedupRedis.lrem(this.pendingQ, 1, res); + const { id, url } = JSON.parse(res); + const total = await this.dedupRedis.llen(this.sourceQ); + await this.dedupRedis.setex(this.pendingPrefix + id, "1", 300); + return { id, url, total }; + } } // ============================================================================ From dc04923c49da09e7921b09b301461fc0042bb5ac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 Sep 2025 10:40:57 -0700 Subject: [PATCH 13/29] dedup post requests and non-404s as well! update timestamp after import --- src/indexer.ts | 1 + src/util/recorder.ts | 2 +- src/util/state.ts | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/indexer.ts b/src/indexer.ts index 2390becc..07974bd7 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -72,6 +72,7 @@ export class CrawlIndexer { } logger.info("Done!"); + await dedupIndex.markDoneImport(); process.exit(ExitCodes.Success); } diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 9d580bb5..6b6428ab 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1703,7 +1703,7 @@ export class Recorder extends EventEmitter { const isEmpty = reqresp.readSize === 0; - if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) { + if (!isEmpty && url) { const { origUrl, origDate } = await this.crawlState.getHashDupe(hash); if (hash && origUrl && origDate) { diff --git a/src/util/state.ts b/src/util/state.ts index 3b331ef5..0f3e38e4 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -304,6 +304,10 @@ export class RedisDedupIndex { await this.dedupRedis.setex(this.pendingPrefix + id, "1", 300); return { id, url, total }; } + + async markDoneImport() { + await this.dedupRedis.set("last_update_ts", new Date().toISOString()); + } } // ============================================================================ From d620e2199154af4847454e3a15276ef556c091bf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 17 Oct 2025 18:08:38 -0700 Subject: [PATCH 14/29] - track source index for each hash, so entry becomes ' ' - entry for source index can contain the crawl id (or possibly wacz and crawl id) - also store dependent sources in relation.requires in datapackage.json - tests: update tests to check for relation.requires --- src/crawler.ts | 3 ++ src/indexer.ts | 13 +++++++- src/util/constants.ts | 1 + src/util/recorder.ts | 11 +++++-- src/util/state.ts | 64 +++++++++++++++++++++++++++++++++------ src/util/wacz.ts | 7 +++++ tests/dedup-basic.test.js | 30 ++++++++++++++++-- 7 files changed, 113 insertions(+), 16 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 380f6fd4..ea27f83f 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2000,6 +2000,8 @@ self.__bx_behaviors.selectMainBehavior(); await this.closeLog(); + const requires = await this.crawlState.getDupeDependentSources(); + const waczOpts: WACZInitOpts = { input: warcFileList.map((x) => path.join(this.archivesDir, x)), output: waczPath, @@ -2008,6 +2010,7 @@ self.__bx_behaviors.selectMainBehavior(); warcCdxDir: this.warcCdxDir, indexesDir: this.indexesDir, softwareString: this.infoString, + requires, }; if (process.env.WACZ_SIGN_URL) { diff --git a/src/indexer.ts b/src/indexer.ts index 07974bd7..31b8225d 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -28,6 +28,12 @@ export class CrawlIndexer { type: "string", required: true, }, + + sourceId: { + describe: "If single WACZ, use this id as source id", + type: "string", + required: false, + }, }) .parseSync(); } @@ -44,7 +50,7 @@ export class CrawlIndexer { const params = this.initArgs(); const redis = await initRedisWaitForSuccess(params.redisDedupUrl); - const dedupIndex = new RedisDedupIndex(redis); + const dedupIndex = new RedisDedupIndex(redis, ""); for await (const [name, waczfile] of this.iterWACZ(params.sourceUrl)) { await dedupIndex.addHashSource(name, waczfile); @@ -58,6 +64,11 @@ export class CrawlIndexer { count += 1; const loader = new WACZLoader(url); logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile: url }); + + const sourceId = params.sourceId && total === 1 ? params.sourceId : url; + + dedupIndex.dedupKeyIndex = await dedupIndex.addToSourcesList(sourceId); + for await (const file of loader.iterFiles("indexes/")) { const filename = file.filename; if (filename.endsWith(".cdx.gz")) { diff --git a/src/util/constants.ts b/src/util/constants.ts index b8565508..05369705 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -23,6 +23,7 @@ export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const HASH_DUPE_KEY = "dupe"; +export const HASH_DUPE_SOURCE_LIST_KEY = "sources"; export enum BxFunctionBindings { BehaviorLogFunc = "__bx_log", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 6b6428ab..58ae4ce0 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -845,13 +845,16 @@ export class Recorder extends EventEmitter { ) { const hash = "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex"); - const { origUrl } = await this.crawlState.getHashDupe(hash); + const { origUrl, origId } = await this.crawlState.getHashDupe(hash); if (origUrl) { const errorReason = "BlockedByResponse"; await cdp.send("Fetch.failRequest", { requestId, errorReason, }); + if (origId) { + await this.crawlState.addDupeCrawlRef(origId); + } return true; } } @@ -1704,9 +1707,10 @@ export class Recorder extends EventEmitter { const isEmpty = reqresp.readSize === 0; if (!isEmpty && url) { - const { origUrl, origDate } = await this.crawlState.getHashDupe(hash); + const { origUrl, origDate, origId } = + await this.crawlState.getHashDupe(hash); - if (hash && origUrl && origDate) { + if (hash && origUrl && origDate && origId) { const date = tsToDate(origDate).toISOString(); // always write revisit here // duplicate URLs in same crawl filtered out separately @@ -1717,6 +1721,7 @@ export class Recorder extends EventEmitter { origUrl, date, )); + await this.crawlState.addDupeCrawlRef(origId); } else { // no dupe, continue } diff --git a/src/util/state.ts b/src/util/state.ts index 0f3e38e4..41583bde 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -7,7 +7,8 @@ import { MAX_DEPTH, DEFAULT_MAX_RETRIES, ROBOTS_CACHE_LIMIT, - HASH_DUPE_KEY + HASH_DUPE_KEY, + HASH_DUPE_SOURCE_LIST_KEY, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; @@ -217,6 +218,8 @@ export type SaveState = { // ============================================================================ export class RedisDedupIndex { dedupRedis: Redis; + key: string; + dedupKeyIndex = -1; sourceDone = "src:d"; sourceQ = "src:q"; @@ -224,22 +227,42 @@ export class RedisDedupIndex { sourceP = "src:p"; pendingPrefix = "pending:q:"; - constructor(dedupRedis: Redis) { + constructor(dedupRedis: Redis, key: string) { this.dedupRedis = dedupRedis; + this.key = key; + } + + private async getKeyIndex() { + if (!this.key) { + return; + } + const res = await this.dedupRedis.lpos(HASH_DUPE_SOURCE_LIST_KEY, this.key); + if (res) { + this.dedupKeyIndex = res; + } else { + this.dedupKeyIndex = await this.addToSourcesList(this.key); + } + return this.dedupKeyIndex; + } + + async addToSourcesList(crawlId: string) { + return ( + (await this.dedupRedis.rpush(HASH_DUPE_SOURCE_LIST_KEY, crawlId)) - 1 + ); } async getHashDupe( hash: string, key = HASH_DUPE_KEY, //url: string, - ): Promise<{ origDate?: string; origUrl?: string }> { + ): Promise<{ origDate?: string; origUrl?: string; origId?: string }> { hash = hash.split(":").at(-1)!; const value = await this.dedupRedis.hget(key, hash); if (!value) { return {}; } - const val = value.split("|"); - return { origUrl: val[1], origDate: val[0] }; + const val = value.split(" "); + return { origUrl: val[2], origDate: val[1], origId: val[0] }; } async addHashDupe( @@ -248,8 +271,12 @@ export class RedisDedupIndex { date: string, key = HASH_DUPE_KEY, ) { - const val = date.replace(/[^\d]/g, "") + "|" + url; + date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; + if (this.dedupKeyIndex < 0) { + await this.getKeyIndex(); + } + const val = `${this.dedupKeyIndex} ${date} ${url}`; await this.dedupRedis.hsetnx(key, hash, val); } @@ -300,7 +327,7 @@ export class RedisDedupIndex { await this.dedupRedis.lrem(this.pendingQ, 1, res); const { id, url } = JSON.parse(res); - const total = await this.dedupRedis.llen(this.sourceQ); + const total = (await this.dedupRedis.llen(this.sourceQ)) + 1; await this.dedupRedis.setex(this.pendingPrefix + id, "1", 300); return { id, url, total }; } @@ -316,7 +343,6 @@ export class RedisCrawlState extends RedisDedupIndex { maxRetries: number; uid: string; - key: string; maxPageTime: number; qkey: string; @@ -347,11 +373,10 @@ export class RedisCrawlState extends RedisDedupIndex { maxRetries?: number, dedupRedis?: Redis, ) { - super(dedupRedis || redis); + super(dedupRedis || redis, key); this.redis = redis; this.uid = uid; - this.key = key; this.maxPageTime = maxPageTime; this.maxRetries = maxRetries ?? DEFAULT_MAX_RETRIES; @@ -1301,4 +1326,23 @@ return inx; result.modified = this._timestamp(); await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); } + + async addDupeCrawlRef(id: string) { + await this.redis.sadd(`${this.key}:dindex`, id); + } + + async getDupeDependentSources(): Promise { + const dependIndexes = await this.redis.smembers(`${this.key}:dindex`); + const crawlIds = []; + for (const inx of dependIndexes) { + const crawlId = await this.dedupRedis.lindex( + HASH_DUPE_SOURCE_LIST_KEY, + Number(inx), + ); + if (crawlId && crawlId !== this.key) { + crawlIds.push(crawlId); + } + } + return crawlIds; + } } diff --git a/src/util/wacz.ts b/src/util/wacz.ts index acbbfe94..5e85e06a 100644 --- a/src/util/wacz.ts +++ b/src/util/wacz.ts @@ -45,6 +45,7 @@ export type WACZInitOpts = { signingToken?: string; title?: string; description?: string; + requires?: string[]; }; export type WACZResourceEntry = { @@ -61,6 +62,7 @@ export type WACZDataPackage = { software: string; title?: string; description?: string; + relation?: { requires: string[] }; }; type WACZDigest = { @@ -131,6 +133,11 @@ export class WACZ { this.datapackage.description = config.description; } + if (config.requires && config.requires.length) { + this.datapackage.relation = { requires: config.requires }; + } + console.log("REQUIRES", config.requires); + this.signingUrl = config.signingUrl || null; this.signingToken = config.signingToken || null; } diff --git a/tests/dedup-basic.test.js b/tests/dedup-basic.test.js index 85a03f9d..9f0d068a 100644 --- a/tests/dedup-basic.test.js +++ b/tests/dedup-basic.test.js @@ -31,7 +31,7 @@ afterAll(async () => { function runCrawl(name, db="0") { fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true }); - const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379/${db} --generateWACZ`); + const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379/${db} --generateWACZ`); return new Promise((resolve) => { crawler.on("exit", (code) => { @@ -54,6 +54,20 @@ function loadFirstWARC(name) { return parser; } +function loadDataPackageRelated(name) { + execSync( + `unzip test-crawls/collections/${name}/${name}.wacz -d test-crawls/collections/${name}/wacz`, + ); + + const data = fs.readFileSync( + `test-crawls/collections/${name}/wacz/datapackage.json`, + "utf8", + ); + const dataPackageJSON = JSON.parse(data); + return dataPackageJSON.relation; +} + + test("check revisit records written on duplicate crawl", async () => { expect(await runCrawl("dedup-test-orig")).toBe(0); @@ -99,7 +113,7 @@ test("check revisit records written on duplicate crawl", async () => { test("import index and crawl dupe", async () => { - execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --redisDedupUrl redis://dedup-redis:6379/1`); + execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --sourceId dedup-test-orig --redisDedupUrl redis://dedup-redis:6379/1`); const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null }); @@ -130,4 +144,16 @@ test("imported crawl dupe matches previous dupe count", async () => { expect(revisit).toBe(numResponses); }); +test("test requires in wacz 1", () => { + + const expected = {"requires": ["dedup-test-orig"]}; + + const res1 = loadDataPackageRelated("dedup-test-dupe"); + const res2 = loadDataPackageRelated("dedup-test-dupe-2"); + + expect(res1).toEqual(expected); + expect(res1).toEqual(res2); + +}); + From e0244391f12234f569f76a159b4d668fc3d4edbd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Oct 2025 10:38:36 -0700 Subject: [PATCH 15/29] update to new data model: - hashes stored in separate crawl specific entries, h: - wacz files stored in crawl specific list, c::wacz - hashes committed to 'alldupes' hashset when crawl is complete, crawls added to 'allcrawls' set - store filename, crawlId in related.requires list entries for each wacz --- package.json | 2 +- src/crawler.ts | 4 + src/indexer.ts | 58 +++++++--- src/util/constants.ts | 4 +- src/util/recorder.ts | 22 ++-- src/util/state.ts | 231 ++++++++++++++++++++++++-------------- src/util/wacz.ts | 7 +- tests/dedup-basic.test.js | 28 +++-- yarn.lock | 18 +-- 9 files changed, 244 insertions(+), 130 deletions(-) diff --git a/package.json b/package.json index 2908fa49..f2ec9cb4 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", - "ioredis": "^5.3.2", + "ioredis": "^5.8.2", "iso-639-1": "^3.1.5", "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", diff --git a/src/crawler.ts b/src/crawler.ts index ea27f83f..10124395 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2041,9 +2041,13 @@ self.__bx_behaviors.selectMainBehavior(); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); + await this.crawlState.updateDedupSource(wacz); + await this.crawlState.clearWACZFilename(); return true; + } else { + await this.crawlState.updateDedupSource(wacz); } return false; diff --git a/src/indexer.ts b/src/indexer.ts index 31b8225d..353d5385 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -1,5 +1,4 @@ #!/usr/bin/env node - import yargs from "yargs"; import { logger } from "./util/logger.js"; import { getInfoString } from "./util/file_reader.js"; @@ -9,6 +8,15 @@ import { ExitCodes } from "./util/constants.js"; import { initRedisWaitForSuccess } from "./util/redis.js"; import { AsyncIterReader } from "warcio"; import { RedisDedupIndex } from "./util/state.js"; +import { basename } from "node:path"; + +export type DedupIndexEntry = { + name: string; + url: string; + crawlId?: string; + size?: number; + hash?: string; +}; export class CrawlIndexer { constructor() {} @@ -29,7 +37,7 @@ export class CrawlIndexer { required: true, }, - sourceId: { + sourceCrawlId: { describe: "If single WACZ, use this id as source id", type: "string", required: false, @@ -52,38 +60,50 @@ export class CrawlIndexer { const redis = await initRedisWaitForSuccess(params.redisDedupUrl); const dedupIndex = new RedisDedupIndex(redis, ""); - for await (const [name, waczfile] of this.iterWACZ(params.sourceUrl)) { - await dedupIndex.addHashSource(name, waczfile); + for await (const entry of this.iterWACZ(params.sourceUrl)) { + await dedupIndex.queueImportSource(entry.name, JSON.stringify(entry)); } let count = 0; let res; - while ((res = await dedupIndex.nextQueuedHashSource())) { - const { id, url, total } = res; + while ((res = await dedupIndex.nextQueuedImportSource())) { + const { name, entry, total } = res; + const { url, crawlId, size, hash } = JSON.parse(entry) as DedupIndexEntry; count += 1; const loader = new WACZLoader(url); logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile: url }); - const sourceId = params.sourceId && total === 1 ? params.sourceId : url; + const crawlIdReal = crawlId || params.sourceCrawlId || url; - dedupIndex.dedupKeyIndex = await dedupIndex.addToSourcesList(sourceId); + await dedupIndex.addImportedSourceForDedup(crawlIdReal, { + filename: name, + size, + hash, + }); for await (const file of loader.iterFiles("indexes/")) { const filename = file.filename; if (filename.endsWith(".cdx.gz")) { logger.debug("Processing CDX GZ Index", { filename }); - await this.ingestCDXJ(dedupIndex, loader, filename, "gzip"); + await this.ingestCDXJ( + dedupIndex, + loader, + filename, + crawlIdReal, + "gzip", + ); } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) { logger.debug("Processing CDX Index", { filename }); - await this.ingestCDXJ(dedupIndex, loader, filename); + await this.ingestCDXJ(dedupIndex, loader, filename, crawlIdReal); } } - await dedupIndex.addDoneSource(id); + + await dedupIndex.markImportSourceDone(name, crawlIdReal); } logger.info("Done!"); - await dedupIndex.markDoneImport(); + await dedupIndex.markImportFinishedTS(); process.exit(ExitCodes.Success); } @@ -91,6 +111,7 @@ export class CrawlIndexer { dedupIndex: RedisDedupIndex, loader: WACZLoader, filename: string, + crawlId: string, compression?: string, ) { let reader = await loader.loadFile(filename); @@ -137,7 +158,8 @@ export class CrawlIndexer { } if (url && date && hash) { - await dedupIndex.addHashDupe(hash, url, date); + await dedupIndex.addHashDupe(hash, url, date, crawlId); + await dedupIndex.addImportedForCrawl(hash, crawlId); } else { logger.warn("Skipping invalid CDXJ, data missing", { url, @@ -153,7 +175,7 @@ export class CrawlIndexer { logger.debug("Processed", { count }); } - async *iterWACZ(url: string, name?: string): AsyncIterable<[string, string]> { + async *iterWACZ(url: string, name?: string): AsyncIterable { let path: string = url; try { @@ -163,7 +185,7 @@ export class CrawlIndexer { } if (path.endsWith(".wacz")) { - yield [name || url, url]; + yield { name: basename(name || url), url }; } else if (path.endsWith(".json")) { if (!url.startsWith("http://") && !url.startsWith("https://")) { const blob = await openAsBlob(url); @@ -174,7 +196,11 @@ export class CrawlIndexer { const json = await resp.json(); for (const entry of json.resources) { - if (entry.path) { + const url = entry.path; + if (url && url.endsWith(".wacz")) { + const { size, hash, crawlId, name } = entry; + yield { crawlId, name, url, size, hash }; + } else { yield* this.iterWACZ(entry.path, entry.name); } } diff --git a/src/util/constants.ts b/src/util/constants.ts index 05369705..4784d9b6 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -22,8 +22,8 @@ export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; -export const HASH_DUPE_KEY = "dupe"; -export const HASH_DUPE_SOURCE_LIST_KEY = "sources"; +export const DUPE_ALL_HASH_KEY = "alldupes"; +export const DUPE_ALL_CRAWLS = "allcrawls"; export enum BxFunctionBindings { BehaviorLogFunc = "__bx_log", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 58ae4ce0..6e53e0a7 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -845,16 +845,15 @@ export class Recorder extends EventEmitter { ) { const hash = "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex"); - const { origUrl, origId } = await this.crawlState.getHashDupe(hash); - if (origUrl) { + const res = await this.crawlState.getHashDupe(hash); + if (res) { + const { index, crawlId } = res; const errorReason = "BlockedByResponse"; await cdp.send("Fetch.failRequest", { requestId, errorReason, }); - if (origId) { - await this.crawlState.addDupeCrawlRef(origId); - } + await this.crawlState.addDupeCrawlRef(crawlId, index); return true; } } @@ -1706,11 +1705,13 @@ export class Recorder extends EventEmitter { const isEmpty = reqresp.readSize === 0; + let isDupe = false; + if (!isEmpty && url) { - const { origUrl, origDate, origId } = - await this.crawlState.getHashDupe(hash); + const res = await this.crawlState.getHashDupe(hash); - if (hash && origUrl && origDate && origId) { + if (res) { + const { origUrl, origDate, crawlId, index } = res; const date = tsToDate(origDate).toISOString(); // always write revisit here // duplicate URLs in same crawl filtered out separately @@ -1721,7 +1722,8 @@ export class Recorder extends EventEmitter { origUrl, date, )); - await this.crawlState.addDupeCrawlRef(origId); + await this.crawlState.addDupeCrawlRef(crawlId, index); + isDupe = true; } else { // no dupe, continue } @@ -1755,7 +1757,7 @@ export class Recorder extends EventEmitter { this.addPageRecord(reqresp); - if (!isEmpty) { + if (!isEmpty && !isDupe) { await this.crawlState.addHashDupe(hash, url, date); } diff --git a/src/util/state.ts b/src/util/state.ts index 41583bde..7eda7497 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -7,13 +7,14 @@ import { MAX_DEPTH, DEFAULT_MAX_RETRIES, ROBOTS_CACHE_LIMIT, - HASH_DUPE_KEY, - HASH_DUPE_SOURCE_LIST_KEY, + DUPE_ALL_HASH_KEY, + DUPE_ALL_CRAWLS, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url"; +import { WACZ } from "./wacz.js"; // ============================================================================ export enum LoadState { @@ -215,11 +216,27 @@ export type SaveState = { excluded?: string[]; }; +// ============================================================================ +export type DedupeEntry = { + origDate: string; + origUrl: string; + index: string; + crawlId: string; +}; + +// ============================================================================ +export type DedupSourceEntry = { + filename: string; + size?: number; + hash?: string; +}; + // ============================================================================ export class RedisDedupIndex { dedupRedis: Redis; - key: string; + crawlId: string; dedupKeyIndex = -1; + dedupCurrFilename = ""; sourceDone = "src:d"; sourceQ = "src:q"; @@ -227,72 +244,118 @@ export class RedisDedupIndex { sourceP = "src:p"; pendingPrefix = "pending:q:"; - constructor(dedupRedis: Redis, key: string) { + constructor(dedupRedis: Redis, crawlId: string) { this.dedupRedis = dedupRedis; - this.key = key; + this.crawlId = crawlId; } - private async getKeyIndex() { - if (!this.key) { + // DEDUP SOURCE + + async addSourceForDedup(filename: string) { + //const count = await this.dedupRedis.incr(`c:${key}:count`) - 1; + const count = + (await this.dedupRedis.rpush( + `c:${this.crawlId}:wacz`, + JSON.stringify({ filename }), + )) - 1; + this.dedupCurrFilename = filename; + this.dedupKeyIndex = count; + } + + async updateDedupSource(wacz: WACZ) { + if (this.dedupKeyIndex < 0) { return; } - const res = await this.dedupRedis.lpos(HASH_DUPE_SOURCE_LIST_KEY, this.key); - if (res) { - this.dedupKeyIndex = res; - } else { - this.dedupKeyIndex = await this.addToSourcesList(this.key); - } - return this.dedupKeyIndex; - } - async addToSourcesList(crawlId: string) { - return ( - (await this.dedupRedis.rpush(HASH_DUPE_SOURCE_LIST_KEY, crawlId)) - 1 + const value: DedupSourceEntry = { + filename: wacz.getLocalFilename() || this.dedupCurrFilename, + hash: wacz.getHash(), + size: wacz.getSize(), + }; + + await this.dedupRedis.lset( + `c:${this.crawlId}:wacz`, + this.dedupKeyIndex, + JSON.stringify(value), ); + + await this.commitDedupDone(); + } + + // COMMIT DEDUP TO SHARED INDEX + + async commitDedupDone() { + for await (const hashes of this.dedupRedis.hscanStream( + `h:${this.crawlId}`, + { + noValues: true, + }, + )) { + for (const hash of hashes) { + await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + } + } + + // add to crawls list + await this.dedupRedis.sadd(DUPE_ALL_CRAWLS, this.crawlId); } + // GET OR ADD INDIVIDUAL HASHES + async getHashDupe( hash: string, - key = HASH_DUPE_KEY, + key = DUPE_ALL_HASH_KEY, //url: string, - ): Promise<{ origDate?: string; origUrl?: string; origId?: string }> { + ): Promise { hash = hash.split(":").at(-1)!; - const value = await this.dedupRedis.hget(key, hash); + + // first, check the shared key + let crawlId = await this.dedupRedis.hget(key, hash); + if (!crawlId) { + // otherwise, try current crawl + crawlId = this.crawlId; + } + const value = await this.dedupRedis.hget(`h:${crawlId}`, hash); if (!value) { - return {}; + return null; } const val = value.split(" "); - return { origUrl: val[2], origDate: val[1], origId: val[0] }; + return { origUrl: val[2], origDate: val[1], index: val[0], crawlId }; } - async addHashDupe( - hash: string, - url: string, - date: string, - key = HASH_DUPE_KEY, - ) { + async addHashDupe(hash: string, url: string, date: string, crawlId?: string) { date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; - if (this.dedupKeyIndex < 0) { - await this.getKeyIndex(); - } const val = `${this.dedupKeyIndex} ${date} ${url}`; - await this.dedupRedis.hsetnx(key, hash, val); + await this.dedupRedis.hsetnx(`h:${crawlId || this.crawlId}`, hash, val); } - async addHashSource(id: string, url: string) { + // IMPORT + + async queueImportSource(id: string, data: string) { // already handled this source if (await this.dedupRedis.sismember(this.sourceDone, id)) { return; } - await this.dedupRedis.lpush(this.sourceQ, JSON.stringify({ id, url })); + await this.dedupRedis.lpush(this.sourceQ, data); + } + + async addImportedForCrawl(hash: string, crawlId: string) { + await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, crawlId); + } + + async addImportedSourceForDedup(key: string, entry: DedupSourceEntry) { + return ( + (await this.dedupRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1 + ); } - async addDoneSource(id: string) { + async markImportSourceDone(id: string, crawlId: string) { await this.dedupRedis.sadd(this.sourceDone, id); + await this.dedupRedis.sadd(DUPE_ALL_CRAWLS, crawlId); } - async nextQueuedHashSource() { + async nextQueuedImportSource() { let res: string | null = await this.dedupRedis.lmove( this.sourceQ, this.pendingQ, @@ -326,13 +389,13 @@ export class RedisDedupIndex { } await this.dedupRedis.lrem(this.pendingQ, 1, res); - const { id, url } = JSON.parse(res); + const { name } = JSON.parse(res); const total = (await this.dedupRedis.llen(this.sourceQ)) + 1; - await this.dedupRedis.setex(this.pendingPrefix + id, "1", 300); - return { id, url, total }; + await this.dedupRedis.setex(this.pendingPrefix + name, "1", 300); + return { name, entry: res, total }; } - async markDoneImport() { + async markImportFinishedTS() { await this.dedupRedis.set("last_update_ts", new Date().toISOString()); } } @@ -380,32 +443,32 @@ export class RedisCrawlState extends RedisDedupIndex { this.maxPageTime = maxPageTime; this.maxRetries = maxRetries ?? DEFAULT_MAX_RETRIES; - this.qkey = this.key + ":q"; - this.pkey = this.key + ":p"; - this.skey = this.key + ":s"; + this.qkey = this.crawlId + ":q"; + this.pkey = this.crawlId + ":p"; + this.skey = this.crawlId + ":s"; // done (integer) - this.dkey = this.key + ":d"; + this.dkey = this.crawlId + ":d"; // failed final, no more retry - this.fkey = this.key + ":f"; + this.fkey = this.crawlId + ":f"; // crawler errors - this.ekey = this.key + ":e"; + this.ekey = this.crawlId + ":e"; // crawler behavior script messages - this.bkey = this.key + ":b"; + this.bkey = this.crawlId + ":b"; // cached robots.txt bodies (per-origin) - this.rkey = this.key + ":r"; + this.rkey = this.crawlId + ":r"; // LRU cache of robots.txt keys - this.lkey = this.key + ":l"; + this.lkey = this.crawlId + ":l"; // pages - this.pageskey = this.key + ":pages"; + this.pageskey = this.crawlId + ":pages"; - this.esKey = this.key + ":extraSeeds"; - this.esMap = this.key + ":esMap"; + this.esKey = this.crawlId + ":extraSeeds"; + this.esMap = this.crawlId + ":esMap"; // stores URLs that have been seen but excluded // (eg. redirect-to-excluded or trimmed) - this.exKey = this.key + ":excluded"; + this.exKey = this.crawlId + ":excluded"; - this.sitemapDoneKey = this.key + ":sitemapDone"; + this.sitemapDoneKey = this.crawlId + ":sitemapDone"; this._initLuaCommands(this.redis); } @@ -658,29 +721,29 @@ return inx; } async setFailReason(reason: string) { - await this.redis.set(`${this.key}:failReason`, reason); + await this.redis.set(`${this.crawlId}:failReason`, reason); } async setStatus(status_: string) { - await this.redis.hset(`${this.key}:status`, this.uid, status_); + await this.redis.hset(`${this.crawlId}:status`, this.uid, status_); } async getStatus(): Promise { - return (await this.redis.hget(`${this.key}:status`, this.uid)) || ""; + return (await this.redis.hget(`${this.crawlId}:status`, this.uid)) || ""; } async setWACZFilename(): Promise { const filename = process.env.STORE_FILENAME || "@ts-@id.wacz"; - this.waczFilename = interpolateFilename(filename, this.key); + this.waczFilename = interpolateFilename(filename, this.crawlId); if ( !(await this.redis.hsetnx( - `${this.key}:nextWacz`, + `${this.crawlId}:nextWacz`, this.uid, this.waczFilename, )) ) { this.waczFilename = await this.redis.hget( - `${this.key}:nextWacz`, + `${this.crawlId}:nextWacz`, this.uid, ); logger.debug( @@ -695,6 +758,7 @@ return inx; "state", ); } + await this.addSourceForDedup(this.waczFilename!); return this.waczFilename!; } @@ -706,20 +770,20 @@ return inx; } async clearWACZFilename(): Promise { - await this.redis.hdel(`${this.key}:nextWacz`, this.uid); + await this.redis.hdel(`${this.crawlId}:nextWacz`, this.uid); this.waczFilename = null; } async setArchiveSize(size: number) { - return await this.redis.hset(`${this.key}:size`, this.uid, size); + return await this.redis.hset(`${this.crawlId}:size`, this.uid, size); } async isCrawlStopped() { - if ((await this.redis.get(`${this.key}:stopping`)) === "1") { + if ((await this.redis.get(`${this.crawlId}:stopping`)) === "1") { return true; } - if ((await this.redis.hget(`${this.key}:stopone`, this.uid)) === "1") { + if ((await this.redis.hget(`${this.crawlId}:stopone`, this.uid)) === "1") { return true; } @@ -727,7 +791,7 @@ return inx; } async isCrawlPaused() { - if ((await this.redis.get(`${this.key}:paused`)) === "1") { + if ((await this.redis.get(`${this.crawlId}:paused`)) === "1") { return true; } @@ -735,13 +799,13 @@ return inx; } async isCrawlCanceled() { - return (await this.redis.get(`${this.key}:canceled`)) === "1"; + return (await this.redis.get(`${this.crawlId}:canceled`)) === "1"; } // note: not currently called in crawler, but could be // crawl may be stopped by setting this elsewhere in shared redis async stopCrawl() { - await this.redis.set(`${this.key}:stopping`, "1"); + await this.redis.set(`${this.crawlId}:stopping`, "1"); } async processMessage(seeds: ScopedSeed[]) { @@ -831,7 +895,7 @@ return inx; } async incFailCount() { - const key = `${this.key}:status:failcount:${this.uid}`; + const key = `${this.crawlId}:status:failcount:${this.uid}`; const res = await this.redis.incr(key); // consider failed if 3 failed retries in 60 secs @@ -1198,11 +1262,11 @@ return inx; } async isInUserSet(value: string) { - return (await this.redis.sismember(this.key + ":user", value)) === 1; + return (await this.redis.sismember(this.crawlId + ":user", value)) === 1; } async addToUserSet(value: string) { - return (await this.redis.sadd(this.key + ":user", value)) === 1; + return (await this.redis.sadd(this.crawlId + ":user", value)) === 1; } async logError(error: string) { @@ -1324,23 +1388,26 @@ return inx; async markProfileUploaded(result: UploadResult & { modified?: string }) { result.modified = this._timestamp(); - await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); + await this.redis.set(`${this.crawlId}:profileUploaded`, JSON.stringify(result)); } - async addDupeCrawlRef(id: string) { - await this.redis.sadd(`${this.key}:dindex`, id); + async addDupeCrawlRef(crawlId: string, index: string) { + await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); } - async getDupeDependentSources(): Promise { - const dependIndexes = await this.redis.smembers(`${this.key}:dindex`); + async getDupeDependentSources() { + const dependIndexes = await this.redis.smembers(`${this.crawlId}:dindex`); const crawlIds = []; - for (const inx of dependIndexes) { - const crawlId = await this.dedupRedis.lindex( - HASH_DUPE_SOURCE_LIST_KEY, - Number(inx), + for (const value of dependIndexes) { + const [crawlId, index] = value.split(" "); + const source = await this.dedupRedis.lindex( + `c:${crawlId}:wacz`, + Number(index), ); - if (crawlId && crawlId !== this.key) { - crawlIds.push(crawlId); + if (crawlId && crawlId !== this.crawlId && source) { + const entry = JSON.parse(source); + entry.crawlId = crawlId; + crawlIds.push(entry); } } return crawlIds; diff --git a/src/util/wacz.ts b/src/util/wacz.ts index 5e85e06a..d31e2ae2 100644 --- a/src/util/wacz.ts +++ b/src/util/wacz.ts @@ -109,6 +109,7 @@ export class WACZ { private size = 0; private hash: string = ""; + private localFilename = ""; constructor(config: WACZInitOpts, collDir: string) { this.warcs = config.input; @@ -136,7 +137,6 @@ export class WACZ { if (config.requires && config.requires.length) { this.datapackage.relation = { requires: config.requires }; } - console.log("REQUIRES", config.requires); this.signingUrl = config.signingUrl || null; this.signingToken = config.signingToken || null; @@ -201,7 +201,12 @@ export class WACZ { return this.size; } + getLocalFilename() { + return this.localFilename; + } + async generateToFile(filename: string) { + this.localFilename = path.basename(filename); await pipeline(this.generate(), fs.createWriteStream(filename)); } diff --git a/tests/dedup-basic.test.js b/tests/dedup-basic.test.js index 9f0d068a..f0d6b7c6 100644 --- a/tests/dedup-basic.test.js +++ b/tests/dedup-basic.test.js @@ -113,13 +113,13 @@ test("check revisit records written on duplicate crawl", async () => { test("import index and crawl dupe", async () => { - execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --sourceId dedup-test-orig --redisDedupUrl redis://dedup-redis:6379/1`); + execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --sourceCrawlId dedup-test-orig --redisDedupUrl redis://dedup-redis:6379/1`); const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null }); await redis.connect({maxRetriesPerRequest: 50}); - expect(await redis.hlen("dupe")).toBe(numResponses); + expect(await redis.hlen("alldupes")).toBe(numResponses); }); @@ -144,16 +144,26 @@ test("imported crawl dupe matches previous dupe count", async () => { expect(revisit).toBe(numResponses); }); -test("test requires in wacz 1", () => { - - const expected = {"requires": ["dedup-test-orig"]}; - +test("test requires in datapackage.json of wacz deduped against previous crawl", () => { const res1 = loadDataPackageRelated("dedup-test-dupe"); - const res2 = loadDataPackageRelated("dedup-test-dupe-2"); - expect(res1).toEqual(expected); - expect(res1).toEqual(res2); + expect(res1.requires.length).toBe(1); + const entry = res1.requires[0]; + expect(entry.crawlId).toBe("dedup-test-orig"); + expect(entry.filename).toBe("dedup-test-orig.wacz"); + expect(entry.size).toBeDefined(); + expect(entry.hash).toBeDefined(); +}); +test("test requires in datapackage.json of wacz deduped against import from wacz", () => { + const res2 = loadDataPackageRelated("dedup-test-dupe-2"); + expect(res2.requires.length).toBe(1); + const entry2 = res2.requires[0]; + expect(entry2.crawlId).toBe("dedup-test-orig"); + expect(entry2.filename).toBe("dedup-test-orig.wacz"); + // undefined as importing from single WACZ and not computing + expect(entry2.size).toBeUndefined(); + expect(entry2.hash).toBeUndefined(); }); diff --git a/yarn.lock b/yarn.lock index e09d37b5..3725fadd 100644 --- a/yarn.lock +++ b/yarn.lock @@ -370,10 +370,10 @@ resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3" integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA== -"@ioredis/commands@^1.1.1": - version "1.2.0" - resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11" - integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg== +"@ioredis/commands@1.4.0": + version "1.4.0" + resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25" + integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ== "@istanbuljs/load-nyc-config@^1.0.0": version "1.1.0" @@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3: "@formatjs/icu-messageformat-parser" "2.11.2" tslib "^2.8.0" -ioredis@^5.3.2: - version "5.4.1" - resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40" - integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA== +ioredis@^5.8.2: + version "5.8.2" + resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14" + integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q== dependencies: - "@ioredis/commands" "^1.1.1" + "@ioredis/commands" "1.4.0" cluster-key-slot "^1.1.0" debug "^4.3.4" denque "^2.1.0" From b5157ae3b564f9b3df269826a5c750ddbbe94fc4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Oct 2025 13:24:53 -0700 Subject: [PATCH 16/29] cleanup, keep compatibility with redis 6 still set to 'post-crawl' state after uploading --- package.json | 2 +- src/crawler.ts | 35 +++++++++++++++++++++-------------- src/util/state.ts | 10 ++++++---- yarn.lock | 18 +++++++++--------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/package.json b/package.json index f2ec9cb4..2908fa49 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", - "ioredis": "^5.8.2", + "ioredis": "^5.3.2", "iso-639-1": "^3.1.5", "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", diff --git a/src/crawler.ts b/src/crawler.ts index 10124395..5171569e 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -203,6 +203,7 @@ export class Crawler { | null = null; recording: boolean; + deduping = false; constructor() { const args = this.parseArgs(); @@ -344,6 +345,8 @@ export class Crawler { const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; const dedupRedisUrl = this.params.redisDedupUrl || redisUrl; + this.deduping = dedupRedisUrl !== redisUrl; + if (!redisUrl.startsWith("redis://")) { logger.fatal( "stateStoreUrl must start with redis:// -- Only redis-based store currently supported", @@ -1910,10 +1913,20 @@ self.__bx_behaviors.selectMainBehavior(); } if (this.params.generateWACZ && generateFiles) { - const uploaded = await this.generateWACZ(); + const wacz = await this.generateWACZ(); + + if (wacz) { + if (this.deduping) { + await this.crawlState.setStatus("post-crawl"); + await this.crawlState.updateDedupSource(wacz); + } - if (uploaded && this.uploadAndDeleteLocal) { + await this.crawlState.clearWACZFilename(); + } + + if (wacz && this.uploadAndDeleteLocal) { await this.crawlState.setArchiveSize(0); + logger.info( `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`, ); @@ -1962,7 +1975,7 @@ self.__bx_behaviors.selectMainBehavior(); await streamFinish(logFH); } - async generateWACZ() { + async generateWACZ(): Promise { logger.info("Generating WACZ"); await this.crawlState.setStatus("generate-wacz"); @@ -1976,11 +1989,11 @@ self.__bx_behaviors.selectMainBehavior(); if (!warcFileList.length) { // if finished, just return if (isFinished || (await this.crawlState.isCrawlCanceled())) { - return; + return null; } // possibly restarted after committing, so assume done here! if ((await this.crawlState.numDone()) > 0) { - return; + return null; } // fail crawl otherwise logger.fatal("No WARC Files, assuming crawl failed"); @@ -2041,16 +2054,8 @@ self.__bx_behaviors.selectMainBehavior(); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); - await this.crawlState.updateDedupSource(wacz); - - await this.crawlState.clearWACZFilename(); - - return true; - } else { - await this.crawlState.updateDedupSource(wacz); + return wacz; } - - return false; } catch (e) { logger.error("Error creating WACZ", e); if (!streaming) { @@ -2059,6 +2064,8 @@ self.__bx_behaviors.selectMainBehavior(); await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted"); } } + + return null; } logMemory() { diff --git a/src/util/state.ts b/src/util/state.ts index 7eda7497..ca9b619f 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -287,12 +287,13 @@ export class RedisDedupIndex { async commitDedupDone() { for await (const hashes of this.dedupRedis.hscanStream( `h:${this.crawlId}`, - { - noValues: true, - }, )) { + let value = false; for (const hash of hashes) { - await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + if (!value) { + await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + } + value = !value; } } @@ -1391,6 +1392,7 @@ return inx; await this.redis.set(`${this.crawlId}:profileUploaded`, JSON.stringify(result)); } + // DEPENDENT CRAWLS FOR DEDUPE async addDupeCrawlRef(crawlId: string, index: string) { await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); } diff --git a/yarn.lock b/yarn.lock index 3725fadd..e09d37b5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -370,10 +370,10 @@ resolved "https://registry.yarnpkg.com/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz#4a2868d75d6d6963e423bcf90b7fd1be343409d3" integrity sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA== -"@ioredis/commands@1.4.0": - version "1.4.0" - resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.4.0.tgz#9f657d51cdd5d2fdb8889592aa4a355546151f25" - integrity sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ== +"@ioredis/commands@^1.1.1": + version "1.2.0" + resolved "https://registry.yarnpkg.com/@ioredis/commands/-/commands-1.2.0.tgz#6d61b3097470af1fdbbe622795b8921d42018e11" + integrity sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg== "@istanbuljs/load-nyc-config@^1.0.0": version "1.1.0" @@ -3014,12 +3014,12 @@ intl-messageformat@^10.5.3: "@formatjs/icu-messageformat-parser" "2.11.2" tslib "^2.8.0" -ioredis@^5.8.2: - version "5.8.2" - resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.8.2.tgz#c7a228a26cf36f17a5a8011148836877780e2e14" - integrity sha512-C6uC+kleiIMmjViJINWk80sOQw5lEzse1ZmvD+S/s8p8CWapftSaC+kocGTx6xrbrJ4WmYQGC08ffHLr6ToR6Q== +ioredis@^5.3.2: + version "5.4.1" + resolved "https://registry.yarnpkg.com/ioredis/-/ioredis-5.4.1.tgz#1c56b70b759f01465913887375ed809134296f40" + integrity sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA== dependencies: - "@ioredis/commands" "1.4.0" + "@ioredis/commands" "^1.1.1" cluster-key-slot "^1.1.0" debug "^4.3.4" denque "^2.1.0" From cb9367460f88dbabd191982f29cc93a600efc90b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 24 Oct 2025 15:01:00 -0700 Subject: [PATCH 17/29] always return wacz, store wacz depends only for current wacz store crawlid depends for entire crawl --- src/crawler.ts | 7 ++++--- src/util/state.ts | 9 +++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 5171569e..caf5aa42 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1919,12 +1919,14 @@ self.__bx_behaviors.selectMainBehavior(); if (this.deduping) { await this.crawlState.setStatus("post-crawl"); await this.crawlState.updateDedupSource(wacz); + + await this.crawlState.clearDupeFileRef(); } await this.crawlState.clearWACZFilename(); } - if (wacz && this.uploadAndDeleteLocal) { + if (wacz && this.storage && this.uploadAndDeleteLocal) { await this.crawlState.setArchiveSize(0); logger.info( @@ -2053,9 +2055,8 @@ self.__bx_behaviors.selectMainBehavior(); const targetFilename = await this.crawlState.getWACZFilename(); await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished); - - return wacz; } + return wacz; } catch (e) { logger.error("Error creating WACZ", e); if (!streaming) { diff --git a/src/util/state.ts b/src/util/state.ts index ca9b619f..3f851463 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1394,11 +1394,16 @@ return inx; // DEPENDENT CRAWLS FOR DEDUPE async addDupeCrawlRef(crawlId: string, index: string) { - await this.redis.sadd(`${this.crawlId}:dindex`, crawlId + " " + index); + await this.redis.sadd(`${this.uid}:dindex`, crawlId + " " + index); + await this.redis.sadd(`${this.crawlId}:depCrawls`, crawlId); + } + + async clearDupeFileRef() { + await this.redis.del(`${this.uid}:dindex`); } async getDupeDependentSources() { - const dependIndexes = await this.redis.smembers(`${this.crawlId}:dindex`); + const dependIndexes = await this.redis.smembers(`${this.uid}:dindex`); const crawlIds = []; for (const value of dependIndexes) { const [crawlId, index] = value.split(" "); From 7a5b3b2c180dba1fb00f6488eecdfbced034f936 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 25 Oct 2025 09:33:37 -0700 Subject: [PATCH 18/29] rename 'dedup' -> 'dedupe' for consistency --- src/crawler.ts | 14 +- src/indexer.ts | 38 +++--- src/util/argParser.ts | 4 +- src/util/recorder.ts | 8 +- src/util/state.ts | 121 +++++++++--------- ...dup-basic.test.js => dedupe-basic.test.js} | 34 ++--- 6 files changed, 113 insertions(+), 106 deletions(-) rename tests/{dedup-basic.test.js => dedupe-basic.test.js} (72%) diff --git a/src/crawler.ts b/src/crawler.ts index caf5aa42..dfa4f62c 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -343,9 +343,9 @@ export class Crawler { async initCrawlState() { const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; - const dedupRedisUrl = this.params.redisDedupUrl || redisUrl; + const dedupeRedisUrl = this.params.redisDedupeUrl || redisUrl; - this.deduping = dedupRedisUrl !== redisUrl; + this.deduping = dedupeRedisUrl !== redisUrl; if (!redisUrl.startsWith("redis://")) { logger.fatal( @@ -361,10 +361,10 @@ export class Crawler { "state", ); - let dedupRedis = redis; + let dedupeRedis = redis; - if (redisUrl !== dedupRedisUrl) { - dedupRedis = await initRedisWaitForSuccess(dedupRedisUrl); + if (redisUrl !== dedupeRedisUrl) { + dedupeRedis = await initRedisWaitForSuccess(dedupeRedisUrl); } logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state"); @@ -375,7 +375,7 @@ export class Crawler { this.maxPageTime, os.hostname(), this.params.maxPageRetries, - dedupRedis, + dedupeRedis, ); if (this.params.logErrorsToRedis) { @@ -1918,7 +1918,7 @@ self.__bx_behaviors.selectMainBehavior(); if (wacz) { if (this.deduping) { await this.crawlState.setStatus("post-crawl"); - await this.crawlState.updateDedupSource(wacz); + await this.crawlState.updateDedupeSource(wacz); await this.crawlState.clearDupeFileRef(); } diff --git a/src/indexer.ts b/src/indexer.ts index 353d5385..74dfaaa2 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -7,10 +7,10 @@ import { WACZLoader } from "./util/wacz.js"; import { ExitCodes } from "./util/constants.js"; import { initRedisWaitForSuccess } from "./util/redis.js"; import { AsyncIterReader } from "warcio"; -import { RedisDedupIndex } from "./util/state.js"; +import { RedisDedupeIndex } from "./util/state.js"; import { basename } from "node:path"; -export type DedupIndexEntry = { +export type DedupeIndexEntry = { name: string; url: string; crawlId?: string; @@ -25,7 +25,7 @@ export class CrawlIndexer { return yargs(process.argv) .usage("indexer [options]") .options({ - redisDedupUrl: { + redisDedupeUrl: { describe: "URL for remote redis instance to index into", type: "string", required: true, @@ -57,26 +57,28 @@ export class CrawlIndexer { const params = this.initArgs(); - const redis = await initRedisWaitForSuccess(params.redisDedupUrl); - const dedupIndex = new RedisDedupIndex(redis, ""); + const redis = await initRedisWaitForSuccess(params.redisDedupeUrl); + const dedupeIndex = new RedisDedupeIndex(redis, ""); for await (const entry of this.iterWACZ(params.sourceUrl)) { - await dedupIndex.queueImportSource(entry.name, JSON.stringify(entry)); + await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry)); } let count = 0; let res; - while ((res = await dedupIndex.nextQueuedImportSource())) { + while ((res = await dedupeIndex.nextQueuedImportSource())) { const { name, entry, total } = res; - const { url, crawlId, size, hash } = JSON.parse(entry) as DedupIndexEntry; + const { url, crawlId, size, hash } = JSON.parse( + entry, + ) as DedupeIndexEntry; count += 1; const loader = new WACZLoader(url); logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile: url }); const crawlIdReal = crawlId || params.sourceCrawlId || url; - await dedupIndex.addImportedSourceForDedup(crawlIdReal, { + await dedupeIndex.addImportedSourceForDedupe(crawlIdReal, { filename: name, size, hash, @@ -87,7 +89,7 @@ export class CrawlIndexer { if (filename.endsWith(".cdx.gz")) { logger.debug("Processing CDX GZ Index", { filename }); await this.ingestCDXJ( - dedupIndex, + dedupeIndex, loader, filename, crawlIdReal, @@ -95,20 +97,20 @@ export class CrawlIndexer { ); } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) { logger.debug("Processing CDX Index", { filename }); - await this.ingestCDXJ(dedupIndex, loader, filename, crawlIdReal); + await this.ingestCDXJ(dedupeIndex, loader, filename, crawlIdReal); } } - await dedupIndex.markImportSourceDone(name, crawlIdReal); + await dedupeIndex.markImportSourceDone(name, crawlIdReal); } logger.info("Done!"); - await dedupIndex.markImportFinishedTS(); + await dedupeIndex.markImportFinishedTS(); process.exit(ExitCodes.Success); } async ingestCDXJ( - dedupIndex: RedisDedupIndex, + dedupeIndex: RedisDedupeIndex, loader: WACZLoader, filename: string, crawlId: string, @@ -152,14 +154,14 @@ export class CrawlIndexer { continue; } - // only adding originals to dedup against, don't want to dedup against existing revisits + // only adding originals to dedupe against, don't want to dedupe against existing revisits if (cdx.mime === "warc/revisit") { continue; } if (url && date && hash) { - await dedupIndex.addHashDupe(hash, url, date, crawlId); - await dedupIndex.addImportedForCrawl(hash, crawlId); + await dedupeIndex.addHashDupe(hash, url, date, crawlId); + await dedupeIndex.addImportedForCrawl(hash, crawlId); } else { logger.warn("Skipping invalid CDXJ, data missing", { url, @@ -175,7 +177,7 @@ export class CrawlIndexer { logger.debug("Processed", { count }); } - async *iterWACZ(url: string, name?: string): AsyncIterable { + async *iterWACZ(url: string, name?: string): AsyncIterable { let path: string = url; try { diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 22c15243..e1a9c3d6 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -445,13 +445,13 @@ class ArgParser { default: "redis://localhost:6379/0", }, - redisDedupUrl: { + redisDedupeUrl: { describe: "If set, url for remote redis server to store state. Otherwise, using local redis instance", type: "string", }, - minPageDedupDepth: { + minPageDedupeDepth: { describe: "If set >= 0, minimum depth at which duplicate pages can be skipped. -1 means never skip duplicate pages", type: "number", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 6e53e0a7..db3e4447 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -144,7 +144,7 @@ export class Recorder extends EventEmitter { pageSeed?: ScopedSeed; pageSeedDepth = 0; - minPageDedupDepth = -1; + minPageDedupeDepth = -1; frameIdToExecId: Map | null; @@ -168,7 +168,7 @@ export class Recorder extends EventEmitter { this.shouldSaveStorage = !!crawler.params.saveStorage; - this.minPageDedupDepth = crawler.params.minPageDedupDepth; + this.minPageDedupeDepth = crawler.params.minPageDedupeDepth; this.writer = writer; @@ -840,8 +840,8 @@ export class Recorder extends EventEmitter { if ( url === this.pageUrl && reqresp.payload && - this.minPageDedupDepth >= 0 && - this.pageSeedDepth >= this.minPageDedupDepth + this.minPageDedupeDepth >= 0 && + this.pageSeedDepth >= this.minPageDedupeDepth ) { const hash = "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex"); diff --git a/src/util/state.ts b/src/util/state.ts index 3f851463..f63d050b 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -48,7 +48,7 @@ const normalizeUrlOpts: NormamlizeUrlOptions = { // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup -export function normalizeDedupStatus(status: number): string { +export function normalizeDedupeStatus(status: number): string { if (status === 0 || status === 206) { return "200"; } @@ -225,18 +225,18 @@ export type DedupeEntry = { }; // ============================================================================ -export type DedupSourceEntry = { +export type DedupeSourceEntry = { filename: string; size?: number; hash?: string; }; // ============================================================================ -export class RedisDedupIndex { - dedupRedis: Redis; +export class RedisDedupeIndex { + dedupeRedis: Redis; crawlId: string; - dedupKeyIndex = -1; - dedupCurrFilename = ""; + dedupeKeyIndex = -1; + dedupeCurrFilename = ""; sourceDone = "src:d"; sourceQ = "src:q"; @@ -244,61 +244,61 @@ export class RedisDedupIndex { sourceP = "src:p"; pendingPrefix = "pending:q:"; - constructor(dedupRedis: Redis, crawlId: string) { - this.dedupRedis = dedupRedis; + constructor(dedupeRedis: Redis, crawlId: string) { + this.dedupeRedis = dedupeRedis; this.crawlId = crawlId; } - // DEDUP SOURCE + // DEDUPE SOURCE - async addSourceForDedup(filename: string) { - //const count = await this.dedupRedis.incr(`c:${key}:count`) - 1; + async addSourceForDedupe(filename: string) { + //const count = await this.dedupeRedis.incr(`c:${key}:count`) - 1; const count = - (await this.dedupRedis.rpush( + (await this.dedupeRedis.rpush( `c:${this.crawlId}:wacz`, JSON.stringify({ filename }), )) - 1; - this.dedupCurrFilename = filename; - this.dedupKeyIndex = count; + this.dedupeCurrFilename = filename; + this.dedupeKeyIndex = count; } - async updateDedupSource(wacz: WACZ) { - if (this.dedupKeyIndex < 0) { + async updateDedupeSource(wacz: WACZ) { + if (this.dedupeKeyIndex < 0) { return; } - const value: DedupSourceEntry = { - filename: wacz.getLocalFilename() || this.dedupCurrFilename, + const value: DedupeSourceEntry = { + filename: wacz.getLocalFilename() || this.dedupeCurrFilename, hash: wacz.getHash(), size: wacz.getSize(), }; - await this.dedupRedis.lset( + await this.dedupeRedis.lset( `c:${this.crawlId}:wacz`, - this.dedupKeyIndex, + this.dedupeKeyIndex, JSON.stringify(value), ); - await this.commitDedupDone(); + await this.commitDedupeDone(); } - // COMMIT DEDUP TO SHARED INDEX + // COMMIT DEDUPE TO SHARED INDEX - async commitDedupDone() { - for await (const hashes of this.dedupRedis.hscanStream( + async commitDedupeDone() { + for await (const hashes of this.dedupeRedis.hscanStream( `h:${this.crawlId}`, )) { let value = false; for (const hash of hashes) { if (!value) { - await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, this.crawlId); + await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, this.crawlId); } value = !value; } } // add to crawls list - await this.dedupRedis.sadd(DUPE_ALL_CRAWLS, this.crawlId); + await this.dedupeRedis.sadd(DUPE_ALL_CRAWLS, this.crawlId); } // GET OR ADD INDIVIDUAL HASHES @@ -311,12 +311,12 @@ export class RedisDedupIndex { hash = hash.split(":").at(-1)!; // first, check the shared key - let crawlId = await this.dedupRedis.hget(key, hash); + let crawlId = await this.dedupeRedis.hget(key, hash); if (!crawlId) { // otherwise, try current crawl crawlId = this.crawlId; } - const value = await this.dedupRedis.hget(`h:${crawlId}`, hash); + const value = await this.dedupeRedis.hget(`h:${crawlId}`, hash); if (!value) { return null; } @@ -327,37 +327,37 @@ export class RedisDedupIndex { async addHashDupe(hash: string, url: string, date: string, crawlId?: string) { date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; - const val = `${this.dedupKeyIndex} ${date} ${url}`; - await this.dedupRedis.hsetnx(`h:${crawlId || this.crawlId}`, hash, val); + const val = `${this.dedupeKeyIndex} ${date} ${url}`; + await this.dedupeRedis.hsetnx(`h:${crawlId || this.crawlId}`, hash, val); } // IMPORT async queueImportSource(id: string, data: string) { // already handled this source - if (await this.dedupRedis.sismember(this.sourceDone, id)) { + if (await this.dedupeRedis.sismember(this.sourceDone, id)) { return; } - await this.dedupRedis.lpush(this.sourceQ, data); + await this.dedupeRedis.lpush(this.sourceQ, data); } async addImportedForCrawl(hash: string, crawlId: string) { - await this.dedupRedis.hset(DUPE_ALL_HASH_KEY, hash, crawlId); + await this.dedupeRedis.hset(DUPE_ALL_HASH_KEY, hash, crawlId); } - async addImportedSourceForDedup(key: string, entry: DedupSourceEntry) { + async addImportedSourceForDedupe(key: string, entry: DedupeSourceEntry) { return ( - (await this.dedupRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1 + (await this.dedupeRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1 ); } async markImportSourceDone(id: string, crawlId: string) { - await this.dedupRedis.sadd(this.sourceDone, id); - await this.dedupRedis.sadd(DUPE_ALL_CRAWLS, crawlId); + await this.dedupeRedis.sadd(this.sourceDone, id); + await this.dedupeRedis.sadd(DUPE_ALL_CRAWLS, crawlId); } async nextQueuedImportSource() { - let res: string | null = await this.dedupRedis.lmove( + let res: string | null = await this.dedupeRedis.lmove( this.sourceQ, this.pendingQ, "RIGHT", @@ -365,9 +365,9 @@ export class RedisDedupIndex { ); // use circular pending Q to support retries if (!res) { - const len = await this.dedupRedis.llen(this.pendingQ); + const len = await this.dedupeRedis.llen(this.pendingQ); for (let i = 0; i < len; i++) { - res = await this.dedupRedis.lmove( + res = await this.dedupeRedis.lmove( this.pendingQ, this.pendingQ, "RIGHT", @@ -375,7 +375,7 @@ export class RedisDedupIndex { ); if (res) { const { id } = JSON.parse(res); - if (await this.dedupRedis.get(this.pendingPrefix + id)) { + if (await this.dedupeRedis.get(this.pendingPrefix + id)) { res = null; continue; } else { @@ -389,20 +389,20 @@ export class RedisDedupIndex { return null; } - await this.dedupRedis.lrem(this.pendingQ, 1, res); + await this.dedupeRedis.lrem(this.pendingQ, 1, res); const { name } = JSON.parse(res); - const total = (await this.dedupRedis.llen(this.sourceQ)) + 1; - await this.dedupRedis.setex(this.pendingPrefix + name, "1", 300); + const total = (await this.dedupeRedis.llen(this.sourceQ)) + 1; + await this.dedupeRedis.setex(this.pendingPrefix + name, "1", 300); return { name, entry: res, total }; } async markImportFinishedTS() { - await this.dedupRedis.set("last_update_ts", new Date().toISOString()); + await this.dedupeRedis.set("last_update_ts", new Date().toISOString()); } } // ============================================================================ -export class RedisCrawlState extends RedisDedupIndex { +export class RedisCrawlState extends RedisDedupeIndex { redis: Redis; maxRetries: number; @@ -435,9 +435,9 @@ export class RedisCrawlState extends RedisDedupIndex { maxPageTime: number, uid: string, maxRetries?: number, - dedupRedis?: Redis, + dedupeRedis?: Redis, ) { - super(dedupRedis || redis, key); + super(dedupeRedis || redis, key); this.redis = redis; this.uid = uid; @@ -759,7 +759,7 @@ return inx; "state", ); } - await this.addSourceForDedup(this.waczFilename!); + await this.addSourceForDedupe(this.waczFilename!); return this.waczFilename!; } @@ -1253,13 +1253,18 @@ return inx; async addIfNoDupe(key: string, url: string, status: number) { url = normalizeUrl(url, normalizeUrlOpts); return ( - (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) === - 1 + (await this.redis.sadd( + key, + normalizeDedupeStatus(status) + "|" + url, + )) === 1 ); } async removeDupe(key: string, url: string, status: number) { - return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url); + return await this.redis.srem( + key, + normalizeDedupeStatus(status) + "|" + url, + ); } async isInUserSet(value: string) { @@ -1394,20 +1399,20 @@ return inx; // DEPENDENT CRAWLS FOR DEDUPE async addDupeCrawlRef(crawlId: string, index: string) { - await this.redis.sadd(`${this.uid}:dindex`, crawlId + " " + index); - await this.redis.sadd(`${this.crawlId}:depCrawls`, crawlId); + await this.redis.sadd(`${this.uid}:duperef`, crawlId + " " + index); + await this.redis.sadd(`${this.crawlId}:reqCrawls`, crawlId); } async clearDupeFileRef() { - await this.redis.del(`${this.uid}:dindex`); + await this.redis.del(`${this.uid}:duperef`); } async getDupeDependentSources() { - const dependIndexes = await this.redis.smembers(`${this.uid}:dindex`); + const dependRefs = await this.redis.smembers(`${this.uid}:duperef`); const crawlIds = []; - for (const value of dependIndexes) { + for (const value of dependRefs) { const [crawlId, index] = value.split(" "); - const source = await this.dedupRedis.lindex( + const source = await this.dedupeRedis.lindex( `c:${crawlId}:wacz`, Number(index), ); diff --git a/tests/dedup-basic.test.js b/tests/dedupe-basic.test.js similarity index 72% rename from tests/dedup-basic.test.js rename to tests/dedupe-basic.test.js index f0d6b7c6..759b1c89 100644 --- a/tests/dedup-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -13,9 +13,9 @@ let redisId; let numResponses = 0; beforeAll(() => { - execSync("docker network create dedup"); + execSync("docker network create dedupe"); - redisId = execSync("docker run --rm --network=dedup -p 37379:6379 --name dedup-redis -d redis"); + redisId = execSync("docker run --rm --network=dedupe -p 37379:6379 --name dedupe-redis -d redis"); }); afterAll(async () => { @@ -25,13 +25,13 @@ afterAll(async () => { //await Promise.allSettled([crawler1, crawler2]); - execSync("docker network rm dedup"); + execSync("docker network rm dedupe"); }); function runCrawl(name, db="0") { fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true }); - const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379/${db} --generateWACZ`); + const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupeUrl redis://dedupe-redis:6379/${db} --generateWACZ`); return new Promise((resolve) => { crawler.on("exit", (code) => { @@ -70,15 +70,15 @@ function loadDataPackageRelated(name) { test("check revisit records written on duplicate crawl", async () => { - expect(await runCrawl("dedup-test-orig")).toBe(0); - expect(await runCrawl("dedup-test-dupe")).toBe(0); + expect(await runCrawl("dedupe-test-orig")).toBe(0); + expect(await runCrawl("dedupe-test-dupe")).toBe(0); let statusCode = -1; let response = 0; let revisit = 0; - const parserOrig = loadFirstWARC("dedup-test-orig"); + const parserOrig = loadFirstWARC("dedupe-test-orig"); for await (const record of parserOrig) { if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { @@ -90,7 +90,7 @@ test("check revisit records written on duplicate crawl", async () => { } } - const dupeOrig = loadFirstWARC("dedup-test-dupe"); + const dupeOrig = loadFirstWARC("dedupe-test-dupe"); for await (const record of dupeOrig) { if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { @@ -113,7 +113,7 @@ test("check revisit records written on duplicate crawl", async () => { test("import index and crawl dupe", async () => { - execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --sourceCrawlId dedup-test-orig --redisDedupUrl redis://dedup-redis:6379/1`); + execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedupe-test-orig/dedupe-test-orig.wacz --sourceCrawlId dedupe-test-orig --redisDedupeUrl redis://dedupe-redis:6379/1`); const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null }); @@ -124,9 +124,9 @@ test("import index and crawl dupe", async () => { test("imported crawl dupe matches previous dupe count", async () => { - expect(await runCrawl("dedup-test-dupe-2", 1)).toBe(0); + expect(await runCrawl("dedupe-test-dupe-2", 1)).toBe(0); - const dupeOrig = loadFirstWARC("dedup-test-dupe-2"); + const dupeOrig = loadFirstWARC("dedupe-test-dupe-2"); let revisit = 0; @@ -145,22 +145,22 @@ test("imported crawl dupe matches previous dupe count", async () => { }); test("test requires in datapackage.json of wacz deduped against previous crawl", () => { - const res1 = loadDataPackageRelated("dedup-test-dupe"); + const res1 = loadDataPackageRelated("dedupe-test-dupe"); expect(res1.requires.length).toBe(1); const entry = res1.requires[0]; - expect(entry.crawlId).toBe("dedup-test-orig"); - expect(entry.filename).toBe("dedup-test-orig.wacz"); + expect(entry.crawlId).toBe("dedupe-test-orig"); + expect(entry.filename).toBe("dedupe-test-orig.wacz"); expect(entry.size).toBeDefined(); expect(entry.hash).toBeDefined(); }); test("test requires in datapackage.json of wacz deduped against import from wacz", () => { - const res2 = loadDataPackageRelated("dedup-test-dupe-2"); + const res2 = loadDataPackageRelated("dedupe-test-dupe-2"); expect(res2.requires.length).toBe(1); const entry2 = res2.requires[0]; - expect(entry2.crawlId).toBe("dedup-test-orig"); - expect(entry2.filename).toBe("dedup-test-orig.wacz"); + expect(entry2.crawlId).toBe("dedupe-test-orig"); + expect(entry2.filename).toBe("dedupe-test-orig.wacz"); // undefined as importing from single WACZ and not computing expect(entry2.size).toBeUndefined(); expect(entry2.hash).toBeUndefined(); From 7c9317e3dc14ccb45baace8bcfe6240cfe4aa8d7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 25 Oct 2025 13:17:01 -0700 Subject: [PATCH 19/29] indexer optimize: commit only if added --- src/indexer.ts | 26 +++++++++++++------------- src/util/state.ts | 17 +++++++++++------ 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index 74dfaaa2..f08e59ca 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -60,7 +60,10 @@ export class CrawlIndexer { const redis = await initRedisWaitForSuccess(params.redisDedupeUrl); const dedupeIndex = new RedisDedupeIndex(redis, ""); - for await (const entry of this.iterWACZ(params.sourceUrl)) { + for await (const entry of this.iterWACZ({ + url: params.sourceUrl, + name: params.sourceCrawlId || params.sourceUrl, + })) { await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry)); } @@ -160,8 +163,7 @@ export class CrawlIndexer { } if (url && date && hash) { - await dedupeIndex.addHashDupe(hash, url, date, crawlId); - await dedupeIndex.addImportedForCrawl(hash, crawlId); + await dedupeIndex.addHashDupe(hash, url, date, crawlId, true); } else { logger.warn("Skipping invalid CDXJ, data missing", { url, @@ -177,8 +179,10 @@ export class CrawlIndexer { logger.debug("Processed", { count }); } - async *iterWACZ(url: string, name?: string): AsyncIterable { - let path: string = url; + async *iterWACZ(entry: DedupeIndexEntry): AsyncIterable { + const { name } = entry; + let { url } = entry; + let path = url; try { path = new URL(url).pathname; @@ -187,7 +191,8 @@ export class CrawlIndexer { } if (path.endsWith(".wacz")) { - yield { name: basename(name || url), url }; + console.log({ ...entry, name: basename(name || url) }); + yield { ...entry, name: basename(name || url) }; } else if (path.endsWith(".json")) { if (!url.startsWith("http://") && !url.startsWith("https://")) { const blob = await openAsBlob(url); @@ -198,13 +203,8 @@ export class CrawlIndexer { const json = await resp.json(); for (const entry of json.resources) { - const url = entry.path; - if (url && url.endsWith(".wacz")) { - const { size, hash, crawlId, name } = entry; - yield { crawlId, name, url, size, hash }; - } else { - yield* this.iterWACZ(entry.path, entry.name); - } + entry.url = entry.path; + yield* this.iterWACZ(entry); } } else { logger.warn("Unknown source", { url }, "replay"); diff --git a/src/util/state.ts b/src/util/state.ts index f63d050b..bcfe537b 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -324,11 +324,20 @@ export class RedisDedupeIndex { return { origUrl: val[2], origDate: val[1], index: val[0], crawlId }; } - async addHashDupe(hash: string, url: string, date: string, crawlId?: string) { + async addHashDupe( + hash: string, + url: string, + date: string, + crawlId?: string, + commit = false, + ) { date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; const val = `${this.dedupeKeyIndex} ${date} ${url}`; - await this.dedupeRedis.hsetnx(`h:${crawlId || this.crawlId}`, hash, val); + crawlId = crawlId || this.crawlId; + if ((await this.dedupeRedis.hsetnx(`h:${crawlId}`, hash, val)) && commit) { + await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, crawlId); + } } // IMPORT @@ -341,10 +350,6 @@ export class RedisDedupeIndex { await this.dedupeRedis.lpush(this.sourceQ, data); } - async addImportedForCrawl(hash: string, crawlId: string) { - await this.dedupeRedis.hset(DUPE_ALL_HASH_KEY, hash, crawlId); - } - async addImportedSourceForDedupe(key: string, entry: DedupeSourceEntry) { return ( (await this.dedupeRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1 From 460badf8c7af49d27d1d03dca93ed57899d88aea Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 25 Oct 2025 15:41:31 -0700 Subject: [PATCH 20/29] add removing option to also remove unused crawls if doing a full sync, disable by default --- src/indexer.ts | 32 +++++++++++++++++++++---- src/util/state.ts | 59 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index f08e59ca..ac31f6fb 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -9,6 +9,7 @@ import { initRedisWaitForSuccess } from "./util/redis.js"; import { AsyncIterReader } from "warcio"; import { RedisDedupeIndex } from "./util/state.js"; import { basename } from "node:path"; +import { sleep } from "./util/timing.js"; export type DedupeIndexEntry = { name: string; @@ -42,6 +43,13 @@ export class CrawlIndexer { type: "string", required: false, }, + + removing: { + describe: "If set, also remove unsued crawls/hashes from index", + type: "boolean", + required: false, + default: false, + }, }) .parseSync(); } @@ -62,16 +70,24 @@ export class CrawlIndexer { for await (const entry of this.iterWACZ({ url: params.sourceUrl, - name: params.sourceCrawlId || params.sourceUrl, + name: basename(params.sourceUrl), + crawlId: params.sourceCrawlId, })) { await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry)); + if (params.removing && entry.crawlId) { + await dedupeIndex.markNotRemoved(entry.crawlId); + } } let count = 0; + let total = 0; let res; while ((res = await dedupeIndex.nextQueuedImportSource())) { - const { name, entry, total } = res; + const { name, entry, remaining } = res; + if (!total) { + total = remaining; + } const { url, crawlId, size, hash } = JSON.parse( entry, ) as DedupeIndexEntry; @@ -107,7 +123,15 @@ export class CrawlIndexer { await dedupeIndex.markImportSourceDone(name, crawlIdReal); } + if (params.removing) { + const removeset = await dedupeIndex.getRemoveSet(); + if (removeset.size > 0) { + await dedupeIndex.removeCrawlIds(removeset); + } + } + logger.info("Done!"); + await sleep(30); await dedupeIndex.markImportFinishedTS(); process.exit(ExitCodes.Success); } @@ -180,7 +204,6 @@ export class CrawlIndexer { } async *iterWACZ(entry: DedupeIndexEntry): AsyncIterable { - const { name } = entry; let { url } = entry; let path = url; @@ -191,8 +214,7 @@ export class CrawlIndexer { } if (path.endsWith(".wacz")) { - console.log({ ...entry, name: basename(name || url) }); - yield { ...entry, name: basename(name || url) }; + yield entry; } else if (path.endsWith(".json")) { if (!url.startsWith("http://") && !url.startsWith("https://")) { const blob = await openAsBlob(url); diff --git a/src/util/state.ts b/src/util/state.ts index bcfe537b..72f05b4c 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -288,12 +288,12 @@ export class RedisDedupeIndex { for await (const hashes of this.dedupeRedis.hscanStream( `h:${this.crawlId}`, )) { - let value = false; + let isValue = false; for (const hash of hashes) { - if (!value) { + if (!isValue) { await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, this.crawlId); } - value = !value; + isValue = !isValue; } } @@ -396,14 +396,58 @@ export class RedisDedupeIndex { await this.dedupeRedis.lrem(this.pendingQ, 1, res); const { name } = JSON.parse(res); - const total = (await this.dedupeRedis.llen(this.sourceQ)) + 1; + const remaining = (await this.dedupeRedis.llen(this.sourceQ)) + 1; await this.dedupeRedis.setex(this.pendingPrefix + name, "1", 300); - return { name, entry: res, total }; + return { name, entry: res, remaining }; } async markImportFinishedTS() { await this.dedupeRedis.set("last_update_ts", new Date().toISOString()); } + + // REMOVE ON IMPORT + + async markNotRemoved(crawlId: string) { + await this.dedupeRedis.sadd("noremove", crawlId); + } + + async getRemoveSet() { + const removeSet = await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"); + await this.dedupeRedis.del("noremove"); + return new Set(removeSet); + } + + async removeCrawlIds(toRemove: Set) { + for await (const hashes of this.dedupeRedis.hscanStream( + DUPE_ALL_HASH_KEY, + )) { + let isValue = false; + let key = ""; + for (const hash of hashes) { + if (!isValue) { + key = hash; + } + if (key && isValue && toRemove.has(hash)) { + await this.dedupeRedis.hdel(DUPE_ALL_HASH_KEY, key); + } + isValue = !isValue; + } + } + + for (const crawlId of toRemove) { + const allWACZ = await this.dedupeRedis.lrange(`c:${crawlId}:wacz`, 0, -1); + for (const waczdata of allWACZ) { + try { + const { filename } = JSON.parse(waczdata); + await this.dedupeRedis.srem(this.sourceDone, filename); + } catch (e) { + // ignore + } + } + await this.dedupeRedis.del(`h:${crawlId}`, `c:${crawlId}:wacz`); + await this.dedupeRedis.srem(DUPE_ALL_CRAWLS, crawlId); + } + } } // ============================================================================ @@ -1399,7 +1443,10 @@ return inx; async markProfileUploaded(result: UploadResult & { modified?: string }) { result.modified = this._timestamp(); - await this.redis.set(`${this.crawlId}:profileUploaded`, JSON.stringify(result)); + await this.redis.set( + `${this.crawlId}:profileUploaded`, + JSON.stringify(result), + ); } // DEPENDENT CRAWLS FOR DEDUPE From c401c4871e291bd0dbd5ddd2f0dcd1f16ad9c23a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Nov 2025 23:40:02 -0800 Subject: [PATCH 21/29] generate wacz filename if deduping --- src/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawler.ts b/src/crawler.ts index dfa4f62c..5d0cb42b 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1713,7 +1713,7 @@ self.__bx_behaviors.selectMainBehavior(); this.storage = initStorage(); } - if (this.params.generateWACZ && this.storage) { + if (this.params.generateWACZ && (this.storage || this.deduping)) { await this.crawlState.setWACZFilename(); } From aa44e5491c181133ab5e9f10ec7e0adf0d00c79c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 28 Nov 2025 01:16:58 -0800 Subject: [PATCH 22/29] cleanup pass: - support dedupe without requiring wacz, no crawl dependency tracking stored - add dedupe test w/o wacz - cleanup dedupe related naming --- src/crawler.ts | 43 ++++++++++++--------- src/util/recorder.ts | 4 +- src/util/state.ts | 60 ++++++++++++++--------------- tests/dedupe-basic.test.js | 79 +++++++++++++++++++++++++++++++++----- 4 files changed, 126 insertions(+), 60 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 5d0cb42b..dead6c93 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1714,7 +1714,10 @@ self.__bx_behaviors.selectMainBehavior(); } if (this.params.generateWACZ && (this.storage || this.deduping)) { - await this.crawlState.setWACZFilename(); + const filename = await this.crawlState.setWACZFilename(); + if (this.deduping) { + await this.crawlState.addSourceWACZForDedupe(filename); + } } if (POST_CRAWL_STATES.includes(initState)) { @@ -1916,30 +1919,34 @@ self.__bx_behaviors.selectMainBehavior(); const wacz = await this.generateWACZ(); if (wacz) { - if (this.deduping) { - await this.crawlState.setStatus("post-crawl"); - await this.crawlState.updateDedupeSource(wacz); + await this.crawlState.clearWACZFilename(); - await this.crawlState.clearDupeFileRef(); + if (this.deduping) { + await this.crawlState.updateDedupeSourceWACZ(wacz); } - await this.crawlState.clearWACZFilename(); - } - - if (wacz && this.storage && this.uploadAndDeleteLocal) { - await this.crawlState.setArchiveSize(0); + if (this.storage && this.uploadAndDeleteLocal) { + await this.crawlState.setArchiveSize(0); - logger.info( - `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`, - ); - try { - fs.rmSync(this.collDir, { recursive: true, force: true }); - } catch (e) { - logger.warn(`Unable to clear ${this.collDir} before exit`, e); + logger.info( + `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`, + ); + try { + fs.rmSync(this.collDir, { recursive: true, force: true }); + } catch (e) { + logger.warn(`Unable to clear ${this.collDir} before exit`, e); + } } } } + if (this.deduping) { + //await this.crawlState.clearDupeCrawlRef(); + + // commit crawl data to main index + await this.crawlState.commitDedupeDone(); + } + if (this.finalExit && generateFiles && this.params.saveProfile) { const resource = await this.browser.saveProfile( this.params.saveProfile, @@ -2015,7 +2022,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.closeLog(); - const requires = await this.crawlState.getDupeDependentSources(); + const requires = await this.crawlState.getDupeDependentCrawls(); const waczOpts: WACZInitOpts = { input: warcFileList.map((x) => path.join(this.archivesDir, x)), diff --git a/src/util/recorder.ts b/src/util/recorder.ts index db3e4447..4d9a72f5 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -853,7 +853,7 @@ export class Recorder extends EventEmitter { requestId, errorReason, }); - await this.crawlState.addDupeCrawlRef(crawlId, index); + await this.crawlState.addDupeCrawlDependency(crawlId, index); return true; } } @@ -1722,7 +1722,7 @@ export class Recorder extends EventEmitter { origUrl, date, )); - await this.crawlState.addDupeCrawlRef(crawlId, index); + await this.crawlState.addDupeCrawlDependency(crawlId, index); isDupe = true; } else { // no dupe, continue diff --git a/src/util/state.ts b/src/util/state.ts index 72f05b4c..013856fb 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -235,7 +235,7 @@ export type DedupeSourceEntry = { export class RedisDedupeIndex { dedupeRedis: Redis; crawlId: string; - dedupeKeyIndex = -1; + dedupeKeyIndex = 0; dedupeCurrFilename = ""; sourceDone = "src:d"; @@ -249,37 +249,32 @@ export class RedisDedupeIndex { this.crawlId = crawlId; } - // DEDUPE SOURCE + // DEDUPE SOURCE WACZ (to track dependencies) - async addSourceForDedupe(filename: string) { - //const count = await this.dedupeRedis.incr(`c:${key}:count`) - 1; + async addSourceWACZForDedupe(filename: string) { + const crawlId = this.crawlId; const count = (await this.dedupeRedis.rpush( - `c:${this.crawlId}:wacz`, + `c:${crawlId}:wacz`, JSON.stringify({ filename }), )) - 1; this.dedupeCurrFilename = filename; this.dedupeKeyIndex = count; } - async updateDedupeSource(wacz: WACZ) { - if (this.dedupeKeyIndex < 0) { - return; - } - + async updateDedupeSourceWACZ(wacz: WACZ) { const value: DedupeSourceEntry = { filename: wacz.getLocalFilename() || this.dedupeCurrFilename, hash: wacz.getHash(), size: wacz.getSize(), }; + const crawlId = this.crawlId; await this.dedupeRedis.lset( - `c:${this.crawlId}:wacz`, + `c:${crawlId}:wacz`, this.dedupeKeyIndex, JSON.stringify(value), ); - - await this.commitDedupeDone(); } // COMMIT DEDUPE TO SHARED INDEX @@ -350,9 +345,12 @@ export class RedisDedupeIndex { await this.dedupeRedis.lpush(this.sourceQ, data); } - async addImportedSourceForDedupe(key: string, entry: DedupeSourceEntry) { + async addImportedSourceForDedupe(crawlId: string, entry: DedupeSourceEntry) { return ( - (await this.dedupeRedis.rpush(`c:${key}:wacz`, JSON.stringify(entry))) - 1 + (await this.dedupeRedis.rpush( + `c:${crawlId}:wacz`, + JSON.stringify(entry), + )) - 1 ); } @@ -808,7 +806,6 @@ return inx; "state", ); } - await this.addSourceForDedupe(this.waczFilename!); return this.waczFilename!; } @@ -1449,29 +1446,32 @@ return inx; ); } - // DEPENDENT CRAWLS FOR DEDUPE - async addDupeCrawlRef(crawlId: string, index: string) { + // DEPENDENT CRAWLS FOR DEDUPE (requires WACZ) + async addDupeCrawlDependency(crawlId: string, index: string) { await this.redis.sadd(`${this.uid}:duperef`, crawlId + " " + index); await this.redis.sadd(`${this.crawlId}:reqCrawls`, crawlId); } - async clearDupeFileRef() { - await this.redis.del(`${this.uid}:duperef`); - } + // async clearDupeCrawlDependency() { + // await this.redis.del(`${this.uid}:duperef`); + // } - async getDupeDependentSources() { + // Requires crawling with WACZ to match dependencies + async getDupeDependentCrawls() { const dependRefs = await this.redis.smembers(`${this.uid}:duperef`); const crawlIds = []; for (const value of dependRefs) { const [crawlId, index] = value.split(" "); - const source = await this.dedupeRedis.lindex( - `c:${crawlId}:wacz`, - Number(index), - ); - if (crawlId && crawlId !== this.crawlId && source) { - const entry = JSON.parse(source); - entry.crawlId = crawlId; - crawlIds.push(entry); + if (crawlId && crawlId !== this.crawlId) { + const source = await this.dedupeRedis.lindex( + `c:${crawlId}:wacz`, + Number(index), + ); + if (source) { + const entry = JSON.parse(source); + entry.crawlId = crawlId; + crawlIds.push(entry); + } } } return crawlIds; diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 759b1c89..2c2526d1 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -28,10 +28,10 @@ afterAll(async () => { execSync("docker network rm dedupe"); }); -function runCrawl(name, db="0") { +function runCrawl(name, {db = 0, limit = 4, wacz = true} = {}) { fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true }); - const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupeUrl redis://dedupe-redis:6379/${db} --generateWACZ`); + const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit ${limit} --exclude community --collection ${name} --redisDedupeUrl redis://dedupe-redis:6379/${db} ${wacz ? "--generateWACZ" : ""}`); return new Promise((resolve) => { crawler.on("exit", (code) => { @@ -54,6 +54,16 @@ function loadFirstWARC(name) { return parser; } +function deleteFirstWARC(name) { + const archiveWarcLists = fs.readdirSync( + `test-crawls/collections/${name}/archive`, + ); + + const warcName = path.join(`test-crawls/collections/${name}/archive`, archiveWarcLists[0]); + + fs.unlinkSync(warcName); +} + function loadDataPackageRelated(name) { execSync( `unzip test-crawls/collections/${name}/${name}.wacz -d test-crawls/collections/${name}/wacz`, @@ -67,11 +77,60 @@ function loadDataPackageRelated(name) { return dataPackageJSON.relation; } +test("check revisit records written on duplicate crawl, same collection, no wacz", async () => { + + const collName = "dedupe-test-same-coll"; + + expect(await runCrawl(collName, {limit: 1, wacz: false})).toBe(0); + + let statusCode = -1; + + let response = 0; + let revisit = 0; + + const parserOrig = loadFirstWARC(collName); + + for await (const record of parserOrig) { + if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { + continue; + } + + if (record.warcType === "response") { + response++; + } + } + + deleteFirstWARC(collName); + + expect(await runCrawl(collName, {limit: 1, wacz: false})).toBe(0); + + const dupeOrig = loadFirstWARC(collName); + + for await (const record of dupeOrig) { + if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) { + continue; + } + + if (record.warcType === "revisit") { + revisit++; + } + } + + expect(response).toBeGreaterThan(0); + + // revisits should match number of responses for non urn: + expect(response).toBe(revisit); + + numResponses = response; +}); + + + -test("check revisit records written on duplicate crawl", async () => { +test("check revisit records written on duplicate crawl, different collections, with wacz", async () => { - expect(await runCrawl("dedupe-test-orig")).toBe(0); - expect(await runCrawl("dedupe-test-dupe")).toBe(0); + expect(await runCrawl("dedupe-test-orig", {db: 1})).toBe(0); + expect(await runCrawl("dedupe-test-dupe", {db: 1})).toBe(0); let statusCode = -1; @@ -111,11 +170,11 @@ test("check revisit records written on duplicate crawl", async () => { }); -test("import index and crawl dupe", async () => { +test("import dupe index from wacz", async () => { - execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedupe-test-orig/dedupe-test-orig.wacz --sourceCrawlId dedupe-test-orig --redisDedupeUrl redis://dedupe-redis:6379/1`); + execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedupe webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedupe-test-orig/dedupe-test-orig.wacz --sourceCrawlId dedupe-test-orig --redisDedupeUrl redis://dedupe-redis:6379/2`); - const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null }); + const redis = new Redis("redis://127.0.0.1:37379/2", { lazyConnect: true, retryStrategy: () => null }); await redis.connect({maxRetriesPerRequest: 50}); @@ -123,8 +182,8 @@ test("import index and crawl dupe", async () => { }); -test("imported crawl dupe matches previous dupe count", async () => { - expect(await runCrawl("dedupe-test-dupe-2", 1)).toBe(0); +test("verify crawl with imported dupe index has same dupes as dedupe against original", async () => { + expect(await runCrawl("dedupe-test-dupe-2", {db: 2})).toBe(0); const dupeOrig = loadFirstWARC("dedupe-test-dupe-2"); From f68175f74a0ad5bc05bfe4ee17fb16dbc47d5e5a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 9 Dec 2025 16:20:19 -0800 Subject: [PATCH 23/29] don't include current crawl as self-reference dependency --- src/util/state.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index 013856fb..951421e2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1448,8 +1448,10 @@ return inx; // DEPENDENT CRAWLS FOR DEDUPE (requires WACZ) async addDupeCrawlDependency(crawlId: string, index: string) { - await this.redis.sadd(`${this.uid}:duperef`, crawlId + " " + index); - await this.redis.sadd(`${this.crawlId}:reqCrawls`, crawlId); + if (crawlId !== this.crawlId) { + await this.redis.sadd(`${this.uid}:duperef`, crawlId + " " + index); + await this.redis.sadd(`${this.crawlId}:reqCrawls`, crawlId); + } } // async clearDupeCrawlDependency() { From 36d0020354f6e43555e5029446bcab0fc6ee18b8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 10 Dec 2025 12:40:44 -0800 Subject: [PATCH 24/29] stats: - compute totalUrls, totalSize, uniqSize (uniqUrls = number of hashes) in per crawl key - add stats on crawl commit, remove on crawl remove - tests: update tests to check stats --- src/indexer.ts | 2 ++ src/util/constants.ts | 1 + src/util/recorder.ts | 16 +++++++++++- src/util/state.ts | 51 +++++++++++++++++++++++++++++++++++--- src/util/warcwriter.ts | 13 +++++++++- tests/dedupe-basic.test.js | 26 +++++++++++++++++++ 6 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index ac31f6fb..1f15ff6f 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -183,11 +183,13 @@ export class CrawlIndexer { // only adding originals to dedupe against, don't want to dedupe against existing revisits if (cdx.mime === "warc/revisit") { + await dedupeIndex.addStats(true, cdx.length, crawlId, true); continue; } if (url && date && hash) { await dedupeIndex.addHashDupe(hash, url, date, crawlId, true); + await dedupeIndex.addStats(false, cdx.length, crawlId, true); } else { logger.warn("Skipping invalid CDXJ, data missing", { url, diff --git a/src/util/constants.ts b/src/util/constants.ts index 4784d9b6..f2898803 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -24,6 +24,7 @@ export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const DUPE_ALL_HASH_KEY = "alldupes"; export const DUPE_ALL_CRAWLS = "allcrawls"; +export const DUPE_ALL_COUNTS = "allcounts"; export enum BxFunctionBindings { BehaviorLogFunc = "__bx_log", diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 4d9a72f5..50b8674f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -854,6 +854,7 @@ export class Recorder extends EventEmitter { errorReason, }); await this.crawlState.addDupeCrawlDependency(crawlId, index); + await this.crawlState.addStats(true, reqresp.payload.length); return true; } } @@ -1753,7 +1754,20 @@ export class Recorder extends EventEmitter { ); } - this.writer.writeRecordPair(responseRecord, requestRecord, serializer); + const addStatsCallback = async (size: number) => { + try { + await this.crawlState.addStats(isDupe, size); + } catch (e) { + logger.warn("Error updating dedupe size", e, "recorder"); + } + }; + + this.writer.writeRecordPair( + responseRecord, + requestRecord, + serializer, + addStatsCallback, + ); this.addPageRecord(reqresp); diff --git a/src/util/state.ts b/src/util/state.ts index 951421e2..1b825d87 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -9,6 +9,7 @@ import { ROBOTS_CACHE_LIMIT, DUPE_ALL_HASH_KEY, DUPE_ALL_CRAWLS, + DUPE_ALL_COUNTS, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; @@ -294,6 +295,9 @@ export class RedisDedupeIndex { // add to crawls list await this.dedupeRedis.sadd(DUPE_ALL_CRAWLS, this.crawlId); + + // add counts + await this.addRemoveCrawlCounts(this.crawlId); } // GET OR ADD INDIVIDUAL HASHES @@ -324,14 +328,52 @@ export class RedisDedupeIndex { url: string, date: string, crawlId?: string, - commit = false, + commitToAllKey = false, ) { date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; const val = `${this.dedupeKeyIndex} ${date} ${url}`; crawlId = crawlId || this.crawlId; - if ((await this.dedupeRedis.hsetnx(`h:${crawlId}`, hash, val)) && commit) { - await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, crawlId); + if (await this.dedupeRedis.hsetnx(`h:${crawlId}`, hash, val)) { + // first time seeing hash + if (commitToAllKey) { + await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, crawlId); + } + } + } + + // COUNT STATS + async addStats( + isDupe: boolean, + size: number, + crawlId?: string, + commitToAllKey = false, + ) { + crawlId = crawlId || this.crawlId; + if (isDupe) { + await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "uniqSize", size); + if (commitToAllKey) { + await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "uniqSize", size); + } + } + await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "totalSize", size); + await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "totalUrls", 1); + if (commitToAllKey) { + await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "totalSize", size); + await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "totalUrls", 1); + } + } + + async addRemoveCrawlCounts(crawlId: string, remove = false) { + // add or remove counts + const factor = remove ? -1 : 1; + const counts = await this.dedupeRedis.hgetall(`h:${crawlId}:counts`); + for (const [key, value] of Object.entries(counts)) { + await this.dedupeRedis.hincrby( + DUPE_ALL_COUNTS, + key, + Number(value) * factor, + ); } } @@ -444,6 +486,9 @@ export class RedisDedupeIndex { } await this.dedupeRedis.del(`h:${crawlId}`, `c:${crawlId}:wacz`); await this.dedupeRedis.srem(DUPE_ALL_CRAWLS, crawlId); + + // remove counts + await this.addRemoveCrawlCounts(crawlId, true); } } } diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 7c26b60c..7bfdb7f8 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -138,9 +138,15 @@ export class WARCWriter implements IndexerOffsetLength { responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined, + callback: ((length: number, offset: number) => void) | undefined, ) { this.addToQueue(() => - this._writeRecordPair(responseRecord, requestRecord, responseSerializer), + this._writeRecordPair( + responseRecord, + requestRecord, + responseSerializer, + callback, + ), ); } @@ -148,6 +154,7 @@ export class WARCWriter implements IndexerOffsetLength { responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined, + callback: ((length: number, offset: number) => void) | undefined, ) { const opts = this.useSHA1 ? { @@ -171,6 +178,10 @@ export class WARCWriter implements IndexerOffsetLength { responseSerializer, ); + if (callback) { + callback(this.recordLength, this.offset); + } + this._writeCDX(responseRecord); if (requestRecord.httpHeaders?.method !== "GET") { diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 2c2526d1..6531ce57 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -77,6 +77,26 @@ function loadDataPackageRelated(name) { return dataPackageJSON.relation; } +async function redisGetHash(key, db=0) { + const redis = new Redis(`redis://127.0.0.1:37379/${db}`, { lazyConnect: true, retryStrategy: () => null }); + + await redis.connect({maxRetriesPerRequest: 50}); + + return await redis.hgetall(key); +} + +async function checkSizeStats(numUniq, key, db, minSizeDiff) { + const result = await redisGetHash(key, db); + console.log(numUniq, result); + expect(numUniq).toBeLessThan(Number(result.totalUrls)); + + const uniqSize = Number(result.uniqSize); + const totalSize = Number(result.totalSize); + + expect(uniqSize).toBeLessThan(totalSize); + expect(totalSize - uniqSize).toBeGreaterThan(minSizeDiff); +} + test("check revisit records written on duplicate crawl, same collection, no wacz", async () => { const collName = "dedupe-test-same-coll"; @@ -122,6 +142,8 @@ test("check revisit records written on duplicate crawl, same collection, no wacz expect(response).toBe(revisit); numResponses = response; + + await checkSizeStats(numResponses, "allcounts", 0, 180000); }); @@ -167,6 +189,8 @@ test("check revisit records written on duplicate crawl, different collections, w expect(response).toBe(revisit); numResponses = response; + + await checkSizeStats(numResponses, "allcounts", 1, 48400000); }); @@ -201,6 +225,8 @@ test("verify crawl with imported dupe index has same dupes as dedupe against ori // matches same number of revisits as original expect(revisit).toBe(numResponses); + + await checkSizeStats(numResponses, "allcounts", 2, 48400000); }); test("test requires in datapackage.json of wacz deduped against previous crawl", () => { From 1a8fa632dda448b929effb7264ca0fd9c5d01e8f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 10 Dec 2025 15:18:59 -0800 Subject: [PATCH 25/29] uniq -> unique add 'removable' count for number of crawls that can be removed from the index --- src/indexer.ts | 7 +++---- src/util/state.ts | 24 +++++++++++++++++++----- tests/dedupe-basic.test.js | 6 +++--- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index 1f15ff6f..70094129 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -124,10 +124,9 @@ export class CrawlIndexer { } if (params.removing) { - const removeset = await dedupeIndex.getRemoveSet(); - if (removeset.size > 0) { - await dedupeIndex.removeCrawlIds(removeset); - } + await dedupeIndex.purgeUnusedCrawls(); + } else { + await dedupeIndex.countUnusedCrawls(); } logger.info("Done!"); diff --git a/src/util/state.ts b/src/util/state.ts index 1b825d87..baaaa9d6 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -351,9 +351,9 @@ export class RedisDedupeIndex { ) { crawlId = crawlId || this.crawlId; if (isDupe) { - await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "uniqSize", size); + await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "uniqueSize", size); if (commitToAllKey) { - await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "uniqSize", size); + await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "uniqueSize", size); } } await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "totalSize", size); @@ -451,10 +451,24 @@ export class RedisDedupeIndex { await this.dedupeRedis.sadd("noremove", crawlId); } - async getRemoveSet() { - const removeSet = await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"); + async purgeUnusedCrawls() { + const removeSet = new Set( + await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"), + ); + + if (removeSet.size > 0) { + await this.removeCrawlIds(removeSet); + } + await this.dedupeRedis.del("noremove"); - return new Set(removeSet); + await this.dedupeRedis.hset(DUPE_ALL_COUNTS, "removable", 0); + } + + async countUnusedCrawls() { + const removeSet = new Set( + await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"), + ); + await this.dedupeRedis.hset(DUPE_ALL_COUNTS, "removable", removeSet.size); } async removeCrawlIds(toRemove: Set) { diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 6531ce57..91a57c2c 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -90,11 +90,11 @@ async function checkSizeStats(numUniq, key, db, minSizeDiff) { console.log(numUniq, result); expect(numUniq).toBeLessThan(Number(result.totalUrls)); - const uniqSize = Number(result.uniqSize); + const uniqueSize = Number(result.uniqueSize); const totalSize = Number(result.totalSize); - expect(uniqSize).toBeLessThan(totalSize); - expect(totalSize - uniqSize).toBeGreaterThan(minSizeDiff); + expect(uniqueSize).toBeLessThan(totalSize); + expect(totalSize - uniqueSize).toBeGreaterThan(minSizeDiff); } test("check revisit records written on duplicate crawl, same collection, no wacz", async () => { From 60c9b7d5d067a795d3180b8726d0787628fc0d2b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 10 Dec 2025 19:01:37 -0800 Subject: [PATCH 26/29] update purging of crawls to readd/recommit from added crawls, instead of removing hashes from removed crawls, as hashes may be present in other crawls remove crawl-specific keys for removed crawls --- src/indexer.ts | 2 +- src/util/state.ts | 83 ++++++++++++++++++++++++----------------------- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index 70094129..6416fd25 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -74,7 +74,7 @@ export class CrawlIndexer { crawlId: params.sourceCrawlId, })) { await dedupeIndex.queueImportSource(entry.name, JSON.stringify(entry)); - if (params.removing && entry.crawlId) { + if (entry.crawlId) { await dedupeIndex.markNotRemoved(entry.crawlId); } } diff --git a/src/util/state.ts b/src/util/state.ts index baaaa9d6..ed8fed26 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -280,24 +280,23 @@ export class RedisDedupeIndex { // COMMIT DEDUPE TO SHARED INDEX - async commitDedupeDone() { - for await (const hashes of this.dedupeRedis.hscanStream( - `h:${this.crawlId}`, - )) { + async commitDedupeDone(crawlId?: string) { + crawlId = crawlId || this.crawlId; + for await (const hashes of this.dedupeRedis.hscanStream(`h:${crawlId}`)) { let isValue = false; for (const hash of hashes) { if (!isValue) { - await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, this.crawlId); + await this.dedupeRedis.hsetnx(DUPE_ALL_HASH_KEY, hash, crawlId); } isValue = !isValue; } } // add to crawls list - await this.dedupeRedis.sadd(DUPE_ALL_CRAWLS, this.crawlId); + await this.dedupeRedis.sadd(DUPE_ALL_CRAWLS, crawlId); // add counts - await this.addRemoveCrawlCounts(this.crawlId); + await this.addRemoveCrawlCounts(crawlId); } // GET OR ADD INDIVIDUAL HASHES @@ -452,45 +451,47 @@ export class RedisDedupeIndex { } async purgeUnusedCrawls() { - const removeSet = new Set( - await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"), + const noRemoveSet = new Set( + await this.dedupeRedis.smembers("noremove"), ); - if (removeSet.size > 0) { - await this.removeCrawlIds(removeSet); - } + await this.clearAndReadd(noRemoveSet); await this.dedupeRedis.del("noremove"); - await this.dedupeRedis.hset(DUPE_ALL_COUNTS, "removable", 0); } async countUnusedCrawls() { - const removeSet = new Set( - await this.dedupeRedis.sdiff(DUPE_ALL_CRAWLS, "noremove"), - ); - await this.dedupeRedis.hset(DUPE_ALL_COUNTS, "removable", removeSet.size); + const removable = + (await this.dedupeRedis.scard(DUPE_ALL_CRAWLS)) - + (await this.dedupeRedis.scard("noremove")); + await this.dedupeRedis.del("noremove"); + await this.dedupeRedis.hset(DUPE_ALL_COUNTS, "removable", removable); } - async removeCrawlIds(toRemove: Set) { - for await (const hashes of this.dedupeRedis.hscanStream( - DUPE_ALL_HASH_KEY, - )) { - let isValue = false; - let key = ""; - for (const hash of hashes) { - if (!isValue) { - key = hash; - } - if (key && isValue && toRemove.has(hash)) { - await this.dedupeRedis.hdel(DUPE_ALL_HASH_KEY, key); - } - isValue = !isValue; - } + async clearAndReadd(readdCrawls: Set) { + const TO_REMOVE_CRAWLS = "to-remove-crawls"; + + await this.dedupeRedis.rename(DUPE_ALL_CRAWLS, TO_REMOVE_CRAWLS); + await this.dedupeRedis.del(DUPE_ALL_HASH_KEY); + await this.dedupeRedis.del(DUPE_ALL_COUNTS); + + // readd all crawls that should be kept + for (const crawlId of readdCrawls) { + await this.commitDedupeDone(crawlId); + await this.dedupeRedis.srem(TO_REMOVE_CRAWLS, crawlId); } - for (const crawlId of toRemove) { - const allWACZ = await this.dedupeRedis.lrange(`c:${crawlId}:wacz`, 0, -1); - for (const waczdata of allWACZ) { + // clear data for remaining + while (true) { + const crawlId = await this.dedupeRedis.spop(TO_REMOVE_CRAWLS); + if (!crawlId) { + break; + } + while (true) { + const waczdata = await this.dedupeRedis.lpop(`c:${crawlId}:wacz`); + if (!waczdata) { + break; + } try { const { filename } = JSON.parse(waczdata); await this.dedupeRedis.srem(this.sourceDone, filename); @@ -498,12 +499,14 @@ export class RedisDedupeIndex { // ignore } } - await this.dedupeRedis.del(`h:${crawlId}`, `c:${crawlId}:wacz`); - await this.dedupeRedis.srem(DUPE_ALL_CRAWLS, crawlId); - - // remove counts - await this.addRemoveCrawlCounts(crawlId, true); + await this.dedupeRedis.del( + `h:${crawlId}`, + `c:${crawlId}:wacz`, + `h:${crawlId}:counts`, + ); } + + await this.dedupeRedis.del(TO_REMOVE_CRAWLS); } } From 1eba37aea7a9e6981d0c265119ea369b02759529 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 10 Dec 2025 23:50:56 -0800 Subject: [PATCH 27/29] don't commit to all if will be purged anyway --- src/indexer.ts | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/indexer.ts b/src/indexer.ts index 6416fd25..8061293e 100644 --- a/src/indexer.ts +++ b/src/indexer.ts @@ -113,10 +113,18 @@ export class CrawlIndexer { filename, crawlIdReal, "gzip", + !params.removing, ); } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) { logger.debug("Processing CDX Index", { filename }); - await this.ingestCDXJ(dedupeIndex, loader, filename, crawlIdReal); + await this.ingestCDXJ( + dedupeIndex, + loader, + filename, + crawlIdReal, + "", + !params.removing, + ); } } @@ -140,7 +148,8 @@ export class CrawlIndexer { loader: WACZLoader, filename: string, crawlId: string, - compression?: string, + compression: string, + commitToAllkey: boolean, ) { let reader = await loader.loadFile(filename); @@ -182,12 +191,12 @@ export class CrawlIndexer { // only adding originals to dedupe against, don't want to dedupe against existing revisits if (cdx.mime === "warc/revisit") { - await dedupeIndex.addStats(true, cdx.length, crawlId, true); + await dedupeIndex.addStats(true, cdx.length, crawlId, commitToAllkey); continue; } if (url && date && hash) { - await dedupeIndex.addHashDupe(hash, url, date, crawlId, true); + await dedupeIndex.addHashDupe(hash, url, date, crawlId, commitToAllkey); await dedupeIndex.addStats(false, cdx.length, crawlId, true); } else { logger.warn("Skipping invalid CDXJ, data missing", { From f00d791e1bff55dedf4f9cf8fcbee92d9d86778c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Dec 2025 10:37:53 -0800 Subject: [PATCH 28/29] fix size count typo, unique == not dupe! --- src/util/state.ts | 3 ++- tests/dedupe-basic.test.js | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index ed8fed26..7165b5c2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -349,7 +349,8 @@ export class RedisDedupeIndex { commitToAllKey = false, ) { crawlId = crawlId || this.crawlId; - if (isDupe) { + // if not a dupe, add to unique size count + if (!isDupe) { await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "uniqueSize", size); if (commitToAllKey) { await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "uniqueSize", size); diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 91a57c2c..52e8240b 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -143,7 +143,7 @@ test("check revisit records written on duplicate crawl, same collection, no wacz numResponses = response; - await checkSizeStats(numResponses, "allcounts", 0, 180000); + await checkSizeStats(numResponses, "allcounts", 0, 10000); }); @@ -190,7 +190,7 @@ test("check revisit records written on duplicate crawl, different collections, w numResponses = response; - await checkSizeStats(numResponses, "allcounts", 1, 48400000); + await checkSizeStats(numResponses, "allcounts", 1, 27000); }); @@ -226,7 +226,7 @@ test("verify crawl with imported dupe index has same dupes as dedupe against ori // matches same number of revisits as original expect(revisit).toBe(numResponses); - await checkSizeStats(numResponses, "allcounts", 2, 48400000); + await checkSizeStats(numResponses, "allcounts", 2, 27000); }); test("test requires in datapackage.json of wacz deduped against previous crawl", () => { From 40983f1670bea4eb2f966e5f76f522afeb8e754f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Dec 2025 10:46:23 -0800 Subject: [PATCH 29/29] add urlNormalize to addHashDupe --- src/util/state.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/state.ts b/src/util/state.ts index 7165b5c2..5ba01a24 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -329,6 +329,7 @@ export class RedisDedupeIndex { crawlId?: string, commitToAllKey = false, ) { + url = normalizeUrl(url, normalizeUrlOpts); date = date.replace(/[^\d]/g, ""); hash = hash.split(":").at(-1)!; const val = `${this.dedupeKeyIndex} ${date} ${url}`;