From 7a0dda676336502ec613b5e22f3a79227646096c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jan 2025 17:11:48 -0800 Subject: [PATCH] don't count retried pages multiple times --- src/crawler.ts | 6 +++--- src/util/state.ts | 24 +++++++++++++----------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 1e84d6de..638bcba5 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1156,12 +1156,12 @@ self.__bx_behaviors.selectMainBehavior(); // if page loaded, considered page finished successfully // (even if behaviors timed out) - const { loadState, logDetails, depth, url } = data; + const { loadState, logDetails, depth, url, retry } = data; if (data.loadState >= LoadState.FULL_PAGE_LOADED) { logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus"); - await this.crawlState.markFinished(url); + await this.crawlState.markFinished(url, retry); if (this.healthChecker) { this.healthChecker.resetErrors(); @@ -1171,7 +1171,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.checkLimits(); } else { - await this.crawlState.markFailed(url); + await this.crawlState.markFailed(url, retry); if (this.healthChecker) { this.healthChecker.incError(); diff --git a/src/util/state.ts b/src/util/state.ts index 3f5ad807..37d595d8 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -35,6 +35,7 @@ export type QueueEntry = { extraHops: number; ts?: number; pageid?: string; + retry?: number; }; // ============================================================================ @@ -54,6 +55,7 @@ export class PageState { seedId: number; depth: number; extraHops: number; + retry: number; status: number; @@ -87,6 +89,7 @@ export class PageState { } this.pageid = redisData.pageid || uuidv4(); this.status = 0; + this.retry = redisData.retry || 0; } } @@ -115,13 +118,7 @@ declare module "ioredis" { uid: string, ): Result; - movefailed( - pkey: string, - fkey: string, - url: string, - value: string, - state: string, - ): Result; + movefailed(pkey: string, fkey: string, url: string): Result; requeuefailed( fkey: string, @@ -283,7 +280,6 @@ local json = redis.call('hget', KEYS[1], ARGV[1]); if json then local data = cjson.decode(json); - data[ARGV[3]] = ARGV[2]; json = cjson.encode(data); redis.call('lpush', KEYS[2], json); @@ -375,15 +371,21 @@ return inx; ); } - async markFinished(url: string) { + async markFinished(url: string, retry: number) { await this.redis.hdel(this.pkey, url); + if (retry) { + return 1; + } return await this.redis.incr(this.dkey); } - async markFailed(url: string) { - await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed"); + async markFailed(url: string, retry: number) { + await this.redis.movefailed(this.pkey, this.fkey, url); + if (retry) { + return 1; + } return await this.redis.incr(this.dkey); }