Skip to content

Commit

Permalink
don't count retried pages multiple times
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Jan 19, 2025
1 parent 3539755 commit 7a0dda6
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 14 deletions.
6 changes: 3 additions & 3 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1156,12 +1156,12 @@ self.__bx_behaviors.selectMainBehavior();

// if page loaded, considered page finished successfully
// (even if behaviors timed out)
const { loadState, logDetails, depth, url } = data;
const { loadState, logDetails, depth, url, retry } = data;

if (data.loadState >= LoadState.FULL_PAGE_LOADED) {
logger.info("Page Finished", { loadState, ...logDetails }, "pageStatus");

await this.crawlState.markFinished(url);
await this.crawlState.markFinished(url, retry);

if (this.healthChecker) {
this.healthChecker.resetErrors();
Expand All @@ -1171,7 +1171,7 @@ self.__bx_behaviors.selectMainBehavior();

await this.checkLimits();
} else {
await this.crawlState.markFailed(url);
await this.crawlState.markFailed(url, retry);

if (this.healthChecker) {
this.healthChecker.incError();
Expand Down
24 changes: 13 additions & 11 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export type QueueEntry = {
extraHops: number;
ts?: number;
pageid?: string;
retry?: number;
};

// ============================================================================
Expand All @@ -54,6 +55,7 @@ export class PageState {
seedId: number;
depth: number;
extraHops: number;
retry: number;

status: number;

Expand Down Expand Up @@ -87,6 +89,7 @@ export class PageState {
}
this.pageid = redisData.pageid || uuidv4();
this.status = 0;
this.retry = redisData.retry || 0;
}
}

Expand Down Expand Up @@ -115,13 +118,7 @@ declare module "ioredis" {
uid: string,
): Result<void, Context>;

movefailed(
pkey: string,
fkey: string,
url: string,
value: string,
state: string,
): Result<void, Context>;
movefailed(pkey: string, fkey: string, url: string): Result<void, Context>;

requeuefailed(
fkey: string,
Expand Down Expand Up @@ -283,7 +280,6 @@ local json = redis.call('hget', KEYS[1], ARGV[1]);
if json then
local data = cjson.decode(json);
data[ARGV[3]] = ARGV[2];
json = cjson.encode(data);
redis.call('lpush', KEYS[2], json);
Expand Down Expand Up @@ -375,15 +371,21 @@ return inx;
);
}

async markFinished(url: string) {
async markFinished(url: string, retry: number) {
await this.redis.hdel(this.pkey, url);

if (retry) {
return 1;
}
return await this.redis.incr(this.dkey);
}

async markFailed(url: string) {
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
async markFailed(url: string, retry: number) {
await this.redis.movefailed(this.pkey, this.fkey, url);

if (retry) {
return 1;
}
return await this.redis.incr(this.dkey);
}

Expand Down

0 comments on commit 7a0dda6

Please sign in to comment.