Skip to content

Commit 5d9c62e

Browse files
authored
Retry Failed Pages + Ignore Hashtags in Redirect Check (#739)
- Retry pages that are marked as failed once, at the end of the crawl, in case it was due to a timeout - Also, don't treat differences in hashtag between seed page loaded and actual URL as a redirect (eg. don't add as new seed)
1 parent bc4a958 commit 5d9c62e

File tree

2 files changed

+65
-6
lines changed

2 files changed

+65
-6
lines changed

src/crawler.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1938,10 +1938,15 @@ self.__bx_behaviors.selectMainBehavior();
19381938
throw new Error("no response for page load, assuming failed");
19391939
}
19401940

1941-
const respUrl = resp.url();
1941+
const respUrl = resp.url().split("#")[0];
19421942
const isChromeError = page.url().startsWith("chrome-error://");
19431943

1944-
if (depth === 0 && !isChromeError && respUrl !== url && !downloadResponse) {
1944+
if (
1945+
depth === 0 &&
1946+
!isChromeError &&
1947+
respUrl !== url.split("#")[0] &&
1948+
!downloadResponse
1949+
) {
19451950
data.seedId = await this.crawlState.addExtraSeed(
19461951
this.seeds,
19471952
this.numOriginalSeeds,

src/util/state.ts

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,13 @@ declare module "ioredis" {
123123
state: string,
124124
): Result<void, Context>;
125125

126+
requeuefailed(
127+
fkey: string,
128+
qkey: string,
129+
maxRetryPending: number,
130+
maxRegularDepth: number,
131+
): Result<number, Context>;
132+
126133
unlockpending(
127134
pkeyUrl: string,
128135
uid: string,
@@ -283,6 +290,27 @@ if json then
283290
redis.call('hdel', KEYS[1], ARGV[1]);
284291
end
285292
293+
`,
294+
});
295+
296+
redis.defineCommand("requeuefailed", {
297+
numberOfKeys: 2,
298+
lua: `
299+
local json = redis.call('rpop', KEYS[1]);
300+
301+
if json then
302+
local data = cjson.decode(json);
303+
data['retry'] = (data['retry'] or 0) + 1;
304+
if tonumber(data['retry']) <= tonumber(ARGV[1]) then
305+
json = cjson.encode(data);
306+
local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[2]);
307+
redis.call('zadd', KEYS[2], score, json);
308+
return 1;
309+
else
310+
return 2;
311+
end
312+
end
313+
return 0;
286314
`,
287315
});
288316

@@ -543,18 +571,44 @@ return inx;
543571
}
544572

545573
async nextFromQueue() {
546-
const json = await this._getNext();
574+
let json = await this._getNext();
575+
let retryFailed = false;
576+
577+
if (!json) {
578+
const res = await this.redis.requeuefailed(
579+
this.fkey,
580+
this.qkey,
581+
this.maxRetryPending,
582+
MAX_DEPTH,
583+
);
584+
585+
switch (res) {
586+
case 1:
587+
json = await this._getNext();
588+
retryFailed = true;
589+
break;
590+
591+
case 2:
592+
logger.debug("Did not retry failed, already retried", {}, "state");
593+
return null;
594+
}
595+
}
596+
597+
if (!json) {
598+
return null;
599+
}
600+
547601
let data;
548602

549603
try {
550604
data = JSON.parse(json);
551605
} catch (e) {
552-
logger.error("Invalid queued json", json, "redis");
606+
logger.error("Invalid queued json", json, "state");
553607
return null;
554608
}
555609

556-
if (!data) {
557-
return null;
610+
if (retryFailed) {
611+
logger.debug("Retring failed URL", { url: data.url }, "state");
558612
}
559613

560614
await this.markStarted(data.url);

0 commit comments

Comments
 (0)