From 449553260648b09ac2b9950adcfcb997ee175a7c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Jun 2024 09:16:24 -0700 Subject: [PATCH] Always download PDF + non HTML page cleanup + enterprise policy cleanup (#629) Adds enterprise policy to always download PDF and sets download dir to /dev/null Moves policies to chromium.json and brave.json for clarity Further cleanup of non-HTML loading path: - sets downloadResponse when page load is aborted but response is actually download - sets firstResponse when first response finishes, but page doesn't fully load - logs that non-HTML pages skip all post-crawl behaviors in one place - move page extra delay to separate awaitPageExtraDelay() function, applied for all pages (while post-load delay only applied to HTML pages) --------- Co-authored-by: Tessa Walsh --- .../{brave-default.json => brave.json} | 3 +- ...down-profilebrowser.json => chromium.json} | 5 +- src/crawler.ts | 342 ++++++++++-------- src/replaycrawler.ts | 2 + src/util/recorder.ts | 21 +- src/util/reqresp.ts | 19 +- tests/non-html-crawl.test.js | 174 +++++++++ tests/pdf-crawl.test.js | 61 ---- 8 files changed, 393 insertions(+), 234 deletions(-) rename config/policies/{brave-default.json => brave.json} (63%) rename config/policies/{lockdown-profilebrowser.json => chromium.json} (70%) create mode 100644 tests/non-html-crawl.test.js delete mode 100644 tests/pdf-crawl.test.js diff --git a/config/policies/brave-default.json b/config/policies/brave.json similarity index 63% rename from config/policies/brave-default.json rename to config/policies/brave.json index aac2fc245..fec2906b1 100644 --- a/config/policies/brave-default.json +++ b/config/policies/brave.json @@ -2,5 +2,6 @@ "BraveRewardsDisabled": true, "BraveWalletDisabled": true, "BraveVPNDisabled": 1, - "BraveAIChatEnabled": false + "BraveAIChatEnabled": false, + "TorDisabled": true } diff --git a/config/policies/lockdown-profilebrowser.json b/config/policies/chromium.json similarity index 70% rename from config/policies/lockdown-profilebrowser.json rename to config/policies/chromium.json index 0ef3f4aa1..2b7695641 100644 --- a/config/policies/lockdown-profilebrowser.json +++ b/config/policies/chromium.json @@ -1,10 +1,11 @@ { + "AlwaysOpenPdfExternally": true, "NewTabPageLocation": "about:blank", "RestoreOnStartup": 5, "IncognitoModeAvailability": 1, - "TorDisabled": true, "AllowFileSelectionDialogs": false, "URLBlocklist": [ "file://*" - ] + ], + "DownloadDirectory": "/dev/null" } diff --git a/src/crawler.ts b/src/crawler.ts index c7a11abc2..26a28f970 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -51,12 +51,19 @@ import { import { AdBlockRules, BlockRules } from "./util/blockrules.js"; import { OriginOverride } from "./util/originoverride.js"; -import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core"; +import { + CDPSession, + Frame, + HTTPRequest, + HTTPResponse, + Page, + Protocol, +} from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; -import { isHTMLContentType } from "./util/reqresp.js"; +import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js"; import { initProxy } from "./util/proxy.js"; const behaviors = fs.readFileSync( @@ -842,7 +849,7 @@ self.__bx_behaviors.selectMainBehavior(); ); if (mime) { data.mime = mime; - data.isHTMLPage = isHTMLContentType(mime); + data.isHTMLPage = isHTMLMime(mime); } if (fetched) { data.loadState = LoadState.FULL_PAGE_LOADED; @@ -872,18 +879,21 @@ self.__bx_behaviors.selectMainBehavior(); data.favicon = await this.getFavicon(page, logDetails); await this.doPostLoadActions(opts); + + await this.awaitPageExtraDelay(opts); } async doPostLoadActions(opts: WorkerState, saveOutput = false) { const { page, cdp, data, workerid } = opts; const { url } = data; + if (!data.isHTMLPage) { + return; + } + const logDetails = { page: url, workerid }; if (this.params.screenshot && this.screenshotWriter) { - if (!data.isHTMLPage) { - logger.debug("Skipping screenshots for non-HTML page", logDetails); - } const screenshots = new Screenshots({ browser: this.browser, page, @@ -903,7 +913,7 @@ self.__bx_behaviors.selectMainBehavior(); let textextract = null; - if (data.isHTMLPage && this.textWriter) { + if (this.textWriter) { textextract = new TextExtractViaSnapshot(cdp, { writer: this.textWriter, url, @@ -923,13 +933,7 @@ self.__bx_behaviors.selectMainBehavior(); data.loadState = LoadState.EXTRACTION_DONE; if (this.params.behaviorOpts && data.status < 400) { - if (!data.isHTMLPage) { - logger.debug( - "Skipping behaviors for non-HTML page", - logDetails, - "behavior", - ); - } else if (data.skipBehaviors) { + if (data.skipBehaviors) { logger.info("Skipping behaviors for slow page", logDetails, "behavior"); } else { const res = await timedRun( @@ -958,8 +962,17 @@ self.__bx_behaviors.selectMainBehavior(); } } } + } + async awaitPageExtraDelay(opts: WorkerState) { if (this.params.pageExtraDelay) { + const { + data: { url: page }, + workerid, + } = opts; + + const logDetails = { page, workerid }; + logger.info( `Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, logDetails, @@ -1704,109 +1717,71 @@ self.__bx_behaviors.selectMainBehavior(); const failCrawlOnError = depth === 0 && this.params.failOnFailedSeed; - let ignoreAbort = false; - - // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF), - // if so, don't report as an error + // Attempt to load the page: + // - Already tried direct fetch w/o browser before getting here, and that resulted in an HTML page or non-200 response + // so now loading using the browser + // - If page.load() fails, but downloadResponse is set, then its a download, consider successful + // set page status to FULL_PAGE_LOADED (2) + // - If page.load() fails, but firstResponse is set to CONTENT_LOADED (1) state, + // consider a slow page, proceed to link extraction, but skip behaviors, issue warning + // - If page.load() fails otherwise and if failOnFailedSeed is set, fail crawl, otherwise fail page + // - If page.load() succeeds, check if page url is a chrome-error:// page, fail page (and or crawl if failOnFailedSeed and seed) + // - If at least one response, check if HTML, proceed with post-crawl actions only if HTML. + + let downloadResponse: HTTPResponse | null = null; + let firstResponse: HTTPResponse | null = null; + let fullLoadedResponse: HTTPResponse | null = null; + + // Detect if failure is actually caused by trying to load a non-page (eg. downloadable PDF), + // store the downloadResponse, if any page.once("requestfailed", (req: HTTPRequest) => { - ignoreAbort = shouldIgnoreAbort(req, data); + downloadResponse = getDownloadResponse(req); }); - let isHTMLPage = data.isHTMLPage; + // store the first successful non-redirect response, even if page doesn't load fully + const waitFirstResponse = (resp: HTTPResponse) => { + firstResponse = resp; + if (!isRedirectStatus(firstResponse.status())) { + // don't listen to any additional responses + page.off("response", waitFirstResponse); + } + }; - if (isHTMLPage) { - page.once("domcontentloaded", () => { - data.loadState = LoadState.CONTENT_LOADED; - }); - } + page.on("response", waitFirstResponse); + + // store that domcontentloaded was finished + page.once("domcontentloaded", () => { + data.loadState = LoadState.CONTENT_LOADED; + }); - const gotoOpts = isHTMLPage + const gotoOpts = data.isHTMLPage ? this.gotoOpts : { waitUntil: "domcontentloaded" }; logger.info("Awaiting page load", logDetails); try { - const resp = await page.goto(url, gotoOpts); - - if (!resp) { - throw new Error("page response missing"); - } - - const respUrl = resp.url(); - const isChromeError = page.url().startsWith("chrome-error://"); - - if (depth === 0 && !isChromeError && respUrl !== url) { - data.seedId = await this.crawlState.addExtraSeed( - this.seeds, - this.numOriginalSeeds, - data.seedId, - respUrl, - ); - logger.info("Seed page redirected, adding redirected seed", { - origUrl: url, - newUrl: respUrl, - seedId: data.seedId, - }); - } - - const status = resp.status(); - data.status = status; - - let failed = isChromeError; - - if (this.params.failOnInvalidStatus && status >= 400) { - // Handle 4xx or 5xx response as a page load error - failed = true; - } - - if (failed) { - if (failCrawlOnError) { - logger.fatal( - "Seed Page Load Error, failing crawl", - { - status, - ...logDetails, - }, - "general", - 1, - ); - } else { - logger.error( - isChromeError ? "Page Crashed on Load" : "Page Invalid Status", - { - status, - ...logDetails, - }, - ); - throw new Error("logged"); - } - } - - const contentType = resp.headers()["content-type"]; - - isHTMLPage = isHTMLContentType(contentType); - - if (contentType) { - data.mime = contentType.split(";")[0]; - } + // store the page load response when page fully loads + fullLoadedResponse = await page.goto(url, gotoOpts); } catch (e) { if (!(e instanceof Error)) { throw e; } const msg = e.message || ""; - if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { + + // got firstResponse and content loaded, not a failure + if (firstResponse && data.loadState == LoadState.CONTENT_LOADED) { // if timeout error, and at least got to content loaded, continue on - if ( - e.name === "TimeoutError" && - data.loadState == LoadState.CONTENT_LOADED - ) { - logger.warn("Page Loading Slowly, skipping behaviors", { + logger.warn( + "Page load timed out, loading but slowly, skipping behaviors", + { msg, ...logDetails, - }); - data.skipBehaviors = true; - } else if (failCrawlOnError) { + }, + ); + data.skipBehaviors = true; + } else if (!downloadResponse) { + if (failCrawlOnError) { // if fail on error, immediately fail here logger.fatal( "Page Load Timeout, failing crawl", @@ -1817,64 +1792,127 @@ self.__bx_behaviors.selectMainBehavior(); "general", 1, ); - } else { - // log if not already log and rethrow - if (msg !== "logged") { - const loadState = data.loadState; - if (loadState >= LoadState.CONTENT_LOADED) { - logger.warn("Page Load Timeout, skipping further processing", { - msg, - loadState, - ...logDetails, - }); - } else { - logger.error("Page Load Failed, skipping page", { - msg, - loadState, - ...logDetails, - }); - } - e.message = "logged"; - } - throw e; + // log if not already log and rethrow, consider page failed + } else if (msg !== "logged") { + logger.error("Page Load Failed, skipping page", { + msg, + loadState: data.loadState, + ...logDetails, + }); + e.message = "logged"; } + throw e; } } - data.loadState = LoadState.FULL_PAGE_LOADED; + const resp = fullLoadedResponse || downloadResponse || firstResponse; - data.isHTMLPage = isHTMLPage; + if (!resp) { + throw new Error("no response for page load, assuming failed"); + } - if (isHTMLPage) { - const frames = await page.frames(); + const respUrl = resp.url(); + const isChromeError = page.url().startsWith("chrome-error://"); - const filteredFrames = await Promise.allSettled( - frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)), + if (depth === 0 && !isChromeError && respUrl !== url && !downloadResponse) { + data.seedId = await this.crawlState.addExtraSeed( + this.seeds, + this.numOriginalSeeds, + data.seedId, + respUrl, ); + logger.info("Seed page redirected, adding redirected seed", { + origUrl: url, + newUrl: respUrl, + seedId: data.seedId, + }); + } - data.filteredFrames = filteredFrames - .filter((x: PromiseSettledResult) => { - if (x.status === "fulfilled") { - return !!x.value; - } - logger.warn("Error in iframe check", { - reason: x.reason, + const status = resp.status(); + data.status = status; + + let failed = isChromeError; + + if (this.params.failOnInvalidStatus && status >= 400) { + // Handle 4xx or 5xx response as a page load error + failed = true; + } + + if (failed) { + if (failCrawlOnError) { + logger.fatal( + "Seed Page Load Error, failing crawl", + { + status, ...logDetails, - }); - return false; - }) - .map((x) => (x as PromiseFulfilledResult).value); + }, + "general", + 1, + ); + } else { + logger.error( + isChromeError ? "Page Crashed on Load" : "Page Invalid Status", + { + status, + ...logDetails, + }, + ); + throw new Error("logged"); + } + } + + const contentType = resp.headers()["content-type"]; - //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails)); + if (contentType) { + data.mime = contentType.split(";")[0]; + data.isHTMLPage = isHTMLMime(data.mime); } else { - data.filteredFrames = []; + // guess that its html if it fully loaded as a page + data.isHTMLPage = !!fullLoadedResponse; + } + + // Full Page Loaded if: + // - it was a download response + // - page.load() succeeded + // but not: + // - if first response was received, but not fully loaded + if (fullLoadedResponse || downloadResponse) { + data.loadState = LoadState.FULL_PAGE_LOADED; } - if (!isHTMLPage) { - logger.debug("Skipping link extraction for non-HTML page", logDetails); + if (!data.isHTMLPage) { + data.filteredFrames = []; + + logger.info( + "Non-HTML Page URL, skipping all post-crawl actions", + { isDownload: !!downloadResponse, mime: data.mime, ...logDetails }, + "pageStatus", + ); return; } + // HTML Pages Only here + const frames = await page.frames(); + + const filteredFrames = await Promise.allSettled( + frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)), + ); + + data.filteredFrames = filteredFrames + .filter((x: PromiseSettledResult) => { + if (x.status === "fulfilled") { + return !!x.value; + } + logger.warn("Error in iframe check", { + reason: x.reason, + ...logDetails, + }); + return false; + }) + .map((x) => (x as PromiseFulfilledResult).value); + + //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails)); + const { seedId } = data; const seed = await this.crawlState.getSeedAt( @@ -2565,35 +2603,39 @@ self.__bx_behaviors.selectMainBehavior(); } } -function shouldIgnoreAbort(req: HTTPRequest, data: PageState) { +function getDownloadResponse(req: HTTPRequest) { try { + if (!req.isNavigationRequest()) { + return null; + } + const failure = req.failure(); const failureText = (failure && failure.errorText) || ""; if ( failureText !== "net::ERR_ABORTED" || req.resourceType() !== "document" ) { - return false; + return null; } const resp = req.response(); - const headers = resp && resp.headers(); - if (!headers) { - return false; + if (!resp) { + return null; } + const headers = resp.headers(); + if ( headers["content-disposition"] || - (headers["content-type"] && !headers["content-type"].startsWith("text/")) + (headers["content-type"] && !isHTMLMime(headers["content-type"])) ) { - data.status = resp.status(); - data.mime = headers["content-type"].split(";")[0]; - return true; + return resp; } } catch (e) { - return false; + console.log(e); + // ignore } - return false; + return null; } diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 8f4611ee4..29feff9b1 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -457,6 +457,8 @@ export class ReplayCrawler extends Crawler { await this.doPostLoadActions(opts, true); + await this.awaitPageExtraDelay(opts); + await this.compareScreenshots(page, data, url, date, workerid); await this.compareText(page, data, url, date); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 7fb1865e3..58c777efb 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -6,7 +6,7 @@ import PQueue from "p-queue"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; -import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; +import { RequestResponseInfo, isHTMLMime } from "./reqresp.js"; import { fetch, Response } from "undici"; @@ -90,6 +90,13 @@ export type DirectFetchRequest = { cdp: CDPSession; }; +// ================================================================= +export type DirectFetchResponse = { + fetched: boolean; + mime: string; + ts: Date; +}; + // ================================================================= export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & { cdp: CDPSession; @@ -1088,11 +1095,11 @@ export class Recorder { this.writer.writeRecordPair(responseRecord, requestRecord); } - async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{ - fetched: boolean; - mime: string; - ts: Date; - }> { + async directFetchCapture({ + url, + headers, + cdp, + }: DirectFetchRequest): Promise { const reqresp = new RequestResponseInfo("0"); const ts = new Date(); @@ -1125,7 +1132,7 @@ export class Recorder { mime = ct.split(";")[0]; } - return !isHTMLContentType(mime); + return !isHTMLMime(mime); }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 0d7e4e541..933b8da36 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -151,7 +151,7 @@ export class RequestResponseInfo { } isRedirectStatus() { - return this.status >= 300 && this.status < 400 && this.status !== 304; + return isRedirectStatus(this.status); } isSelfRedirect() { @@ -375,17 +375,10 @@ export class RequestResponseInfo { } } -export function isHTMLContentType(contentType: string | null) { - // just load if no content-type - if (!contentType) { - return true; - } - - const mime = contentType.split(";")[0]; - - if (HTML_TYPES.includes(mime)) { - return true; - } +export function isHTMLMime(mime: string) { + return HTML_TYPES.includes(mime); +} - return false; +export function isRedirectStatus(status: number) { + return status >= 300 && status < 400 && status !== 304; } diff --git a/tests/non-html-crawl.test.js b/tests/non-html-crawl.test.js new file mode 100644 index 000000000..83da93357 --- /dev/null +++ b/tests/non-html-crawl.test.js @@ -0,0 +1,174 @@ +import child_process from "child_process"; +import fs from "fs"; +import path from "path"; +import { WARCParser } from "warcio"; + +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; +const PDF_HTTP = PDF.replace("https", "http"); + +const XML = "https://webrecorder.net/feed.xml"; +const XML_REDIR = "https://www.webrecorder.net/feed.xml"; + +test("PDF: ensure pdf is crawled", () => { + child_process.execSync( + `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf` + ); +}); + +test("PDF: check that individual WARCs have PDF written as 200 response", async () => { + const archiveWarcLists = fs.readdirSync( + "test-crawls/collections/crawl-pdf/archive", + ); + + const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]); + + const nodeStream = fs.createReadStream(warcName); + + const parser = new WARCParser(nodeStream); + + let statusCode = -1; + + for await (const record of parser) { + if (record.warcType !== "response") { + continue; + } + + if (record.warcTargetURI === PDF) { + statusCode = record.httpHeaders.statusCode; + } + } + + expect(statusCode).toBe(200); +}); + +test("PDF: ensure pdf with redirect is crawled", () => { + child_process.execSync( + `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX` + ); +}); + +test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => { + expect( + fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"), + ).toBe(true); + + + const pages = fs + .readFileSync( + "test-crawls/collections/crawl-pdf/pages/pages.jsonl", + "utf8", + ) + .trim() + .split("\n"); + + expect(pages.length).toBe(3); + + const page = JSON.parse(pages[1]); + expect(page.url).toBe(PDF); + expect(page.status).toBe(200); + expect(page.mime).toBe("application/pdf"); + expect(page.loadState).toBe(2); + + const pageH = JSON.parse(pages[2]); + expect(pageH.url).toBe(PDF_HTTP); + expect(pageH.status).toBe(200); + expect(pageH.mime).toBe("application/pdf"); + expect(pageH.loadState).toBe(2); +}); + +test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => { + const filedata = fs.readFileSync( + "test-crawls/collections/crawl-pdf/indexes/index.cdxj", + { encoding: "utf-8" }, + ); + + const lines = filedata.trim().split("\n"); + const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); + + expect(cdxj.length).toBe(5); + + expect(cdxj[0].url).toBe(PDF_HTTP); + expect(cdxj[0].status).toBe("301"); + + expect(cdxj[1].url).toBe(PDF); + expect(cdxj[1].status).toBe("200"); + expect(cdxj[1].mime).toBe("application/pdf"); + + expect(cdxj[2].url).toBe(PDF); + expect(cdxj[2].status).toBe("200"); + expect(cdxj[2].mime).toBe("application/pdf"); + + expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP); + expect(cdxj[3].mime).toBe("application/json"); + + expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF); + expect(cdxj[4].mime).toBe("application/json"); +}); + +test("XML: ensure with and without redirect is crawled", () => { + child_process.execSync( + `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX` + ); +}); + +test("XML: check pages.jsonl file entry contains status code and mime type", () => { + expect( + fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"), + ).toBe(true); + + + const pages = fs + .readFileSync( + "test-crawls/collections/crawl-xml/pages/pages.jsonl", + "utf8", + ) + .trim() + .split("\n"); + + expect(pages.length).toBe(3); + + const page = JSON.parse(pages[1]); + expect(page.url).toBe(XML); + expect(page.status).toBe(200); + expect(page.mime).toBe("application/xml"); + expect(page.loadState).toBe(2); + + const pageH = JSON.parse(pages[2]); + expect(pageH.url).toBe(XML_REDIR); + expect(pageH.status).toBe(200); + expect(pageH.mime).toBe("application/xml"); + expect(pageH.loadState).toBe(2); +}); + +test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => { + const filedata = fs.readFileSync( + "test-crawls/collections/crawl-xml/indexes/index.cdxj", + { encoding: "utf-8" }, + ); + + const lines = filedata.trim().split("\n"); + const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); + + expect(cdxj.length).toBe(6); + + expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico"); + + expect(cdxj[1].url).toBe(XML); + expect(cdxj[1].status).toBe("200"); + expect(cdxj[1].mime).toBe("application/xml"); + + expect(cdxj[2].url).toBe(XML); + expect(cdxj[2].status).toBe("200"); + expect(cdxj[2].mime).toBe("application/xml"); + + expect(cdxj[3].url).toBe(XML_REDIR); + expect(cdxj[3].status).toBe("301"); + + expect(cdxj[4].url).toBe("urn:pageinfo:" + XML); + expect(cdxj[4].mime).toBe("application/json"); + + expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR); + expect(cdxj[5].mime).toBe("application/json"); +}); + + diff --git a/tests/pdf-crawl.test.js b/tests/pdf-crawl.test.js deleted file mode 100644 index 3bc6c0770..000000000 --- a/tests/pdf-crawl.test.js +++ /dev/null @@ -1,61 +0,0 @@ -import child_process from "child_process"; -import fs from "fs"; -import path from "path"; -import { WARCParser } from "warcio"; - -const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; - -test("ensure pdf is crawled", async () => { - child_process.execSync( - `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf` - ); -}); - -test("check that individual WARCs have PDF written as 200 response", async () => { - const archiveWarcLists = fs.readdirSync( - "test-crawls/collections/crawl-pdf/archive", - ); - - const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]); - - const nodeStream = fs.createReadStream(warcName); - - const parser = new WARCParser(nodeStream); - - let statusCode = -1; - - for await (const record of parser) { - if (record.warcType !== "response") { - continue; - } - - if (record.warcTargetURI === PDF) { - statusCode = record.httpHeaders.statusCode; - } - } - - expect(statusCode).toBe(200); -}); - - -test("check that the pages.jsonl file entry contains status code and mime type", () => { - expect( - fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"), - ).toBe(true); - - - const pages = fs - .readFileSync( - "test-crawls/collections/crawl-pdf/pages/pages.jsonl", - "utf8", - ) - .trim() - .split("\n"); - - expect(pages.length).toBe(2); - - const page = JSON.parse(pages[1]); - expect(page.url).toBe(PDF); - expect(page.status).toBe(200); - expect(page.mime).toBe("application/pdf"); -});