From 449553260648b09ac2b9950adcfcb997ee175a7c Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Wed, 26 Jun 2024 09:16:24 -0700
Subject: [PATCH] Always download PDF + non HTML page cleanup + enterprise
 policy cleanup (#629)

Adds enterprise policy to always download PDF and sets download dir to
/dev/null
Moves policies to chromium.json and brave.json for clarity
Further cleanup of non-HTML loading path:
- sets downloadResponse when page load is aborted but response is
actually download
- sets firstResponse when first response finishes, but page doesn't
fully load
 - logs that non-HTML pages skip all post-crawl behaviors in one place
 - move page extra delay to separate awaitPageExtraDelay() function, applied for all pages (while post-load delay only applied to HTML pages)

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
---
 .../{brave-default.json => brave.json}        |   3 +-
 ...down-profilebrowser.json => chromium.json} |   5 +-
 src/crawler.ts                                | 342 ++++++++++--------
 src/replaycrawler.ts                          |   2 +
 src/util/recorder.ts                          |  21 +-
 src/util/reqresp.ts                           |  19 +-
 tests/non-html-crawl.test.js                  | 174 +++++++++
 tests/pdf-crawl.test.js                       |  61 ----
 8 files changed, 393 insertions(+), 234 deletions(-)
 rename config/policies/{brave-default.json => brave.json} (63%)
 rename config/policies/{lockdown-profilebrowser.json => chromium.json} (70%)
 create mode 100644 tests/non-html-crawl.test.js
 delete mode 100644 tests/pdf-crawl.test.js

diff --git a/config/policies/brave-default.json b/config/policies/brave.json
similarity index 63%
rename from config/policies/brave-default.json
rename to config/policies/brave.json
index aac2fc245..fec2906b1 100644
--- a/config/policies/brave-default.json
+++ b/config/policies/brave.json
@@ -2,5 +2,6 @@
     "BraveRewardsDisabled": true,
     "BraveWalletDisabled": true,
     "BraveVPNDisabled": 1,
-    "BraveAIChatEnabled": false
+    "BraveAIChatEnabled": false,
+    "TorDisabled": true
 }
diff --git a/config/policies/lockdown-profilebrowser.json b/config/policies/chromium.json
similarity index 70%
rename from config/policies/lockdown-profilebrowser.json
rename to config/policies/chromium.json
index 0ef3f4aa1..2b7695641 100644
--- a/config/policies/lockdown-profilebrowser.json
+++ b/config/policies/chromium.json
@@ -1,10 +1,11 @@
 {
+    "AlwaysOpenPdfExternally": true,
     "NewTabPageLocation": "about:blank",
     "RestoreOnStartup": 5,
     "IncognitoModeAvailability": 1,
-    "TorDisabled": true,
     "AllowFileSelectionDialogs": false,
     "URLBlocklist": [
         "file://*"
-    ]
+    ],
+    "DownloadDirectory": "/dev/null"
 }
diff --git a/src/crawler.ts b/src/crawler.ts
index c7a11abc2..26a28f970 100644
--- a/src/crawler.ts
+++ b/src/crawler.ts
@@ -51,12 +51,19 @@ import {
 import { AdBlockRules, BlockRules } from "./util/blockrules.js";
 import { OriginOverride } from "./util/originoverride.js";
 
-import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
+import {
+  CDPSession,
+  Frame,
+  HTTPRequest,
+  HTTPResponse,
+  Page,
+  Protocol,
+} from "puppeteer-core";
 import { Recorder } from "./util/recorder.js";
 import { SitemapReader } from "./util/sitemapper.js";
 import { ScopedSeed } from "./util/seeds.js";
 import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
-import { isHTMLContentType } from "./util/reqresp.js";
+import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
 import { initProxy } from "./util/proxy.js";
 
 const behaviors = fs.readFileSync(
@@ -842,7 +849,7 @@ self.__bx_behaviors.selectMainBehavior();
         );
         if (mime) {
           data.mime = mime;
-          data.isHTMLPage = isHTMLContentType(mime);
+          data.isHTMLPage = isHTMLMime(mime);
         }
         if (fetched) {
           data.loadState = LoadState.FULL_PAGE_LOADED;
@@ -872,18 +879,21 @@ self.__bx_behaviors.selectMainBehavior();
     data.favicon = await this.getFavicon(page, logDetails);
 
     await this.doPostLoadActions(opts);
+
+    await this.awaitPageExtraDelay(opts);
   }
 
   async doPostLoadActions(opts: WorkerState, saveOutput = false) {
     const { page, cdp, data, workerid } = opts;
     const { url } = data;
 
+    if (!data.isHTMLPage) {
+      return;
+    }
+
     const logDetails = { page: url, workerid };
 
     if (this.params.screenshot && this.screenshotWriter) {
-      if (!data.isHTMLPage) {
-        logger.debug("Skipping screenshots for non-HTML page", logDetails);
-      }
       const screenshots = new Screenshots({
         browser: this.browser,
         page,
@@ -903,7 +913,7 @@ self.__bx_behaviors.selectMainBehavior();
 
     let textextract = null;
 
-    if (data.isHTMLPage && this.textWriter) {
+    if (this.textWriter) {
       textextract = new TextExtractViaSnapshot(cdp, {
         writer: this.textWriter,
         url,
@@ -923,13 +933,7 @@ self.__bx_behaviors.selectMainBehavior();
     data.loadState = LoadState.EXTRACTION_DONE;
 
     if (this.params.behaviorOpts && data.status < 400) {
-      if (!data.isHTMLPage) {
-        logger.debug(
-          "Skipping behaviors for non-HTML page",
-          logDetails,
-          "behavior",
-        );
-      } else if (data.skipBehaviors) {
+      if (data.skipBehaviors) {
         logger.info("Skipping behaviors for slow page", logDetails, "behavior");
       } else {
         const res = await timedRun(
@@ -958,8 +962,17 @@ self.__bx_behaviors.selectMainBehavior();
         }
       }
     }
+  }
 
+  async awaitPageExtraDelay(opts: WorkerState) {
     if (this.params.pageExtraDelay) {
+      const {
+        data: { url: page },
+        workerid,
+      } = opts;
+
+      const logDetails = { page, workerid };
+
       logger.info(
         `Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
         logDetails,
@@ -1704,109 +1717,71 @@ self.__bx_behaviors.selectMainBehavior();
 
     const failCrawlOnError = depth === 0 && this.params.failOnFailedSeed;
 
-    let ignoreAbort = false;
-
-    // Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
-    // if so, don't report as an error
+    // Attempt to load the page:
+    // - Already tried direct fetch w/o browser before getting here, and that resulted in an HTML page or non-200 response
+    //   so now loading using the browser
+    // - If page.load() fails, but downloadResponse is set, then its a download, consider successful
+    //   set page status to FULL_PAGE_LOADED (2)
+    // - If page.load() fails, but firstResponse is set to CONTENT_LOADED (1) state,
+    //   consider a slow page, proceed to link extraction, but skip behaviors, issue warning
+    // - If page.load() fails otherwise and if failOnFailedSeed is set, fail crawl, otherwise fail page
+    // - If page.load() succeeds, check if page url is a chrome-error:// page, fail page (and or crawl if failOnFailedSeed and seed)
+    // - If at least one response, check if HTML, proceed with post-crawl actions only if HTML.
+
+    let downloadResponse: HTTPResponse | null = null;
+    let firstResponse: HTTPResponse | null = null;
+    let fullLoadedResponse: HTTPResponse | null = null;
+
+    // Detect if failure is actually caused by trying to load a non-page (eg. downloadable PDF),
+    // store the downloadResponse, if any
     page.once("requestfailed", (req: HTTPRequest) => {
-      ignoreAbort = shouldIgnoreAbort(req, data);
+      downloadResponse = getDownloadResponse(req);
     });
 
-    let isHTMLPage = data.isHTMLPage;
+    // store the first successful non-redirect response, even if page doesn't load fully
+    const waitFirstResponse = (resp: HTTPResponse) => {
+      firstResponse = resp;
+      if (!isRedirectStatus(firstResponse.status())) {
+        // don't listen to any additional responses
+        page.off("response", waitFirstResponse);
+      }
+    };
 
-    if (isHTMLPage) {
-      page.once("domcontentloaded", () => {
-        data.loadState = LoadState.CONTENT_LOADED;
-      });
-    }
+    page.on("response", waitFirstResponse);
+
+    // store that domcontentloaded was finished
+    page.once("domcontentloaded", () => {
+      data.loadState = LoadState.CONTENT_LOADED;
+    });
 
-    const gotoOpts = isHTMLPage
+    const gotoOpts = data.isHTMLPage
       ? this.gotoOpts
       : { waitUntil: "domcontentloaded" };
 
     logger.info("Awaiting page load", logDetails);
 
     try {
-      const resp = await page.goto(url, gotoOpts);
-
-      if (!resp) {
-        throw new Error("page response missing");
-      }
-
-      const respUrl = resp.url();
-      const isChromeError = page.url().startsWith("chrome-error://");
-
-      if (depth === 0 && !isChromeError && respUrl !== url) {
-        data.seedId = await this.crawlState.addExtraSeed(
-          this.seeds,
-          this.numOriginalSeeds,
-          data.seedId,
-          respUrl,
-        );
-        logger.info("Seed page redirected, adding redirected seed", {
-          origUrl: url,
-          newUrl: respUrl,
-          seedId: data.seedId,
-        });
-      }
-
-      const status = resp.status();
-      data.status = status;
-
-      let failed = isChromeError;
-
-      if (this.params.failOnInvalidStatus && status >= 400) {
-        // Handle 4xx or 5xx response as a page load error
-        failed = true;
-      }
-
-      if (failed) {
-        if (failCrawlOnError) {
-          logger.fatal(
-            "Seed Page Load Error, failing crawl",
-            {
-              status,
-              ...logDetails,
-            },
-            "general",
-            1,
-          );
-        } else {
-          logger.error(
-            isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
-            {
-              status,
-              ...logDetails,
-            },
-          );
-          throw new Error("logged");
-        }
-      }
-
-      const contentType = resp.headers()["content-type"];
-
-      isHTMLPage = isHTMLContentType(contentType);
-
-      if (contentType) {
-        data.mime = contentType.split(";")[0];
-      }
+      // store the page load response when page fully loads
+      fullLoadedResponse = await page.goto(url, gotoOpts);
     } catch (e) {
       if (!(e instanceof Error)) {
         throw e;
       }
       const msg = e.message || "";
-      if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
+
+      // got firstResponse and content loaded, not a failure
+      if (firstResponse && data.loadState == LoadState.CONTENT_LOADED) {
         // if timeout error, and at least got to content loaded, continue on
-        if (
-          e.name === "TimeoutError" &&
-          data.loadState == LoadState.CONTENT_LOADED
-        ) {
-          logger.warn("Page Loading Slowly, skipping behaviors", {
+        logger.warn(
+          "Page load timed out, loading but slowly, skipping behaviors",
+          {
             msg,
             ...logDetails,
-          });
-          data.skipBehaviors = true;
-        } else if (failCrawlOnError) {
+          },
+        );
+        data.skipBehaviors = true;
+      } else if (!downloadResponse) {
+        if (failCrawlOnError) {
           // if fail on error, immediately fail here
           logger.fatal(
             "Page Load Timeout, failing crawl",
@@ -1817,64 +1792,127 @@ self.__bx_behaviors.selectMainBehavior();
             "general",
             1,
           );
-        } else {
-          // log if not already log and rethrow
-          if (msg !== "logged") {
-            const loadState = data.loadState;
-            if (loadState >= LoadState.CONTENT_LOADED) {
-              logger.warn("Page Load Timeout, skipping further processing", {
-                msg,
-                loadState,
-                ...logDetails,
-              });
-            } else {
-              logger.error("Page Load Failed, skipping page", {
-                msg,
-                loadState,
-                ...logDetails,
-              });
-            }
-            e.message = "logged";
-          }
-          throw e;
+          // log if not already log and rethrow, consider page failed
+        } else if (msg !== "logged") {
+          logger.error("Page Load Failed, skipping page", {
+            msg,
+            loadState: data.loadState,
+            ...logDetails,
+          });
+          e.message = "logged";
         }
+        throw e;
       }
     }
 
-    data.loadState = LoadState.FULL_PAGE_LOADED;
+    const resp = fullLoadedResponse || downloadResponse || firstResponse;
 
-    data.isHTMLPage = isHTMLPage;
+    if (!resp) {
+      throw new Error("no response for page load, assuming failed");
+    }
 
-    if (isHTMLPage) {
-      const frames = await page.frames();
+    const respUrl = resp.url();
+    const isChromeError = page.url().startsWith("chrome-error://");
 
-      const filteredFrames = await Promise.allSettled(
-        frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
+    if (depth === 0 && !isChromeError && respUrl !== url && !downloadResponse) {
+      data.seedId = await this.crawlState.addExtraSeed(
+        this.seeds,
+        this.numOriginalSeeds,
+        data.seedId,
+        respUrl,
       );
+      logger.info("Seed page redirected, adding redirected seed", {
+        origUrl: url,
+        newUrl: respUrl,
+        seedId: data.seedId,
+      });
+    }
 
-      data.filteredFrames = filteredFrames
-        .filter((x: PromiseSettledResult<Frame | null>) => {
-          if (x.status === "fulfilled") {
-            return !!x.value;
-          }
-          logger.warn("Error in iframe check", {
-            reason: x.reason,
+    const status = resp.status();
+    data.status = status;
+
+    let failed = isChromeError;
+
+    if (this.params.failOnInvalidStatus && status >= 400) {
+      // Handle 4xx or 5xx response as a page load error
+      failed = true;
+    }
+
+    if (failed) {
+      if (failCrawlOnError) {
+        logger.fatal(
+          "Seed Page Load Error, failing crawl",
+          {
+            status,
             ...logDetails,
-          });
-          return false;
-        })
-        .map((x) => (x as PromiseFulfilledResult<Frame>).value);
+          },
+          "general",
+          1,
+        );
+      } else {
+        logger.error(
+          isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
+          {
+            status,
+            ...logDetails,
+          },
+        );
+        throw new Error("logged");
+      }
+    }
+
+    const contentType = resp.headers()["content-type"];
 
-      //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
+    if (contentType) {
+      data.mime = contentType.split(";")[0];
+      data.isHTMLPage = isHTMLMime(data.mime);
     } else {
-      data.filteredFrames = [];
+      // guess that its html if it fully loaded as a page
+      data.isHTMLPage = !!fullLoadedResponse;
+    }
+
+    // Full Page Loaded if:
+    // - it was a download response
+    // - page.load() succeeded
+    // but not:
+    // - if first response was received, but not fully loaded
+    if (fullLoadedResponse || downloadResponse) {
+      data.loadState = LoadState.FULL_PAGE_LOADED;
     }
 
-    if (!isHTMLPage) {
-      logger.debug("Skipping link extraction for non-HTML page", logDetails);
+    if (!data.isHTMLPage) {
+      data.filteredFrames = [];
+
+      logger.info(
+        "Non-HTML Page URL, skipping all post-crawl actions",
+        { isDownload: !!downloadResponse, mime: data.mime, ...logDetails },
+        "pageStatus",
+      );
       return;
     }
 
+    // HTML Pages Only here
+    const frames = await page.frames();
+
+    const filteredFrames = await Promise.allSettled(
+      frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
+    );
+
+    data.filteredFrames = filteredFrames
+      .filter((x: PromiseSettledResult<Frame | null>) => {
+        if (x.status === "fulfilled") {
+          return !!x.value;
+        }
+        logger.warn("Error in iframe check", {
+          reason: x.reason,
+          ...logDetails,
+        });
+        return false;
+      })
+      .map((x) => (x as PromiseFulfilledResult<Frame>).value);
+
+    //data.filteredFrames = await page.frames().filter(frame => this.shouldIncludeFrame(frame, logDetails));
+
     const { seedId } = data;
 
     const seed = await this.crawlState.getSeedAt(
@@ -2565,35 +2603,39 @@ self.__bx_behaviors.selectMainBehavior();
   }
 }
 
-function shouldIgnoreAbort(req: HTTPRequest, data: PageState) {
+function getDownloadResponse(req: HTTPRequest) {
   try {
+    if (!req.isNavigationRequest()) {
+      return null;
+    }
+
     const failure = req.failure();
     const failureText = (failure && failure.errorText) || "";
     if (
       failureText !== "net::ERR_ABORTED" ||
       req.resourceType() !== "document"
     ) {
-      return false;
+      return null;
     }
 
     const resp = req.response();
-    const headers = resp && resp.headers();
 
-    if (!headers) {
-      return false;
+    if (!resp) {
+      return null;
     }
 
+    const headers = resp.headers();
+
     if (
       headers["content-disposition"] ||
-      (headers["content-type"] && !headers["content-type"].startsWith("text/"))
+      (headers["content-type"] && !isHTMLMime(headers["content-type"]))
     ) {
-      data.status = resp.status();
-      data.mime = headers["content-type"].split(";")[0];
-      return true;
+      return resp;
     }
   } catch (e) {
-    return false;
+    console.log(e);
+    // ignore
   }
 
-  return false;
+  return null;
 }
diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts
index 8f4611ee4..29feff9b1 100644
--- a/src/replaycrawler.ts
+++ b/src/replaycrawler.ts
@@ -457,6 +457,8 @@ export class ReplayCrawler extends Crawler {
 
     await this.doPostLoadActions(opts, true);
 
+    await this.awaitPageExtraDelay(opts);
+
     await this.compareScreenshots(page, data, url, date, workerid);
 
     await this.compareText(page, data, url, date);
diff --git a/src/util/recorder.ts b/src/util/recorder.ts
index 7fb1865e3..58c777efb 100644
--- a/src/util/recorder.ts
+++ b/src/util/recorder.ts
@@ -6,7 +6,7 @@ import PQueue from "p-queue";
 
 import { logger, formatErr } from "./logger.js";
 import { sleep, timedRun, timestampNow } from "./timing.js";
-import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
+import { RequestResponseInfo, isHTMLMime } from "./reqresp.js";
 
 import { fetch, Response } from "undici";
 
@@ -90,6 +90,13 @@ export type DirectFetchRequest = {
   cdp: CDPSession;
 };
 
+// =================================================================
+export type DirectFetchResponse = {
+  fetched: boolean;
+  mime: string;
+  ts: Date;
+};
+
 // =================================================================
 export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
   cdp: CDPSession;
@@ -1088,11 +1095,11 @@ export class Recorder {
     this.writer.writeRecordPair(responseRecord, requestRecord);
   }
 
-  async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
-    fetched: boolean;
-    mime: string;
-    ts: Date;
-  }> {
+  async directFetchCapture({
+    url,
+    headers,
+    cdp,
+  }: DirectFetchRequest): Promise<DirectFetchResponse> {
     const reqresp = new RequestResponseInfo("0");
     const ts = new Date();
 
@@ -1125,7 +1132,7 @@ export class Recorder {
         mime = ct.split(";")[0];
       }
 
-      return !isHTMLContentType(mime);
+      return !isHTMLMime(mime);
     };
 
     // ignore dupes: if previous URL was not a page, still load as page. if previous was page,
diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts
index 0d7e4e541..933b8da36 100644
--- a/src/util/reqresp.ts
+++ b/src/util/reqresp.ts
@@ -151,7 +151,7 @@ export class RequestResponseInfo {
   }
 
   isRedirectStatus() {
-    return this.status >= 300 && this.status < 400 && this.status !== 304;
+    return isRedirectStatus(this.status);
   }
 
   isSelfRedirect() {
@@ -375,17 +375,10 @@ export class RequestResponseInfo {
   }
 }
 
-export function isHTMLContentType(contentType: string | null) {
-  // just load if no content-type
-  if (!contentType) {
-    return true;
-  }
-
-  const mime = contentType.split(";")[0];
-
-  if (HTML_TYPES.includes(mime)) {
-    return true;
-  }
+export function isHTMLMime(mime: string) {
+  return HTML_TYPES.includes(mime);
+}
 
-  return false;
+export function isRedirectStatus(status: number) {
+  return status >= 300 && status < 400 && status !== 304;
 }
diff --git a/tests/non-html-crawl.test.js b/tests/non-html-crawl.test.js
new file mode 100644
index 000000000..83da93357
--- /dev/null
+++ b/tests/non-html-crawl.test.js
@@ -0,0 +1,174 @@
+import child_process from "child_process";
+import fs from "fs";
+import path from "path";
+import { WARCParser } from "warcio";
+
+const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
+const PDF_HTTP = PDF.replace("https", "http");
+
+const XML = "https://webrecorder.net/feed.xml";
+const XML_REDIR = "https://www.webrecorder.net/feed.xml";
+
+test("PDF: ensure pdf is crawled", () => {
+  child_process.execSync(
+    `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf`
+  );
+});
+
+test("PDF: check that individual WARCs have PDF written as 200 response", async () => {
+  const archiveWarcLists = fs.readdirSync(
+    "test-crawls/collections/crawl-pdf/archive",
+  );
+
+  const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]);
+
+  const nodeStream = fs.createReadStream(warcName);
+
+  const parser = new WARCParser(nodeStream);
+
+  let statusCode = -1;
+
+  for await (const record of parser) {
+    if (record.warcType !== "response") {
+      continue;
+    }
+
+    if (record.warcTargetURI === PDF) {
+      statusCode = record.httpHeaders.statusCode;
+    }
+  }
+
+  expect(statusCode).toBe(200);
+});
+
+test("PDF: ensure pdf with redirect is crawled", () => {
+  child_process.execSync(
+    `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX`
+  );
+});
+
+test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => {
+  expect(
+    fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"),
+  ).toBe(true);
+
+
+  const pages = fs
+    .readFileSync(
+      "test-crawls/collections/crawl-pdf/pages/pages.jsonl",
+      "utf8",
+    )
+    .trim()
+    .split("\n");
+
+  expect(pages.length).toBe(3);
+
+  const page = JSON.parse(pages[1]);
+  expect(page.url).toBe(PDF);
+  expect(page.status).toBe(200);
+  expect(page.mime).toBe("application/pdf");
+  expect(page.loadState).toBe(2);
+
+  const pageH = JSON.parse(pages[2]);
+  expect(pageH.url).toBe(PDF_HTTP);
+  expect(pageH.status).toBe(200);
+  expect(pageH.mime).toBe("application/pdf");
+  expect(pageH.loadState).toBe(2);
+});
+
+test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
+  const filedata = fs.readFileSync(
+    "test-crawls/collections/crawl-pdf/indexes/index.cdxj",
+    { encoding: "utf-8" },
+  );
+
+  const lines = filedata.trim().split("\n");
+  const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
+
+  expect(cdxj.length).toBe(5);
+
+  expect(cdxj[0].url).toBe(PDF_HTTP);
+  expect(cdxj[0].status).toBe("301");
+
+  expect(cdxj[1].url).toBe(PDF);
+  expect(cdxj[1].status).toBe("200");
+  expect(cdxj[1].mime).toBe("application/pdf");
+
+  expect(cdxj[2].url).toBe(PDF);
+  expect(cdxj[2].status).toBe("200");
+  expect(cdxj[2].mime).toBe("application/pdf");
+
+  expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP);
+  expect(cdxj[3].mime).toBe("application/json");
+
+  expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF);
+  expect(cdxj[4].mime).toBe("application/json");
+});
+
+test("XML: ensure with and without redirect is crawled", () => {
+  child_process.execSync(
+    `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX`
+  );
+});
+
+test("XML: check pages.jsonl file entry contains status code and mime type", () => {
+  expect(
+    fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"),
+  ).toBe(true);
+
+
+  const pages = fs
+    .readFileSync(
+      "test-crawls/collections/crawl-xml/pages/pages.jsonl",
+      "utf8",
+    )
+    .trim()
+    .split("\n");
+
+  expect(pages.length).toBe(3);
+
+  const page = JSON.parse(pages[1]);
+  expect(page.url).toBe(XML);
+  expect(page.status).toBe(200);
+  expect(page.mime).toBe("application/xml");
+  expect(page.loadState).toBe(2);
+
+  const pageH = JSON.parse(pages[2]);
+  expect(pageH.url).toBe(XML_REDIR);
+  expect(pageH.status).toBe(200);
+  expect(pageH.mime).toBe("application/xml");
+  expect(pageH.loadState).toBe(2);
+});
+
+test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => {
+  const filedata = fs.readFileSync(
+    "test-crawls/collections/crawl-xml/indexes/index.cdxj",
+    { encoding: "utf-8" },
+  );
+
+  const lines = filedata.trim().split("\n");
+  const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
+
+  expect(cdxj.length).toBe(6);
+
+  expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
+
+  expect(cdxj[1].url).toBe(XML);
+  expect(cdxj[1].status).toBe("200");
+  expect(cdxj[1].mime).toBe("application/xml");
+
+  expect(cdxj[2].url).toBe(XML);
+  expect(cdxj[2].status).toBe("200");
+  expect(cdxj[2].mime).toBe("application/xml");
+
+  expect(cdxj[3].url).toBe(XML_REDIR);
+  expect(cdxj[3].status).toBe("301");
+
+  expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
+  expect(cdxj[4].mime).toBe("application/json");
+
+  expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
+  expect(cdxj[5].mime).toBe("application/json");
+});
+
+
diff --git a/tests/pdf-crawl.test.js b/tests/pdf-crawl.test.js
deleted file mode 100644
index 3bc6c0770..000000000
--- a/tests/pdf-crawl.test.js
+++ /dev/null
@@ -1,61 +0,0 @@
-import child_process from "child_process";
-import fs from "fs";
-import path from "path";
-import { WARCParser } from "warcio";
-
-const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
-
-test("ensure pdf is crawled", async () => {
-  child_process.execSync(
-    `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf`
-  );
-});
-
-test("check that individual WARCs have PDF written as 200 response", async () => {
-  const archiveWarcLists = fs.readdirSync(
-    "test-crawls/collections/crawl-pdf/archive",
-  );
-
-  const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]);
-
-  const nodeStream = fs.createReadStream(warcName);
-
-  const parser = new WARCParser(nodeStream);
-
-  let statusCode = -1;
-
-  for await (const record of parser) {
-    if (record.warcType !== "response") {
-      continue;
-    }
-
-    if (record.warcTargetURI === PDF) {
-      statusCode = record.httpHeaders.statusCode;
-    }
-  }
-
-  expect(statusCode).toBe(200);
-});
-
-
-test("check that the pages.jsonl file entry contains status code and mime type", () => {
-  expect(
-    fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"),
-  ).toBe(true);
-
-
-  const pages = fs
-    .readFileSync(
-      "test-crawls/collections/crawl-pdf/pages/pages.jsonl",
-      "utf8",
-    )
-    .trim()
-    .split("\n");
-
-  expect(pages.length).toBe(2);
-
-  const page = JSON.parse(pages[1]);
-  expect(page.url).toBe(PDF);
-  expect(page.status).toBe(200);
-  expect(page.mime).toBe("application/pdf");
-});