Upgrade dependencies, require Node 14+, add experimental Firefox supp…

…ort (#148) * Upgrade dependencies, minimum Node version 14 * Add support for Firefox PDF rendering via the `--browser=firefox` option * Default style: improve formatting of code blocks * WIP support for Firefox page size * update node-fetch@2 * Use 'node:' prefixes for importing built-in modules. Use 'node:crypto' instead of 'uuid' package. * Passing the content as ArrayBuffer from 'node-fetch' to 'jsdom' performs encoding sniffing. Fixes #149. See: https://github.com/jsdom/jsdom#encoding-sniffing This allows us to get rid of encoding sniffing at the 'node-fetch' level, and lets us upgrade to 'node-fetch' v3.x without any issues. * Remove encoding test referencing external url * In Firefox, provide explicit page size to Puppeteer, extracted from "@page/src" CSS property. * Fix integration test: .originalContent is now an ArrayBuffer * fetchContent() should return { buffer, contentType } in all cases (url, file, stdin) * Linting; add test for slurp * Display header & footer only in chrome
danburzo · Feb 17, 2023 · c45b36b · c45b36b
1 parent 2b2c306
commit c45b36b
Show file tree

Hide file tree

Showing 22 changed files with 1,320 additions and 556 deletions.
diff --git a/.github/workflows/nodejs.yml b/.github/workflows/nodejs.yml
@@ -8,7 +8,7 @@ jobs:
 
         strategy:
             matrix:
-                node-version: [12.x, 14.x, 16.x]
+                node-version: [14.x, 16.x, 18.x]
 
         steps:
             - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ Percollate is a command-line tool that turns web pages into beautifully formatte
 npm install -g percollate
 ```
 
-Percollate and its dependencies **require Node.js 12.20.0** or later.
+Percollate and its dependencies **require Node.js 14.17.0** or later.
 
 #### Community-maintained packages
 

diff --git a/cli.js b/cli.js
@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 
-import { readFileSync } from 'fs';
+import { readFileSync } from 'node:fs';
 import cliopts from './src/cli-opts.js';
 import { pdf, epub, html } from './index.js';
 
@@ -64,55 +64,59 @@ Commands:
 
 Commmon options:
 
-  -h, --help         Output usage information.
-  -V, --version      Output program version.
-  --debug            Print more detailed information.
+  -h, --help           Output usage information.
+  -V, --version        Output program version.
+  --debug              Print more detailed information.
 
-  -o <output>,       Path for the generated bundle.
+  -o <output>,         Path for the generated bundle.
   --output=<path>  
 
-  --template=<path>  Path to a custom HTML template.
+  --template=<path>    Path to a custom HTML template.
   
-  --style=<path>     Path to a custom CSS file.
+  --style=<path>       Path to a custom CSS file.
   
-  --css=<style>      Additional inline CSS style.
+  --css=<style>        Additional inline CSS style.
   
-  -u, --url=<url>    Sets the base URL when HTML is provided on stdin.
-                     Multiple URL options can be specified.
+  -u, --url=<url>      Sets the base URL when HTML is provided on stdin.
+                       Multiple URL options can be specified.
 
-  -w, --wait=<sec>   Process the provided URLs sequentially, 
-                     pausing a number of seconds between items.
+  -w, --wait=<sec>     Process the provided URLs sequentially, 
+                       pausing a number of seconds between items.
   
-  -t <title>,        The bundle title.
+  -t <title>,          The bundle title.
   --title=<title>
 
-  -a <author>,       The bundle author.
+  -a <author>,         The bundle author.
   --author=<author>
   
-  --individual       Export each web page as an individual file.
+  --individual         Export each web page as an individual file.
   
-  --toc              Generate a Table of Contents.
-                     Implicitly enabled when bundling more than one item.
+  --toc                Generate a Table of Contents.
+                       Implicitly enabled when bundling more than one item.
   
-  --cover            Generate a cover for the PDF / EPUB.
-                     Implicitly enabled when bundling more than one item
-                     or the --title option is provided.
+  --cover              Generate a cover for the PDF / EPUB.
+                       Implicitly enabled when bundling more than one item
+                       or the --title option is provided.
+
+  --browser=<browser>  One of 'chrome' (default), 'firefox'.
+                       Used for producing PDF and the cover image for EPUB.
   
-  --hyphenate        Enable hyphenation. Enabled by default for PDF.
+  --hyphenate          Enable hyphenation. Enabled by default for PDF.
 
-  --inline           Embed images inline with the content.
-                     Fetches and converts images to Base64 'data:' URLs.
+  --inline             Embed images inline with the content.
+                       Fetches and converts images to Base64 'data:' URLs.
 
 Options to disable features:
 
-  --no-amp           Don't prefer the AMP version of the web page.
-  --no-toc           Don't generate a table of contents.
-  --no-cover         Don't generate a cover.
-  --no-hyphenate     Disable hyphenation.
+  --no-amp             Don't prefer the AMP version of the web page.
+  --no-toc             Don't generate a table of contents.
+  --no-cover           Don't generate a cover.
+  --no-hyphenate       Disable hyphenation.
 
 PDF options: 
 
-  --no-sandbox       Passed to Puppeteer.
+  --no-sandbox         Passed to Puppeteer.
+
 
 Operands:
 

diff --git a/index.js b/index.js
@@ -1,20 +1,24 @@
+import fs from 'node:fs';
+import stream from 'node:stream';
+import path from 'node:path';
+import { randomUUID as uuid } from 'node:crypto';
+
 import pup from 'puppeteer';
 import archiver from 'archiver';
 import fetch from 'node-fetch';
 import { JSDOM } from 'jsdom';
 import nunjucks from 'nunjucks';
-import fs from 'fs';
-import stream from 'stream';
-import path from 'path';
 import css from 'css';
 import { Readability } from '@mozilla/readability';
-import { v1 as uuid } from 'uuid';
 import createDOMPurify from 'dompurify';
+import MimeType from 'whatwg-mimetype';
+
 import slurp from './src/util/slurp.js';
-import mimetype from './src/util/mimetype.js';
+import fileMimetype from './src/util/file-mimetype.js';
 import epubDate from './src/util/epub-date.js';
 import humanDate from './src/util/human-date.js';
 import outputPath from './src/util/output-path.js';
+import getCssPageFormat from './src/util/get-css-page-format.js';
 import { resolveSequence, resolveParallel } from './src/util/promises.js';
 import addExif from './src/exif.js';
 import { hyphenateDom } from './src/hyphenate.js';
@@ -101,6 +105,7 @@ function launch(options, size) {
 
 	return pup.launch({
 		headless: true,
+		product: options.browser || 'chrome',
 		args,
 		defaultViewport: {
 			// Emulate retina display (@2x)...
@@ -129,15 +134,12 @@ function isURL(ref) {
 	return false;
 }
 
-const accepted_content_types = new Set([
-	'text/html',
-	'application/xhtml+xml',
-	'application/xml'
-]);
-
 async function fetchContent(ref, fetchOptions = {}) {
 	if (ref instanceof stream.Readable) {
-		return slurp(ref);
+		return {
+			buffer: await slurp(ref),
+			contentType: undefined
+		};
 	}
 
 	let url;
@@ -148,12 +150,18 @@ async function fetchContent(ref, fetchOptions = {}) {
 	}
 
 	if (!url) {
-		return readFile(ref, 'utf8');
+		return {
+			buffer: await readFile(ref),
+			contentType: fileMimetype(ref)
+		};
 	}
 
 	if (url && url.protocol === 'file:') {
 		url = decodeURI(url.href.replace(/^file:\/\//, ''));
-		return readFile(url, 'utf8');
+		return {
+			buffer: await readFile(url),
+			contentType: fileMimetype(url)
+		};
 	}
 
 	/*
@@ -166,17 +174,20 @@ async function fetchContent(ref, fetchOptions = {}) {
 			...fetchOptions.headers,
 			'user-agent': UA
 		}
-	}).then(response => {
-		let ct = (response.headers.get('Content-Type') || '').trim();
-		if (ct.indexOf(';') > -1) {
-			ct = ct.split(';')[0].trim();
-		}
-		if (!accepted_content_types.has(ct)) {
+	}).then(async response => {
+		let contentType = response.headers.get('Content-Type');
+		let mt = new MimeType(contentType);
+
+		if (!mt.isHTML() && !mt.isXML()) {
 			throw new Error(
-				`URL ${url.href} has unsupported content type: ${ct}`
+				`URL ${url.href} has unsupported content type: ${contentType}`
 			);
 		}
-		return response.textConverted();
+
+		return {
+			buffer: await response.arrayBuffer(),
+			contentType
+		};
 	});
 }
 
@@ -187,7 +198,7 @@ async function cleanup(url, options) {
 	try {
 		out.write(`Fetching: ${url}`);
 
-		const content = await fetchContent(
+		const { buffer, contentType } = await fetchContent(
 			url === '-' ? process.stdin : url,
 			options.fetch || {}
 		);
@@ -203,7 +214,10 @@ async function cleanup(url, options) {
 				? url
 				: 'file://' + path.resolve(url);
 
-		const dom = new JSDOM(content, { url: final_url });
+		const dom = new JSDOM(buffer, {
+			contentType,
+			url: final_url
+		});
 
 		// Force relative URL resolution
 		dom.window.document.body.setAttribute(null, null);
@@ -322,7 +336,10 @@ async function cleanup(url, options) {
 			length: parsed.length,
 			siteName: sanitizer.sanitize(parsed.siteName),
 			remoteResources,
-			originalContent: content
+			originalContent: {
+				buffer,
+				contentType
+			}
 		};
 	} catch (error) {
 		console.error(`${url}:`, error.message);
@@ -436,13 +453,38 @@ async function bundlePdf(items, options) {
 
 	const output_path = outputPath(items, options, '.pdf', options.slugCache);
 
-	let buffer = await page.pdf({
+	const pdfOptions = {
 		preferCSSPageSize: true,
-		displayHeaderFooter: true,
+		displayHeaderFooter: options.browser !== 'firefox',
 		headerTemplate: header.body.innerHTML,
 		footerTemplate: footer.body.innerHTML,
 		printBackground: true
-	});
+	};
+
+	/*
+		Currently, Firefox does not produce PDFs 
+		with the page format specified by the author 
+		with the `@page/size` CSS declaration.
+
+		We need to extract that value ourselves and produce
+		the appropriate Puppeteer config.
+
+		Once these tasks get done in Firefox,
+		the EXPLICIT_PAGE_SIZE_FROM_CSS code path can be removed:
+
+		https://bugzilla.mozilla.org/show_bug.cgi?id=1793220
+		https://bugzilla.mozilla.org/show_bug.cgi?id=1815565
+	 */
+	const EXPLICIT_PAGE_SIZE_FROM_CSS = options.browser === 'firefox';
+
+	let buffer = await page.pdf(
+		EXPLICIT_PAGE_SIZE_FROM_CSS
+			? {
+					...pdfOptions,
+					...getCssPageFormat(doc)
+			  }
+			: pdfOptions
+	);
 
 	await browser.close();
 
@@ -774,7 +816,7 @@ async function epubgen(data, output_path, options) {
 			remoteResources: remoteResources.map(entry => ({
 				id: entry[1].replace(/[^a-z0-9]/gi, ''),
 				href: entry[1],
-				mimetype: mimetype(entry[1])
+				mimetype: fileMimetype(entry[1])
 			}))
 		});