From d7b655b6b18a265e17384b31b94898d05c095c32 Mon Sep 17 00:00:00 2001
From: Dan Burzo <dan@danburzo.ro>
Date: Sun, 11 Aug 2024 20:28:49 +0300
Subject: [PATCH] =?UTF-8?q?Also=20bundle=20with=20EPUBs=20images=20that=20?=
 =?UTF-8?q?don=E2=80=99t=20have=20a=20characteristic=20file=20extension=20?=
 =?UTF-8?q?as=20`image`=20MIME=20type=20and=20`.image`=20file=20extension.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 index.js                  |  2 +-
 src/constants/regex.js    | 10 ----------
 src/enhancements.js       |  5 ++---
 src/inline-images.js      | 16 +++-------------
 src/remote-resources.js   | 27 +++++++++++++++------------
 src/util/file-mimetype.js | 23 +++++++++++++++--------
 6 files changed, 36 insertions(+), 47 deletions(-)
 delete mode 100644 src/constants/regex.js
diff --git a/index.js b/index.js
index 4d86d2c..4614cb3 100755
--- a/index.js
+++ b/index.js
@@ -978,7 +978,7 @@ async function epubgen(data, output_path, options) {
 			remoteResources: remoteResources.map(entry => ({
 				id: entry.mapped.replace(/[^a-z0-9]/gi, ''),
 				href: entry.mapped,
-				mimetype: lookupMimetype(entry.mapped)
+				mimetype: entry.mimetype
 			}))
 		});
 
diff --git a/src/constants/regex.js b/src/constants/regex.js
deleted file mode 100644
index 26d5a81..0000000
--- a/src/constants/regex.js
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
-	Regex to match URLs pointing to image files in the most common formats.
-
-	Note: it is unfortunate that we use two separate mechanisms
-	to discern when an URL points to an image, but here we are.
-	
-	`REGEX_IMAGE_URL` here needs to be kept in sync 
-	with the `imageMimetypes` set defined in `file-mimetype.js`.
-*/
-export const REGEX_IMAGE_URL = /\.(jpe?g|png|svg|gif|bmp|webp|avif|tiff?)$/i;
diff --git a/src/enhancements.js b/src/enhancements.js
index 554de89..07fc962 100644
--- a/src/enhancements.js
+++ b/src/enhancements.js
@@ -1,6 +1,6 @@
 import { parseSrcset, stringifySrcset } from 'srcset';
 import replaceElementType from './replace-element-type.js';
-import { REGEX_IMAGE_URL } from './constants/regex.js';
+import { isImageURL } from './util/file-mimetype.js';
 
 /* 
 	Convert AMP markup to HMTL markup
@@ -52,7 +52,6 @@ function fixLazyLoadedImages(doc) {
 		<img src='original-size.png'/>
 */
 function imagesAtFullSize(doc) {
-	let include_pattern = REGEX_IMAGE_URL;
 	let exclude_patterns = [
 		/*
 			Exclude Wikipedia links to image file pages
@@ -85,7 +84,7 @@ function imagesAtFullSize(doc) {
 
 		// Only replace if the `href` matches an image file
 		if (
-			include_pattern.test(href) &&
+			isImageURL(href, doc) &&
 			!exclude_patterns.some(pattern => pattern.test(href))
 		) {
 			img.setAttribute('src', anchor.href);
diff --git a/src/inline-images.js b/src/inline-images.js
index dcf5b5e..32d26d5 100644
--- a/src/inline-images.js
+++ b/src/inline-images.js
@@ -1,17 +1,7 @@
 import { parseSrcset, stringifySrcset } from 'srcset';
-import { lookupMimetype, imageMimetypes } from './util/file-mimetype.js';
+import { getMimetypeFromURL, imageMimetypes } from './util/file-mimetype.js';
 import fetchBase64 from './util/fetch-base64.js';
 
-function get_mime(src, doc) {
-	let pathname = src;
-	try {
-		pathname = new URL(src, doc.baseURI).pathname;
-	} catch (err) {
-		// no-op, probably due to bad `doc.baseURI`
-	}
-	return lookupMimetype(pathname);
-}
-
 export default async function inlineImages(doc, fetchOptions = {}, out) {
 	if (out) {
 		out.write('Inlining images...\n');
@@ -19,7 +9,7 @@ export default async function inlineImages(doc, fetchOptions = {}, out) {
 	let src_promises = Array.from(
 		doc.querySelectorAll('picture source[src], img[src]')
 	).map(async el => {
-		let mime = get_mime(el.src, doc);
+		let mime = getMimetypeFromURL(el.src, doc);
 		/*
 			For web pages using atypical URLs for images
 			let’s just use a generic MIME type and hope it works.
@@ -61,7 +51,7 @@ export default async function inlineImages(doc, fetchOptions = {}, out) {
 				stringifySrcset(
 					await Promise.all(
 						items.map(async item => {
-							let mime = get_mime(item.url, doc);
+							let mime = getMimetypeFromURL(item.url, doc);
 
 							/*
 								For web pages using atypical URLs for images
diff --git a/src/remote-resources.js b/src/remote-resources.js
index aebf91d..6c928d3 100644
--- a/src/remote-resources.js
+++ b/src/remote-resources.js
@@ -1,6 +1,10 @@
 import { randomUUID as uuid } from 'node:crypto';
 import { parseSrcset, stringifySrcset } from 'srcset';
-import { REGEX_IMAGE_URL } from './constants/regex.js';
+import {
+	getMimetypeFromURL,
+	imageMimetypes,
+	extForMimetype
+} from './util/file-mimetype.js';
 import { getUrlOrigin } from './util/url-origin.js';
 
 export default function remoteResources(doc) {
@@ -11,21 +15,20 @@ export default function remoteResources(doc) {
 		and return a uniquely generated file name instead.
 	 */
 	function collectAndReplace(src) {
-		let pathname = src;
-		try {
-			pathname = new URL(src, doc.baseURI).pathname;
-		} catch (err) {
-			// no-op, probably due to bad `doc.baseURI`.
-		}
-		let match = pathname.match(REGEX_IMAGE_URL);
-		if (!match) {
-			return src;
+		let ext;
+		let mime = getMimetypeFromURL(src);
+		if (mime && imageMimetypes.has(mime)) {
+			ext = extForMimetype(mime);
+		} else {
+			ext = '.image';
+			mime = 'image';
 		}
 		if (!srcs.has(src)) {
 			srcs.set(src, {
 				original: src,
-				mapped: `rr-${uuid()}.${match[1]}`,
-				origin: getUrlOrigin(doc.baseURI)
+				mapped: `rr-${uuid()}${ext}`,
+				origin: getUrlOrigin(doc.baseURI),
+				mimetype: mime
 			});
 		}
 		return `./${srcs.get(src).mapped}`;
diff --git a/src/util/file-mimetype.js b/src/util/file-mimetype.js
index 1836fe9..174e7e2 100644
--- a/src/util/file-mimetype.js
+++ b/src/util/file-mimetype.js
@@ -10,17 +10,24 @@ export function lookupMimetype(filepath) {
 	return mimetype.lookup(filepath);
 }
 
-export function extForMimetype(mimetype) {
-	return Object.entries(mimetype.catalog).find(it => it[1] === mimetype)?.[0];
+export function extForMimetype(type) {
+	return Object.entries(mimetype.catalog).find(it => it[1] === type)?.[0];
 }
 
-/* 
-	Note: it is unfortunate that we use two separate mechanisms
-	to discern when an URL points to an image, but here we are.
+export function getMimetypeFromURL(src, doc) {
+	let pathname = src;
+	try {
+		pathname = new URL(src, doc.baseURI).pathname;
+	} catch (err) {
+		// no-op, probably due to bad `doc.baseURI`
+	}
+	return lookupMimetype(pathname);
+}
+
+export function isImageURL(src, doc) {
+	return imageMimetypes.has(getMimetypeFromURL(src, doc));
+}
 
-	`imageMimetypes` here needs to be kept in sync with the
-	`REGEX_IMAGE_URL` constant!
-*/
 export const imageMimetypes = new Set([
 	'image/avif',
 	'image/bmp',