From d7b655b6b18a265e17384b31b94898d05c095c32 Mon Sep 17 00:00:00 2001 From: Dan Burzo Date: Sun, 11 Aug 2024 20:28:49 +0300 Subject: [PATCH] =?UTF-8?q?Also=20bundle=20with=20EPUBs=20images=20that=20?= =?UTF-8?q?don=E2=80=99t=20have=20a=20characteristic=20file=20extension=20?= =?UTF-8?q?as=20`image`=20MIME=20type=20and=20`.image`=20file=20extension.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 2 +- src/constants/regex.js | 10 ---------- src/enhancements.js | 5 ++--- src/inline-images.js | 16 +++------------- src/remote-resources.js | 27 +++++++++++++++------------ src/util/file-mimetype.js | 23 +++++++++++++++-------- 6 files changed, 36 insertions(+), 47 deletions(-) delete mode 100644 src/constants/regex.js diff --git a/index.js b/index.js index 4d86d2c..4614cb3 100755 --- a/index.js +++ b/index.js @@ -978,7 +978,7 @@ async function epubgen(data, output_path, options) { remoteResources: remoteResources.map(entry => ({ id: entry.mapped.replace(/[^a-z0-9]/gi, ''), href: entry.mapped, - mimetype: lookupMimetype(entry.mapped) + mimetype: entry.mimetype })) }); diff --git a/src/constants/regex.js b/src/constants/regex.js deleted file mode 100644 index 26d5a81..0000000 --- a/src/constants/regex.js +++ /dev/null @@ -1,10 +0,0 @@ -/* - Regex to match URLs pointing to image files in the most common formats. - - Note: it is unfortunate that we use two separate mechanisms - to discern when an URL points to an image, but here we are. - - `REGEX_IMAGE_URL` here needs to be kept in sync - with the `imageMimetypes` set defined in `file-mimetype.js`. -*/ -export const REGEX_IMAGE_URL = /\.(jpe?g|png|svg|gif|bmp|webp|avif|tiff?)$/i; diff --git a/src/enhancements.js b/src/enhancements.js index 554de89..07fc962 100644 --- a/src/enhancements.js +++ b/src/enhancements.js @@ -1,6 +1,6 @@ import { parseSrcset, stringifySrcset } from 'srcset'; import replaceElementType from './replace-element-type.js'; -import { REGEX_IMAGE_URL } from './constants/regex.js'; +import { isImageURL } from './util/file-mimetype.js'; /* Convert AMP markup to HMTL markup @@ -52,7 +52,6 @@ function fixLazyLoadedImages(doc) { */ function imagesAtFullSize(doc) { - let include_pattern = REGEX_IMAGE_URL; let exclude_patterns = [ /* Exclude Wikipedia links to image file pages @@ -85,7 +84,7 @@ function imagesAtFullSize(doc) { // Only replace if the `href` matches an image file if ( - include_pattern.test(href) && + isImageURL(href, doc) && !exclude_patterns.some(pattern => pattern.test(href)) ) { img.setAttribute('src', anchor.href); diff --git a/src/inline-images.js b/src/inline-images.js index dcf5b5e..32d26d5 100644 --- a/src/inline-images.js +++ b/src/inline-images.js @@ -1,17 +1,7 @@ import { parseSrcset, stringifySrcset } from 'srcset'; -import { lookupMimetype, imageMimetypes } from './util/file-mimetype.js'; +import { getMimetypeFromURL, imageMimetypes } from './util/file-mimetype.js'; import fetchBase64 from './util/fetch-base64.js'; -function get_mime(src, doc) { - let pathname = src; - try { - pathname = new URL(src, doc.baseURI).pathname; - } catch (err) { - // no-op, probably due to bad `doc.baseURI` - } - return lookupMimetype(pathname); -} - export default async function inlineImages(doc, fetchOptions = {}, out) { if (out) { out.write('Inlining images...\n'); @@ -19,7 +9,7 @@ export default async function inlineImages(doc, fetchOptions = {}, out) { let src_promises = Array.from( doc.querySelectorAll('picture source[src], img[src]') ).map(async el => { - let mime = get_mime(el.src, doc); + let mime = getMimetypeFromURL(el.src, doc); /* For web pages using atypical URLs for images let’s just use a generic MIME type and hope it works. @@ -61,7 +51,7 @@ export default async function inlineImages(doc, fetchOptions = {}, out) { stringifySrcset( await Promise.all( items.map(async item => { - let mime = get_mime(item.url, doc); + let mime = getMimetypeFromURL(item.url, doc); /* For web pages using atypical URLs for images diff --git a/src/remote-resources.js b/src/remote-resources.js index aebf91d..6c928d3 100644 --- a/src/remote-resources.js +++ b/src/remote-resources.js @@ -1,6 +1,10 @@ import { randomUUID as uuid } from 'node:crypto'; import { parseSrcset, stringifySrcset } from 'srcset'; -import { REGEX_IMAGE_URL } from './constants/regex.js'; +import { + getMimetypeFromURL, + imageMimetypes, + extForMimetype +} from './util/file-mimetype.js'; import { getUrlOrigin } from './util/url-origin.js'; export default function remoteResources(doc) { @@ -11,21 +15,20 @@ export default function remoteResources(doc) { and return a uniquely generated file name instead. */ function collectAndReplace(src) { - let pathname = src; - try { - pathname = new URL(src, doc.baseURI).pathname; - } catch (err) { - // no-op, probably due to bad `doc.baseURI`. - } - let match = pathname.match(REGEX_IMAGE_URL); - if (!match) { - return src; + let ext; + let mime = getMimetypeFromURL(src); + if (mime && imageMimetypes.has(mime)) { + ext = extForMimetype(mime); + } else { + ext = '.image'; + mime = 'image'; } if (!srcs.has(src)) { srcs.set(src, { original: src, - mapped: `rr-${uuid()}.${match[1]}`, - origin: getUrlOrigin(doc.baseURI) + mapped: `rr-${uuid()}${ext}`, + origin: getUrlOrigin(doc.baseURI), + mimetype: mime }); } return `./${srcs.get(src).mapped}`; diff --git a/src/util/file-mimetype.js b/src/util/file-mimetype.js index 1836fe9..174e7e2 100644 --- a/src/util/file-mimetype.js +++ b/src/util/file-mimetype.js @@ -10,17 +10,24 @@ export function lookupMimetype(filepath) { return mimetype.lookup(filepath); } -export function extForMimetype(mimetype) { - return Object.entries(mimetype.catalog).find(it => it[1] === mimetype)?.[0]; +export function extForMimetype(type) { + return Object.entries(mimetype.catalog).find(it => it[1] === type)?.[0]; } -/* - Note: it is unfortunate that we use two separate mechanisms - to discern when an URL points to an image, but here we are. +export function getMimetypeFromURL(src, doc) { + let pathname = src; + try { + pathname = new URL(src, doc.baseURI).pathname; + } catch (err) { + // no-op, probably due to bad `doc.baseURI` + } + return lookupMimetype(pathname); +} + +export function isImageURL(src, doc) { + return imageMimetypes.has(getMimetypeFromURL(src, doc)); +} - `imageMimetypes` here needs to be kept in sync with the - `REGEX_IMAGE_URL` constant! -*/ export const imageMimetypes = new Set([ 'image/avif', 'image/bmp',