Skip to content

Commit 654af65

Browse files
Soxasorahuumn
andauthored
enhance: improve media type recognition with HEAD or magic bytes (#2599)
* enhance: improve media type recognition by fetching HEAD or reading its first (magic) bytes * add origin protection, handle links behind basic auth * rollback export from createImgproxyPath * fix api return statements, protect url swap * light cleanup * fix wrong fetch url light cleanup * amend this * do media checks with the capture microservice * fix media check import * we don't need to abort on imgproxy * bail on auth; handle abort via exception; affirm HEAD * ensure url is correct for sndev * add CORS, add comment about Express automatic decodeURIComponent, integrate capture in the images compose profile, address useEffect bug * best practice: use startsWith for sndev URL replacement * remove capture from dmenu profile, restrict CORS to the /media endpoint * revert removal of capture profile, start capture if capture/images profile is passed * add NEXT_PUBLIC_URL to capture manifest (+ do media check in imgproxy worker) --------- Co-authored-by: k00b <[email protected]>
1 parent 626c5b6 commit 654af65

File tree

12 files changed

+216
-25
lines changed

12 files changed

+216
-25
lines changed

.env.development

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ NEXT_PUBLIC_EXTRA_LONG_POLL_INTERVAL_MS=300000
116116
IMGPROXY_URL_DOCKER=http://imgproxy:8080
117117
MEDIA_URL_DOCKER=http://s3:4566/uploads
118118

119+
# media check with capture container
120+
MEDIA_CHECK_URL_DOCKER=http://capture:5678/media
121+
NEXT_PUBLIC_MEDIA_CHECK_URL=http://localhost:5678/media
122+
119123
# postgres container stuff
120124
POSTGRES_PASSWORD=password
121125
POSTGRES_USER=sn

.env.production

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ NEXTAUTH_URL=https://stacker.news
77
NEXTAUTH_URL_INTERNAL=http://127.0.0.1:8080/api/auth
88
NEXT_PUBLIC_AWS_UPLOAD_BUCKET=snuploads
99
NEXT_PUBLIC_IMGPROXY_URL=https://imgprxy.stacker.news/
10+
NEXT_PUBLIC_MEDIA_CHECK_URL=https://capture.stacker.news/media
1011
NEXT_PUBLIC_MEDIA_DOMAIN=m.stacker.news
1112
PUBLIC_URL=https://stacker.news
1213
SELF_URL=http://127.0.0.1:8080

capture/index.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import express from 'express'
22
import puppeteer from 'puppeteer'
3+
import mediaCheck from './media-check.js'
4+
import cors from 'cors'
35

46
const captureUrl = process.env.CAPTURE_URL || 'http://host.docker.internal:3000/'
57
const port = process.env.PORT || 5678
@@ -55,6 +57,12 @@ app.get('/health', (req, res) => {
5557
res.status(200).end()
5658
})
5759

60+
app.get('/media/:url', cors({
61+
origin: process.env.NEXT_PUBLIC_URL,
62+
methods: ['GET', 'OPTIONS'],
63+
credentials: false
64+
}), mediaCheck)
65+
5866
app.get('/*', async (req, res) => {
5967
const url = new URL(req.originalUrl, captureUrl)
6068
const timeLabel = `${Date.now()}-${url.href}`

capture/media-check.js

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import { filetypemime } from 'magic-bytes.js'
2+
3+
const TIMEOUT_HEAD = 2000
4+
const TIMEOUT_GET = 10000
5+
const BYTE_LIMIT = 8192
6+
7+
export function isImageMime (mime) { return typeof mime === 'string' && mime.startsWith('image/') }
8+
9+
export function isVideoMime (mime) { return typeof mime === 'string' && mime.startsWith('video/') }
10+
11+
// adapted from lib/time.js
12+
function timeoutSignal (timeout) {
13+
const controller = new AbortController()
14+
15+
if (timeout) {
16+
setTimeout(() => {
17+
controller.abort(new Error(`timeout after ${timeout / 1000}s`))
18+
}, timeout)
19+
}
20+
21+
return controller.signal
22+
}
23+
24+
const requiresAuth = (res) => res.status === 401 || res.status === 403
25+
26+
async function headMime (url, timeout = TIMEOUT_HEAD) {
27+
const res = await fetch(url, { method: 'HEAD', signal: timeoutSignal(timeout) })
28+
// bail on auth or forbidden
29+
if (requiresAuth(res)) return null
30+
31+
return res.headers.get('content-type')
32+
}
33+
34+
async function readMagicBytes (url, { timeout = TIMEOUT_GET, byteLimit = BYTE_LIMIT } = {}) {
35+
const res = await fetch(url, {
36+
method: 'GET',
37+
// accept image and video, but not other types
38+
headers: { Range: `bytes=0-${byteLimit - 1}`, Accept: 'image/*,video/*;q=0.9,*/*;q=0.8' },
39+
signal: timeoutSignal(timeout)
40+
})
41+
// bail on auth or forbidden
42+
if (requiresAuth(res)) return { bytes: null, headers: res.headers }
43+
44+
// stream a small chunk if possible, otherwise read buffer
45+
if (res.body?.getReader) {
46+
const reader = res.body.getReader()
47+
let received = 0
48+
const chunks = []
49+
try {
50+
while (received < byteLimit) {
51+
const { done, value } = await reader.read()
52+
if (done) break
53+
chunks.push(value)
54+
received += value.byteLength
55+
}
56+
} finally {
57+
try { reader.releaseLock?.() } catch {}
58+
try { res.body?.cancel?.() } catch {}
59+
}
60+
const buf = new Uint8Array(received)
61+
let offset = 0
62+
for (const c of chunks) {
63+
buf.set(c, offset)
64+
offset += c.byteLength
65+
}
66+
return { bytes: buf, headers: res.headers }
67+
} else {
68+
const ab = await res.arrayBuffer()
69+
const buf = new Uint8Array(ab.slice(0, byteLimit))
70+
return { bytes: buf, headers: res.headers }
71+
}
72+
}
73+
74+
export default async function mediaCheck (req, res) {
75+
// express automatically decodes the values in req.params (using decodeURIComponent)
76+
let url = req.params.url
77+
if (typeof url !== 'string' || !/^(https?:\/\/)/.test(url)) {
78+
return res.status(400).json({ error: 'Invalid URL' })
79+
}
80+
81+
try {
82+
// in development, the capture container can't reach the public media url,
83+
// so we need to replace it with its docker equivalent, e.g. http://s3:4566/uploads
84+
if (url.startsWith(process.env.NEXT_PUBLIC_MEDIA_URL) && process.env.NODE_ENV === 'development') {
85+
url = url.replace(process.env.NEXT_PUBLIC_MEDIA_URL, process.env.MEDIA_URL_DOCKER)
86+
}
87+
88+
// trying with HEAD first, as it's the cheapest option
89+
try {
90+
const ct = await headMime(url)
91+
if (isImageMime(ct) || isVideoMime(ct)) {
92+
return res.status(200).json({ mime: ct, isImage: isImageMime(ct), isVideo: isVideoMime(ct) })
93+
}
94+
} catch {}
95+
96+
// otherwise, read the first bytes
97+
const { bytes, headers } = await readMagicBytes(url)
98+
const mimes = bytes ? filetypemime(bytes) : null
99+
const mime = mimes?.[0] ?? headers.get('content-type') ?? null
100+
return res.status(200).json({ mime, isImage: isImageMime(mime), isVideo: isVideoMime(mime) })
101+
} catch (err) {
102+
console.log('media check error:', err)
103+
return res.status(500).json({ mime: null, isImage: false, isVideo: false })
104+
}
105+
}

capture/package-lock.json

Lines changed: 49 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

capture/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
"author": "",
1010
"license": "ISC",
1111
"dependencies": {
12+
"cors": "^2.8.5",
1213
"express": "^4.20.0",
14+
"magic-bytes.js": "^1.12.1",
1315
"puppeteer": "^20.8.2"
1416
},
1517
"type": "module"

components/media-or-link.js

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import styles from './text.module.css'
22
import { useState, useEffect, useMemo, useCallback, memo, useRef } from 'react'
33
import { decodeProxyUrl, IMGPROXY_URL_REGEXP, MEDIA_DOMAIN_REGEXP } from '@/lib/url'
44
import { useMe } from './me'
5-
import { UNKNOWN_LINK_REL } from '@/lib/constants'
5+
import { UNKNOWN_LINK_REL, PUBLIC_MEDIA_CHECK_URL } from '@/lib/constants'
66
import classNames from 'classnames'
77
import { useCarousel } from './carousel'
88

@@ -130,31 +130,33 @@ export const useMediaHelper = ({ src, srcSet: srcSetIntital, topLevel, tab }) =>
130130
// don't load the video at all if user doesn't want these
131131
if (!showMedia || isVideo || isImage) return
132132

133-
// check if it's a video by trying to load it
134-
const video = document.createElement('video')
135-
video.onloadedmetadata = () => {
136-
setIsVideo(true)
137-
setIsImage(false)
138-
}
139-
video.onerror = () => {
140-
// hack
141-
// if it's not a video it will throw an error, so we can assume it's an image
142-
const img = new window.Image()
143-
img.src = src
144-
img.decode().then(() => { // decoding beforehand to prevent wrong image cropping
145-
setIsImage(true)
146-
}).catch((e) => {
147-
console.warn('Cannot decode image:', src, e)
148-
})
133+
const controller = new AbortController()
134+
135+
const checkMedia = async () => {
136+
try {
137+
const res = await fetch(`${PUBLIC_MEDIA_CHECK_URL}/${encodeURIComponent(src)}`, { signal: controller.signal })
138+
if (!res.ok) return
139+
140+
const data = await res.json()
141+
142+
if (data.isVideo) {
143+
setIsVideo(true)
144+
setIsImage(false)
145+
} else if (data.isImage) {
146+
setIsImage(true)
147+
}
148+
} catch (error) {
149+
if (error.name === 'AbortError') return
150+
console.error('cannot check media type', error)
151+
}
149152
}
150-
video.src = src
153+
checkMedia()
151154

152155
return () => {
153-
video.onloadedmetadata = null
154-
video.onerror = null
155-
video.src = ''
156+
// abort the fetch
157+
try { controller.abort() } catch {}
156158
}
157-
}, [src, setIsImage, setIsVideo, showMedia, isImage])
159+
}, [src, setIsImage, setIsVideo, showMedia])
158160

159161
const srcSet = useMemo(() => {
160162
if (Object.keys(srcSetObj).length === 0) return undefined

copilot/capture/manifest.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ variables: # Pass environment variables as key value pairs.
4545
CAPTURE_URL: https://stacker.news/
4646
MAX_PAGES: 10
4747
TIMEOUT: 3000
48+
NEXT_PUBLIC_URL: https://stacker.news
4849

4950
#secrets: # Pass secrets from AWS Systems Manager (SSM) Parameter Store.
5051
# GITHUB_TOKEN: GITHUB_TOKEN # The key is the name of the environment variable, the value is the name of the SSM parameter.

docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ services:
8787
restart: unless-stopped
8888
depends_on: *depends_on_app
8989
profiles:
90+
- images
9091
- capture
9192
healthcheck:
9293
<<: *healthcheck

lib/constants.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ export const BOOST_MAX = 1_000_000
2323
export const IMAGE_PIXELS_MAX = 35000000
2424
// backwards compatibile with old media domain env var and precedence for docker url if set
2525
export const MEDIA_URL = process.env.MEDIA_URL_DOCKER || process.env.NEXT_PUBLIC_MEDIA_URL || `https://${process.env.NEXT_PUBLIC_MEDIA_DOMAIN}`
26+
export const PUBLIC_MEDIA_CHECK_URL = process.env.NEXT_PUBLIC_MEDIA_CHECK_URL
2627
export const AWS_S3_URL_REGEXP = new RegExp(`${process.env.NEXT_PUBLIC_MEDIA_URL || `https://${process.env.NEXT_PUBLIC_MEDIA_DOMAIN}`}/([0-9]+)`, 'g')
2728
export const UPLOAD_TYPES_ALLOW = [
2829
'image/gif',

0 commit comments

Comments
 (0)