Skip to content

Commit dca2e85

Browse files
committed
pdf_utils: send browser User-Agent on PDF downloads
Three different submissions failed at the download step because publisher servers reject Python's default urllib User-Agent. Confirmed cases: - werbos.com → HTTP 465 without UA, 200 OK with browser UA - royalsocietypublishing.org → 403 (CF bot management; not fixable without session cookies, needs PDF upload) - qwen.ai/blog → not a PDF, blog HTML; user error Send a real Chrome User-Agent and Accept: application/pdf on every download. This unblocks werbos and the broader class of sites that filter on UA. Royal Society and the Qwen blog are not server-side bugs to fix in our code: - Royal Society needs an admin PDF-upload (already shipped) - Qwen blog isn't a paper Add a test asserting the User-Agent contains "Mozilla" and that the Accept header carries application/pdf. Generated-by: Claude AI Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
1 parent cd83798 commit dca2e85

2 files changed

Lines changed: 45 additions & 3 deletions

File tree

pdf_utils.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ def _normalize_pdf_url(url):
3434
return raw
3535

3636

37+
# Many publisher and academic servers reject Python's default
38+
# User-Agent (e.g. werbos.com returns 465, others return 403/451).
39+
# Sending a real browser UA fixes the majority of these without
40+
# affecting servers that don't care.
41+
_BROWSER_UA = (
42+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
43+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
44+
)
45+
46+
3747
def download_pdf(url, timeout=60):
3848
"""Download a PDF from a URL to a temporary file.
3949
@@ -49,8 +59,11 @@ def download_pdf(url, timeout=60):
4959
print(f"[PDF] Normalized {url} -> {resolved_url}", file=sys.stderr)
5060

5161
print(f"[PDF] Downloading {resolved_url}...", file=sys.stderr)
62+
headers = {"User-Agent": _BROWSER_UA, "Accept": "application/pdf,*/*"}
5263
try:
53-
resp = requests.get(resolved_url, timeout=timeout, stream=True)
64+
resp = requests.get(
65+
resolved_url, timeout=timeout, stream=True, headers=headers,
66+
)
5467
except requests.exceptions.SSLError as exc:
5568
# Many academic servers (university personal pages, preprint
5669
# mirrors) ship incomplete TLS chains that Python's certifi
@@ -66,6 +79,7 @@ def download_pdf(url, timeout=60):
6679
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
6780
resp = requests.get(
6881
resolved_url, timeout=timeout, stream=True, verify=False,
82+
headers=headers,
6983
)
7084
resp.raise_for_status()
7185

tests/test_pdf_utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,9 @@ def test_normalize_pdf_url_converts_openreview_forum_to_pdf():
5454
def test_download_pdf_normalizes_arxiv_abs_before_request(monkeypatch):
5555
seen = {}
5656

57-
def fake_get(url, timeout=60, stream=True):
57+
def fake_get(url, **kwargs):
5858
seen["url"] = url
59+
seen["headers"] = kwargs.get("headers") or {}
5960
return DummyResponse(url, b"%PDF-1.5\n1 0 obj\n")
6061

6162
monkeypatch.setattr("pdf_utils.requests.get", fake_get)
@@ -69,7 +70,7 @@ def fake_get(url, timeout=60, stream=True):
6970

7071

7172
def test_download_pdf_rejects_non_pdf_response(monkeypatch):
72-
def fake_get(url, timeout=60, stream=True):
73+
def fake_get(url, **kwargs):
7374
return DummyResponse(url, b"<!DOCTYPE html>", "text/html; charset=utf-8")
7475

7576
monkeypatch.setattr("pdf_utils.requests.get", fake_get)
@@ -78,6 +79,33 @@ def fake_get(url, timeout=60, stream=True):
7879
download_pdf("https://example.com/not-a-pdf")
7980

8081

82+
def test_download_pdf_sends_browser_user_agent(monkeypatch):
83+
"""Many publisher servers (werbos.com 465, others 403) reject
84+
Python's default urllib UA. We must send a real browser UA on
85+
every PDF fetch — not just when retrying after an error.
86+
"""
87+
seen = {}
88+
89+
def fake_get(url, **kwargs):
90+
seen["headers"] = kwargs.get("headers") or {}
91+
return DummyResponse(url, b"%PDF-1.5\nbody")
92+
93+
monkeypatch.setattr("pdf_utils.requests.get", fake_get)
94+
95+
path = download_pdf("https://example.com/paper.pdf")
96+
try:
97+
ua = seen["headers"].get("User-Agent", "")
98+
assert "Mozilla" in ua, (
99+
f"download_pdf must send a browser User-Agent; got {ua!r}"
100+
)
101+
# Servers that 403 without an Accept header for PDFs are also
102+
# common; we send Accept: application/pdf as a hint.
103+
accept = seen["headers"].get("Accept", "")
104+
assert "application/pdf" in accept
105+
finally:
106+
Path(path).unlink(missing_ok=True)
107+
108+
81109
def test_download_and_extract_reads_local_text_sources(tmp_path):
82110
src = tmp_path / "source.txt"
83111
src.write_text("Recovered fallback source text.\n", encoding="utf-8")

0 commit comments

Comments
 (0)