pdf_utils: send browser User-Agent on PDF downloads

mcgrof · mcgrof · commit dca2e85058b4 · 2026-04-29T05:45:23.000-07:00
Three different submissions failed at the download step
because publisher servers reject Python's default urllib
User-Agent. Confirmed cases:
  - werbos.com → HTTP 465 without UA, 200 OK with browser UA
  - royalsocietypublishing.org → 403 (CF bot management;
    not fixable without session cookies, needs PDF upload)
  - qwen.ai/blog → not a PDF, blog HTML; user error

Send a real Chrome User-Agent and Accept: application/pdf on
every download. This unblocks werbos and the broader class of
sites that filter on UA. Royal Society and the Qwen blog are
not server-side bugs to fix in our code:
  - Royal Society needs an admin PDF-upload (already shipped)
  - Qwen blog isn't a paper

Add a test asserting the User-Agent contains "Mozilla" and
that the Accept header carries application/pdf.

Generated-by: Claude AI
Signed-off-by: Luis Chamberlain &lt;mcgrof@kernel.org&gt;
diff --git a/pdf_utils.py b/pdf_utils.py
@@ -34,6 +34,16 @@ def _normalize_pdf_url(url):
     return raw
 
 
+# Many publisher and academic servers reject Python's default
+# User-Agent (e.g. werbos.com returns 465, others return 403/451).
+# Sending a real browser UA fixes the majority of these without
+# affecting servers that don't care.
+_BROWSER_UA = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+)
+
+
 def download_pdf(url, timeout=60):
     """Download a PDF from a URL to a temporary file.
 
@@ -49,8 +59,11 @@ def download_pdf(url, timeout=60):
         print(f"[PDF] Normalized {url} -> {resolved_url}", file=sys.stderr)
 
     print(f"[PDF] Downloading {resolved_url}...", file=sys.stderr)
+    headers = {"User-Agent": _BROWSER_UA, "Accept": "application/pdf,*/*"}
     try:
-        resp = requests.get(resolved_url, timeout=timeout, stream=True)
+        resp = requests.get(
+            resolved_url, timeout=timeout, stream=True, headers=headers,
+        )
     except requests.exceptions.SSLError as exc:
         # Many academic servers (university personal pages, preprint
         # mirrors) ship incomplete TLS chains that Python's certifi
@@ -66,6 +79,7 @@ def download_pdf(url, timeout=60):
         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
         resp = requests.get(
             resolved_url, timeout=timeout, stream=True, verify=False,
+            headers=headers,
         )
     resp.raise_for_status()
 
diff --git a/tests/test_pdf_utils.py b/tests/test_pdf_utils.py
@@ -54,8 +54,9 @@ def test_normalize_pdf_url_converts_openreview_forum_to_pdf():
 def test_download_pdf_normalizes_arxiv_abs_before_request(monkeypatch):
     seen = {}
 
-    def fake_get(url, timeout=60, stream=True):
+    def fake_get(url, **kwargs):
         seen["url"] = url
+        seen["headers"] = kwargs.get("headers") or {}
         return DummyResponse(url, b"%PDF-1.5\n1 0 obj\n")
 
     monkeypatch.setattr("pdf_utils.requests.get", fake_get)
@@ -69,7 +70,7 @@ def fake_get(url, timeout=60, stream=True):
 
 
 def test_download_pdf_rejects_non_pdf_response(monkeypatch):
-    def fake_get(url, timeout=60, stream=True):
+    def fake_get(url, **kwargs):
         return DummyResponse(url, b"<!DOCTYPE html>", "text/html; charset=utf-8")
 
     monkeypatch.setattr("pdf_utils.requests.get", fake_get)
@@ -78,6 +79,33 @@ def fake_get(url, timeout=60, stream=True):
         download_pdf("https://example.com/not-a-pdf")
 
 
+def test_download_pdf_sends_browser_user_agent(monkeypatch):
+    """Many publisher servers (werbos.com 465, others 403) reject
+    Python's default urllib UA. We must send a real browser UA on
+    every PDF fetch — not just when retrying after an error.
+    """
+    seen = {}
+
+    def fake_get(url, **kwargs):
+        seen["headers"] = kwargs.get("headers") or {}
+        return DummyResponse(url, b"%PDF-1.5\nbody")
+
+    monkeypatch.setattr("pdf_utils.requests.get", fake_get)
+
+    path = download_pdf("https://example.com/paper.pdf")
+    try:
+        ua = seen["headers"].get("User-Agent", "")
+        assert "Mozilla" in ua, (
+            f"download_pdf must send a browser User-Agent; got {ua!r}"
+        )
+        # Servers that 403 without an Accept header for PDFs are also
+        # common; we send Accept: application/pdf as a hint.
+        accept = seen["headers"].get("Accept", "")
+        assert "application/pdf" in accept
+    finally:
+        Path(path).unlink(missing_ok=True)
+
+
 def test_download_and_extract_reads_local_text_sources(tmp_path):
     src = tmp_path / "source.txt"
     src.write_text("Recovered fallback source text.\n", encoding="utf-8")