From ab62667969cb4212f7a7cb0a23b226c4148e363b Mon Sep 17 00:00:00 2001
From: sphinx <133899485+protosphinx@users.noreply.github.com>
Date: Thu, 30 Apr 2026 20:07:16 -0700
Subject: [PATCH] feat(v0.1): dataset fetch + sha256 verification + cache
 resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- cache.py — `$PM_BENCH_CACHE` → `~/.cache/pm-bench/` with per-dataset
  paths; rejects synthetic and unknown formats
- fetch.py — `ensure_cached(dataset)` covers cached+match,
  cached+mismatch (loud HashMismatchError), cached+unpinned (returns
  actual hash), not-cached (auto-download if URL set, otherwise
  ManualFetchRequired with the precise landing URL + on-disk path).
  Streams in 1 MiB chunks; atomic .part-then-rename writes
- CLI: `pm-bench fetch <name> [--pin]` — prints status, emits a
  pasteable registry.yml sha256 patch when `--pin` is set against an
  unpinned-but-present cached file (the path the TOS-gated workflow
  takes)
- 13 new tests (test_cache.py, test_fetch.py); 37 total, ruff clean
- STATUS / GOALS / README updated: v0.1 marked partial — machinery
  shipped, per-dataset hash pins pending one-time manual downloads
---
 GOALS.md            |   6 +-
 README.md           |  25 +++++++--
 STATUS.md           |  95 +++++++++++++++++++++++---------
 pm_bench/cache.py   |  58 ++++++++++++++++++++
 pm_bench/cli.py     |  68 +++++++++++++++++++++++
 pm_bench/fetch.py   | 130 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_cache.py |  60 ++++++++++++++++++++
 tests/test_fetch.py |  92 +++++++++++++++++++++++++++++++
 8 files changed, 502 insertions(+), 32 deletions(-)
 create mode 100644 pm_bench/cache.py
 create mode 100644 pm_bench/fetch.py
 create mode 100644 tests/test_cache.py
 create mode 100644 tests/test_fetch.py
diff --git a/GOALS.md b/GOALS.md
index 4ed7ae6..8b88dae 100644
--- a/GOALS.md
+++ b/GOALS.md
@@ -5,11 +5,13 @@ Be the default benchmark for new process-mining methods. Within 18 months,
 ≥10 external papers report `pm-bench` numbers in their abstract.
 
 ## v0 success criteria
-- 7 datasets fetchable + hash-verified
+- 7 datasets fetchable + hash-verified — fetch/hash machinery shipped
+  (`pm-bench fetch <name> [--pin]`); per-dataset hash pins pending
+  the one-time TOS-gated downloads
 - 5 tasks with fixed scoring scripts (next-event ✅; remaining-time, outcome,
   conformance, bottleneck pending)
 - `gnn` runs end-to-end as the reference baseline (Markov reference ✅;
-  `gnn` integration pending v0.1 dataset machinery)
+  `gnn` integration pending the first pinned dataset)
 - End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes →
   predict → score, covered by `tests/test_e2e.py`
 
diff --git a/README.md b/README.md
index 3071239..e369897 100644
--- a/README.md
+++ b/README.md
@@ -118,10 +118,22 @@ pm-bench score predictions.csv \
 
 The full loop (`split → prefixes → predict → score`) runs end-to-end on
 `synthetic-toy` today; it's covered by `tests/test_e2e.py` and locks
-the file formats the leaderboard depends on. BPI / Sepsis / Helpdesk
-will use the same commands once v0.1's fetch+cache machinery lands —
-4TU's interactive TOS makes the download itself a one-time manual
-step, but everything downstream is automated.
+the file formats the leaderboard depends on.
+
+For the public datasets, the fetch + hash machinery is in place:
+
+```bash
+pm-bench fetch bpi2020                    # auto-downloads if URL is set
+pm-bench fetch bpi2020 --pin              # after manual TOS-gated download,
+                                          # emits a registry.yml sha256 patch
+```
+
+`pm-bench fetch` resolves a cache directory (`$PM_BENCH_CACHE`, else
+`~/.cache/pm-bench/`), verifies the registry sha256 if pinned, and —
+for TOS-gated 4TU / Mendeley datasets — prints the precise landing URL
+and on-disk path you need to fill in. The per-dataset hash pins are the
+last manual step before BPI / Sepsis / Helpdesk run through the same
+loop as `synthetic-toy`.
 
 The full pipeline:
 
@@ -204,7 +216,10 @@ honesty. The point of the benchmark is to make the comparison real.
 - [x] v0.0.1 — end-to-end loop on `synthetic-toy`: split → prefixes →
       predict (Markov) → score, with a smoke test that locks the file
       formats
-- [ ] v0.1 — fetch + cache + hash for all 7 datasets
+- [🟡] v0.1 — fetch + cache + hash for all 7 datasets. Machinery
+      shipped (`pm-bench fetch <name> [--pin]`, sha256 verification,
+      `$PM_BENCH_CACHE` resolution); per-dataset hash-pinning PRs
+      pending the one-time TOS-gated downloads from 4TU and Mendeley.
 - [ ] v0.2 — splits: next-event, remaining-time
 - [ ] v0.3 — scoring scripts for all 5 tasks
 - [ ] v0.4 — leaderboard CI + landing page
diff --git a/STATUS.md b/STATUS.md
index bc837ee..92518d0 100644
--- a/STATUS.md
+++ b/STATUS.md
@@ -4,11 +4,24 @@ _Last updated: 2026-04-30._
 
 ## Where we are
 
-Pre-v0. The end-to-end loop runs on the bundled `synthetic-toy`
-dataset; the seven public datasets are still pending v0.1's fetch +
-hash machinery.
+Pre-v0. Two pieces shipped on top of v0.0:
 
-A submission today looks like:
+1. The end-to-end loop runs on the bundled `synthetic-toy` dataset
+   (split → prefixes → predict → score; Markov reference baseline
+   gets top-1 0.976, top-3 1.000).
+2. The fetch + hash + cache machinery is in place. `pm-bench fetch
+   <name>` resolves a dataset to a local path, verifies the registry
+   sha256, and prints precise instructions for the TOS-gated download
+   step on 4TU / Mendeley. `--pin` emits the `registry.yml` patch a
+   contributor pastes into a PR after the manual download.
+
+What's still left in v0.1 is purely a per-dataset operational task: do
+the one-time download, run `--pin`, open seven small PRs to pin the
+hashes, then wire the XES parser to `_load_events` so `split`/
+`prefixes`/`predict` work on real BPI data. None of it requires
+further code design.
+
+A submission today on the bundled toy:
 
 ```bash
 pm-bench split synthetic-toy > split.json
@@ -16,15 +29,45 @@ pm-bench prefixes synthetic-toy --split split.json --out prefixes.csv
 pm-bench predict synthetic-toy --split split.json \
   --prefixes prefixes.csv --out predictions.csv --baseline markov
 pm-bench score predictions.csv --prefixes prefixes.csv --task next-event
-# → top1 0.976, top3 1.000 (Markov on synthetic-toy)
+# → top1 0.976, top3 1.000
 ```
 
-That sequence is the contract — it's what `tests/test_e2e.py` runs in
-CI, and it's what the leaderboard CI will run once datasets are pinned.
+The fetch flow on a TOS-gated dataset:
+
+```bash
+pm-bench fetch bpi2020
+# → bpi2020: no download_url (TOS-gated). Visit https://data.4tu.nl/...,
+#   accept the terms, and save the archive to ~/.cache/pm-bench/bpi2020.xes.gz.
+#   Then re-run `pm-bench fetch bpi2020 --pin` to compute the sha256.
+
+# (manual download + place in cache dir)
+
+pm-bench fetch bpi2020 --pin
+# → bpi2020: cached at ~/.cache/pm-bench/bpi2020.xes.gz (unpinned)
+#   sha256: <hex>
+#
+#   # paste under the matching dataset entry in pm_bench/registry.yml:
+#     - name: bpi2020
+#       sha256: <hex>
+```
 
 ## Recently shipped
 
-- **End-to-end loop on synthetic-toy** (`end-to-end-loop` branch).
+- **v0.1 fetch + hash machinery** (`dataset-fetch` branch).
+  - `pm_bench/cache.py` — cache root resolution
+    (`$PM_BENCH_CACHE` → `~/.cache/pm-bench/`), per-dataset path with
+    correct extension by format.
+  - `pm_bench/fetch.py` — `ensure_cached(dataset)` covers the four
+    cases: cached+match, cached+mismatch (loud failure),
+    cached+unpinned (returns actual hash), not-cached (auto-download
+    if URL set, otherwise raise `ManualFetchRequired`). Streams in
+    1 MiB chunks; atomic `.part`-then-rename writes; sha256 verified
+    against the registry pin.
+  - CLI `pm-bench fetch <name> [--pin]` — prints status, emits a
+    pasteable `registry.yml` patch when `--pin` is set.
+  - 13 new tests across `test_cache.py` and `test_fetch.py`. 37 total.
+- **End-to-end loop on synthetic-toy** (`end-to-end-loop` branch,
+  PR #2).
   - `pm_bench/prefixes.py` — extract prediction targets from a split,
     write/read CSV. Skips length-1 cases.
   - `pm_bench/predictions.py` — predictions CSV format
@@ -32,9 +75,7 @@ CI, and it's what the leaderboard CI will run once datasets are pinned.
   - `pm_bench/baselines/markov.py` — first-order Markov reference
     baseline. Trained on the train partition only; falls back to
     unigram for unseen last-activities.
-  - CLI gained `prefixes`, `predict`, `score`. The full
-    `split → prefixes → predict → score` loop now matches what the
-    README advertises.
+  - CLI gained `prefixes`, `predict`, `score`.
   - `tests/test_e2e.py` covers the loop end-to-end via the click
     runner; format changes will trip it.
 - **v0.0** (initial release): scaffold, registry, case-chrono split,
@@ -42,23 +83,27 @@ CI, and it's what the leaderboard CI will run once datasets are pinned.
 
 ## Next up
 
-- **v0.1 — dataset fetch + hash** for the seven public logs. The 4TU
-  portal needs interactive TOS acceptance per dataset, so the fetch
-  itself is a one-time manual step; the rest (cache → verify hash →
-  parse XES → run the same loop) is automated. This is the work that
-  unblocks every downstream milestone.
-- **`gnn` as the second reference baseline** once v0.1 lands. `gnn`'s
-  v0.5 milestone is symmetrical with this — it's been waiting for a
-  pinned dataset registry, which `pm-bench` is meant to provide.
+- **One-time dataset pinning.** Per dataset (BPI 2012/2017/2018/2019/
+  2020 collection, Sepsis, Helpdesk): accept the TOS, save to the
+  cache, run `pm-bench fetch <name> --pin`, open the registry PR.
+  This is the gate on every downstream milestone.
+- **XES parser wiring.** `_load_events` currently rejects everything
+  except `synthetic-toy`. Once a dataset is pinned, swap that branch
+  for a pm4py-backed XES read (move pm4py to `[bpi]` extras so the
+  base install stays light).
+- **`gnn` as the second reference baseline.** `gnn`'s v0.5 milestone
+  has been waiting for a pinned dataset registry, which `pm-bench`
+  now provides the moment any single dataset is pinned.
 - Additional tasks beyond next-event (remaining-time, outcome,
   conformance, bottleneck). The split + prefixes machinery is shared;
   scoring is the per-task piece.
 
 ## Known gaps
 
-- No `pm-bench fetch` yet. README still hints at it; the install &
-  use section now shows the loop that actually works (synthetic-toy
-  only) so the doc and the CLI line up.
-- `predict` currently only knows `markov`. The `--baseline` flag is a
-  click choice so adding a second is a one-liner, but the second one
-  worth adding is `gnn`, which depends on v0.1.
+- The base install does not pull `pm4py`, so XES parsing isn't wired
+  yet. Adding a `[bpi]` extra is the right move when we pin the
+  first dataset — keeps `pip install pm-bench` fast for users who
+  only need scoring.
+- No leaderboard CI yet (v0.4). The file formats are stable, so this
+  is "wire up a workflow that runs `pm-bench score`" — orthogonal to
+  the dataset work.
diff --git a/pm_bench/cache.py b/pm_bench/cache.py
new file mode 100644
index 0000000..99b287b
--- /dev/null
+++ b/pm_bench/cache.py
@@ -0,0 +1,58 @@
+"""Local cache directory for downloaded event logs.
+
+Datasets land in `$PM_BENCH_CACHE` if set, else `~/.cache/pm-bench/`.
+We never write inside the install tree — the cache survives uninstalls
+and wheel rebuilds, and a single cache can be shared across virtualenvs.
+
+The on-disk layout is one file per dataset:
+
+    <cache_root>/<name>.<ext>
+
+where `<ext>` is `xes.gz` for XES logs (the canonical 4TU
+distribution form) and `csv` / `csv.gz` for CSV. The synthetic-toy
+dataset is generated on demand and never touches the cache.
+"""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from pm_bench.registry import Dataset
+
+
+def cache_root(override: str | None = None) -> Path:
+    """Return the cache root, creating it if needed.
+
+    Resolution order: explicit `override`, then `$PM_BENCH_CACHE`, then
+    `~/.cache/pm-bench/`. The directory is created on first call so
+    callers don't have to.
+    """
+    if override:
+        root = Path(override).expanduser()
+    elif env := os.environ.get("PM_BENCH_CACHE"):
+        root = Path(env).expanduser()
+    else:
+        root = Path.home() / ".cache" / "pm-bench"
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+_EXT_BY_FORMAT = {
+    "xes": "xes.gz",
+    "csv": "csv",
+}
+
+
+def cache_path(dataset: Dataset, override_root: str | None = None) -> Path:
+    """Return the on-disk path where this dataset's archive lives.
+
+    The path is purely a function of `(cache_root, name, format)`; we
+    do not check whether the file actually exists. Callers should test
+    `path.exists()` before reading.
+    """
+    if dataset.format == "synthetic":
+        raise ValueError(f"{dataset.name} is generated on demand, not cached")
+    ext = _EXT_BY_FORMAT.get(dataset.format)
+    if ext is None:
+        raise ValueError(f"unknown dataset format: {dataset.format}")
+    return cache_root(override_root) / f"{dataset.name}.{ext}"
diff --git a/pm_bench/cli.py b/pm_bench/cli.py
index 3af96e0..bf45ff7 100644
--- a/pm_bench/cli.py
+++ b/pm_bench/cli.py
@@ -8,6 +8,12 @@
 
 from pm_bench import _synth
 from pm_bench.baselines.markov import fit_markov, predict_markov
+from pm_bench.fetch import (
+    FetchError,
+    ManualFetchRequired,
+    ensure_cached,
+    sha256_file,
+)
 from pm_bench.predictions import read_predictions_csv, write_predictions_csv
 from pm_bench.prefixes import extract_prefixes, read_prefixes_csv, write_prefixes_csv
 from pm_bench.registry import get_dataset, load_registry
@@ -71,6 +77,68 @@ def info(name: str) -> None:
     )
 
 
+@main.command()
+@click.argument("name")
+@click.option(
+    "--pin",
+    is_flag=True,
+    default=False,
+    help="After locating the cached file, print a registry.yml patch with its sha256.",
+)
+def fetch(name: str, pin: bool) -> None:
+    """Make a dataset available locally and verify its hash.
+
+    Auto-downloads when `download_url` is set; otherwise prints
+    instructions for the manual TOS-gated download path (4TU / Mendeley).
+    """
+    try:
+        d = get_dataset(name)
+    except KeyError:
+        click.echo(f"unknown dataset: {name}", err=True)
+        sys.exit(1)
+
+    if d.format == "synthetic":
+        click.echo(f"{name}: generated on demand, no fetch needed")
+        return
+
+    try:
+        result = ensure_cached(d)
+    except ManualFetchRequired as exc:
+        # Special-cased only so we can also handle --pin against a file the
+        # user just placed by hand. If the file is now there, recurse via
+        # ensure_cached; otherwise print the instructions and bail.
+        path = exc.expected_path
+        if path.exists():
+            actual = sha256_file(path)
+            click.echo(f"{name}: cached at {path}")
+            click.echo(f"  sha256: {actual}")
+            if pin:
+                _print_pin_patch(name, actual)
+            elif d.sha256 is None:
+                click.echo("  (registry hash unset — re-run with --pin to emit a patch)")
+            return
+        click.echo(str(exc), err=True)
+        sys.exit(2)
+    except FetchError as exc:
+        click.echo(f"{name}: {exc}", err=True)
+        sys.exit(2)
+
+    state = "downloaded" if result.downloaded else "cached"
+    pinned = "verified" if result.pinned else "unpinned"
+    click.echo(f"{name}: {state} at {result.path} ({pinned})")
+    click.echo(f"  sha256: {result.sha256}")
+    if pin and not result.pinned:
+        _print_pin_patch(name, result.sha256)
+
+
+def _print_pin_patch(name: str, digest: str) -> None:
+    """Print a YAML snippet the user can paste into registry.yml."""
+    click.echo("")
+    click.echo("# paste under the matching dataset entry in pm_bench/registry.yml:")
+    click.echo(f"  - name: {name}")
+    click.echo(f"    sha256: {digest}")
+
+
 @main.command()
 @click.argument("name")
 @click.option("--task", default="next-event", show_default=True)
diff --git a/pm_bench/fetch.py b/pm_bench/fetch.py
new file mode 100644
index 0000000..7ab4a45
--- /dev/null
+++ b/pm_bench/fetch.py
@@ -0,0 +1,130 @@
+"""Dataset fetch + hash verification.
+
+The four cases that matter:
+
+1. **Cached + hash matches.** Nothing to do — return the cached path.
+2. **Cached + hash mismatch.** Loud failure: someone modified the
+   archive on disk, or the registry hash is wrong. Either way we
+   refuse to proceed silently.
+3. **Cached + registry hash unset.** First-time pin path. The caller
+   can compute the hash via `--pin` and PR a registry update.
+4. **Not cached.** If the dataset has a `download_url` we fetch it,
+   verify the hash, and cache. If not (the BPI / Sepsis case, gated
+   behind 4TU's interactive TOS) we print precise manual-fetch
+   instructions and exit non-zero.
+
+We deliberately do not auto-write the registry. Hash pins must land
+via PR so the provenance is reviewable.
+"""
+from __future__ import annotations
+
+import hashlib
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+
+from pm_bench.cache import cache_path
+from pm_bench.registry import Dataset
+
+CHUNK_BYTES = 1 << 20  # 1 MiB streaming reads
+
+
+@dataclass(frozen=True)
+class FetchResult:
+    """Outcome of a `ensure_cached` call."""
+
+    dataset: str
+    path: Path
+    sha256: str
+    pinned: bool  # True iff registry already had a hash and it matched
+    downloaded: bool  # True iff we just fetched it (not present before)
+
+
+class FetchError(RuntimeError):
+    """Raised when a dataset can't be made available locally."""
+
+
+class HashMismatchError(FetchError):
+    """Cached file is on disk but its hash doesn't match the registry."""
+
+
+class ManualFetchRequired(FetchError):
+    """Dataset has no `download_url` — user must download + place manually."""
+
+    def __init__(self, dataset: Dataset, expected_path: Path):
+        self.dataset = dataset
+        self.expected_path = expected_path
+        super().__init__(
+            f"{dataset.name}: no download_url (TOS-gated). Visit "
+            f"{dataset.landing_url}, accept the terms, and save the archive to "
+            f"{expected_path}. Then re-run `pm-bench fetch {dataset.name} --pin` "
+            f"to compute the sha256 and PR it into registry.yml."
+        )
+
+
+def sha256_file(path: Path) -> str:
+    """Stream a file and return its hex sha256."""
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while chunk := f.read(CHUNK_BYTES):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def _download(url: str, dest: Path) -> None:
+    """Stream a URL into `dest` atomically (write to .part, rename)."""
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    tmp.parent.mkdir(parents=True, exist_ok=True)
+    with urllib.request.urlopen(url) as resp, open(tmp, "wb") as out:  # noqa: S310
+        while chunk := resp.read(CHUNK_BYTES):
+            out.write(chunk)
+    tmp.replace(dest)
+
+
+def ensure_cached(dataset: Dataset, override_root: str | None = None) -> FetchResult:
+    """Make `dataset` available on disk, verify hash, return its path.
+
+    Synthetic datasets are rejected — they're generated on demand and
+    have no on-disk form.
+    """
+    if dataset.format == "synthetic":
+        raise FetchError(f"{dataset.name} is generated on demand, not cached")
+
+    path = cache_path(dataset, override_root=override_root)
+    downloaded = False
+
+    if not path.exists():
+        if not dataset.download_url:
+            raise ManualFetchRequired(dataset, path)
+        _download(dataset.download_url, path)
+        downloaded = True
+
+    actual = sha256_file(path)
+
+    if dataset.sha256 is None:
+        # First-time-on-disk; nothing to verify against. Caller decides
+        # whether to pin (--pin) or proceed unverified.
+        return FetchResult(
+            dataset=dataset.name,
+            path=path,
+            sha256=actual,
+            pinned=False,
+            downloaded=downloaded,
+        )
+
+    if actual != dataset.sha256:
+        raise HashMismatchError(
+            f"{dataset.name}: sha256 mismatch at {path}\n"
+            f"  expected: {dataset.sha256}\n"
+            f"  actual:   {actual}\n"
+            f"Either the archive is corrupt or the pinned hash is wrong. "
+            f"Delete the cached file to re-fetch, or open a PR to update the pin."
+        )
+
+    return FetchResult(
+        dataset=dataset.name,
+        path=path,
+        sha256=actual,
+        pinned=True,
+        downloaded=downloaded,
+    )
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..27a19d2
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,60 @@
+
+import pytest
+
+from pm_bench.cache import cache_path, cache_root
+from pm_bench.registry import Dataset
+
+
+def _ds(name: str = "demo", fmt: str = "xes") -> Dataset:
+    return Dataset(
+        name=name,
+        title="demo",
+        cases=10,
+        events=50,
+        landing_url=None,
+        download_url=None,
+        sha256=None,
+        license="CC BY 4.0",
+        format=fmt,
+        bundled=False,
+    )
+
+
+def test_cache_root_respects_explicit_override(tmp_path) -> None:
+    root = cache_root(str(tmp_path / "explicit"))
+    assert root == tmp_path / "explicit"
+    assert root.is_dir()
+
+
+def test_cache_root_respects_env_var(tmp_path, monkeypatch) -> None:
+    target = tmp_path / "env"
+    monkeypatch.setenv("PM_BENCH_CACHE", str(target))
+    assert cache_root() == target
+    assert target.is_dir()
+
+
+def test_cache_root_default_when_unset(tmp_path, monkeypatch) -> None:
+    monkeypatch.delenv("PM_BENCH_CACHE", raising=False)
+    monkeypatch.setenv("HOME", str(tmp_path))
+    root = cache_root()
+    assert root == tmp_path / ".cache" / "pm-bench"
+
+
+def test_cache_path_uses_xes_gz_for_xes(tmp_path) -> None:
+    p = cache_path(_ds(fmt="xes"), override_root=str(tmp_path))
+    assert p.name == "demo.xes.gz"
+
+
+def test_cache_path_uses_csv_for_csv(tmp_path) -> None:
+    p = cache_path(_ds(fmt="csv"), override_root=str(tmp_path))
+    assert p.name == "demo.csv"
+
+
+def test_cache_path_rejects_synthetic(tmp_path) -> None:
+    with pytest.raises(ValueError, match="generated on demand"):
+        cache_path(_ds(fmt="synthetic"), override_root=str(tmp_path))
+
+
+def test_cache_path_rejects_unknown_format(tmp_path) -> None:
+    with pytest.raises(ValueError, match="unknown dataset format"):
+        cache_path(_ds(fmt="parquet"), override_root=str(tmp_path))
diff --git a/tests/test_fetch.py b/tests/test_fetch.py
new file mode 100644
index 0000000..51142f6
--- /dev/null
+++ b/tests/test_fetch.py
@@ -0,0 +1,92 @@
+import hashlib
+
+import pytest
+
+from pm_bench.cache import cache_path
+from pm_bench.fetch import (
+    HashMismatchError,
+    ManualFetchRequired,
+    ensure_cached,
+    sha256_file,
+)
+from pm_bench.registry import Dataset
+
+
+def _ds(*, sha256: str | None = None, download_url: str | None = None) -> Dataset:
+    return Dataset(
+        name="demo",
+        title="demo",
+        cases=10,
+        events=50,
+        landing_url="https://example.invalid/landing",
+        download_url=download_url,
+        sha256=sha256,
+        license="CC BY 4.0",
+        format="xes",
+        bundled=False,
+    )
+
+
+def _seed_cache(root, dataset: Dataset, payload: bytes) -> str:
+    """Plant a fake archive in the cache; return its hex digest."""
+    p = cache_path(dataset, override_root=str(root))
+    p.write_bytes(payload)
+    return hashlib.sha256(payload).hexdigest()
+
+
+def test_sha256_file_matches_hashlib(tmp_path) -> None:
+    f = tmp_path / "x"
+    f.write_bytes(b"hello world")
+    assert sha256_file(f) == hashlib.sha256(b"hello world").hexdigest()
+
+
+def test_ensure_cached_no_url_no_file_raises_manual(tmp_path) -> None:
+    d = _ds()
+    with pytest.raises(ManualFetchRequired) as exc:
+        ensure_cached(d, override_root=str(tmp_path))
+    assert exc.value.dataset is d
+    assert exc.value.expected_path.name == "demo.xes.gz"
+
+
+def test_ensure_cached_unpinned_returns_actual_hash(tmp_path) -> None:
+    d = _ds(sha256=None)
+    digest = _seed_cache(tmp_path, d, b"fake archive contents")
+    r = ensure_cached(d, override_root=str(tmp_path))
+    assert r.sha256 == digest
+    assert r.pinned is False
+    assert r.downloaded is False
+
+
+def test_ensure_cached_pinned_match_returns_pinned(tmp_path) -> None:
+    payload = b"fake archive contents"
+    digest = hashlib.sha256(payload).hexdigest()
+    d = _ds(sha256=digest)
+    _seed_cache(tmp_path, d, payload)
+    r = ensure_cached(d, override_root=str(tmp_path))
+    assert r.pinned is True
+    assert r.sha256 == digest
+
+
+def test_ensure_cached_pinned_mismatch_raises(tmp_path) -> None:
+    d = _ds(sha256="0" * 64)
+    _seed_cache(tmp_path, d, b"different contents")
+    with pytest.raises(HashMismatchError) as exc:
+        ensure_cached(d, override_root=str(tmp_path))
+    assert "expected: " + ("0" * 64) in str(exc.value)
+
+
+def test_ensure_cached_synthetic_rejected(tmp_path) -> None:
+    d = Dataset(
+        name="syn",
+        title="synthetic",
+        cases=1,
+        events=1,
+        landing_url=None,
+        download_url=None,
+        sha256=None,
+        license="MIT",
+        format="synthetic",
+        bundled=True,
+    )
+    with pytest.raises(Exception, match="generated on demand"):
+        ensure_cached(d, override_root=str(tmp_path))