From ab62667969cb4212f7a7cb0a23b226c4148e363b Mon Sep 17 00:00:00 2001 From: sphinx <133899485+protosphinx@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:07:16 -0700 Subject: [PATCH] feat(v0.1): dataset fetch + sha256 verification + cache resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cache.py — `$PM_BENCH_CACHE` → `~/.cache/pm-bench/` with per-dataset paths; rejects synthetic and unknown formats - fetch.py — `ensure_cached(dataset)` covers cached+match, cached+mismatch (loud HashMismatchError), cached+unpinned (returns actual hash), not-cached (auto-download if URL set, otherwise ManualFetchRequired with the precise landing URL + on-disk path). Streams in 1 MiB chunks; atomic .part-then-rename writes - CLI: `pm-bench fetch [--pin]` — prints status, emits a pasteable registry.yml sha256 patch when `--pin` is set against an unpinned-but-present cached file (the path the TOS-gated workflow takes) - 13 new tests (test_cache.py, test_fetch.py); 37 total, ruff clean - STATUS / GOALS / README updated: v0.1 marked partial — machinery shipped, per-dataset hash pins pending one-time manual downloads --- GOALS.md | 6 +- README.md | 25 +++++++-- STATUS.md | 95 +++++++++++++++++++++++--------- pm_bench/cache.py | 58 ++++++++++++++++++++ pm_bench/cli.py | 68 +++++++++++++++++++++++ pm_bench/fetch.py | 130 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_cache.py | 60 ++++++++++++++++++++ tests/test_fetch.py | 92 +++++++++++++++++++++++++++++++ 8 files changed, 502 insertions(+), 32 deletions(-) create mode 100644 pm_bench/cache.py create mode 100644 pm_bench/fetch.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_fetch.py diff --git a/GOALS.md b/GOALS.md index 4ed7ae6..8b88dae 100644 --- a/GOALS.md +++ b/GOALS.md @@ -5,11 +5,13 @@ Be the default benchmark for new process-mining methods. Within 18 months, ≥10 external papers report `pm-bench` numbers in their abstract. ## v0 success criteria -- 7 datasets fetchable + hash-verified +- 7 datasets fetchable + hash-verified — fetch/hash machinery shipped + (`pm-bench fetch [--pin]`); per-dataset hash pins pending + the one-time TOS-gated downloads - 5 tasks with fixed scoring scripts (next-event ✅; remaining-time, outcome, conformance, bottleneck pending) - `gnn` runs end-to-end as the reference baseline (Markov reference ✅; - `gnn` integration pending v0.1 dataset machinery) + `gnn` integration pending the first pinned dataset) - End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes → predict → score, covered by `tests/test_e2e.py` diff --git a/README.md b/README.md index 3071239..e369897 100644 --- a/README.md +++ b/README.md @@ -118,10 +118,22 @@ pm-bench score predictions.csv \ The full loop (`split → prefixes → predict → score`) runs end-to-end on `synthetic-toy` today; it's covered by `tests/test_e2e.py` and locks -the file formats the leaderboard depends on. BPI / Sepsis / Helpdesk -will use the same commands once v0.1's fetch+cache machinery lands — -4TU's interactive TOS makes the download itself a one-time manual -step, but everything downstream is automated. +the file formats the leaderboard depends on. + +For the public datasets, the fetch + hash machinery is in place: + +```bash +pm-bench fetch bpi2020 # auto-downloads if URL is set +pm-bench fetch bpi2020 --pin # after manual TOS-gated download, + # emits a registry.yml sha256 patch +``` + +`pm-bench fetch` resolves a cache directory (`$PM_BENCH_CACHE`, else +`~/.cache/pm-bench/`), verifies the registry sha256 if pinned, and — +for TOS-gated 4TU / Mendeley datasets — prints the precise landing URL +and on-disk path you need to fill in. The per-dataset hash pins are the +last manual step before BPI / Sepsis / Helpdesk run through the same +loop as `synthetic-toy`. The full pipeline: @@ -204,7 +216,10 @@ honesty. The point of the benchmark is to make the comparison real. - [x] v0.0.1 — end-to-end loop on `synthetic-toy`: split → prefixes → predict (Markov) → score, with a smoke test that locks the file formats -- [ ] v0.1 — fetch + cache + hash for all 7 datasets +- [🟡] v0.1 — fetch + cache + hash for all 7 datasets. Machinery + shipped (`pm-bench fetch [--pin]`, sha256 verification, + `$PM_BENCH_CACHE` resolution); per-dataset hash-pinning PRs + pending the one-time TOS-gated downloads from 4TU and Mendeley. - [ ] v0.2 — splits: next-event, remaining-time - [ ] v0.3 — scoring scripts for all 5 tasks - [ ] v0.4 — leaderboard CI + landing page diff --git a/STATUS.md b/STATUS.md index bc837ee..92518d0 100644 --- a/STATUS.md +++ b/STATUS.md @@ -4,11 +4,24 @@ _Last updated: 2026-04-30._ ## Where we are -Pre-v0. The end-to-end loop runs on the bundled `synthetic-toy` -dataset; the seven public datasets are still pending v0.1's fetch + -hash machinery. +Pre-v0. Two pieces shipped on top of v0.0: -A submission today looks like: +1. The end-to-end loop runs on the bundled `synthetic-toy` dataset + (split → prefixes → predict → score; Markov reference baseline + gets top-1 0.976, top-3 1.000). +2. The fetch + hash + cache machinery is in place. `pm-bench fetch + ` resolves a dataset to a local path, verifies the registry + sha256, and prints precise instructions for the TOS-gated download + step on 4TU / Mendeley. `--pin` emits the `registry.yml` patch a + contributor pastes into a PR after the manual download. + +What's still left in v0.1 is purely a per-dataset operational task: do +the one-time download, run `--pin`, open seven small PRs to pin the +hashes, then wire the XES parser to `_load_events` so `split`/ +`prefixes`/`predict` work on real BPI data. None of it requires +further code design. + +A submission today on the bundled toy: ```bash pm-bench split synthetic-toy > split.json @@ -16,15 +29,45 @@ pm-bench prefixes synthetic-toy --split split.json --out prefixes.csv pm-bench predict synthetic-toy --split split.json \ --prefixes prefixes.csv --out predictions.csv --baseline markov pm-bench score predictions.csv --prefixes prefixes.csv --task next-event -# → top1 0.976, top3 1.000 (Markov on synthetic-toy) +# → top1 0.976, top3 1.000 ``` -That sequence is the contract — it's what `tests/test_e2e.py` runs in -CI, and it's what the leaderboard CI will run once datasets are pinned. +The fetch flow on a TOS-gated dataset: + +```bash +pm-bench fetch bpi2020 +# → bpi2020: no download_url (TOS-gated). Visit https://data.4tu.nl/..., +# accept the terms, and save the archive to ~/.cache/pm-bench/bpi2020.xes.gz. +# Then re-run `pm-bench fetch bpi2020 --pin` to compute the sha256. + +# (manual download + place in cache dir) + +pm-bench fetch bpi2020 --pin +# → bpi2020: cached at ~/.cache/pm-bench/bpi2020.xes.gz (unpinned) +# sha256: +# +# # paste under the matching dataset entry in pm_bench/registry.yml: +# - name: bpi2020 +# sha256: +``` ## Recently shipped -- **End-to-end loop on synthetic-toy** (`end-to-end-loop` branch). +- **v0.1 fetch + hash machinery** (`dataset-fetch` branch). + - `pm_bench/cache.py` — cache root resolution + (`$PM_BENCH_CACHE` → `~/.cache/pm-bench/`), per-dataset path with + correct extension by format. + - `pm_bench/fetch.py` — `ensure_cached(dataset)` covers the four + cases: cached+match, cached+mismatch (loud failure), + cached+unpinned (returns actual hash), not-cached (auto-download + if URL set, otherwise raise `ManualFetchRequired`). Streams in + 1 MiB chunks; atomic `.part`-then-rename writes; sha256 verified + against the registry pin. + - CLI `pm-bench fetch [--pin]` — prints status, emits a + pasteable `registry.yml` patch when `--pin` is set. + - 13 new tests across `test_cache.py` and `test_fetch.py`. 37 total. +- **End-to-end loop on synthetic-toy** (`end-to-end-loop` branch, + PR #2). - `pm_bench/prefixes.py` — extract prediction targets from a split, write/read CSV. Skips length-1 cases. - `pm_bench/predictions.py` — predictions CSV format @@ -32,9 +75,7 @@ CI, and it's what the leaderboard CI will run once datasets are pinned. - `pm_bench/baselines/markov.py` — first-order Markov reference baseline. Trained on the train partition only; falls back to unigram for unseen last-activities. - - CLI gained `prefixes`, `predict`, `score`. The full - `split → prefixes → predict → score` loop now matches what the - README advertises. + - CLI gained `prefixes`, `predict`, `score`. - `tests/test_e2e.py` covers the loop end-to-end via the click runner; format changes will trip it. - **v0.0** (initial release): scaffold, registry, case-chrono split, @@ -42,23 +83,27 @@ CI, and it's what the leaderboard CI will run once datasets are pinned. ## Next up -- **v0.1 — dataset fetch + hash** for the seven public logs. The 4TU - portal needs interactive TOS acceptance per dataset, so the fetch - itself is a one-time manual step; the rest (cache → verify hash → - parse XES → run the same loop) is automated. This is the work that - unblocks every downstream milestone. -- **`gnn` as the second reference baseline** once v0.1 lands. `gnn`'s - v0.5 milestone is symmetrical with this — it's been waiting for a - pinned dataset registry, which `pm-bench` is meant to provide. +- **One-time dataset pinning.** Per dataset (BPI 2012/2017/2018/2019/ + 2020 collection, Sepsis, Helpdesk): accept the TOS, save to the + cache, run `pm-bench fetch --pin`, open the registry PR. + This is the gate on every downstream milestone. +- **XES parser wiring.** `_load_events` currently rejects everything + except `synthetic-toy`. Once a dataset is pinned, swap that branch + for a pm4py-backed XES read (move pm4py to `[bpi]` extras so the + base install stays light). +- **`gnn` as the second reference baseline.** `gnn`'s v0.5 milestone + has been waiting for a pinned dataset registry, which `pm-bench` + now provides the moment any single dataset is pinned. - Additional tasks beyond next-event (remaining-time, outcome, conformance, bottleneck). The split + prefixes machinery is shared; scoring is the per-task piece. ## Known gaps -- No `pm-bench fetch` yet. README still hints at it; the install & - use section now shows the loop that actually works (synthetic-toy - only) so the doc and the CLI line up. -- `predict` currently only knows `markov`. The `--baseline` flag is a - click choice so adding a second is a one-liner, but the second one - worth adding is `gnn`, which depends on v0.1. +- The base install does not pull `pm4py`, so XES parsing isn't wired + yet. Adding a `[bpi]` extra is the right move when we pin the + first dataset — keeps `pip install pm-bench` fast for users who + only need scoring. +- No leaderboard CI yet (v0.4). The file formats are stable, so this + is "wire up a workflow that runs `pm-bench score`" — orthogonal to + the dataset work. diff --git a/pm_bench/cache.py b/pm_bench/cache.py new file mode 100644 index 0000000..99b287b --- /dev/null +++ b/pm_bench/cache.py @@ -0,0 +1,58 @@ +"""Local cache directory for downloaded event logs. + +Datasets land in `$PM_BENCH_CACHE` if set, else `~/.cache/pm-bench/`. +We never write inside the install tree — the cache survives uninstalls +and wheel rebuilds, and a single cache can be shared across virtualenvs. + +The on-disk layout is one file per dataset: + + /. + +where `` is `xes.gz` for XES logs (the canonical 4TU +distribution form) and `csv` / `csv.gz` for CSV. The synthetic-toy +dataset is generated on demand and never touches the cache. +""" +from __future__ import annotations + +import os +from pathlib import Path + +from pm_bench.registry import Dataset + + +def cache_root(override: str | None = None) -> Path: + """Return the cache root, creating it if needed. + + Resolution order: explicit `override`, then `$PM_BENCH_CACHE`, then + `~/.cache/pm-bench/`. The directory is created on first call so + callers don't have to. + """ + if override: + root = Path(override).expanduser() + elif env := os.environ.get("PM_BENCH_CACHE"): + root = Path(env).expanduser() + else: + root = Path.home() / ".cache" / "pm-bench" + root.mkdir(parents=True, exist_ok=True) + return root + + +_EXT_BY_FORMAT = { + "xes": "xes.gz", + "csv": "csv", +} + + +def cache_path(dataset: Dataset, override_root: str | None = None) -> Path: + """Return the on-disk path where this dataset's archive lives. + + The path is purely a function of `(cache_root, name, format)`; we + do not check whether the file actually exists. Callers should test + `path.exists()` before reading. + """ + if dataset.format == "synthetic": + raise ValueError(f"{dataset.name} is generated on demand, not cached") + ext = _EXT_BY_FORMAT.get(dataset.format) + if ext is None: + raise ValueError(f"unknown dataset format: {dataset.format}") + return cache_root(override_root) / f"{dataset.name}.{ext}" diff --git a/pm_bench/cli.py b/pm_bench/cli.py index 3af96e0..bf45ff7 100644 --- a/pm_bench/cli.py +++ b/pm_bench/cli.py @@ -8,6 +8,12 @@ from pm_bench import _synth from pm_bench.baselines.markov import fit_markov, predict_markov +from pm_bench.fetch import ( + FetchError, + ManualFetchRequired, + ensure_cached, + sha256_file, +) from pm_bench.predictions import read_predictions_csv, write_predictions_csv from pm_bench.prefixes import extract_prefixes, read_prefixes_csv, write_prefixes_csv from pm_bench.registry import get_dataset, load_registry @@ -71,6 +77,68 @@ def info(name: str) -> None: ) +@main.command() +@click.argument("name") +@click.option( + "--pin", + is_flag=True, + default=False, + help="After locating the cached file, print a registry.yml patch with its sha256.", +) +def fetch(name: str, pin: bool) -> None: + """Make a dataset available locally and verify its hash. + + Auto-downloads when `download_url` is set; otherwise prints + instructions for the manual TOS-gated download path (4TU / Mendeley). + """ + try: + d = get_dataset(name) + except KeyError: + click.echo(f"unknown dataset: {name}", err=True) + sys.exit(1) + + if d.format == "synthetic": + click.echo(f"{name}: generated on demand, no fetch needed") + return + + try: + result = ensure_cached(d) + except ManualFetchRequired as exc: + # Special-cased only so we can also handle --pin against a file the + # user just placed by hand. If the file is now there, recurse via + # ensure_cached; otherwise print the instructions and bail. + path = exc.expected_path + if path.exists(): + actual = sha256_file(path) + click.echo(f"{name}: cached at {path}") + click.echo(f" sha256: {actual}") + if pin: + _print_pin_patch(name, actual) + elif d.sha256 is None: + click.echo(" (registry hash unset — re-run with --pin to emit a patch)") + return + click.echo(str(exc), err=True) + sys.exit(2) + except FetchError as exc: + click.echo(f"{name}: {exc}", err=True) + sys.exit(2) + + state = "downloaded" if result.downloaded else "cached" + pinned = "verified" if result.pinned else "unpinned" + click.echo(f"{name}: {state} at {result.path} ({pinned})") + click.echo(f" sha256: {result.sha256}") + if pin and not result.pinned: + _print_pin_patch(name, result.sha256) + + +def _print_pin_patch(name: str, digest: str) -> None: + """Print a YAML snippet the user can paste into registry.yml.""" + click.echo("") + click.echo("# paste under the matching dataset entry in pm_bench/registry.yml:") + click.echo(f" - name: {name}") + click.echo(f" sha256: {digest}") + + @main.command() @click.argument("name") @click.option("--task", default="next-event", show_default=True) diff --git a/pm_bench/fetch.py b/pm_bench/fetch.py new file mode 100644 index 0000000..7ab4a45 --- /dev/null +++ b/pm_bench/fetch.py @@ -0,0 +1,130 @@ +"""Dataset fetch + hash verification. + +The four cases that matter: + +1. **Cached + hash matches.** Nothing to do — return the cached path. +2. **Cached + hash mismatch.** Loud failure: someone modified the + archive on disk, or the registry hash is wrong. Either way we + refuse to proceed silently. +3. **Cached + registry hash unset.** First-time pin path. The caller + can compute the hash via `--pin` and PR a registry update. +4. **Not cached.** If the dataset has a `download_url` we fetch it, + verify the hash, and cache. If not (the BPI / Sepsis case, gated + behind 4TU's interactive TOS) we print precise manual-fetch + instructions and exit non-zero. + +We deliberately do not auto-write the registry. Hash pins must land +via PR so the provenance is reviewable. +""" +from __future__ import annotations + +import hashlib +import urllib.request +from dataclasses import dataclass +from pathlib import Path + +from pm_bench.cache import cache_path +from pm_bench.registry import Dataset + +CHUNK_BYTES = 1 << 20 # 1 MiB streaming reads + + +@dataclass(frozen=True) +class FetchResult: + """Outcome of a `ensure_cached` call.""" + + dataset: str + path: Path + sha256: str + pinned: bool # True iff registry already had a hash and it matched + downloaded: bool # True iff we just fetched it (not present before) + + +class FetchError(RuntimeError): + """Raised when a dataset can't be made available locally.""" + + +class HashMismatchError(FetchError): + """Cached file is on disk but its hash doesn't match the registry.""" + + +class ManualFetchRequired(FetchError): + """Dataset has no `download_url` — user must download + place manually.""" + + def __init__(self, dataset: Dataset, expected_path: Path): + self.dataset = dataset + self.expected_path = expected_path + super().__init__( + f"{dataset.name}: no download_url (TOS-gated). Visit " + f"{dataset.landing_url}, accept the terms, and save the archive to " + f"{expected_path}. Then re-run `pm-bench fetch {dataset.name} --pin` " + f"to compute the sha256 and PR it into registry.yml." + ) + + +def sha256_file(path: Path) -> str: + """Stream a file and return its hex sha256.""" + h = hashlib.sha256() + with open(path, "rb") as f: + while chunk := f.read(CHUNK_BYTES): + h.update(chunk) + return h.hexdigest() + + +def _download(url: str, dest: Path) -> None: + """Stream a URL into `dest` atomically (write to .part, rename).""" + tmp = dest.with_suffix(dest.suffix + ".part") + tmp.parent.mkdir(parents=True, exist_ok=True) + with urllib.request.urlopen(url) as resp, open(tmp, "wb") as out: # noqa: S310 + while chunk := resp.read(CHUNK_BYTES): + out.write(chunk) + tmp.replace(dest) + + +def ensure_cached(dataset: Dataset, override_root: str | None = None) -> FetchResult: + """Make `dataset` available on disk, verify hash, return its path. + + Synthetic datasets are rejected — they're generated on demand and + have no on-disk form. + """ + if dataset.format == "synthetic": + raise FetchError(f"{dataset.name} is generated on demand, not cached") + + path = cache_path(dataset, override_root=override_root) + downloaded = False + + if not path.exists(): + if not dataset.download_url: + raise ManualFetchRequired(dataset, path) + _download(dataset.download_url, path) + downloaded = True + + actual = sha256_file(path) + + if dataset.sha256 is None: + # First-time-on-disk; nothing to verify against. Caller decides + # whether to pin (--pin) or proceed unverified. + return FetchResult( + dataset=dataset.name, + path=path, + sha256=actual, + pinned=False, + downloaded=downloaded, + ) + + if actual != dataset.sha256: + raise HashMismatchError( + f"{dataset.name}: sha256 mismatch at {path}\n" + f" expected: {dataset.sha256}\n" + f" actual: {actual}\n" + f"Either the archive is corrupt or the pinned hash is wrong. " + f"Delete the cached file to re-fetch, or open a PR to update the pin." + ) + + return FetchResult( + dataset=dataset.name, + path=path, + sha256=actual, + pinned=True, + downloaded=downloaded, + ) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..27a19d2 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,60 @@ + +import pytest + +from pm_bench.cache import cache_path, cache_root +from pm_bench.registry import Dataset + + +def _ds(name: str = "demo", fmt: str = "xes") -> Dataset: + return Dataset( + name=name, + title="demo", + cases=10, + events=50, + landing_url=None, + download_url=None, + sha256=None, + license="CC BY 4.0", + format=fmt, + bundled=False, + ) + + +def test_cache_root_respects_explicit_override(tmp_path) -> None: + root = cache_root(str(tmp_path / "explicit")) + assert root == tmp_path / "explicit" + assert root.is_dir() + + +def test_cache_root_respects_env_var(tmp_path, monkeypatch) -> None: + target = tmp_path / "env" + monkeypatch.setenv("PM_BENCH_CACHE", str(target)) + assert cache_root() == target + assert target.is_dir() + + +def test_cache_root_default_when_unset(tmp_path, monkeypatch) -> None: + monkeypatch.delenv("PM_BENCH_CACHE", raising=False) + monkeypatch.setenv("HOME", str(tmp_path)) + root = cache_root() + assert root == tmp_path / ".cache" / "pm-bench" + + +def test_cache_path_uses_xes_gz_for_xes(tmp_path) -> None: + p = cache_path(_ds(fmt="xes"), override_root=str(tmp_path)) + assert p.name == "demo.xes.gz" + + +def test_cache_path_uses_csv_for_csv(tmp_path) -> None: + p = cache_path(_ds(fmt="csv"), override_root=str(tmp_path)) + assert p.name == "demo.csv" + + +def test_cache_path_rejects_synthetic(tmp_path) -> None: + with pytest.raises(ValueError, match="generated on demand"): + cache_path(_ds(fmt="synthetic"), override_root=str(tmp_path)) + + +def test_cache_path_rejects_unknown_format(tmp_path) -> None: + with pytest.raises(ValueError, match="unknown dataset format"): + cache_path(_ds(fmt="parquet"), override_root=str(tmp_path)) diff --git a/tests/test_fetch.py b/tests/test_fetch.py new file mode 100644 index 0000000..51142f6 --- /dev/null +++ b/tests/test_fetch.py @@ -0,0 +1,92 @@ +import hashlib + +import pytest + +from pm_bench.cache import cache_path +from pm_bench.fetch import ( + HashMismatchError, + ManualFetchRequired, + ensure_cached, + sha256_file, +) +from pm_bench.registry import Dataset + + +def _ds(*, sha256: str | None = None, download_url: str | None = None) -> Dataset: + return Dataset( + name="demo", + title="demo", + cases=10, + events=50, + landing_url="https://example.invalid/landing", + download_url=download_url, + sha256=sha256, + license="CC BY 4.0", + format="xes", + bundled=False, + ) + + +def _seed_cache(root, dataset: Dataset, payload: bytes) -> str: + """Plant a fake archive in the cache; return its hex digest.""" + p = cache_path(dataset, override_root=str(root)) + p.write_bytes(payload) + return hashlib.sha256(payload).hexdigest() + + +def test_sha256_file_matches_hashlib(tmp_path) -> None: + f = tmp_path / "x" + f.write_bytes(b"hello world") + assert sha256_file(f) == hashlib.sha256(b"hello world").hexdigest() + + +def test_ensure_cached_no_url_no_file_raises_manual(tmp_path) -> None: + d = _ds() + with pytest.raises(ManualFetchRequired) as exc: + ensure_cached(d, override_root=str(tmp_path)) + assert exc.value.dataset is d + assert exc.value.expected_path.name == "demo.xes.gz" + + +def test_ensure_cached_unpinned_returns_actual_hash(tmp_path) -> None: + d = _ds(sha256=None) + digest = _seed_cache(tmp_path, d, b"fake archive contents") + r = ensure_cached(d, override_root=str(tmp_path)) + assert r.sha256 == digest + assert r.pinned is False + assert r.downloaded is False + + +def test_ensure_cached_pinned_match_returns_pinned(tmp_path) -> None: + payload = b"fake archive contents" + digest = hashlib.sha256(payload).hexdigest() + d = _ds(sha256=digest) + _seed_cache(tmp_path, d, payload) + r = ensure_cached(d, override_root=str(tmp_path)) + assert r.pinned is True + assert r.sha256 == digest + + +def test_ensure_cached_pinned_mismatch_raises(tmp_path) -> None: + d = _ds(sha256="0" * 64) + _seed_cache(tmp_path, d, b"different contents") + with pytest.raises(HashMismatchError) as exc: + ensure_cached(d, override_root=str(tmp_path)) + assert "expected: " + ("0" * 64) in str(exc.value) + + +def test_ensure_cached_synthetic_rejected(tmp_path) -> None: + d = Dataset( + name="syn", + title="synthetic", + cases=1, + events=1, + landing_url=None, + download_url=None, + sha256=None, + license="MIT", + format="synthetic", + bundled=True, + ) + with pytest.raises(Exception, match="generated on demand"): + ensure_cached(d, override_root=str(tmp_path))