erphq · protosphinx · May 1, 2026
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,26 @@
+# v0.1 TODO
+
+## Real download URLs + fetch implementation
+
+### Registry (`pm_bench/registry.yml`)
+- [ ] bpi2012: resolve 4TU direct download URL and pin sha256
+- [ ] bpi2017: resolve 4TU direct download URL and pin sha256
+- [ ] bpi2018: resolve 4TU direct download URL and pin sha256
+- [ ] bpi2019: resolve 4TU direct download URL and pin sha256
+- [ ] bpi2020: decide which sub-files to include; resolve individual URLs and pin sha256 per file
+- [ ] sepsis: resolve 4TU direct download URL and pin sha256
+- [ ] helpdesk: resolve Mendeley direct CSV download URL and pin sha256
+
+### Fetch implementation (`pm_bench/fetch.py`)
+- [ ] Implement HTTP download with resume support (Range header)
+- [ ] Implement sha256 verification after download
+- [ ] Implement atomic move from `.tmp` to final path
+- [ ] Wire `_cache.cache_dir()` as the default cache root
+- [ ] Handle the bundled `synthetic-toy` case (return path without HTTP)
+
+### Cache (`pm_bench/_cache.py`)
+- [ ] Verify `cache_dir()` creates the directory if it doesn't exist
+- [ ] Verify `PM_BENCH_CACHE` env var override works end-to-end
+
+### Tests (`tests/test_fetch.py`)
+- [ ] Fill in all TODO tests once `fetch_dataset` is implemented
diff --git a/pm_bench/_cache.py b/pm_bench/_cache.py
@@ -0,0 +1,29 @@
+"""Local cache directory management for pm-bench."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+_DEFAULT_CACHE = Path.home() / ".cache" / "pm-bench"
+
+
+def cache_dir(override: Path | None = None) -> Path:
+    """Return the cache root, creating it if it doesn't exist.
+
+    Priority: *override* > ``$PM_BENCH_CACHE`` env var > ``~/.cache/pm-bench``.
+    """
+    if override is not None:
+        root = override
+    elif env := os.environ.get("PM_BENCH_CACHE"):
+        root = Path(env)
+    else:
+        root = _DEFAULT_CACHE
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+def dataset_path(name: str, filename: str, override: Path | None = None) -> Path:
+    """Return (and create) the per-dataset cache subdirectory path."""
+    d = cache_dir(override) / name
+    d.mkdir(parents=True, exist_ok=True)
+    return d / filename
diff --git a/pm_bench/cli.py b/pm_bench/cli.py
@@ -3,6 +3,7 @@
 
 import json
 import sys
+from pathlib import Path
 
 import click
 
@@ -83,5 +84,39 @@ def split(name: str, task: str) -> None:
     )
 
 
+@main.command(name="fetch")
+@click.argument("name")
+@click.option(
+    "--cache-dir",
+    "cache_dir_str",
+    default=None,
+    type=click.Path(),
+    help="Override cache directory (default: ~/.cache/pm-bench).",
+)
+def cmd_fetch(name: str, cache_dir_str: str | None) -> None:
+    """Download dataset NAME to the local cache.
+
+    NOTE: not yet implemented — see TODO.md for the v0.1 plan.
+    """
+    from pm_bench.fetch import fetch_dataset
+
+    cache_path = Path(cache_dir_str) if cache_dir_str else None
+    try:
+        path = fetch_dataset(name, cache_path)
+        click.echo(str(path))
+    except KeyError:
+        click.echo(f"unknown dataset: {name}", err=True)
+        sys.exit(1)
+    except ValueError as exc:
+        click.echo(str(exc), err=True)
+        sys.exit(1)
+    except NotImplementedError:
+        click.echo(
+            "fetch_dataset is not yet implemented — see TODO.md for v0.1 plan",
+            err=True,
+        )
+        sys.exit(1)
+
+
 if __name__ == "__main__":
     main()
diff --git a/pm_bench/fetch.py b/pm_bench/fetch.py
@@ -0,0 +1,41 @@
+"""Fetch a registered dataset to the local cache."""
+from __future__ import annotations
+
+from pathlib import Path
+
+# TODO (v0.1): implement HTTP download using requests or urllib.request:
+#   - Resume support via Range header
+#   - sha256 verification after download
+#   - Atomic move from .tmp to final path
+
+
+def fetch_dataset(name: str, cache_dir: Path | None = None) -> Path:
+    """Download dataset *name* to *cache_dir* and return its local path.
+
+    Parameters
+    ----------
+    name:
+        Registry key, e.g. ``"bpi2012"``.
+    cache_dir:
+        Override for the cache root.  Defaults to ``~/.cache/pm-bench``.
+
+    Returns
+    -------
+    Path
+        Path to the downloaded file (XES or CSV).
+
+    Raises
+    ------
+    KeyError
+        If *name* is not in the registry.
+    ValueError
+        If the dataset has no ``download_url`` yet.
+    NotImplementedError
+        Until this function is implemented.
+    """
+    # TODO: call _cache.cache_dir() when cache_dir is None
+    # TODO: look up dataset via registry.get_dataset(name); raise KeyError if unknown
+    # TODO: raise ValueError if dataset.download_url is None
+    # TODO: download to cache_dir / name / filename
+    # TODO: verify sha256 if dataset.sha256 is set
+    raise NotImplementedError("fetch_dataset is not yet implemented — see TODO.md")
diff --git a/pm_bench/registry.yml b/pm_bench/registry.yml
@@ -10,6 +10,9 @@ datasets:
     cases: 13087
     events: 262200
     landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2012/12689204"
+    # TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
+    #   e.g. GET https://data.4tu.nl/v3/articles/12689204/files
+    #   Pin sha256 after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -20,6 +23,9 @@ datasets:
     cases: 31509
     events: 1202267
     landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2017/12696884"
+    # TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
+    #   e.g. GET https://data.4tu.nl/v3/articles/12696884/files
+    #   Pin sha256 after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -30,6 +36,9 @@ datasets:
     cases: 43809
     events: 2514266
     landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2018/12688355"
+    # TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
+    #   e.g. GET https://data.4tu.nl/v3/articles/12688355/files
+    #   Pin sha256 after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -40,6 +49,9 @@ datasets:
     cases: 251734
     events: 1595923
     landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2019/12715853"
+    # TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
+    #   e.g. GET https://data.4tu.nl/v3/articles/12715853/files
+    #   Pin sha256 after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -50,6 +62,10 @@ datasets:
     cases: 10500
     events: 76000
     landing_url: "https://data.4tu.nl/collections/_/5065541/1"
+    # TODO (v0.1): the 2020 collection contains multiple sub-datasets; decide
+    #   which sub-files to include and resolve individual download URLs.
+    #   Collection API: GET https://data.4tu.nl/v3/collections/5065541/articles
+    #   Pin sha256 per file after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -60,6 +76,9 @@ datasets:
     cases: 1050
     events: 15214
     landing_url: "https://data.4tu.nl/articles/dataset/Sepsis_Cases_-_Event_Log/12707639"
+    # TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
+    #   e.g. GET https://data.4tu.nl/v3/articles/12707639/files
+    #   Pin sha256 after manual download + verification.
     download_url: null
     sha256: null
     license: "CC BY 4.0"
@@ -70,6 +89,9 @@ datasets:
     cases: 4580
     events: 21348
     landing_url: "https://data.mendeley.com/datasets/39bp3vv62t/1"
+    # TODO (v0.1): resolve the Mendeley direct CSV download URL.
+    #   Mendeley Data API v1: GET https://data.mendeley.com/api/datasets/39bp3vv62t/versions/1
+    #   Look for the file with mimetype text/csv, then pin sha256.
     download_url: null
     sha256: null
     license: "CC BY 4.0"

diff --git a/tests/test_fetch.py b/tests/test_fetch.py
@@ -0,0 +1,36 @@
+"""TODO tests for pm_bench.fetch and pm_bench._cache (v0.1 milestone)."""
+from __future__ import annotations
+
+import pytest
+
+
+def test_fetch_unknown_dataset_raises_key_error() -> None:
+    # TODO: from pm_bench.fetch import fetch_dataset
+    #   fetch_dataset("nonexistent_dataset") should raise KeyError
+    pytest.skip("TODO (v0.1): implement fetch_dataset first")
+
+
+def test_fetch_dataset_without_url_raises_value_error() -> None:
+    # TODO: pick any dataset with download_url=null (e.g. "bpi2012")
+    #   fetch_dataset("bpi2012") should raise ValueError
+    pytest.skip("TODO (v0.1): implement fetch_dataset first")
+
+
+def test_fetch_synthetic_toy_bundled() -> None:
+    # TODO: synthetic-toy is bundled — decide and implement the behaviour
+    #   for fetch_dataset("synthetic-toy") (return bundled path without HTTP)
+    pytest.skip("TODO (v0.1): decide behaviour for bundled datasets")
+
+
+def test_cache_dir_default_is_under_home() -> None:
+    # TODO: from pm_bench._cache import cache_dir
+    #   d = cache_dir()
+    #   assert d.name == "pm-bench"
+    pytest.skip("TODO (v0.1): implement _cache.cache_dir first")
+
+
+def test_cache_dir_env_override(tmp_path, monkeypatch) -> None:
+    # TODO: monkeypatch.setenv("PM_BENCH_CACHE", str(tmp_path))
+    #   from pm_bench._cache import cache_dir
+    #   assert cache_dir() == tmp_path
+    pytest.skip("TODO (v0.1): implement _cache.cache_dir first")