Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# v0.1 TODO

## Real download URLs + fetch implementation

### Registry (`pm_bench/registry.yml`)
- [ ] bpi2012: resolve 4TU direct download URL and pin sha256
- [ ] bpi2017: resolve 4TU direct download URL and pin sha256
- [ ] bpi2018: resolve 4TU direct download URL and pin sha256
- [ ] bpi2019: resolve 4TU direct download URL and pin sha256
- [ ] bpi2020: decide which sub-files to include; resolve individual URLs and pin sha256 per file
- [ ] sepsis: resolve 4TU direct download URL and pin sha256
- [ ] helpdesk: resolve Mendeley direct CSV download URL and pin sha256

### Fetch implementation (`pm_bench/fetch.py`)
- [ ] Implement HTTP download with resume support (Range header)
- [ ] Implement sha256 verification after download
- [ ] Implement atomic move from `.tmp` to final path
- [ ] Wire `_cache.cache_dir()` as the default cache root
- [ ] Handle the bundled `synthetic-toy` case (return path without HTTP)

### Cache (`pm_bench/_cache.py`)
- [ ] Verify `cache_dir()` creates the directory if it doesn't exist
- [ ] Verify `PM_BENCH_CACHE` env var override works end-to-end

### Tests (`tests/test_fetch.py`)
- [ ] Fill in all TODO tests once `fetch_dataset` is implemented
29 changes: 29 additions & 0 deletions pm_bench/_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Local cache directory management for pm-bench."""
from __future__ import annotations

import os
from pathlib import Path

_DEFAULT_CACHE = Path.home() / ".cache" / "pm-bench"


def cache_dir(override: Path | None = None) -> Path:
"""Return the cache root, creating it if it doesn't exist.

Priority: *override* > ``$PM_BENCH_CACHE`` env var > ``~/.cache/pm-bench``.
"""
if override is not None:
root = override
elif env := os.environ.get("PM_BENCH_CACHE"):
root = Path(env)
else:
root = _DEFAULT_CACHE
root.mkdir(parents=True, exist_ok=True)
return root


def dataset_path(name: str, filename: str, override: Path | None = None) -> Path:
"""Return (and create) the per-dataset cache subdirectory path."""
d = cache_dir(override) / name
d.mkdir(parents=True, exist_ok=True)
return d / filename
35 changes: 35 additions & 0 deletions pm_bench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import sys
from pathlib import Path

import click

Expand Down Expand Up @@ -83,5 +84,39 @@ def split(name: str, task: str) -> None:
)


@main.command(name="fetch")
@click.argument("name")
@click.option(
"--cache-dir",
"cache_dir_str",
default=None,
type=click.Path(),
help="Override cache directory (default: ~/.cache/pm-bench).",
)
def cmd_fetch(name: str, cache_dir_str: str | None) -> None:
"""Download dataset NAME to the local cache.

NOTE: not yet implemented — see TODO.md for the v0.1 plan.
"""
from pm_bench.fetch import fetch_dataset

cache_path = Path(cache_dir_str) if cache_dir_str else None
try:
path = fetch_dataset(name, cache_path)
click.echo(str(path))
except KeyError:
click.echo(f"unknown dataset: {name}", err=True)
sys.exit(1)
except ValueError as exc:
click.echo(str(exc), err=True)
sys.exit(1)
except NotImplementedError:
click.echo(
"fetch_dataset is not yet implemented — see TODO.md for v0.1 plan",
err=True,
)
sys.exit(1)


if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions pm_bench/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Fetch a registered dataset to the local cache."""
from __future__ import annotations

from pathlib import Path

# TODO (v0.1): implement HTTP download using requests or urllib.request:
# - Resume support via Range header
# - sha256 verification after download
# - Atomic move from .tmp to final path


def fetch_dataset(name: str, cache_dir: Path | None = None) -> Path:
"""Download dataset *name* to *cache_dir* and return its local path.

Parameters
----------
name:
Registry key, e.g. ``"bpi2012"``.
cache_dir:
Override for the cache root. Defaults to ``~/.cache/pm-bench``.

Returns
-------
Path
Path to the downloaded file (XES or CSV).

Raises
------
KeyError
If *name* is not in the registry.
ValueError
If the dataset has no ``download_url`` yet.
NotImplementedError
Until this function is implemented.
"""
# TODO: call _cache.cache_dir() when cache_dir is None
# TODO: look up dataset via registry.get_dataset(name); raise KeyError if unknown
# TODO: raise ValueError if dataset.download_url is None
# TODO: download to cache_dir / name / filename
# TODO: verify sha256 if dataset.sha256 is set
raise NotImplementedError("fetch_dataset is not yet implemented — see TODO.md")
22 changes: 22 additions & 0 deletions pm_bench/registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ datasets:
cases: 13087
events: 262200
landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2012/12689204"
# TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
# e.g. GET https://data.4tu.nl/v3/articles/12689204/files
# Pin sha256 after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -20,6 +23,9 @@ datasets:
cases: 31509
events: 1202267
landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2017/12696884"
# TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
# e.g. GET https://data.4tu.nl/v3/articles/12696884/files
# Pin sha256 after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -30,6 +36,9 @@ datasets:
cases: 43809
events: 2514266
landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2018/12688355"
# TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
# e.g. GET https://data.4tu.nl/v3/articles/12688355/files
# Pin sha256 after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -40,6 +49,9 @@ datasets:
cases: 251734
events: 1595923
landing_url: "https://data.4tu.nl/articles/dataset/BPI_Challenge_2019/12715853"
# TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
# e.g. GET https://data.4tu.nl/v3/articles/12715853/files
# Pin sha256 after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -50,6 +62,10 @@ datasets:
cases: 10500
events: 76000
landing_url: "https://data.4tu.nl/collections/_/5065541/1"
# TODO (v0.1): the 2020 collection contains multiple sub-datasets; decide
# which sub-files to include and resolve individual download URLs.
# Collection API: GET https://data.4tu.nl/v3/collections/5065541/articles
# Pin sha256 per file after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -60,6 +76,9 @@ datasets:
cases: 1050
events: 15214
landing_url: "https://data.4tu.nl/articles/dataset/Sepsis_Cases_-_Event_Log/12707639"
# TODO (v0.1): resolve the direct .xes.gz download URL via the 4TU.ResearchData API.
# e.g. GET https://data.4tu.nl/v3/articles/12707639/files
# Pin sha256 after manual download + verification.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand All @@ -70,6 +89,9 @@ datasets:
cases: 4580
events: 21348
landing_url: "https://data.mendeley.com/datasets/39bp3vv62t/1"
# TODO (v0.1): resolve the Mendeley direct CSV download URL.
# Mendeley Data API v1: GET https://data.mendeley.com/api/datasets/39bp3vv62t/versions/1
# Look for the file with mimetype text/csv, then pin sha256.
download_url: null
sha256: null
license: "CC BY 4.0"
Expand Down
36 changes: 36 additions & 0 deletions tests/test_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""TODO tests for pm_bench.fetch and pm_bench._cache (v0.1 milestone)."""
from __future__ import annotations

import pytest


def test_fetch_unknown_dataset_raises_key_error() -> None:
# TODO: from pm_bench.fetch import fetch_dataset
# fetch_dataset("nonexistent_dataset") should raise KeyError
pytest.skip("TODO (v0.1): implement fetch_dataset first")


def test_fetch_dataset_without_url_raises_value_error() -> None:
# TODO: pick any dataset with download_url=null (e.g. "bpi2012")
# fetch_dataset("bpi2012") should raise ValueError
pytest.skip("TODO (v0.1): implement fetch_dataset first")


def test_fetch_synthetic_toy_bundled() -> None:
# TODO: synthetic-toy is bundled — decide and implement the behaviour
# for fetch_dataset("synthetic-toy") (return bundled path without HTTP)
pytest.skip("TODO (v0.1): decide behaviour for bundled datasets")


def test_cache_dir_default_is_under_home() -> None:
# TODO: from pm_bench._cache import cache_dir
# d = cache_dir()
# assert d.name == "pm-bench"
pytest.skip("TODO (v0.1): implement _cache.cache_dir first")


def test_cache_dir_env_override(tmp_path, monkeypatch) -> None:
# TODO: monkeypatch.setenv("PM_BENCH_CACHE", str(tmp_path))
# from pm_bench._cache import cache_dir
# assert cache_dir() == tmp_path
pytest.skip("TODO (v0.1): implement _cache.cache_dir first")
Loading