diff --git a/README.md b/README.md index f2f4afc..d236335 100644 --- a/README.md +++ b/README.md @@ -243,9 +243,9 @@ honesty. The point of the benchmark is to make the comparison real. - [x] v0.2 - splits + targets for next-event ✅ and remaining-time ✅ - [x] v0.3 - scoring scripts for all 5 tasks ✅. next-event, remaining-time, outcome, bottleneck, conformance - every task - ships with a CPython baseline and a leaderboard entry on - synthetic-toy (the outcome row waits on a dataset whose test - split has both classes). + ships with a CPython baseline and a real leaderboard entry on + synthetic-toy. All five entries are verified by + `pm-bench leaderboard --all --verify` in CI. - [x] v0.4 - leaderboard CI + landing page. Standings JSON, reference entries, `pm-bench leaderboard [--all] [--verify] [--markdown]`, the dedicated `leaderboard.yml` GitHub workflow, diff --git a/STANDINGS.md b/STANDINGS.md index ac62caf..85fe295 100644 --- a/STANDINGS.md +++ b/STANDINGS.md @@ -7,25 +7,32 @@ _NDCG@10 over per-transition wait times (higher is better)_ | Model | NDCG@k | k | n_transitions | |---|---:|---:|---:| -| `mean-wait-ref` | 0.9786 | 10 | 6 | +| `mean-wait-ref` | 0.9911 | 10 | 9 | ### conformance · synthetic-toy _DFG fitness × precision → F-score (higher is better)_ | Model | F | Fitness | Precision | n_test | n_model | |---|---:|---:|---:|---:|---:| -| `dfg-ref` | 0.8571 | 1.0000 | 0.7500 | 6 | 8 | +| `dfg-ref` | 1.0000 | 1.0000 | 1.0000 | 9 | 9 | ### next-event · synthetic-toy _top1 / top3 accuracy_ | Model | top1 | top3 | n | |---|---:|---:|---:| -| `markov-ref` | 0.9756 | 1.0000 | 41 | +| `markov-ref` | 0.9304 | 1.0000 | 158 | + +### outcome · synthetic-toy +_ROC AUC (higher is better)_ + +| Model | AUC | n | n_pos | +|---|---:|---:|---:| +| `prior-ref` | 0.6319 | 158 | 45 | ### remaining-time · synthetic-toy _MAE in days (lower is better)_ | Model | mae_days | n | |---|---:|---:| -| `mean-ref` | 1.2546 | 41 | +| `mean-ref` | 1.3481 | 158 | diff --git a/STATUS.md b/STATUS.md index b4a8a0d..1e37334 100644 --- a/STATUS.md +++ b/STATUS.md @@ -60,6 +60,22 @@ pm-bench fetch bpi2020 --pin ## Recently shipped +- **Synthetic-toy bumped to 200 cases — outcome row finally lands** + (`synthetic-200` branch). + - `synthetic_log()` default `n_cases` = 200 (was 50). Test partition + now has ~45 positive cases (`delivery_confirmed`) so AUC is + meaningful instead of degenerating to 0.5. + - All 4 existing reference predictions regenerated and re-scored. + New numbers: markov-ref top-1 0.9304 (was 0.9756 on 50 cases), + mean-ref MAE 1.3481, mean-wait-ref NDCG@10 0.9911, + dfg-ref F=1.0 (both partitions now cover the full path graph). + - **5th leaderboard board added**: `outcome/synthetic-toy.json` + with `prior-ref` entry — AUC 0.6319, n_pos 45 / 158. Real floor + for any temporal model on the outcome task. + - `_rescore_outcome` + `_outcome_truth_for_dataset` added to + `leaderboard.py`. `pm-bench leaderboard --all --verify` now + walks all 5 boards. + - STANDINGS.md regenerated. 109 tests, ruff clean. - **Conformance task - v0.3 closed** (`conformance-task` branch). - `score_conformance` - DFG fitness × precision → F-score. Pure CPython; no pm4py dep. diff --git a/leaderboard/bottleneck/synthetic-toy.json b/leaderboard/bottleneck/synthetic-toy.json index a5711e5..e9e16c8 100644 --- a/leaderboard/bottleneck/synthetic-toy.json +++ b/leaderboard/bottleneck/synthetic-toy.json @@ -16,9 +16,9 @@ "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_wait.py", "paper": null, "score": { - "ndcg_at_k": 0.9786469611053435, + "ndcg_at_k": 0.9911470371722683, "k": 10, - "n_transitions": 6 + "n_transitions": 9 }, "scored_at": "2026-04-30T00:00:00Z", "notes": "Per-transition mean wait time fitted on training cases, with global-mean fallback for unseen transitions. The dumbest model that uses any time information; a real model has to do strictly better." diff --git a/leaderboard/conformance/synthetic-toy.json b/leaderboard/conformance/synthetic-toy.json index 14f9df2..6d55b97 100644 --- a/leaderboard/conformance/synthetic-toy.json +++ b/leaderboard/conformance/synthetic-toy.json @@ -17,13 +17,13 @@ "paper": null, "score": { "fitness": 1.0, - "precision": 0.75, - "fscore": 0.8571428571428571, - "n_test_transitions": 6, - "n_model_transitions": 8 + "precision": 1.0, + "fscore": 1.0, + "n_test_transitions": 9, + "n_model_transitions": 9 }, "scored_at": "2026-04-30T00:00:00Z", - "notes": "Directly-follows graph extracted from training cases. Perfect fitness on synthetic-toy (every test transition was seen at training time); precision drops because the model carries two transitions the test never uses." + "notes": "Directly-follows graph extracted from training cases. With 200 cases the train and test partitions both observe every path-graph edge, so fitness = precision = 1.0. Any future submission has to keep this floor while generalizing to a different test split." } ] } diff --git a/leaderboard/next-event/synthetic-toy.json b/leaderboard/next-event/synthetic-toy.json index 110c50a..c9c3bb4 100644 --- a/leaderboard/next-event/synthetic-toy.json +++ b/leaderboard/next-event/synthetic-toy.json @@ -16,9 +16,9 @@ "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/markov.py", "paper": null, "score": { - "top1": 0.975609756097561, + "top1": 0.930379746835443, "top3": 1.0, - "n": 41 + "n": 158 }, "scored_at": "2026-04-30T00:00:00Z", "notes": "First-order Markov reference baseline shipped with pm-bench. Trained on the train partition only; falls back to unigram for unseen last-activities. The floor any 'real' sequence model has to clear." diff --git a/leaderboard/outcome/synthetic-toy.json b/leaderboard/outcome/synthetic-toy.json new file mode 100644 index 0000000..0c0188f --- /dev/null +++ b/leaderboard/outcome/synthetic-toy.json @@ -0,0 +1,28 @@ +{ + "task": "outcome", + "dataset": "synthetic-toy", + "metric": "ROC AUC (higher is better)", + "scored_with": "pm_bench.score.score_outcome", + "split": { + "kind": "case-chrono", + "train_frac": 0.7, + "val_frac": 0.1 + }, + "outcome_rule": "case ends with `delivery_confirmed` (synthetic-toy happy path)", + "entries": [ + { + "model": "prior-ref", + "version": "0.1.0", + "predictions_path": "leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz", + "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/prior_outcome.py", + "paper": null, + "score": { + "auc": 0.631858407079646, + "n": 158, + "n_pos": 45 + }, + "scored_at": "2026-04-30T00:00:00Z", + "notes": "Last-activity-conditioned positive rate (with global-rate fallback for unseen activities). The dumbest baseline that uses any prefix signal; a real model has to clear AUC 0.63 to claim it's reading the trace." + } + ] +} diff --git a/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz b/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz index 1de80bd..68b7809 100644 Binary files a/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz and b/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz differ diff --git a/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json b/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json index 9717447..86e432f 100644 --- a/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json +++ b/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json @@ -31,6 +31,10 @@ [ "received", "payment_pending" + ], + [ + "ship_order", + "delivery_confirmed" ] ] } \ No newline at end of file diff --git a/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz b/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz index 45d77cf..7d8d07b 100644 Binary files a/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz and b/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz differ diff --git a/leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz b/leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz new file mode 100644 index 0000000..32aa1d5 Binary files /dev/null and b/leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz differ diff --git a/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz b/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz index a4d2673..a6d7cf7 100644 Binary files a/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz and b/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz differ diff --git a/leaderboard/remaining-time/synthetic-toy.json b/leaderboard/remaining-time/synthetic-toy.json index 5bce918..48247a6 100644 --- a/leaderboard/remaining-time/synthetic-toy.json +++ b/leaderboard/remaining-time/synthetic-toy.json @@ -16,11 +16,11 @@ "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_time.py", "paper": null, "score": { - "mae_days": 1.2546469315499607, - "n": 41 + "mae_days": 1.348129855278205, + "n": 158 }, "scored_at": "2026-04-30T00:00:00Z", - "notes": "Constant prediction = mean remaining-time observed on training prefixes. The dumbest model that still respects the train/test split - the floor any temporal model must clear." + "notes": "Constant prediction = mean remaining-time observed on training prefixes. The dumbest model that still respects the train/test split — the floor any temporal model must clear." } ] } diff --git a/pm_bench/_synth.py b/pm_bench/_synth.py index 9827e66..63bc5b0 100644 --- a/pm_bench/_synth.py +++ b/pm_bench/_synth.py @@ -36,7 +36,7 @@ WEIGHTS: list[float] = [0.50, 0.15, 0.15, 0.10, 0.10] -def synthetic_log(n_cases: int = 50, seed: int = 42) -> Iterator[Event]: +def synthetic_log(n_cases: int = 200, seed: int = 42) -> Iterator[Event]: """Yield `(case_id, activity, timestamp)` tuples deterministically.""" rng = random.Random(seed) start = dt.datetime(2024, 1, 1) diff --git a/pm_bench/leaderboard.py b/pm_bench/leaderboard.py index ba2a998..bce6913 100644 --- a/pm_bench/leaderboard.py +++ b/pm_bench/leaderboard.py @@ -26,8 +26,10 @@ from pm_bench.predictions import Prediction from pm_bench.prefixes import ( PREFIX_SEP, + OutcomeTarget, Prefix, TimeTarget, + extract_outcome_targets, extract_prefixes, extract_remaining_time_targets, ) @@ -35,6 +37,7 @@ score_bottleneck, score_conformance, score_next_event, + score_outcome, score_remaining_time, ) @@ -139,6 +142,23 @@ def _bottleneck_truth_for_dataset(name: str) -> list[BottleneckTarget]: return list(extract_bottleneck_targets(events, test_cases)) +def _outcome_truth_for_dataset(name: str) -> list[OutcomeTarget]: + """Canonical outcome truth set for a known dataset. + + The per-dataset positive-outcome rule lives in `pm_bench._synth` for + `synthetic-toy`; other datasets register their own rule when pinned. + """ + if name == "synthetic-toy": + from pm_bench._synth import is_positive_outcome + + events, test_cases = _events_and_test_cases(name) + return list(extract_outcome_targets(events, test_cases, is_positive_outcome)) + raise ValueError( + f"outcome truth for dataset {name!r} not yet wired; register an outcome " + "rule alongside the dataset" + ) + + def _rescore_next_event(board: Board, repo_root: Path) -> list[tuple[Entry, dict]]: truth = _truth_for_dataset(board.dataset) truth_keys = [(t.case_id, t.prefix_idx) for t in truth] @@ -244,6 +264,35 @@ def _rescore_conformance(board: Board, repo_root: Path) -> list[tuple[Entry, dic return out +def _rescore_outcome(board: Board, repo_root: Path) -> list[tuple[Entry, dict]]: + import csv + import gzip + + truth = _outcome_truth_for_dataset(board.dataset) + truth_keys = [(t.case_id, t.prefix_idx) for t in truth] + truth_int = [t.outcome for t in truth] + + out: list[tuple[Entry, dict]] = [] + for entry in board.entries: + pred_path = repo_root / entry.predictions_path + opener = gzip.open if str(pred_path).endswith(".gz") else open + pred_lookup: dict[tuple[str, int], float] = {} + with opener(pred_path, "rt", newline="") as f: + reader = csv.DictReader(f) + for row in reader: + pred_lookup[(row["case_id"], int(row["prefix_idx"]))] = float(row["score"]) + missing = [k for k in truth_keys if k not in pred_lookup] + if missing: + raise ValueError( + f"{entry.model}: predictions missing {len(missing)} target(s); " + f"first missing {missing[0]}" + ) + preds = [pred_lookup[k] for k in truth_keys] + s = score_outcome(preds, truth_int) + out.append((entry, {"auc": s.auc, "n": s.n, "n_pos": s.n_pos})) + return out + + def rescore(board: Board, repo_root: str | Path = ".") -> list[tuple[Entry, dict]]: """Re-run scoring for every entry; return (entry, fresh_score) pairs.""" root = Path(repo_root) @@ -255,6 +304,8 @@ def rescore(board: Board, repo_root: str | Path = ".") -> list[tuple[Entry, dict return _rescore_bottleneck(board, root) if board.task == "conformance": return _rescore_conformance(board, root) + if board.task == "outcome": + return _rescore_outcome(board, root) raise ValueError(f"unknown task: {board.task}") diff --git a/pm_bench/registry.yml b/pm_bench/registry.yml index 77dff27..5c71fb1 100644 --- a/pm_bench/registry.yml +++ b/pm_bench/registry.yml @@ -77,8 +77,8 @@ datasets: - name: synthetic-toy title: "Synthetic toy event log (deterministic, generated)" - cases: 50 - events: 250 + cases: 200 + events: 965 landing_url: null download_url: null sha256: null diff --git a/tests/test_leaderboard.py b/tests/test_leaderboard.py index 998e18c..49e71ad 100644 --- a/tests/test_leaderboard.py +++ b/tests/test_leaderboard.py @@ -81,6 +81,14 @@ def test_cli_leaderboard_verify_passes() -> None: assert "no drift" in r.output +def test_outcome_board_loads_and_verifies() -> None: + p = REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json" + board = load_board(p) + assert board.task == "outcome" + drifts = verify(board, repo_root=REPO_ROOT) + assert drifts == [], drifts + + def test_remaining_time_board_loads_and_verifies() -> None: board = load_board(TIME_BOARD_PATH) assert board.task == "remaining-time" @@ -141,7 +149,7 @@ def test_board_to_markdown_includes_model_and_score() -> None: board = load_board(BOARD_PATH) md = board_to_markdown(board) assert "markov-ref" in md - assert "0.9756" in md + assert "0.9304" in md assert "next-event" in md @@ -150,6 +158,8 @@ def test_all_standings_markdown_lists_every_board() -> None: assert "next-event · synthetic-toy" in md assert "remaining-time · synthetic-toy" in md assert "bottleneck · synthetic-toy" in md + assert "outcome · synthetic-toy" in md + assert "conformance · synthetic-toy" in md def test_checked_in_standings_md_is_up_to_date() -> None: diff --git a/tests/test_outcome.py b/tests/test_outcome.py index 0e1ed03..990e3c6 100644 --- a/tests/test_outcome.py +++ b/tests/test_outcome.py @@ -1,11 +1,9 @@ """End-to-end + targeted tests for the outcome task. -Synthetic-toy with seed=42 doesn't put any `delivery_confirmed` cases -in the test split (path-4 is 10% of cases and the chronological tail -happens to have none), so we test the outcome machinery on a hand-built -event set with controlled class balance instead. The CLI smoke test -runs against synthetic-toy and asserts the pipeline executes cleanly, -even though the AUC degenerates to 0.5 (n_pos=0 in test). +Synthetic-toy at n_cases=200 places ~45 positives in the test partition, +so the outcome AUC is meaningful (prior-ref ≈ 0.63). The targeted unit +tests still use a hand-built event set with controlled class balance so +the AUC math is checkable by inspection. """ from __future__ import annotations @@ -120,8 +118,7 @@ def test_score_outcome_round_trip_via_writer(tmp_path) -> None: def test_full_outcome_pipeline_on_synthetic_toy(tmp_path) -> None: - """Pipeline runs cleanly end-to-end. AUC degenerates because seed=42's - test partition has no positives, but the contract still holds.""" + """Full pipeline runs and AUC is meaningful (n_cases=200 default).""" runner = CliRunner() split_path = tmp_path / "split.json" prefixes_path = tmp_path / "prefixes.csv" @@ -155,6 +152,6 @@ def test_full_outcome_pipeline_on_synthetic_toy(tmp_path) -> None: result = json.loads(r.output) assert result["task"] == "outcome" assert result["n"] > 0 - # synthetic-toy with seed=42 happens to have n_pos=0 in test - # → degenerate AUC = 0.5 by convention. The pipeline still runs. - assert 0.0 <= result["auc"] <= 1.0 + assert result["n_pos"] > 0 # both classes present in test partition + # The prior baseline should beat the trivial 0.5 floor with both classes. + assert result["auc"] > 0.5