Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion GOALS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Be the default benchmark for new process-mining methods. Within 18 months,
(`pm-bench fetch <name> [--pin]`); per-dataset hash pins pending
the one-time TOS-gated downloads
- 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅,
outcome ✅; conformance, bottleneck pending)
outcome ✅, bottleneck ✅; conformance pending)
- `gnn` runs end-to-end as the reference baseline (Markov reference ✅;
`gnn` integration pending the first pinned dataset)
- End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes →
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ honesty. The point of the benchmark is to make the comparison real.
- [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅,
remaining-time ✅, outcome ✅ (AUC scoring + prior baseline +
pipeline; leaderboard entry waits on a dataset whose test split
has both classes); conformance / bottleneck remain.
has both classes), bottleneck ✅ (NDCG@10 + mean-wait baseline +
leaderboard entry); conformance remains.
- [🟡] v0.4 — leaderboard CI + landing page. Standings format,
reference Markov entry, `pm-bench leaderboard [--all] --verify`,
and the dedicated `leaderboard.yml` GitHub workflow shipped;
Expand Down
14 changes: 14 additions & 0 deletions STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ pm-bench fetch bpi2020 --pin

## Recently shipped

- **Bottleneck task (NDCG@10 over transitions)** (`bottleneck-task` branch).
- `score_bottleneck` — pure-CPython NDCG@k with average DCG/IDCG
discounting. Missing predictions sink to the bottom of the
ranking (model that refuses to predict can't claim credit).
- `pm_bench/bottleneck.py` — per-transition mean-wait targets.
Truth shape is `(activity_a, activity_b, mean_wait_seconds,
n_observations)` — different from the per-prefix tasks.
- `pm_bench/baselines/mean_wait.py` — train-mean-per-transition
with global-mean fallback. On synthetic-toy: NDCG@10 0.9786 over
6 transitions. Strong floor for any temporal model.
- CLI: `--task bottleneck`, `--baseline mean-wait`, end-to-end.
- `leaderboard/bottleneck/synthetic-toy.json` with the mean-wait-ref
entry; `pm-bench leaderboard --all --verify` now walks 3 boards.
- 7 new tests (`test_bottleneck.py`); 86 total, ruff clean.
- **Outcome task (binary AUC)** (`outcome-task` branch).
- `score_outcome` — pure-CPython rank-sum AUC, with average-rank
tie-breaking; degenerate single-class case returns 0.5 by
Expand Down
27 changes: 27 additions & 0 deletions leaderboard/bottleneck/synthetic-toy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"task": "bottleneck",
"dataset": "synthetic-toy",
"metric": "NDCG@10 over per-transition wait times (higher is better)",
"scored_with": "pm_bench.score.score_bottleneck",
"split": {
"kind": "case-chrono",
"train_frac": 0.7,
"val_frac": 0.1
},
"entries": [
{
"model": "mean-wait-ref",
"version": "0.1.0",
"predictions_path": "leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz",
"code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_wait.py",
"paper": null,
"score": {
"ndcg_at_k": 0.9786469611053435,
"k": 10,
"n_transitions": 6
},
"scored_at": "2026-04-30T00:00:00Z",
"notes": "Per-transition mean wait time fitted on training cases, with global-mean fallback for unseen transitions. The dumbest model that uses any time information; a real model has to do strictly better."
}
]
}
Binary file not shown.
4 changes: 4 additions & 0 deletions pm_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,19 @@
)
from pm_bench.registry import Dataset, get_dataset, load_registry
from pm_bench.score import (
BottleneckScore,
NextEventScore,
OutcomeScore,
RemainingTimeScore,
score_bottleneck,
score_next_event,
score_outcome,
score_remaining_time,
)
from pm_bench.split import Event, Split, case_chrono_split

__all__ = [
"BottleneckScore",
"Dataset",
"Event",
"NextEventScore",
Expand All @@ -50,6 +53,7 @@
"read_predictions_csv",
"read_prefixes_csv",
"read_time_targets_csv",
"score_bottleneck",
"score_next_event",
"score_outcome",
"score_remaining_time",
Expand Down
4 changes: 4 additions & 0 deletions pm_bench/baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
read_time_predictions_csv,
write_time_predictions_csv,
)
from pm_bench.baselines.mean_wait import MeanWaitBaseline, fit_mean_wait, predict_mean_wait
from pm_bench.baselines.prior_outcome import (
OutcomePrediction,
PriorOutcomeBaseline,
Expand All @@ -28,13 +29,16 @@
__all__ = [
"MarkovBaseline",
"MeanTimeBaseline",
"MeanWaitBaseline",
"OutcomePrediction",
"PriorOutcomeBaseline",
"TimePrediction",
"fit_mean_time",
"fit_mean_wait",
"fit_prior_outcome",
"predict_markov",
"predict_mean_time",
"predict_mean_wait",
"predict_prior_outcome",
"read_outcome_predictions_csv",
"read_time_predictions_csv",
Expand Down
62 changes: 62 additions & 0 deletions pm_bench/baselines/mean_wait.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Train-mean-wait reference baseline for bottleneck detection.

For every (activity_a, activity_b) transition observed in the training
cases, store the mean wait time. At test time, predict that mean. Falls
back to the global training mean for transitions never seen during
training.

Identifies the "obvious" bottlenecks — transitions that were already
slow in training. A model that ties this isn't using any new
information from the test set.
"""
from __future__ import annotations

from collections.abc import Iterable
from dataclasses import dataclass

from pm_bench.bottleneck import BottleneckPrediction, BottleneckTarget, extract_bottleneck_targets
from pm_bench.split import Activity, CaseId, Event


@dataclass(frozen=True)
class MeanWaitBaseline:
by_transition: dict[tuple[Activity, Activity], float]
global_mean_seconds: float


def fit_mean_wait(
events: Iterable[Event],
train_case_ids: Iterable[CaseId],
) -> MeanWaitBaseline:
"""Per-transition mean wait time over training cases."""
targets = list(extract_bottleneck_targets(events, train_case_ids))
if not targets:
return MeanWaitBaseline(by_transition={}, global_mean_seconds=0.0)

by_transition = {
(t.activity_a, t.activity_b): t.mean_wait_seconds for t in targets
}
# Weight global mean by observation count so common transitions dominate.
total_wait = sum(t.mean_wait_seconds * t.n_observations for t in targets)
total_obs = sum(t.n_observations for t in targets)
global_mean = (total_wait / total_obs) if total_obs else 0.0
return MeanWaitBaseline(by_transition=by_transition, global_mean_seconds=global_mean)


def predict_mean_wait(
model: MeanWaitBaseline,
targets: Iterable[BottleneckTarget],
) -> list[BottleneckPrediction]:
"""For each target transition, return the trained mean (or global fallback)."""
out: list[BottleneckPrediction] = []
for t in targets:
key = (t.activity_a, t.activity_b)
pred = model.by_transition.get(key, model.global_mean_seconds)
out.append(
BottleneckPrediction(
activity_a=t.activity_a,
activity_b=t.activity_b,
predicted_wait_seconds=pred,
)
)
return out
142 changes: 142 additions & 0 deletions pm_bench/bottleneck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Bottleneck-detection targets — per-transition mean wait time.

Bottleneck is the only v0 task that's *per-transition* rather than
per-prefix: there's one truth row per ordered (activity_a, activity_b)
pair observed in the partition, with the mean wait time (seconds)
between them across all cases. Models predict a value per transition;
NDCG@10 over the ranking is the score.

Truth file columns:

activity_a,activity_b,mean_wait_seconds,n_observations

Predictions file columns:

activity_a,activity_b,predicted_wait_seconds
"""
from __future__ import annotations

from collections.abc import Iterable, Iterator
from dataclasses import dataclass

from pm_bench.split import Activity, CaseId, Event


@dataclass(frozen=True)
class BottleneckTarget:
activity_a: Activity
activity_b: Activity
mean_wait_seconds: float
n_observations: int


@dataclass(frozen=True)
class BottleneckPrediction:
activity_a: Activity
activity_b: Activity
predicted_wait_seconds: float


def extract_bottleneck_targets(
events: Iterable[Event],
case_ids: Iterable[CaseId],
) -> Iterator[BottleneckTarget]:
"""Yield per-transition mean wait time for the given case ids.

For each pair of chronologically-consecutive activities within a
case, we record the wait time. The yielded targets aggregate
across all cases in `case_ids` — one row per distinct (a, b) pair.
"""
keep = set(case_ids)
by_case: dict[CaseId, list[tuple[Activity, object]]] = {}
for case_id, activity, ts in events:
if case_id not in keep:
continue
by_case.setdefault(case_id, []).append((activity, ts))

sums: dict[tuple[Activity, Activity], float] = {}
counts: dict[tuple[Activity, Activity], int] = {}
for rows in by_case.values():
rows.sort(key=lambda r: r[1])
for (a, ta), (b, tb) in zip(rows, rows[1:], strict=False):
key = (a, b)
wait = (tb - ta).total_seconds() # type: ignore[operator]
sums[key] = sums.get(key, 0.0) + wait
counts[key] = counts.get(key, 0) + 1

for key in sorted(sums.keys()):
yield BottleneckTarget(
activity_a=key[0],
activity_b=key[1],
mean_wait_seconds=sums[key] / counts[key],
n_observations=counts[key],
)


def write_bottleneck_targets_csv(
targets: Iterable[BottleneckTarget], path: str
) -> int:
"""Write bottleneck targets to a CSV file."""
import csv

n = 0
with open(path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["activity_a", "activity_b", "mean_wait_seconds", "n_observations"])
for t in targets:
w.writerow([t.activity_a, t.activity_b, repr(t.mean_wait_seconds), t.n_observations])
n += 1
return n


def read_bottleneck_targets_csv(path: str) -> list[BottleneckTarget]:
"""Read a bottleneck-targets CSV."""
import csv

out: list[BottleneckTarget] = []
with open(path, newline="") as f:
r = csv.DictReader(f)
for row in r:
out.append(
BottleneckTarget(
activity_a=row["activity_a"],
activity_b=row["activity_b"],
mean_wait_seconds=float(row["mean_wait_seconds"]),
n_observations=int(row["n_observations"]),
)
)
return out


def write_bottleneck_predictions_csv(
predictions: Iterable[BottleneckPrediction], path: str
) -> int:
"""Write bottleneck predictions to a CSV file."""
import csv

n = 0
with open(path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["activity_a", "activity_b", "predicted_wait_seconds"])
for p in predictions:
w.writerow([p.activity_a, p.activity_b, repr(p.predicted_wait_seconds)])
n += 1
return n


def read_bottleneck_predictions_csv(path: str) -> list[BottleneckPrediction]:
"""Read a bottleneck-predictions CSV."""
import csv

out: list[BottleneckPrediction] = []
with open(path, newline="") as f:
r = csv.DictReader(f)
for row in r:
out.append(
BottleneckPrediction(
activity_a=row["activity_a"],
activity_b=row["activity_b"],
predicted_wait_seconds=float(row["predicted_wait_seconds"]),
)
)
return out
Loading