erphq · protosphinx · May 1, 2026
diff --git a/GOALS.md b/GOALS.md
@@ -9,7 +9,7 @@ Be the default benchmark for new process-mining methods. Within 18 months,
   (`pm-bench fetch <name> [--pin]`); per-dataset hash pins pending
   the one-time TOS-gated downloads
 - 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅,
-  outcome ✅; conformance, bottleneck pending)
+  outcome ✅, bottleneck ✅; conformance pending)
 - `gnn` runs end-to-end as the reference baseline (Markov reference ✅;
   `gnn` integration pending the first pinned dataset)
 - End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes →

diff --git a/README.md b/README.md
@@ -224,7 +224,8 @@ honesty. The point of the benchmark is to make the comparison real.
 - [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅,
       remaining-time ✅, outcome ✅ (AUC scoring + prior baseline +
       pipeline; leaderboard entry waits on a dataset whose test split
-      has both classes); conformance / bottleneck remain.
+      has both classes), bottleneck ✅ (NDCG@10 + mean-wait baseline +
+      leaderboard entry); conformance remains.
 - [🟡] v0.4 — leaderboard CI + landing page. Standings format,
       reference Markov entry, `pm-bench leaderboard [--all] --verify`,
       and the dedicated `leaderboard.yml` GitHub workflow shipped;

diff --git a/STATUS.md b/STATUS.md
@@ -60,6 +60,20 @@ pm-bench fetch bpi2020 --pin
 
 ## Recently shipped
 
+- **Bottleneck task (NDCG@10 over transitions)** (`bottleneck-task` branch).
+  - `score_bottleneck` — pure-CPython NDCG@k with average DCG/IDCG
+    discounting. Missing predictions sink to the bottom of the
+    ranking (model that refuses to predict can't claim credit).
+  - `pm_bench/bottleneck.py` — per-transition mean-wait targets.
+    Truth shape is `(activity_a, activity_b, mean_wait_seconds,
+    n_observations)` — different from the per-prefix tasks.
+  - `pm_bench/baselines/mean_wait.py` — train-mean-per-transition
+    with global-mean fallback. On synthetic-toy: NDCG@10 0.9786 over
+    6 transitions. Strong floor for any temporal model.
+  - CLI: `--task bottleneck`, `--baseline mean-wait`, end-to-end.
+  - `leaderboard/bottleneck/synthetic-toy.json` with the mean-wait-ref
+    entry; `pm-bench leaderboard --all --verify` now walks 3 boards.
+  - 7 new tests (`test_bottleneck.py`); 86 total, ruff clean.
 - **Outcome task (binary AUC)** (`outcome-task` branch).
   - `score_outcome` — pure-CPython rank-sum AUC, with average-rank
     tie-breaking; degenerate single-class case returns 0.5 by

diff --git a/leaderboard/bottleneck/synthetic-toy.json b/leaderboard/bottleneck/synthetic-toy.json
@@ -0,0 +1,27 @@
+{
+  "task": "bottleneck",
+  "dataset": "synthetic-toy",
+  "metric": "NDCG@10 over per-transition wait times (higher is better)",
+  "scored_with": "pm_bench.score.score_bottleneck",
+  "split": {
+    "kind": "case-chrono",
+    "train_frac": 0.7,
+    "val_frac": 0.1
+  },
+  "entries": [
+    {
+      "model": "mean-wait-ref",
+      "version": "0.1.0",
+      "predictions_path": "leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz",
+      "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_wait.py",
+      "paper": null,
+      "score": {
+        "ndcg_at_k": 0.9786469611053435,
+        "k": 10,
+        "n_transitions": 6
+      },
+      "scored_at": "2026-04-30T00:00:00Z",
+      "notes": "Per-transition mean wait time fitted on training cases, with global-mean fallback for unseen transitions. The dumbest model that uses any time information; a real model has to do strictly better."
+    }
+  ]
+}
diff --git a/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz b/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz
diff --git a/pm_bench/__init__.py b/pm_bench/__init__.py
@@ -20,16 +20,19 @@
 )
 from pm_bench.registry import Dataset, get_dataset, load_registry
 from pm_bench.score import (
+    BottleneckScore,
     NextEventScore,
     OutcomeScore,
     RemainingTimeScore,
+    score_bottleneck,
     score_next_event,
     score_outcome,
     score_remaining_time,
 )
 from pm_bench.split import Event, Split, case_chrono_split
 
 __all__ = [
+    "BottleneckScore",
     "Dataset",
     "Event",
     "NextEventScore",
@@ -50,6 +53,7 @@
     "read_predictions_csv",
     "read_prefixes_csv",
     "read_time_targets_csv",
+    "score_bottleneck",
     "score_next_event",
     "score_outcome",
     "score_remaining_time",

diff --git a/pm_bench/baselines/__init__.py b/pm_bench/baselines/__init__.py
@@ -16,6 +16,7 @@
     read_time_predictions_csv,
     write_time_predictions_csv,
 )
+from pm_bench.baselines.mean_wait import MeanWaitBaseline, fit_mean_wait, predict_mean_wait
 from pm_bench.baselines.prior_outcome import (
     OutcomePrediction,
     PriorOutcomeBaseline,
@@ -28,13 +29,16 @@
 __all__ = [
     "MarkovBaseline",
     "MeanTimeBaseline",
+    "MeanWaitBaseline",
     "OutcomePrediction",
     "PriorOutcomeBaseline",
     "TimePrediction",
     "fit_mean_time",
+    "fit_mean_wait",
     "fit_prior_outcome",
     "predict_markov",
     "predict_mean_time",
+    "predict_mean_wait",
     "predict_prior_outcome",
     "read_outcome_predictions_csv",
     "read_time_predictions_csv",

diff --git a/pm_bench/baselines/mean_wait.py b/pm_bench/baselines/mean_wait.py
@@ -0,0 +1,62 @@
+"""Train-mean-wait reference baseline for bottleneck detection.
+
+For every (activity_a, activity_b) transition observed in the training
+cases, store the mean wait time. At test time, predict that mean. Falls
+back to the global training mean for transitions never seen during
+training.
+
+Identifies the "obvious" bottlenecks — transitions that were already
+slow in training. A model that ties this isn't using any new
+information from the test set.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from pm_bench.bottleneck import BottleneckPrediction, BottleneckTarget, extract_bottleneck_targets
+from pm_bench.split import Activity, CaseId, Event
+
+
+@dataclass(frozen=True)
+class MeanWaitBaseline:
+    by_transition: dict[tuple[Activity, Activity], float]
+    global_mean_seconds: float
+
+
+def fit_mean_wait(
+    events: Iterable[Event],
+    train_case_ids: Iterable[CaseId],
+) -> MeanWaitBaseline:
+    """Per-transition mean wait time over training cases."""
+    targets = list(extract_bottleneck_targets(events, train_case_ids))
+    if not targets:
+        return MeanWaitBaseline(by_transition={}, global_mean_seconds=0.0)
+
+    by_transition = {
+        (t.activity_a, t.activity_b): t.mean_wait_seconds for t in targets
+    }
+    # Weight global mean by observation count so common transitions dominate.
+    total_wait = sum(t.mean_wait_seconds * t.n_observations for t in targets)
+    total_obs = sum(t.n_observations for t in targets)
+    global_mean = (total_wait / total_obs) if total_obs else 0.0
+    return MeanWaitBaseline(by_transition=by_transition, global_mean_seconds=global_mean)
+
+
+def predict_mean_wait(
+    model: MeanWaitBaseline,
+    targets: Iterable[BottleneckTarget],
+) -> list[BottleneckPrediction]:
+    """For each target transition, return the trained mean (or global fallback)."""
+    out: list[BottleneckPrediction] = []
+    for t in targets:
+        key = (t.activity_a, t.activity_b)
+        pred = model.by_transition.get(key, model.global_mean_seconds)
+        out.append(
+            BottleneckPrediction(
+                activity_a=t.activity_a,
+                activity_b=t.activity_b,
+                predicted_wait_seconds=pred,
+            )
+        )
+    return out
diff --git a/pm_bench/bottleneck.py b/pm_bench/bottleneck.py
@@ -0,0 +1,142 @@
+"""Bottleneck-detection targets — per-transition mean wait time.
+
+Bottleneck is the only v0 task that's *per-transition* rather than
+per-prefix: there's one truth row per ordered (activity_a, activity_b)
+pair observed in the partition, with the mean wait time (seconds)
+between them across all cases. Models predict a value per transition;
+NDCG@10 over the ranking is the score.
+
+Truth file columns:
+
+    activity_a,activity_b,mean_wait_seconds,n_observations
+
+Predictions file columns:
+
+    activity_a,activity_b,predicted_wait_seconds
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable, Iterator
+from dataclasses import dataclass
+
+from pm_bench.split import Activity, CaseId, Event
+
+
+@dataclass(frozen=True)
+class BottleneckTarget:
+    activity_a: Activity
+    activity_b: Activity
+    mean_wait_seconds: float
+    n_observations: int
+
+
+@dataclass(frozen=True)
+class BottleneckPrediction:
+    activity_a: Activity
+    activity_b: Activity
+    predicted_wait_seconds: float
+
+
+def extract_bottleneck_targets(
+    events: Iterable[Event],
+    case_ids: Iterable[CaseId],
+) -> Iterator[BottleneckTarget]:
+    """Yield per-transition mean wait time for the given case ids.
+
+    For each pair of chronologically-consecutive activities within a
+    case, we record the wait time. The yielded targets aggregate
+    across all cases in `case_ids` — one row per distinct (a, b) pair.
+    """
+    keep = set(case_ids)
+    by_case: dict[CaseId, list[tuple[Activity, object]]] = {}
+    for case_id, activity, ts in events:
+        if case_id not in keep:
+            continue
+        by_case.setdefault(case_id, []).append((activity, ts))
+
+    sums: dict[tuple[Activity, Activity], float] = {}
+    counts: dict[tuple[Activity, Activity], int] = {}
+    for rows in by_case.values():
+        rows.sort(key=lambda r: r[1])
+        for (a, ta), (b, tb) in zip(rows, rows[1:], strict=False):
+            key = (a, b)
+            wait = (tb - ta).total_seconds()  # type: ignore[operator]
+            sums[key] = sums.get(key, 0.0) + wait
+            counts[key] = counts.get(key, 0) + 1
+
+    for key in sorted(sums.keys()):
+        yield BottleneckTarget(
+            activity_a=key[0],
+            activity_b=key[1],
+            mean_wait_seconds=sums[key] / counts[key],
+            n_observations=counts[key],
+        )
+
+
+def write_bottleneck_targets_csv(
+    targets: Iterable[BottleneckTarget], path: str
+) -> int:
+    """Write bottleneck targets to a CSV file."""
+    import csv
+
+    n = 0
+    with open(path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["activity_a", "activity_b", "mean_wait_seconds", "n_observations"])
+        for t in targets:
+            w.writerow([t.activity_a, t.activity_b, repr(t.mean_wait_seconds), t.n_observations])
+            n += 1
+    return n
+
+
+def read_bottleneck_targets_csv(path: str) -> list[BottleneckTarget]:
+    """Read a bottleneck-targets CSV."""
+    import csv
+
+    out: list[BottleneckTarget] = []
+    with open(path, newline="") as f:
+        r = csv.DictReader(f)
+        for row in r:
+            out.append(
+                BottleneckTarget(
+                    activity_a=row["activity_a"],
+                    activity_b=row["activity_b"],
+                    mean_wait_seconds=float(row["mean_wait_seconds"]),
+                    n_observations=int(row["n_observations"]),
+                )
+            )
+    return out
+
+
+def write_bottleneck_predictions_csv(
+    predictions: Iterable[BottleneckPrediction], path: str
+) -> int:
+    """Write bottleneck predictions to a CSV file."""
+    import csv
+
+    n = 0
+    with open(path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["activity_a", "activity_b", "predicted_wait_seconds"])
+        for p in predictions:
+            w.writerow([p.activity_a, p.activity_b, repr(p.predicted_wait_seconds)])
+            n += 1
+    return n
+
+
+def read_bottleneck_predictions_csv(path: str) -> list[BottleneckPrediction]:
+    """Read a bottleneck-predictions CSV."""
+    import csv
+
+    out: list[BottleneckPrediction] = []
+    with open(path, newline="") as f:
+        r = csv.DictReader(f)
+        for row in r:
+            out.append(
+                BottleneckPrediction(
+                    activity_a=row["activity_a"],
+                    activity_b=row["activity_b"],
+                    predicted_wait_seconds=float(row["predicted_wait_seconds"]),
+                )
+            )
+    return out