erphq · protosphinx · May 1, 2026
diff --git a/README.md b/README.md
@@ -243,9 +243,9 @@ honesty. The point of the benchmark is to make the comparison real.
 - [x] v0.2 - splits + targets for next-event ✅ and remaining-time ✅
 - [x] v0.3 - scoring scripts for all 5 tasks ✅. next-event,
       remaining-time, outcome, bottleneck, conformance - every task
-      ships with a CPython baseline and a leaderboard entry on
-      synthetic-toy (the outcome row waits on a dataset whose test
-      split has both classes).
+      ships with a CPython baseline and a real leaderboard entry on
+      synthetic-toy. All five entries are verified by
+      `pm-bench leaderboard --all --verify` in CI.
 - [x] v0.4 - leaderboard CI + landing page. Standings JSON,
       reference entries, `pm-bench leaderboard [--all] [--verify]
       [--markdown]`, the dedicated `leaderboard.yml` GitHub workflow,

diff --git a/STANDINGS.md b/STANDINGS.md
@@ -7,25 +7,32 @@ _NDCG@10 over per-transition wait times (higher is better)_
 
 | Model | NDCG@k | k | n_transitions |
 |---|---:|---:|---:|
-| `mean-wait-ref` | 0.9786 | 10 | 6 |
+| `mean-wait-ref` | 0.9911 | 10 | 9 |
 
 ### conformance · synthetic-toy
 _DFG fitness × precision → F-score (higher is better)_
 
 | Model | F | Fitness | Precision | n_test | n_model |
 |---|---:|---:|---:|---:|---:|
-| `dfg-ref` | 0.8571 | 1.0000 | 0.7500 | 6 | 8 |
+| `dfg-ref` | 1.0000 | 1.0000 | 1.0000 | 9 | 9 |
 
 ### next-event · synthetic-toy
 _top1 / top3 accuracy_
 
 | Model | top1 | top3 | n |
 |---|---:|---:|---:|
-| `markov-ref` | 0.9756 | 1.0000 | 41 |
+| `markov-ref` | 0.9304 | 1.0000 | 158 |
+
+### outcome · synthetic-toy
+_ROC AUC (higher is better)_
+
+| Model | AUC | n | n_pos |
+|---|---:|---:|---:|
+| `prior-ref` | 0.6319 | 158 | 45 |
 
 ### remaining-time · synthetic-toy
 _MAE in days (lower is better)_
 
 | Model | mae_days | n |
 |---|---:|---:|
-| `mean-ref` | 1.2546 | 41 |
+| `mean-ref` | 1.3481 | 158 |
diff --git a/STATUS.md b/STATUS.md
@@ -60,6 +60,22 @@ pm-bench fetch bpi2020 --pin
 
 ## Recently shipped
 
+- **Synthetic-toy bumped to 200 cases — outcome row finally lands**
+  (`synthetic-200` branch).
+  - `synthetic_log()` default `n_cases` = 200 (was 50). Test partition
+    now has ~45 positive cases (`delivery_confirmed`) so AUC is
+    meaningful instead of degenerating to 0.5.
+  - All 4 existing reference predictions regenerated and re-scored.
+    New numbers: markov-ref top-1 0.9304 (was 0.9756 on 50 cases),
+    mean-ref MAE 1.3481, mean-wait-ref NDCG@10 0.9911,
+    dfg-ref F=1.0 (both partitions now cover the full path graph).
+  - **5th leaderboard board added**: `outcome/synthetic-toy.json`
+    with `prior-ref` entry — AUC 0.6319, n_pos 45 / 158. Real floor
+    for any temporal model on the outcome task.
+  - `_rescore_outcome` + `_outcome_truth_for_dataset` added to
+    `leaderboard.py`. `pm-bench leaderboard --all --verify` now
+    walks all 5 boards.
+  - STANDINGS.md regenerated. 109 tests, ruff clean.
 - **Conformance task - v0.3 closed** (`conformance-task` branch).
   - `score_conformance` - DFG fitness × precision → F-score. Pure
     CPython; no pm4py dep.

diff --git a/leaderboard/bottleneck/synthetic-toy.json b/leaderboard/bottleneck/synthetic-toy.json
@@ -16,9 +16,9 @@
       "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_wait.py",
       "paper": null,
       "score": {
-        "ndcg_at_k": 0.9786469611053435,
+        "ndcg_at_k": 0.9911470371722683,
         "k": 10,
-        "n_transitions": 6
+        "n_transitions": 9
       },
       "scored_at": "2026-04-30T00:00:00Z",
       "notes": "Per-transition mean wait time fitted on training cases, with global-mean fallback for unseen transitions. The dumbest model that uses any time information; a real model has to do strictly better."

diff --git a/leaderboard/conformance/synthetic-toy.json b/leaderboard/conformance/synthetic-toy.json
@@ -17,13 +17,13 @@
       "paper": null,
       "score": {
         "fitness": 1.0,
-        "precision": 0.75,
-        "fscore": 0.8571428571428571,
-        "n_test_transitions": 6,
-        "n_model_transitions": 8
+        "precision": 1.0,
+        "fscore": 1.0,
+        "n_test_transitions": 9,
+        "n_model_transitions": 9
       },
       "scored_at": "2026-04-30T00:00:00Z",
-      "notes": "Directly-follows graph extracted from training cases. Perfect fitness on synthetic-toy (every test transition was seen at training time); precision drops because the model carries two transitions the test never uses."
+      "notes": "Directly-follows graph extracted from training cases. With 200 cases the train and test partitions both observe every path-graph edge, so fitness = precision = 1.0. Any future submission has to keep this floor while generalizing to a different test split."
     }
   ]
 }
diff --git a/leaderboard/next-event/synthetic-toy.json b/leaderboard/next-event/synthetic-toy.json
@@ -16,9 +16,9 @@
       "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/markov.py",
       "paper": null,
       "score": {
-        "top1": 0.975609756097561,
+        "top1": 0.930379746835443,
         "top3": 1.0,
-        "n": 41
+        "n": 158
       },
       "scored_at": "2026-04-30T00:00:00Z",
       "notes": "First-order Markov reference baseline shipped with pm-bench. Trained on the train partition only; falls back to unigram for unseen last-activities. The floor any 'real' sequence model has to clear."

diff --git a/leaderboard/outcome/synthetic-toy.json b/leaderboard/outcome/synthetic-toy.json
@@ -0,0 +1,28 @@
+{
+  "task": "outcome",
+  "dataset": "synthetic-toy",
+  "metric": "ROC AUC (higher is better)",
+  "scored_with": "pm_bench.score.score_outcome",
+  "split": {
+    "kind": "case-chrono",
+    "train_frac": 0.7,
+    "val_frac": 0.1
+  },
+  "outcome_rule": "case ends with `delivery_confirmed` (synthetic-toy happy path)",
+  "entries": [
+    {
+      "model": "prior-ref",
+      "version": "0.1.0",
+      "predictions_path": "leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz",
+      "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/prior_outcome.py",
+      "paper": null,
+      "score": {
+        "auc": 0.631858407079646,
+        "n": 158,
+        "n_pos": 45
+      },
+      "scored_at": "2026-04-30T00:00:00Z",
+      "notes": "Last-activity-conditioned positive rate (with global-rate fallback for unseen activities). The dumbest baseline that uses any prefix signal; a real model has to clear AUC 0.63 to claim it's reading the trace."
+    }
+  ]
+}
diff --git a/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz b/leaderboard/predictions/bottleneck/synthetic-toy/mean-wait-ref.csv.gz
diff --git a/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json b/leaderboard/predictions/conformance/synthetic-toy/dfg-ref.json
@@ -31,6 +31,10 @@
     [
       "received",
       "payment_pending"
+    ],
+    [
+      "ship_order",
+      "delivery_confirmed"
     ]
   ]
 }
diff --git a/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz b/leaderboard/predictions/next-event/synthetic-toy/markov-ref.csv.gz
diff --git a/leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz b/leaderboard/predictions/outcome/synthetic-toy/prior-ref.csv.gz
diff --git a/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz b/leaderboard/predictions/remaining-time/synthetic-toy/mean-ref.csv.gz
diff --git a/leaderboard/remaining-time/synthetic-toy.json b/leaderboard/remaining-time/synthetic-toy.json
@@ -16,11 +16,11 @@
       "code": "https://github.com/erphq/pm-bench/blob/main/pm_bench/baselines/mean_time.py",
       "paper": null,
       "score": {
-        "mae_days": 1.2546469315499607,
-        "n": 41
+        "mae_days": 1.348129855278205,
+        "n": 158
       },
       "scored_at": "2026-04-30T00:00:00Z",
-      "notes": "Constant prediction = mean remaining-time observed on training prefixes. The dumbest model that still respects the train/test split - the floor any temporal model must clear."
+      "notes": "Constant prediction = mean remaining-time observed on training prefixes. The dumbest model that still respects the train/test split — the floor any temporal model must clear."
     }
   ]
 }
diff --git a/pm_bench/_synth.py b/pm_bench/_synth.py
@@ -36,7 +36,7 @@
 WEIGHTS: list[float] = [0.50, 0.15, 0.15, 0.10, 0.10]
 
 
-def synthetic_log(n_cases: int = 50, seed: int = 42) -> Iterator[Event]:
+def synthetic_log(n_cases: int = 200, seed: int = 42) -> Iterator[Event]:
     """Yield `(case_id, activity, timestamp)` tuples deterministically."""
     rng = random.Random(seed)
     start = dt.datetime(2024, 1, 1)

diff --git a/pm_bench/leaderboard.py b/pm_bench/leaderboard.py
@@ -26,15 +26,18 @@
 from pm_bench.predictions import Prediction
 from pm_bench.prefixes import (
     PREFIX_SEP,
+    OutcomeTarget,
     Prefix,
     TimeTarget,
+    extract_outcome_targets,
     extract_prefixes,
     extract_remaining_time_targets,
 )
 from pm_bench.score import (
     score_bottleneck,
     score_conformance,
     score_next_event,
+    score_outcome,
     score_remaining_time,
 )
 
@@ -139,6 +142,23 @@ def _bottleneck_truth_for_dataset(name: str) -> list[BottleneckTarget]:
     return list(extract_bottleneck_targets(events, test_cases))
 
 
+def _outcome_truth_for_dataset(name: str) -> list[OutcomeTarget]:
+    """Canonical outcome truth set for a known dataset.
+
+    The per-dataset positive-outcome rule lives in `pm_bench._synth` for
+    `synthetic-toy`; other datasets register their own rule when pinned.
+    """
+    if name == "synthetic-toy":
+        from pm_bench._synth import is_positive_outcome
+
+        events, test_cases = _events_and_test_cases(name)
+        return list(extract_outcome_targets(events, test_cases, is_positive_outcome))
+    raise ValueError(
+        f"outcome truth for dataset {name!r} not yet wired; register an outcome "
+        "rule alongside the dataset"
+    )
+
+
 def _rescore_next_event(board: Board, repo_root: Path) -> list[tuple[Entry, dict]]:
     truth = _truth_for_dataset(board.dataset)
     truth_keys = [(t.case_id, t.prefix_idx) for t in truth]
@@ -244,6 +264,35 @@ def _rescore_conformance(board: Board, repo_root: Path) -> list[tuple[Entry, dic
     return out
 
 
+def _rescore_outcome(board: Board, repo_root: Path) -> list[tuple[Entry, dict]]:
+    import csv
+    import gzip
+
+    truth = _outcome_truth_for_dataset(board.dataset)
+    truth_keys = [(t.case_id, t.prefix_idx) for t in truth]
+    truth_int = [t.outcome for t in truth]
+
+    out: list[tuple[Entry, dict]] = []
+    for entry in board.entries:
+        pred_path = repo_root / entry.predictions_path
+        opener = gzip.open if str(pred_path).endswith(".gz") else open
+        pred_lookup: dict[tuple[str, int], float] = {}
+        with opener(pred_path, "rt", newline="") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                pred_lookup[(row["case_id"], int(row["prefix_idx"]))] = float(row["score"])
+        missing = [k for k in truth_keys if k not in pred_lookup]
+        if missing:
+            raise ValueError(
+                f"{entry.model}: predictions missing {len(missing)} target(s); "
+                f"first missing {missing[0]}"
+            )
+        preds = [pred_lookup[k] for k in truth_keys]
+        s = score_outcome(preds, truth_int)
+        out.append((entry, {"auc": s.auc, "n": s.n, "n_pos": s.n_pos}))
+    return out
+
+
 def rescore(board: Board, repo_root: str | Path = ".") -> list[tuple[Entry, dict]]:
     """Re-run scoring for every entry; return (entry, fresh_score) pairs."""
     root = Path(repo_root)
@@ -255,6 +304,8 @@ def rescore(board: Board, repo_root: str | Path = ".") -> list[tuple[Entry, dict
         return _rescore_bottleneck(board, root)
     if board.task == "conformance":
         return _rescore_conformance(board, root)
+    if board.task == "outcome":
+        return _rescore_outcome(board, root)
     raise ValueError(f"unknown task: {board.task}")
 
 

diff --git a/pm_bench/registry.yml b/pm_bench/registry.yml
@@ -77,8 +77,8 @@ datasets:
 
   - name: synthetic-toy
     title: "Synthetic toy event log (deterministic, generated)"
-    cases: 50
-    events: 250
+    cases: 200
+    events: 965
     landing_url: null
     download_url: null
     sha256: null

diff --git a/tests/test_leaderboard.py b/tests/test_leaderboard.py
@@ -81,6 +81,14 @@ def test_cli_leaderboard_verify_passes() -> None:
     assert "no drift" in r.output
 
 
+def test_outcome_board_loads_and_verifies() -> None:
+    p = REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json"
+    board = load_board(p)
+    assert board.task == "outcome"
+    drifts = verify(board, repo_root=REPO_ROOT)
+    assert drifts == [], drifts
+
+
 def test_remaining_time_board_loads_and_verifies() -> None:
     board = load_board(TIME_BOARD_PATH)
     assert board.task == "remaining-time"
@@ -141,7 +149,7 @@ def test_board_to_markdown_includes_model_and_score() -> None:
     board = load_board(BOARD_PATH)
     md = board_to_markdown(board)
     assert "markov-ref" in md
-    assert "0.9756" in md
+    assert "0.9304" in md
     assert "next-event" in md
 
 
@@ -150,6 +158,8 @@ def test_all_standings_markdown_lists_every_board() -> None:
     assert "next-event · synthetic-toy" in md
     assert "remaining-time · synthetic-toy" in md
     assert "bottleneck · synthetic-toy" in md
+    assert "outcome · synthetic-toy" in md
+    assert "conformance · synthetic-toy" in md
 
 
 def test_checked_in_standings_md_is_up_to_date() -> None:

diff --git a/tests/test_outcome.py b/tests/test_outcome.py
@@ -1,11 +1,9 @@
 """End-to-end + targeted tests for the outcome task.
 
-Synthetic-toy with seed=42 doesn't put any `delivery_confirmed` cases
-in the test split (path-4 is 10% of cases and the chronological tail
-happens to have none), so we test the outcome machinery on a hand-built
-event set with controlled class balance instead. The CLI smoke test
-runs against synthetic-toy and asserts the pipeline executes cleanly,
-even though the AUC degenerates to 0.5 (n_pos=0 in test).
+Synthetic-toy at n_cases=200 places ~45 positives in the test partition,
+so the outcome AUC is meaningful (prior-ref ≈ 0.63). The targeted unit
+tests still use a hand-built event set with controlled class balance so
+the AUC math is checkable by inspection.
 """
 from __future__ import annotations
 
@@ -120,8 +118,7 @@ def test_score_outcome_round_trip_via_writer(tmp_path) -> None:
 
 
 def test_full_outcome_pipeline_on_synthetic_toy(tmp_path) -> None:
-    """Pipeline runs cleanly end-to-end. AUC degenerates because seed=42's
-    test partition has no positives, but the contract still holds."""
+    """Full pipeline runs and AUC is meaningful (n_cases=200 default)."""
     runner = CliRunner()
     split_path = tmp_path / "split.json"
     prefixes_path = tmp_path / "prefixes.csv"
@@ -155,6 +152,6 @@ def test_full_outcome_pipeline_on_synthetic_toy(tmp_path) -> None:
     result = json.loads(r.output)
     assert result["task"] == "outcome"
     assert result["n"] > 0
-    # synthetic-toy with seed=42 happens to have n_pos=0 in test
-    # → degenerate AUC = 0.5 by convention. The pipeline still runs.
-    assert 0.0 <= result["auc"] <= 1.0
+    assert result["n_pos"] > 0  # both classes present in test partition
+    # The prior baseline should beat the trivial 0.5 floor with both classes.
+    assert result["auc"] > 0.5