erphq · protosphinx · May 1, 2026
diff --git a/GOALS.md b/GOALS.md
@@ -8,8 +8,8 @@ Be the default benchmark for new process-mining methods. Within 18 months,
 - 7 datasets fetchable + hash-verified — fetch/hash machinery shipped
   (`pm-bench fetch <name> [--pin]`); per-dataset hash pins pending
   the one-time TOS-gated downloads
-- 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅;
-  outcome, conformance, bottleneck pending)
+- 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅,
+  outcome ✅; conformance, bottleneck pending)
 - `gnn` runs end-to-end as the reference baseline (Markov reference ✅;
   `gnn` integration pending the first pinned dataset)
 - End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes →

diff --git a/README.md b/README.md
@@ -221,8 +221,10 @@ honesty. The point of the benchmark is to make the comparison real.
       `$PM_BENCH_CACHE` resolution); per-dataset hash-pinning PRs
       pending the one-time TOS-gated downloads from 4TU and Mendeley.
 - [x] v0.2 — splits + targets for next-event ✅ and remaining-time ✅
-- [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅ and
-      remaining-time ✅; outcome / conformance / bottleneck remain.
+- [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅,
+      remaining-time ✅, outcome ✅ (AUC scoring + prior baseline +
+      pipeline; leaderboard entry waits on a dataset whose test split
+      has both classes); conformance / bottleneck remain.
 - [🟡] v0.4 — leaderboard CI + landing page. Standings format,
       reference Markov entry, `pm-bench leaderboard [--all] --verify`,
       and the dedicated `leaderboard.yml` GitHub workflow shipped;

diff --git a/STATUS.md b/STATUS.md
@@ -60,6 +60,22 @@ pm-bench fetch bpi2020 --pin
 
 ## Recently shipped
 
+- **Outcome task (binary AUC)** (`outcome-task` branch).
+  - `score_outcome` — pure-CPython rank-sum AUC, with average-rank
+    tie-breaking; degenerate single-class case returns 0.5 by
+    convention rather than NaN.
+  - `pm_bench/baselines/prior_outcome.py` — last-activity-conditioned
+    positive rate (with global-rate fallback for unseen activities).
+    The dumbest baseline that uses *any* prefix signal.
+  - CLI: `--task outcome`, `--baseline prior`, end-to-end through
+    `prefixes / predict / score`.
+  - Per-dataset outcome rule registered for synthetic-toy
+    (`is_positive_outcome`: case ends with `delivery_confirmed`).
+  - **No leaderboard entry yet** — synthetic-toy with seed=42 happens
+    to have zero positives in the test partition, so AUC degenerates.
+    The pipeline runs end-to-end and the test asserts it; a real
+    leaderboard entry waits on a pinned BPI dataset.
+  - 8 new tests; 73 total, ruff clean.
 - **Remaining-time task** (`remaining-time` branch).
   - `score_remaining_time` (MAE in days), prefixes/predictions
     formats parallel to next-event so models share a loader.

diff --git a/pm_bench/__init__.py b/pm_bench/__init__.py
@@ -5,20 +5,26 @@
 
 from pm_bench.predictions import Prediction, read_predictions_csv, write_predictions_csv
 from pm_bench.prefixes import (
+    OutcomeTarget,
     Prefix,
     TimeTarget,
+    extract_outcome_targets,
     extract_prefixes,
     extract_remaining_time_targets,
+    read_outcome_targets_csv,
     read_prefixes_csv,
     read_time_targets_csv,
+    write_outcome_targets_csv,
     write_prefixes_csv,
     write_time_targets_csv,
 )
 from pm_bench.registry import Dataset, get_dataset, load_registry
 from pm_bench.score import (
     NextEventScore,
+    OutcomeScore,
     RemainingTimeScore,
     score_next_event,
+    score_outcome,
     score_remaining_time,
 )
 from pm_bench.split import Event, Split, case_chrono_split
@@ -27,21 +33,27 @@
     "Dataset",
     "Event",
     "NextEventScore",
+    "OutcomeScore",
+    "OutcomeTarget",
     "Prediction",
     "Prefix",
     "RemainingTimeScore",
     "Split",
     "TimeTarget",
     "case_chrono_split",
+    "extract_outcome_targets",
     "extract_prefixes",
     "extract_remaining_time_targets",
     "get_dataset",
     "load_registry",
+    "read_outcome_targets_csv",
     "read_predictions_csv",
     "read_prefixes_csv",
     "read_time_targets_csv",
     "score_next_event",
+    "score_outcome",
     "score_remaining_time",
+    "write_outcome_targets_csv",
     "write_predictions_csv",
     "write_prefixes_csv",
     "write_time_targets_csv",

diff --git a/pm_bench/_synth.py b/pm_bench/_synth.py
@@ -48,3 +48,13 @@ def synthetic_log(n_cases: int = 50, seed: int = 42) -> Iterator[Event]:
         for activity in path:
             yield (str(case_id), activity, t)
             t += dt.timedelta(hours=rng.randint(1, 48))
+
+
+def is_positive_outcome(activities: list[str]) -> bool:
+    """Synthetic-toy outcome rule: case ends with `delivery_confirmed`.
+
+    This corresponds to the happy path (PATHS[4]) — a fully delivered
+    order. Cancelled, refunded, and shipped-but-unconfirmed cases are
+    all negative.
+    """
+    return bool(activities) and activities[-1] == "delivery_confirmed"
diff --git a/pm_bench/baselines/__init__.py b/pm_bench/baselines/__init__.py
@@ -16,14 +16,28 @@
     read_time_predictions_csv,
     write_time_predictions_csv,
 )
+from pm_bench.baselines.prior_outcome import (
+    OutcomePrediction,
+    PriorOutcomeBaseline,
+    fit_prior_outcome,
+    predict_prior_outcome,
+    read_outcome_predictions_csv,
+    write_outcome_predictions_csv,
+)
 
 __all__ = [
     "MarkovBaseline",
     "MeanTimeBaseline",
+    "OutcomePrediction",
+    "PriorOutcomeBaseline",
     "TimePrediction",
     "fit_mean_time",
+    "fit_prior_outcome",
     "predict_markov",
     "predict_mean_time",
+    "predict_prior_outcome",
+    "read_outcome_predictions_csv",
     "read_time_predictions_csv",
+    "write_outcome_predictions_csv",
     "write_time_predictions_csv",
 ]
diff --git a/pm_bench/baselines/prior_outcome.py b/pm_bench/baselines/prior_outcome.py
@@ -0,0 +1,132 @@
+"""Last-activity-conditioned reference baseline for outcome prediction.
+
+For every (last_activity_in_prefix → case_outcome) pair observed on the
+training cases, store the empirical positive rate. At test time, look
+up the prefix's last activity and return its rate. This is the dumbest
+baseline that uses *any* prefix information — a model that ties this
+isn't conditioning on the trace at all.
+
+Falls back to the global positive rate when a prefix ends in an
+activity unseen during training.
+"""
+from __future__ import annotations
+
+from collections import defaultdict
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+
+from pm_bench.prefixes import OutcomeTarget
+from pm_bench.split import Activity, CaseId, Event
+
+
+@dataclass(frozen=True)
+class PriorOutcomeBaseline:
+    """Last-activity → positive rate, plus a global fallback."""
+
+    by_last: dict[Activity, float]
+    global_rate: float
+
+
+@dataclass(frozen=True)
+class OutcomePrediction:
+    case_id: CaseId
+    prefix_idx: int
+    score: float
+
+
+def fit_prior_outcome(
+    events: Iterable[Event],
+    train_case_ids: Iterable[CaseId],
+    is_positive: Callable[[list[Activity]], bool],
+) -> PriorOutcomeBaseline:
+    """Aggregate per-last-activity outcome rates over training prefixes."""
+    keep = set(train_case_ids)
+    by_case: dict[CaseId, list[tuple[Activity, object]]] = {}
+    for case_id, activity, ts in events:
+        if case_id not in keep:
+            continue
+        by_case.setdefault(case_id, []).append((activity, ts))
+
+    counts: dict[Activity, list[int]] = defaultdict(lambda: [0, 0])  # [pos, total]
+    pos_cases = 0
+    total_cases = 0
+    for rows in by_case.values():
+        rows.sort(key=lambda r: r[1])
+        activities = [a for a, _ in rows]
+        if len(activities) < 2:
+            continue
+        total_cases += 1
+        outcome = 1 if is_positive(activities) else 0
+        if outcome:
+            pos_cases += 1
+        for k in range(1, len(activities)):
+            last = activities[k - 1]
+            counts[last][1] += 1
+            if outcome:
+                counts[last][0] += 1
+
+    by_last = {
+        last: (pos / total) if total else 0.0
+        for last, (pos, total) in counts.items()
+    }
+    global_rate = (pos_cases / total_cases) if total_cases else 0.0
+    return PriorOutcomeBaseline(by_last=by_last, global_rate=global_rate)
+
+
+def predict_prior_outcome(
+    model: PriorOutcomeBaseline,
+    targets: Iterable[OutcomeTarget],
+    events_by_case: dict[CaseId, list[Activity]] | None = None,
+) -> list[OutcomePrediction]:
+    """Score each target by its prefix's last-activity training rate.
+
+    `events_by_case` maps every case_id we'll be asked about to its
+    full ordered activity list (test cases included). Looking up the
+    prefix's last activity needs the full sequence; the targets file
+    by itself only carries `(case_id, prefix_idx)`.
+    """
+    out: list[OutcomePrediction] = []
+    for t in targets:
+        score = model.global_rate
+        if events_by_case is not None:
+            seq = events_by_case.get(t.case_id)
+            if seq is not None and t.prefix_idx <= len(seq):
+                last = seq[t.prefix_idx - 1]
+                score = model.by_last.get(last, model.global_rate)
+        out.append(OutcomePrediction(case_id=t.case_id, prefix_idx=t.prefix_idx, score=score))
+    return out
+
+
+def write_outcome_predictions_csv(
+    predictions: Iterable[OutcomePrediction],
+    path: str,
+) -> int:
+    """Write outcome predictions to CSV. Returns row count."""
+    import csv
+
+    n = 0
+    with open(path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["case_id", "prefix_idx", "score"])
+        for p in predictions:
+            w.writerow([p.case_id, p.prefix_idx, repr(p.score)])
+            n += 1
+    return n
+
+
+def read_outcome_predictions_csv(path: str) -> list[OutcomePrediction]:
+    """Read an outcome predictions CSV."""
+    import csv
+
+    out: list[OutcomePrediction] = []
+    with open(path, newline="") as f:
+        r = csv.DictReader(f)
+        for row in r:
+            out.append(
+                OutcomePrediction(
+                    case_id=row["case_id"],
+                    prefix_idx=int(row["prefix_idx"]),
+                    score=float(row["score"]),
+                )
+            )
+    return out