Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions GOALS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ Be the default benchmark for new process-mining methods. Within 18 months,
- 7 datasets fetchable + hash-verified — fetch/hash machinery shipped
(`pm-bench fetch <name> [--pin]`); per-dataset hash pins pending
the one-time TOS-gated downloads
- 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅;
outcome, conformance, bottleneck pending)
- 5 tasks with fixed scoring scripts (next-event ✅, remaining-time ✅,
outcome ✅; conformance, bottleneck pending)
- `gnn` runs end-to-end as the reference baseline (Markov reference ✅;
`gnn` integration pending the first pinned dataset)
- End-to-end loop runs on `synthetic-toy` ✅ — split → prefixes →
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,10 @@ honesty. The point of the benchmark is to make the comparison real.
`$PM_BENCH_CACHE` resolution); per-dataset hash-pinning PRs
pending the one-time TOS-gated downloads from 4TU and Mendeley.
- [x] v0.2 — splits + targets for next-event ✅ and remaining-time ✅
- [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅ and
remaining-time ✅; outcome / conformance / bottleneck remain.
- [🟡] v0.3 — scoring scripts for all 5 tasks. next-event ✅,
remaining-time ✅, outcome ✅ (AUC scoring + prior baseline +
pipeline; leaderboard entry waits on a dataset whose test split
has both classes); conformance / bottleneck remain.
- [🟡] v0.4 — leaderboard CI + landing page. Standings format,
reference Markov entry, `pm-bench leaderboard [--all] --verify`,
and the dedicated `leaderboard.yml` GitHub workflow shipped;
Expand Down
16 changes: 16 additions & 0 deletions STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ pm-bench fetch bpi2020 --pin

## Recently shipped

- **Outcome task (binary AUC)** (`outcome-task` branch).
- `score_outcome` — pure-CPython rank-sum AUC, with average-rank
tie-breaking; degenerate single-class case returns 0.5 by
convention rather than NaN.
- `pm_bench/baselines/prior_outcome.py` — last-activity-conditioned
positive rate (with global-rate fallback for unseen activities).
The dumbest baseline that uses *any* prefix signal.
- CLI: `--task outcome`, `--baseline prior`, end-to-end through
`prefixes / predict / score`.
- Per-dataset outcome rule registered for synthetic-toy
(`is_positive_outcome`: case ends with `delivery_confirmed`).
- **No leaderboard entry yet** — synthetic-toy with seed=42 happens
to have zero positives in the test partition, so AUC degenerates.
The pipeline runs end-to-end and the test asserts it; a real
leaderboard entry waits on a pinned BPI dataset.
- 8 new tests; 73 total, ruff clean.
- **Remaining-time task** (`remaining-time` branch).
- `score_remaining_time` (MAE in days), prefixes/predictions
formats parallel to next-event so models share a loader.
Expand Down
12 changes: 12 additions & 0 deletions pm_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,26 @@

from pm_bench.predictions import Prediction, read_predictions_csv, write_predictions_csv
from pm_bench.prefixes import (
OutcomeTarget,
Prefix,
TimeTarget,
extract_outcome_targets,
extract_prefixes,
extract_remaining_time_targets,
read_outcome_targets_csv,
read_prefixes_csv,
read_time_targets_csv,
write_outcome_targets_csv,
write_prefixes_csv,
write_time_targets_csv,
)
from pm_bench.registry import Dataset, get_dataset, load_registry
from pm_bench.score import (
NextEventScore,
OutcomeScore,
RemainingTimeScore,
score_next_event,
score_outcome,
score_remaining_time,
)
from pm_bench.split import Event, Split, case_chrono_split
Expand All @@ -27,21 +33,27 @@
"Dataset",
"Event",
"NextEventScore",
"OutcomeScore",
"OutcomeTarget",
"Prediction",
"Prefix",
"RemainingTimeScore",
"Split",
"TimeTarget",
"case_chrono_split",
"extract_outcome_targets",
"extract_prefixes",
"extract_remaining_time_targets",
"get_dataset",
"load_registry",
"read_outcome_targets_csv",
"read_predictions_csv",
"read_prefixes_csv",
"read_time_targets_csv",
"score_next_event",
"score_outcome",
"score_remaining_time",
"write_outcome_targets_csv",
"write_predictions_csv",
"write_prefixes_csv",
"write_time_targets_csv",
Expand Down
10 changes: 10 additions & 0 deletions pm_bench/_synth.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,13 @@ def synthetic_log(n_cases: int = 50, seed: int = 42) -> Iterator[Event]:
for activity in path:
yield (str(case_id), activity, t)
t += dt.timedelta(hours=rng.randint(1, 48))


def is_positive_outcome(activities: list[str]) -> bool:
"""Synthetic-toy outcome rule: case ends with `delivery_confirmed`.

This corresponds to the happy path (PATHS[4]) — a fully delivered
order. Cancelled, refunded, and shipped-but-unconfirmed cases are
all negative.
"""
return bool(activities) and activities[-1] == "delivery_confirmed"
14 changes: 14 additions & 0 deletions pm_bench/baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,28 @@
read_time_predictions_csv,
write_time_predictions_csv,
)
from pm_bench.baselines.prior_outcome import (
OutcomePrediction,
PriorOutcomeBaseline,
fit_prior_outcome,
predict_prior_outcome,
read_outcome_predictions_csv,
write_outcome_predictions_csv,
)

__all__ = [
"MarkovBaseline",
"MeanTimeBaseline",
"OutcomePrediction",
"PriorOutcomeBaseline",
"TimePrediction",
"fit_mean_time",
"fit_prior_outcome",
"predict_markov",
"predict_mean_time",
"predict_prior_outcome",
"read_outcome_predictions_csv",
"read_time_predictions_csv",
"write_outcome_predictions_csv",
"write_time_predictions_csv",
]
132 changes: 132 additions & 0 deletions pm_bench/baselines/prior_outcome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Last-activity-conditioned reference baseline for outcome prediction.

For every (last_activity_in_prefix → case_outcome) pair observed on the
training cases, store the empirical positive rate. At test time, look
up the prefix's last activity and return its rate. This is the dumbest
baseline that uses *any* prefix information — a model that ties this
isn't conditioning on the trace at all.

Falls back to the global positive rate when a prefix ends in an
activity unseen during training.
"""
from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable, Iterable
from dataclasses import dataclass

from pm_bench.prefixes import OutcomeTarget
from pm_bench.split import Activity, CaseId, Event


@dataclass(frozen=True)
class PriorOutcomeBaseline:
"""Last-activity → positive rate, plus a global fallback."""

by_last: dict[Activity, float]
global_rate: float


@dataclass(frozen=True)
class OutcomePrediction:
case_id: CaseId
prefix_idx: int
score: float


def fit_prior_outcome(
events: Iterable[Event],
train_case_ids: Iterable[CaseId],
is_positive: Callable[[list[Activity]], bool],
) -> PriorOutcomeBaseline:
"""Aggregate per-last-activity outcome rates over training prefixes."""
keep = set(train_case_ids)
by_case: dict[CaseId, list[tuple[Activity, object]]] = {}
for case_id, activity, ts in events:
if case_id not in keep:
continue
by_case.setdefault(case_id, []).append((activity, ts))

counts: dict[Activity, list[int]] = defaultdict(lambda: [0, 0]) # [pos, total]
pos_cases = 0
total_cases = 0
for rows in by_case.values():
rows.sort(key=lambda r: r[1])
activities = [a for a, _ in rows]
if len(activities) < 2:
continue
total_cases += 1
outcome = 1 if is_positive(activities) else 0
if outcome:
pos_cases += 1
for k in range(1, len(activities)):
last = activities[k - 1]
counts[last][1] += 1
if outcome:
counts[last][0] += 1

by_last = {
last: (pos / total) if total else 0.0
for last, (pos, total) in counts.items()
}
global_rate = (pos_cases / total_cases) if total_cases else 0.0
return PriorOutcomeBaseline(by_last=by_last, global_rate=global_rate)


def predict_prior_outcome(
model: PriorOutcomeBaseline,
targets: Iterable[OutcomeTarget],
events_by_case: dict[CaseId, list[Activity]] | None = None,
) -> list[OutcomePrediction]:
"""Score each target by its prefix's last-activity training rate.

`events_by_case` maps every case_id we'll be asked about to its
full ordered activity list (test cases included). Looking up the
prefix's last activity needs the full sequence; the targets file
by itself only carries `(case_id, prefix_idx)`.
"""
out: list[OutcomePrediction] = []
for t in targets:
score = model.global_rate
if events_by_case is not None:
seq = events_by_case.get(t.case_id)
if seq is not None and t.prefix_idx <= len(seq):
last = seq[t.prefix_idx - 1]
score = model.by_last.get(last, model.global_rate)
out.append(OutcomePrediction(case_id=t.case_id, prefix_idx=t.prefix_idx, score=score))
return out


def write_outcome_predictions_csv(
predictions: Iterable[OutcomePrediction],
path: str,
) -> int:
"""Write outcome predictions to CSV. Returns row count."""
import csv

n = 0
with open(path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["case_id", "prefix_idx", "score"])
for p in predictions:
w.writerow([p.case_id, p.prefix_idx, repr(p.score)])
n += 1
return n


def read_outcome_predictions_csv(path: str) -> list[OutcomePrediction]:
"""Read an outcome predictions CSV."""
import csv

out: list[OutcomePrediction] = []
with open(path, newline="") as f:
r = csv.DictReader(f)
for row in r:
out.append(
OutcomePrediction(
case_id=row["case_id"],
prefix_idx=int(row["prefix_idx"]),
score=float(row["score"]),
)
)
return out
Loading