Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ The full loop (`split → prefixes → predict → score`) runs end-to-end on
`synthetic-toy` today; it's covered by `tests/test_e2e.py` and locks
the file formats the leaderboard depends on.

**Inspect any log.** `pm-bench stats <name-or-path>` prints n_cases /
n_events / activity-distribution / top transitions in one shot.

**Bring your own CSV.** Any path-like argument is loaded as an event
log directly, no registry plumbing needed:

Expand Down
7 changes: 7 additions & 0 deletions STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ pm-bench fetch bpi2020 --pin

## Recently shipped

- **`pm-bench stats <name>`** (`stats-command` branch).
- One-shot summary stats for any log: n_cases, n_events,
n_activities, time span, mean/median case length, top-N
activities and transitions.
- Pure CPython; works on synthetic-toy and any CSV path the
existing `_load_events` accepts.
- 7 new tests; 116 total.
- **Synthetic-toy bumped to 200 cases — outcome row finally lands**
(`synthetic-200` branch).
- `synthetic_log()` default `n_cases` = 200 (was 50). Test partition
Expand Down
38 changes: 38 additions & 0 deletions pm_bench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
score_remaining_time,
)
from pm_bench.split import case_chrono_split
from pm_bench.stats import summarize


def _load_events(name: str) -> list:
Expand Down Expand Up @@ -209,6 +210,43 @@ def _print_pin_patch(name: str, digest: str) -> None:
click.echo(f" sha256: {digest}")


@main.command()
@click.argument("name")
@click.option(
"--top-n",
"top_n",
type=int,
default=10,
show_default=True,
help="How many top activities / transitions to include in the output.",
)
def stats(name: str, top_n: int) -> None:
"""Summary stats for a log: cases, events, activities, span, top-N."""
events = _load_events(name)
s = summarize(events, top_n=top_n)
click.echo(
json.dumps(
{
"n_events": s.n_events,
"n_cases": s.n_cases,
"n_activities": s.n_activities,
"span_days": s.span_days,
"earliest": s.earliest.isoformat() if s.earliest else None,
"latest": s.latest.isoformat() if s.latest else None,
"mean_case_length": s.mean_case_length,
"median_case_length": s.median_case_length,
"top_activities": [
{"activity": a, "count": c} for a, c in s.top_activities
],
"top_transitions": [
{"a": ab[0], "b": ab[1], "count": c} for ab, c in s.top_transitions
],
},
indent=2,
),
)


@main.command()
@click.argument("name")
@click.option("--task", default="next-event", show_default=True)
Expand Down
89 changes: 89 additions & 0 deletions pm_bench/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Quick summary stats for an event log.

Useful when inspecting a new dataset — n_cases, n_events, distinct
activity count, time span, top-N most-frequent activities and
transitions, mean / median case length. Pure CPython; runs in the
same process as the rest of pm-bench so it works on `synthetic-toy`,
any CSV path, and (eventually) any cached BPI log.
"""
from __future__ import annotations

import statistics
from collections import Counter
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import datetime

from pm_bench.split import Activity, CaseId, Event


@dataclass(frozen=True)
class LogStats:
n_events: int
n_cases: int
n_activities: int
span_days: float
earliest: datetime | None
latest: datetime | None
mean_case_length: float
median_case_length: float
top_activities: list[tuple[Activity, int]]
top_transitions: list[tuple[tuple[Activity, Activity], int]]


def summarize(events: Iterable[Event], *, top_n: int = 10) -> LogStats:
"""Compute summary stats from an event iterable.

`events` is consumed once. Top-N lists are sorted by count
descending; ties broken by lexicographic order.
"""
by_case: dict[CaseId, list[tuple[Activity, datetime]]] = {}
activity_counts: Counter[Activity] = Counter()
earliest: datetime | None = None
latest: datetime | None = None

for case_id, activity, ts in events:
by_case.setdefault(case_id, []).append((activity, ts))
activity_counts[activity] += 1
if earliest is None or ts < earliest:
earliest = ts
if latest is None or ts > latest:
latest = ts

transition_counts: Counter[tuple[Activity, Activity]] = Counter()
case_lengths: list[int] = []
for rows in by_case.values():
rows.sort(key=lambda r: r[1])
case_lengths.append(len(rows))
for (a, _), (b, _) in zip(rows, rows[1:], strict=False):
transition_counts[(a, b)] += 1

span_days = 0.0
if earliest is not None and latest is not None:
span_days = (latest - earliest).total_seconds() / 86400.0

n_events = sum(len(rows) for rows in by_case.values())
n_cases = len(by_case)
mean_len = statistics.fmean(case_lengths) if case_lengths else 0.0
median_len = statistics.median(case_lengths) if case_lengths else 0.0

return LogStats(
n_events=n_events,
n_cases=n_cases,
n_activities=len(activity_counts),
span_days=span_days,
earliest=earliest,
latest=latest,
mean_case_length=mean_len,
median_case_length=median_len,
top_activities=_top_n_sorted(activity_counts, top_n),
top_transitions=_top_n_sorted(transition_counts, top_n),
)


def _top_n_sorted(counter: Counter, n: int) -> list:
"""Return the top-N items, sorted by count descending then by key."""
return sorted(
counter.items(),
key=lambda kv: (-kv[1], kv[0]),
)[:n]
72 changes: 72 additions & 0 deletions tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Tests for the summary-stats helper and CLI."""
from __future__ import annotations

import datetime as dt
import json

from click.testing import CliRunner

from pm_bench.cli import main
from pm_bench.stats import summarize


def _events() -> list[tuple[str, str, dt.datetime]]:
base = dt.datetime(2024, 1, 1)
return [
("c1", "a", base),
("c1", "b", base + dt.timedelta(hours=1)),
("c1", "c", base + dt.timedelta(hours=2)),
("c2", "a", base + dt.timedelta(days=1)),
("c2", "b", base + dt.timedelta(days=1, hours=1)),
("c3", "x", base + dt.timedelta(days=2)),
]


def test_summarize_basic_counts() -> None:
s = summarize(_events())
assert s.n_events == 6
assert s.n_cases == 3
assert s.n_activities == 4 # a, b, c, x


def test_summarize_case_lengths() -> None:
s = summarize(_events())
assert s.mean_case_length == (3 + 2 + 1) / 3
assert s.median_case_length == 2


def test_summarize_top_activities_sorted_by_count_desc() -> None:
s = summarize(_events(), top_n=10)
counts = [c for _, c in s.top_activities]
assert counts == sorted(counts, reverse=True)
assert s.top_activities[0] == ("a", 2)


def test_summarize_top_transitions() -> None:
s = summarize(_events())
transitions = {pair: c for pair, c in s.top_transitions}
assert transitions[("a", "b")] == 2 # c1 a→b and c2 a→b
assert transitions[("b", "c")] == 1


def test_summarize_top_n_caps() -> None:
s = summarize(_events(), top_n=2)
assert len(s.top_activities) == 2


def test_summarize_empty_log_is_safe() -> None:
s = summarize([])
assert s.n_events == 0
assert s.n_cases == 0
assert s.span_days == 0.0
assert s.earliest is None


def test_cli_stats_synthetic_toy() -> None:
runner = CliRunner()
r = runner.invoke(main, ["stats", "synthetic-toy", "--top-n", "3"])
assert r.exit_code == 0, r.output
data = json.loads(r.output)
assert data["n_cases"] == 200
assert data["n_events"] == 965
assert len(data["top_activities"]) == 3