diff --git a/README.md b/README.md index d236335..4608d0f 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,9 @@ The full loop (`split → prefixes → predict → score`) runs end-to-end on `synthetic-toy` today; it's covered by `tests/test_e2e.py` and locks the file formats the leaderboard depends on. +**Inspect any log.** `pm-bench stats ` prints n_cases / +n_events / activity-distribution / top transitions in one shot. + **Bring your own CSV.** Any path-like argument is loaded as an event log directly, no registry plumbing needed: diff --git a/STATUS.md b/STATUS.md index 1e37334..5fb2220 100644 --- a/STATUS.md +++ b/STATUS.md @@ -60,6 +60,13 @@ pm-bench fetch bpi2020 --pin ## Recently shipped +- **`pm-bench stats `** (`stats-command` branch). + - One-shot summary stats for any log: n_cases, n_events, + n_activities, time span, mean/median case length, top-N + activities and transitions. + - Pure CPython; works on synthetic-toy and any CSV path the + existing `_load_events` accepts. + - 7 new tests; 116 total. - **Synthetic-toy bumped to 200 cases — outcome row finally lands** (`synthetic-200` branch). - `synthetic_log()` default `n_cases` = 200 (was 50). Test partition diff --git a/pm_bench/cli.py b/pm_bench/cli.py index 7cd52bd..8b479a1 100644 --- a/pm_bench/cli.py +++ b/pm_bench/cli.py @@ -63,6 +63,7 @@ score_remaining_time, ) from pm_bench.split import case_chrono_split +from pm_bench.stats import summarize def _load_events(name: str) -> list: @@ -209,6 +210,43 @@ def _print_pin_patch(name: str, digest: str) -> None: click.echo(f" sha256: {digest}") +@main.command() +@click.argument("name") +@click.option( + "--top-n", + "top_n", + type=int, + default=10, + show_default=True, + help="How many top activities / transitions to include in the output.", +) +def stats(name: str, top_n: int) -> None: + """Summary stats for a log: cases, events, activities, span, top-N.""" + events = _load_events(name) + s = summarize(events, top_n=top_n) + click.echo( + json.dumps( + { + "n_events": s.n_events, + "n_cases": s.n_cases, + "n_activities": s.n_activities, + "span_days": s.span_days, + "earliest": s.earliest.isoformat() if s.earliest else None, + "latest": s.latest.isoformat() if s.latest else None, + "mean_case_length": s.mean_case_length, + "median_case_length": s.median_case_length, + "top_activities": [ + {"activity": a, "count": c} for a, c in s.top_activities + ], + "top_transitions": [ + {"a": ab[0], "b": ab[1], "count": c} for ab, c in s.top_transitions + ], + }, + indent=2, + ), + ) + + @main.command() @click.argument("name") @click.option("--task", default="next-event", show_default=True) diff --git a/pm_bench/stats.py b/pm_bench/stats.py new file mode 100644 index 0000000..15d6bef --- /dev/null +++ b/pm_bench/stats.py @@ -0,0 +1,89 @@ +"""Quick summary stats for an event log. + +Useful when inspecting a new dataset — n_cases, n_events, distinct +activity count, time span, top-N most-frequent activities and +transitions, mean / median case length. Pure CPython; runs in the +same process as the rest of pm-bench so it works on `synthetic-toy`, +any CSV path, and (eventually) any cached BPI log. +""" +from __future__ import annotations + +import statistics +from collections import Counter +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import datetime + +from pm_bench.split import Activity, CaseId, Event + + +@dataclass(frozen=True) +class LogStats: + n_events: int + n_cases: int + n_activities: int + span_days: float + earliest: datetime | None + latest: datetime | None + mean_case_length: float + median_case_length: float + top_activities: list[tuple[Activity, int]] + top_transitions: list[tuple[tuple[Activity, Activity], int]] + + +def summarize(events: Iterable[Event], *, top_n: int = 10) -> LogStats: + """Compute summary stats from an event iterable. + + `events` is consumed once. Top-N lists are sorted by count + descending; ties broken by lexicographic order. + """ + by_case: dict[CaseId, list[tuple[Activity, datetime]]] = {} + activity_counts: Counter[Activity] = Counter() + earliest: datetime | None = None + latest: datetime | None = None + + for case_id, activity, ts in events: + by_case.setdefault(case_id, []).append((activity, ts)) + activity_counts[activity] += 1 + if earliest is None or ts < earliest: + earliest = ts + if latest is None or ts > latest: + latest = ts + + transition_counts: Counter[tuple[Activity, Activity]] = Counter() + case_lengths: list[int] = [] + for rows in by_case.values(): + rows.sort(key=lambda r: r[1]) + case_lengths.append(len(rows)) + for (a, _), (b, _) in zip(rows, rows[1:], strict=False): + transition_counts[(a, b)] += 1 + + span_days = 0.0 + if earliest is not None and latest is not None: + span_days = (latest - earliest).total_seconds() / 86400.0 + + n_events = sum(len(rows) for rows in by_case.values()) + n_cases = len(by_case) + mean_len = statistics.fmean(case_lengths) if case_lengths else 0.0 + median_len = statistics.median(case_lengths) if case_lengths else 0.0 + + return LogStats( + n_events=n_events, + n_cases=n_cases, + n_activities=len(activity_counts), + span_days=span_days, + earliest=earliest, + latest=latest, + mean_case_length=mean_len, + median_case_length=median_len, + top_activities=_top_n_sorted(activity_counts, top_n), + top_transitions=_top_n_sorted(transition_counts, top_n), + ) + + +def _top_n_sorted(counter: Counter, n: int) -> list: + """Return the top-N items, sorted by count descending then by key.""" + return sorted( + counter.items(), + key=lambda kv: (-kv[1], kv[0]), + )[:n] diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..a9884a6 --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,72 @@ +"""Tests for the summary-stats helper and CLI.""" +from __future__ import annotations + +import datetime as dt +import json + +from click.testing import CliRunner + +from pm_bench.cli import main +from pm_bench.stats import summarize + + +def _events() -> list[tuple[str, str, dt.datetime]]: + base = dt.datetime(2024, 1, 1) + return [ + ("c1", "a", base), + ("c1", "b", base + dt.timedelta(hours=1)), + ("c1", "c", base + dt.timedelta(hours=2)), + ("c2", "a", base + dt.timedelta(days=1)), + ("c2", "b", base + dt.timedelta(days=1, hours=1)), + ("c3", "x", base + dt.timedelta(days=2)), + ] + + +def test_summarize_basic_counts() -> None: + s = summarize(_events()) + assert s.n_events == 6 + assert s.n_cases == 3 + assert s.n_activities == 4 # a, b, c, x + + +def test_summarize_case_lengths() -> None: + s = summarize(_events()) + assert s.mean_case_length == (3 + 2 + 1) / 3 + assert s.median_case_length == 2 + + +def test_summarize_top_activities_sorted_by_count_desc() -> None: + s = summarize(_events(), top_n=10) + counts = [c for _, c in s.top_activities] + assert counts == sorted(counts, reverse=True) + assert s.top_activities[0] == ("a", 2) + + +def test_summarize_top_transitions() -> None: + s = summarize(_events()) + transitions = {pair: c for pair, c in s.top_transitions} + assert transitions[("a", "b")] == 2 # c1 a→b and c2 a→b + assert transitions[("b", "c")] == 1 + + +def test_summarize_top_n_caps() -> None: + s = summarize(_events(), top_n=2) + assert len(s.top_activities) == 2 + + +def test_summarize_empty_log_is_safe() -> None: + s = summarize([]) + assert s.n_events == 0 + assert s.n_cases == 0 + assert s.span_days == 0.0 + assert s.earliest is None + + +def test_cli_stats_synthetic_toy() -> None: + runner = CliRunner() + r = runner.invoke(main, ["stats", "synthetic-toy", "--top-n", "3"]) + assert r.exit_code == 0, r.output + data = json.loads(r.output) + assert data["n_cases"] == 200 + assert data["n_events"] == 965 + assert len(data["top_activities"]) == 3