erphq · protosphinx · May 1, 2026
diff --git a/README.md b/README.md
@@ -120,6 +120,9 @@ The full loop (`split → prefixes → predict → score`) runs end-to-end on
 `synthetic-toy` today; it's covered by `tests/test_e2e.py` and locks
 the file formats the leaderboard depends on.
 
+**Inspect any log.** `pm-bench stats <name-or-path>` prints n_cases /
+n_events / activity-distribution / top transitions in one shot.
+
 **Bring your own CSV.** Any path-like argument is loaded as an event
 log directly, no registry plumbing needed:
 

diff --git a/STATUS.md b/STATUS.md
@@ -60,6 +60,13 @@ pm-bench fetch bpi2020 --pin
 
 ## Recently shipped
 
+- **`pm-bench stats <name>`** (`stats-command` branch).
+  - One-shot summary stats for any log: n_cases, n_events,
+    n_activities, time span, mean/median case length, top-N
+    activities and transitions.
+  - Pure CPython; works on synthetic-toy and any CSV path the
+    existing `_load_events` accepts.
+  - 7 new tests; 116 total.
 - **Synthetic-toy bumped to 200 cases — outcome row finally lands**
   (`synthetic-200` branch).
   - `synthetic_log()` default `n_cases` = 200 (was 50). Test partition

diff --git a/pm_bench/cli.py b/pm_bench/cli.py
@@ -63,6 +63,7 @@
     score_remaining_time,
 )
 from pm_bench.split import case_chrono_split
+from pm_bench.stats import summarize
 
 
 def _load_events(name: str) -> list:
@@ -209,6 +210,43 @@ def _print_pin_patch(name: str, digest: str) -> None:
     click.echo(f"    sha256: {digest}")
 
 
+@main.command()
+@click.argument("name")
+@click.option(
+    "--top-n",
+    "top_n",
+    type=int,
+    default=10,
+    show_default=True,
+    help="How many top activities / transitions to include in the output.",
+)
+def stats(name: str, top_n: int) -> None:
+    """Summary stats for a log: cases, events, activities, span, top-N."""
+    events = _load_events(name)
+    s = summarize(events, top_n=top_n)
+    click.echo(
+        json.dumps(
+            {
+                "n_events": s.n_events,
+                "n_cases": s.n_cases,
+                "n_activities": s.n_activities,
+                "span_days": s.span_days,
+                "earliest": s.earliest.isoformat() if s.earliest else None,
+                "latest": s.latest.isoformat() if s.latest else None,
+                "mean_case_length": s.mean_case_length,
+                "median_case_length": s.median_case_length,
+                "top_activities": [
+                    {"activity": a, "count": c} for a, c in s.top_activities
+                ],
+                "top_transitions": [
+                    {"a": ab[0], "b": ab[1], "count": c} for ab, c in s.top_transitions
+                ],
+            },
+            indent=2,
+        ),
+    )
+
+
 @main.command()
 @click.argument("name")
 @click.option("--task", default="next-event", show_default=True)

diff --git a/pm_bench/stats.py b/pm_bench/stats.py
@@ -0,0 +1,89 @@
+"""Quick summary stats for an event log.
+
+Useful when inspecting a new dataset — n_cases, n_events, distinct
+activity count, time span, top-N most-frequent activities and
+transitions, mean / median case length. Pure CPython; runs in the
+same process as the rest of pm-bench so it works on `synthetic-toy`,
+any CSV path, and (eventually) any cached BPI log.
+"""
+from __future__ import annotations
+
+import statistics
+from collections import Counter
+from collections.abc import Iterable
+from dataclasses import dataclass
+from datetime import datetime
+
+from pm_bench.split import Activity, CaseId, Event
+
+
+@dataclass(frozen=True)
+class LogStats:
+    n_events: int
+    n_cases: int
+    n_activities: int
+    span_days: float
+    earliest: datetime | None
+    latest: datetime | None
+    mean_case_length: float
+    median_case_length: float
+    top_activities: list[tuple[Activity, int]]
+    top_transitions: list[tuple[tuple[Activity, Activity], int]]
+
+
+def summarize(events: Iterable[Event], *, top_n: int = 10) -> LogStats:
+    """Compute summary stats from an event iterable.
+
+    `events` is consumed once. Top-N lists are sorted by count
+    descending; ties broken by lexicographic order.
+    """
+    by_case: dict[CaseId, list[tuple[Activity, datetime]]] = {}
+    activity_counts: Counter[Activity] = Counter()
+    earliest: datetime | None = None
+    latest: datetime | None = None
+
+    for case_id, activity, ts in events:
+        by_case.setdefault(case_id, []).append((activity, ts))
+        activity_counts[activity] += 1
+        if earliest is None or ts < earliest:
+            earliest = ts
+        if latest is None or ts > latest:
+            latest = ts
+
+    transition_counts: Counter[tuple[Activity, Activity]] = Counter()
+    case_lengths: list[int] = []
+    for rows in by_case.values():
+        rows.sort(key=lambda r: r[1])
+        case_lengths.append(len(rows))
+        for (a, _), (b, _) in zip(rows, rows[1:], strict=False):
+            transition_counts[(a, b)] += 1
+
+    span_days = 0.0
+    if earliest is not None and latest is not None:
+        span_days = (latest - earliest).total_seconds() / 86400.0
+
+    n_events = sum(len(rows) for rows in by_case.values())
+    n_cases = len(by_case)
+    mean_len = statistics.fmean(case_lengths) if case_lengths else 0.0
+    median_len = statistics.median(case_lengths) if case_lengths else 0.0
+
+    return LogStats(
+        n_events=n_events,
+        n_cases=n_cases,
+        n_activities=len(activity_counts),
+        span_days=span_days,
+        earliest=earliest,
+        latest=latest,
+        mean_case_length=mean_len,
+        median_case_length=median_len,
+        top_activities=_top_n_sorted(activity_counts, top_n),
+        top_transitions=_top_n_sorted(transition_counts, top_n),
+    )
+
+
+def _top_n_sorted(counter: Counter, n: int) -> list:
+    """Return the top-N items, sorted by count descending then by key."""
+    return sorted(
+        counter.items(),
+        key=lambda kv: (-kv[1], kv[0]),
+    )[:n]
diff --git a/tests/test_stats.py b/tests/test_stats.py
@@ -0,0 +1,72 @@
+"""Tests for the summary-stats helper and CLI."""
+from __future__ import annotations
+
+import datetime as dt
+import json
+
+from click.testing import CliRunner
+
+from pm_bench.cli import main
+from pm_bench.stats import summarize
+
+
+def _events() -> list[tuple[str, str, dt.datetime]]:
+    base = dt.datetime(2024, 1, 1)
+    return [
+        ("c1", "a", base),
+        ("c1", "b", base + dt.timedelta(hours=1)),
+        ("c1", "c", base + dt.timedelta(hours=2)),
+        ("c2", "a", base + dt.timedelta(days=1)),
+        ("c2", "b", base + dt.timedelta(days=1, hours=1)),
+        ("c3", "x", base + dt.timedelta(days=2)),
+    ]
+
+
+def test_summarize_basic_counts() -> None:
+    s = summarize(_events())
+    assert s.n_events == 6
+    assert s.n_cases == 3
+    assert s.n_activities == 4  # a, b, c, x
+
+
+def test_summarize_case_lengths() -> None:
+    s = summarize(_events())
+    assert s.mean_case_length == (3 + 2 + 1) / 3
+    assert s.median_case_length == 2
+
+
+def test_summarize_top_activities_sorted_by_count_desc() -> None:
+    s = summarize(_events(), top_n=10)
+    counts = [c for _, c in s.top_activities]
+    assert counts == sorted(counts, reverse=True)
+    assert s.top_activities[0] == ("a", 2)
+
+
+def test_summarize_top_transitions() -> None:
+    s = summarize(_events())
+    transitions = {pair: c for pair, c in s.top_transitions}
+    assert transitions[("a", "b")] == 2  # c1 a→b and c2 a→b
+    assert transitions[("b", "c")] == 1
+
+
+def test_summarize_top_n_caps() -> None:
+    s = summarize(_events(), top_n=2)
+    assert len(s.top_activities) == 2
+
+
+def test_summarize_empty_log_is_safe() -> None:
+    s = summarize([])
+    assert s.n_events == 0
+    assert s.n_cases == 0
+    assert s.span_days == 0.0
+    assert s.earliest is None
+
+
+def test_cli_stats_synthetic_toy() -> None:
+    runner = CliRunner()
+    r = runner.invoke(main, ["stats", "synthetic-toy", "--top-n", "3"])
+    assert r.exit_code == 0, r.output
+    data = json.loads(r.output)
+    assert data["n_cases"] == 200
+    assert data["n_events"] == 965
+    assert len(data["top_activities"]) == 3