Skip to content

Commit 70fa2fa

Browse files
authored
feat(stats): add std_dev_case_length to LogStats and CLI output
Population standard deviation of case lengths gives a direct measure of process variability: a log where every case has the same length scores 0, while one with high variance scores proportionally higher. The value is included in the JSON emitted by 'pm-bench stats' alongside mean and median, completing the five-number-summary picture. Uses statistics.pstdev (population std dev) consistent with fmean and median already in summarize(). Empty or single-case logs return 0.0.
1 parent 0500f0f commit 70fa2fa

3 files changed

Lines changed: 53 additions & 3 deletions

File tree

pm_bench/cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ def stats(name: str, top_n: int) -> None:
392392
"latest": s.latest.isoformat() if s.latest else None,
393393
"mean_case_length": s.mean_case_length,
394394
"median_case_length": s.median_case_length,
395+
"std_dev_case_length": s.std_dev_case_length,
395396
"min_case_length": s.min_case_length,
396397
"max_case_length": s.max_case_length,
397398
"top_activities": [

pm_bench/stats.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
33
Useful when inspecting a new dataset - n_cases, n_events, distinct
44
activity count, time span, top-N most-frequent activities and
5-
transitions, mean / median / min / max case length. Pure CPython; runs
6-
in the same process as the rest of pm-bench so it works on
7-
`synthetic-toy`, any CSV path, and (eventually) any cached BPI log.
5+
transitions, mean / median / min / max / std-dev case length. Pure
6+
CPython; runs in the same process as the rest of pm-bench so it works
7+
on `synthetic-toy`, any CSV path, and (eventually) any cached BPI log.
88
"""
99
from __future__ import annotations
1010

@@ -27,6 +27,7 @@ class LogStats:
2727
latest: datetime | None
2828
mean_case_length: float
2929
median_case_length: float
30+
std_dev_case_length: float
3031
min_case_length: int
3132
max_case_length: int
3233
top_activities: list[tuple[Activity, int]]
@@ -68,6 +69,7 @@ def summarize(events: Iterable[Event], *, top_n: int = 10) -> LogStats:
6869
n_cases = len(by_case)
6970
mean_len = statistics.fmean(case_lengths) if case_lengths else 0.0
7071
median_len = statistics.median(case_lengths) if case_lengths else 0.0
72+
std_dev_len = statistics.pstdev(case_lengths) if case_lengths else 0.0
7173
min_len = min(case_lengths) if case_lengths else 0
7274
max_len = max(case_lengths) if case_lengths else 0
7375

@@ -80,6 +82,7 @@ def summarize(events: Iterable[Event], *, top_n: int = 10) -> LogStats:
8082
latest=latest,
8183
mean_case_length=mean_len,
8284
median_case_length=median_len,
85+
std_dev_case_length=std_dev_len,
8386
min_case_length=min_len,
8487
max_case_length=max_len,
8588
top_activities=_top_n_sorted(activity_counts, top_n),

tests/test_stats.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,49 @@ def test_cli_stats_synthetic_toy() -> None:
115115
assert len(data["top_activities"]) == 3
116116
assert "min_case_length" in data
117117
assert "max_case_length" in data
118+
119+
120+
def test_summarize_std_dev_empty_log() -> None:
121+
s = summarize([])
122+
assert s.std_dev_case_length == 0.0
123+
124+
125+
def test_summarize_std_dev_single_case() -> None:
126+
ts = dt.datetime(2024, 6, 1)
127+
s = summarize([("c1", "a", ts), ("c1", "b", ts + dt.timedelta(hours=1))])
128+
# Only one case; population std dev of [2] is 0.
129+
assert s.std_dev_case_length == 0.0
130+
131+
132+
def test_summarize_std_dev_uniform_cases() -> None:
133+
# Three cases, each with exactly 2 events: std dev must be 0.
134+
base = dt.datetime(2024, 1, 1)
135+
events = [
136+
("c1", "a", base),
137+
("c1", "b", base + dt.timedelta(hours=1)),
138+
("c2", "a", base + dt.timedelta(days=1)),
139+
("c2", "b", base + dt.timedelta(days=1, hours=1)),
140+
("c3", "a", base + dt.timedelta(days=2)),
141+
("c3", "b", base + dt.timedelta(days=2, hours=1)),
142+
]
143+
s = summarize(events)
144+
assert s.std_dev_case_length == 0.0
145+
146+
147+
def test_summarize_std_dev_case_length_value() -> None:
148+
import statistics
149+
150+
# _events() gives case lengths [3, 2, 1] (order stable inside cases).
151+
s = summarize(_events())
152+
expected = statistics.pstdev([3, 2, 1])
153+
assert abs(s.std_dev_case_length - expected) < 1e-9
154+
155+
156+
def test_cli_stats_std_dev_present() -> None:
157+
runner = CliRunner()
158+
r = runner.invoke(main, ["stats", "synthetic-toy"])
159+
assert r.exit_code == 0, r.output
160+
data = json.loads(r.output)
161+
assert "std_dev_case_length" in data
162+
assert isinstance(data["std_dev_case_length"], float)
163+
assert data["std_dev_case_length"] >= 0.0

0 commit comments

Comments
 (0)