diff --git a/STATUS.md b/STATUS.md index f502ae2..e3daad4 100644 --- a/STATUS.md +++ b/STATUS.md @@ -60,6 +60,13 @@ pm-bench fetch bpi2020 --pin ## Recently shipped +- **`pm-bench compare board_a.json board_b.json`** (`compare-command` + branch). + - Diff two leaderboard JSON files. Per-model score deltas as JSON; + models unique to one side surfaced separately. + - Tasks/datasets must match (errors loudly otherwise) — prevents + accidental cross-task comparisons. + - 6 new tests; 123 total, ruff clean. - **Floor baselines for time + conformance** (`floor-baselines` branch). - `zero-time` for remaining-time: predicts 0 days for every prefix. MAE 2.741 on synthetic-toy - exactly twice mean-ref's 1.348, as diff --git a/pm_bench/cli.py b/pm_bench/cli.py index b56af74..e845f1e 100644 --- a/pm_bench/cli.py +++ b/pm_bench/cli.py @@ -40,6 +40,7 @@ from pm_bench.leaderboard import ( all_standings_markdown, board_to_markdown, + compare_boards, load_board, standings, verify, @@ -777,5 +778,24 @@ def leaderboard( ) +@main.command() +@click.argument("board_a", type=click.Path(exists=True, dir_okay=False)) +@click.argument("board_b", type=click.Path(exists=True, dir_okay=False)) +def compare(board_a: str, board_b: str) -> None: + """Diff two leaderboard JSON files. Per-model score deltas as JSON. + + Use case: snapshot today's standings, change something, run again, + diff. Models that exist on only one side are surfaced separately. + """ + a = load_board(board_a) + b = load_board(board_b) + try: + result = compare_boards(a, b) + except ValueError as exc: + click.echo(str(exc), err=True) + sys.exit(1) + click.echo(json.dumps(result, indent=2)) + + if __name__ == "__main__": main() diff --git a/pm_bench/leaderboard.py b/pm_bench/leaderboard.py index bce6913..fab23bb 100644 --- a/pm_bench/leaderboard.py +++ b/pm_bench/leaderboard.py @@ -325,6 +325,46 @@ def verify(board: Board, repo_root: str | Path = ".", *, tol: float = 1e-9) -> l return drifts +def compare_boards(a: Board, b: Board) -> dict: + """Return per-model score deltas between two leaderboard snapshots. + + Both boards must be on the same (task, dataset). Models matched by + name; entries unique to one side are surfaced separately. + """ + if a.task != b.task or a.dataset != b.dataset: + raise ValueError( + f"can't compare different boards: {a.task}/{a.dataset} vs " + f"{b.task}/{b.dataset}" + ) + + a_by_model = {e.model: e for e in a.entries} + b_by_model = {e.model: e for e in b.entries} + shared = sorted(set(a_by_model) & set(b_by_model)) + only_a = sorted(set(a_by_model) - set(b_by_model)) + only_b = sorted(set(b_by_model) - set(a_by_model)) + + deltas: list[dict] = [] + for model in shared: + ea, eb = a_by_model[model], b_by_model[model] + per_key: dict[str, dict] = {} + for k in sorted(set(ea.score) | set(eb.score)): + va = ea.score.get(k) + vb = eb.score.get(k) + entry: dict = {"a": va, "b": vb} + if isinstance(va, (int, float)) and isinstance(vb, (int, float)): + entry["delta"] = vb - va + per_key[k] = entry + deltas.append({"model": model, "scores": per_key}) + + return { + "task": a.task, + "dataset": a.dataset, + "compared": deltas, + "only_in_a": only_a, + "only_in_b": only_b, + } + + def board_to_markdown(board: Board) -> str: """Render a single board as a fenced markdown table. diff --git a/tests/test_compare.py b/tests/test_compare.py new file mode 100644 index 0000000..b53d549 --- /dev/null +++ b/tests/test_compare.py @@ -0,0 +1,98 @@ +"""Tests for `pm-bench compare` and `compare_boards`.""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from pm_bench.cli import main +from pm_bench.leaderboard import compare_boards, load_board + +REPO_ROOT = Path(__file__).resolve().parent.parent +NEXT_EVENT_BOARD = REPO_ROOT / "leaderboard" / "next-event" / "synthetic-toy.json" + + +def test_compare_identical_boards_has_zero_deltas() -> None: + a = load_board(NEXT_EVENT_BOARD) + b = load_board(NEXT_EVENT_BOARD) + result = compare_boards(a, b) + assert result["only_in_a"] == [] + assert result["only_in_b"] == [] + for entry in result["compared"]: + for v in entry["scores"].values(): + if "delta" in v: + assert v["delta"] == 0 + + +def test_compare_detects_score_change(tmp_path: Path) -> None: + raw = json.loads(NEXT_EVENT_BOARD.read_text()) + # Bump markov-ref's top1 by 0.05 in the b copy + b_path = tmp_path / "b.json" + raw["entries"][0]["score"]["top1"] += 0.05 + b_path.write_text(json.dumps(raw)) + + a = load_board(NEXT_EVENT_BOARD) + b = load_board(b_path) + result = compare_boards(a, b) + + markov_entry = next(e for e in result["compared"] if e["model"] == "markov-ref") + assert abs(markov_entry["scores"]["top1"]["delta"] - 0.05) < 1e-9 + + +def test_compare_surfaces_only_in_b(tmp_path: Path) -> None: + raw = json.loads(NEXT_EVENT_BOARD.read_text()) + raw["entries"].append({ + "model": "newcomer", + "version": "0.1.0", + "predictions_path": "x", + "score": {"top1": 0.5, "top3": 0.7, "n": 1}, + }) + b_path = tmp_path / "b.json" + b_path.write_text(json.dumps(raw)) + + a = load_board(NEXT_EVENT_BOARD) + b = load_board(b_path) + result = compare_boards(a, b) + assert "newcomer" in result["only_in_b"] + assert "newcomer" not in result["only_in_a"] + + +def test_compare_rejects_different_tasks(tmp_path: Path) -> None: + a = load_board(NEXT_EVENT_BOARD) + other = load_board(REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json") + with pytest.raises(ValueError, match="can't compare"): + compare_boards(a, other) + + +def test_cli_compare_smoke(tmp_path: Path) -> None: + raw = json.loads(NEXT_EVENT_BOARD.read_text()) + raw["entries"][0]["score"]["top1"] += 0.01 + b_path = tmp_path / "b.json" + b_path.write_text(json.dumps(raw)) + + runner = CliRunner() + r = runner.invoke(main, ["compare", str(NEXT_EVENT_BOARD), str(b_path)]) + assert r.exit_code == 0, r.output + out = json.loads(r.output) + assert out["task"] == "next-event" + assert any( + abs(e["scores"]["top1"].get("delta", 0) - 0.01) < 1e-9 + for e in out["compared"] + if e["model"] == "markov-ref" + ) + + +def test_cli_compare_different_tasks_exits_nonzero() -> None: + runner = CliRunner() + r = runner.invoke( + main, + [ + "compare", + str(NEXT_EVENT_BOARD), + str(REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json"), + ], + ) + assert r.exit_code == 1 + assert "can't compare" in r.output