Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ pm-bench fetch bpi2020 --pin

## Recently shipped

- **`pm-bench compare board_a.json board_b.json`** (`compare-command`
branch).
- Diff two leaderboard JSON files. Per-model score deltas as JSON;
models unique to one side surfaced separately.
- Tasks/datasets must match (errors loudly otherwise) — prevents
accidental cross-task comparisons.
- 6 new tests; 123 total, ruff clean.
- **Floor baselines for time + conformance** (`floor-baselines` branch).
- `zero-time` for remaining-time: predicts 0 days for every prefix.
MAE 2.741 on synthetic-toy - exactly twice mean-ref's 1.348, as
Expand Down
20 changes: 20 additions & 0 deletions pm_bench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pm_bench.leaderboard import (
all_standings_markdown,
board_to_markdown,
compare_boards,
load_board,
standings,
verify,
Expand Down Expand Up @@ -777,5 +778,24 @@ def leaderboard(
)


@main.command()
@click.argument("board_a", type=click.Path(exists=True, dir_okay=False))
@click.argument("board_b", type=click.Path(exists=True, dir_okay=False))
def compare(board_a: str, board_b: str) -> None:
"""Diff two leaderboard JSON files. Per-model score deltas as JSON.

Use case: snapshot today's standings, change something, run again,
diff. Models that exist on only one side are surfaced separately.
"""
a = load_board(board_a)
b = load_board(board_b)
try:
result = compare_boards(a, b)
except ValueError as exc:
click.echo(str(exc), err=True)
sys.exit(1)
click.echo(json.dumps(result, indent=2))


if __name__ == "__main__":
main()
40 changes: 40 additions & 0 deletions pm_bench/leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,46 @@ def verify(board: Board, repo_root: str | Path = ".", *, tol: float = 1e-9) -> l
return drifts


def compare_boards(a: Board, b: Board) -> dict:
"""Return per-model score deltas between two leaderboard snapshots.

Both boards must be on the same (task, dataset). Models matched by
name; entries unique to one side are surfaced separately.
"""
if a.task != b.task or a.dataset != b.dataset:
raise ValueError(
f"can't compare different boards: {a.task}/{a.dataset} vs "
f"{b.task}/{b.dataset}"
)

a_by_model = {e.model: e for e in a.entries}
b_by_model = {e.model: e for e in b.entries}
shared = sorted(set(a_by_model) & set(b_by_model))
only_a = sorted(set(a_by_model) - set(b_by_model))
only_b = sorted(set(b_by_model) - set(a_by_model))

deltas: list[dict] = []
for model in shared:
ea, eb = a_by_model[model], b_by_model[model]
per_key: dict[str, dict] = {}
for k in sorted(set(ea.score) | set(eb.score)):
va = ea.score.get(k)
vb = eb.score.get(k)
entry: dict = {"a": va, "b": vb}
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
entry["delta"] = vb - va
per_key[k] = entry
deltas.append({"model": model, "scores": per_key})

return {
"task": a.task,
"dataset": a.dataset,
"compared": deltas,
"only_in_a": only_a,
"only_in_b": only_b,
}


def board_to_markdown(board: Board) -> str:
"""Render a single board as a fenced markdown table.

Expand Down
98 changes: 98 additions & 0 deletions tests/test_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Tests for `pm-bench compare` and `compare_boards`."""
from __future__ import annotations

import json
from pathlib import Path

import pytest
from click.testing import CliRunner

from pm_bench.cli import main
from pm_bench.leaderboard import compare_boards, load_board

REPO_ROOT = Path(__file__).resolve().parent.parent
NEXT_EVENT_BOARD = REPO_ROOT / "leaderboard" / "next-event" / "synthetic-toy.json"


def test_compare_identical_boards_has_zero_deltas() -> None:
a = load_board(NEXT_EVENT_BOARD)
b = load_board(NEXT_EVENT_BOARD)
result = compare_boards(a, b)
assert result["only_in_a"] == []
assert result["only_in_b"] == []
for entry in result["compared"]:
for v in entry["scores"].values():
if "delta" in v:
assert v["delta"] == 0


def test_compare_detects_score_change(tmp_path: Path) -> None:
raw = json.loads(NEXT_EVENT_BOARD.read_text())
# Bump markov-ref's top1 by 0.05 in the b copy
b_path = tmp_path / "b.json"
raw["entries"][0]["score"]["top1"] += 0.05
b_path.write_text(json.dumps(raw))

a = load_board(NEXT_EVENT_BOARD)
b = load_board(b_path)
result = compare_boards(a, b)

markov_entry = next(e for e in result["compared"] if e["model"] == "markov-ref")
assert abs(markov_entry["scores"]["top1"]["delta"] - 0.05) < 1e-9


def test_compare_surfaces_only_in_b(tmp_path: Path) -> None:
raw = json.loads(NEXT_EVENT_BOARD.read_text())
raw["entries"].append({
"model": "newcomer",
"version": "0.1.0",
"predictions_path": "x",
"score": {"top1": 0.5, "top3": 0.7, "n": 1},
})
b_path = tmp_path / "b.json"
b_path.write_text(json.dumps(raw))

a = load_board(NEXT_EVENT_BOARD)
b = load_board(b_path)
result = compare_boards(a, b)
assert "newcomer" in result["only_in_b"]
assert "newcomer" not in result["only_in_a"]


def test_compare_rejects_different_tasks(tmp_path: Path) -> None:
a = load_board(NEXT_EVENT_BOARD)
other = load_board(REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json")
with pytest.raises(ValueError, match="can't compare"):
compare_boards(a, other)


def test_cli_compare_smoke(tmp_path: Path) -> None:
raw = json.loads(NEXT_EVENT_BOARD.read_text())
raw["entries"][0]["score"]["top1"] += 0.01
b_path = tmp_path / "b.json"
b_path.write_text(json.dumps(raw))

runner = CliRunner()
r = runner.invoke(main, ["compare", str(NEXT_EVENT_BOARD), str(b_path)])
assert r.exit_code == 0, r.output
out = json.loads(r.output)
assert out["task"] == "next-event"
assert any(
abs(e["scores"]["top1"].get("delta", 0) - 0.01) < 1e-9
for e in out["compared"]
if e["model"] == "markov-ref"
)


def test_cli_compare_different_tasks_exits_nonzero() -> None:
runner = CliRunner()
r = runner.invoke(
main,
[
"compare",
str(NEXT_EVENT_BOARD),
str(REPO_ROOT / "leaderboard" / "outcome" / "synthetic-toy.json"),
],
)
assert r.exit_code == 1
assert "can't compare" in r.output