Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
group_by_task=False,
grouping_keys=None,
)

sa = captured["sampling_args"]
Expand Down Expand Up @@ -116,6 +118,8 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
group_by_task=False,
grouping_keys=None,
)

sa = captured["sampling_args"]
Expand Down
85 changes: 84 additions & 1 deletion verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import verifiers as vf
from verifiers.utils.client_utils import setup_client
from verifiers.utils.message_utils import messages_to_printable, sanitize_tool_calls
from verifiers.utils.report_utils import compute_grouped_summary

# Setup logger for eval script using verifiers logging format
logger = logging.getLogger("verifiers.scripts.eval")
Expand All @@ -36,6 +37,9 @@ def eval_environment(
save_dataset: bool,
save_to_hf_hub: bool,
hf_hub_dataset_name: str,
group_by_task: bool,
grouping_keys: list[str] | None,
grouped_html_report: bool = False,
):
logger.setLevel("DEBUG" if verbose else "INFO")
try:
Expand Down Expand Up @@ -144,6 +148,36 @@ def eval_environment(
out = f"r{i + 1}: {trials}"
logger.info(out)

# Group rewards by task/subset if available and requested
if group_by_task and results.task and len(set(results.task)) > 1:
logger.info("--- Grouped Rewards by Task ---")
grouped_summary = compute_grouped_summary(results)
group_key = "grouped_by_task"
if group_key in grouped_summary:
for task, summary in grouped_summary[group_key].items():
reward_stats = summary["reward"]
logger.info(f"Task '{task}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")

# Group metrics by task as well
for metric_name, metric_stats in summary["metrics"].items():
logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")

# Enhanced grouping by multiple keys if specified
if grouping_keys:
logger.info("--- Grouped Rewards by Specified Keys ---")
grouped_summary = compute_grouped_summary(results, grouping_keys)
for grouping_key in grouping_keys:
group_key = f"grouped_by_{grouping_key}"
if group_key in grouped_summary:
logger.info(f"--- Grouped by {grouping_key} ---")
for group_value, summary in grouped_summary[group_key].items():
reward_stats = summary["reward"]
logger.info(f"{grouping_key} '{group_value}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")

# Group metrics by the same key
for metric_name, metric_stats in summary["metrics"].items():
logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")

if save_dataset or save_to_hf_hub:
ids = [i // rollouts_per_example for i in range(n * rollouts_per_example)]
rewards = results.reward
Expand Down Expand Up @@ -193,7 +227,32 @@ def eval_environment(
with open(results_path / "metadata.json", "w") as f:
json.dump(metadata, f)

logger.info(f"Saved dataset to {results_path}")
# Generate HTML report
try:
from verifiers.utils.report_utils import ReportMeta, write_html_report

report_meta = ReportMeta(
env_id=env,
env_version="0.0.0", # TODO: Get actual version
model=model,
num_examples=n,
rollouts_per_example=rollouts_per_example,
api_base_url=api_base_url,
sampling_args=merged_sampling_args or {},
env_args=env_args,
)

write_html_report(
report_dir=results_path / "reports",
meta=report_meta,
results=results,
group_by_task=grouped_html_report,
grouping_keys=grouping_keys,
)
logger.info(f"Saved dataset to {results_path}")
except Exception as e:
logger.warning(f"Failed to generate HTML report: {e}")

if save_to_hf_hub:
if hf_hub_dataset_name == "":
dataset_name = (
Expand Down Expand Up @@ -317,6 +376,27 @@ def main():
default="",
help="Name of dataset to save to Hugging Face Hub",
)
parser.add_argument(
"--group-by-task",
"-g",
default=False,
action="store_true",
help="Group rewards by task/subset when displaying results",
)
parser.add_argument(
"--grouping-keys",
"-G",
type=str,
nargs="+",
default=None,
help="Group rewards by specified keys (e.g., task difficulty category)",
)
parser.add_argument(
"--grouped-html-report",
default=False,
action="store_true",
help="Generate HTML report with grouped rewards by task/subset",
)
args = parser.parse_args()

eval_environment(
Expand All @@ -337,6 +417,9 @@ def main():
save_dataset=args.save_dataset,
save_to_hf_hub=args.save_to_hf_hub,
hf_hub_dataset_name=args.hf_hub_dataset_name,
group_by_task=args.group_by_task,
grouping_keys=args.grouping_keys,
grouped_html_report=args.grouped_html_report,
)


Expand Down
57 changes: 57 additions & 0 deletions verifiers/trainers/grpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,18 @@ def evaluate(
metrics["eval_reward"] = rewards.mean().item()
metrics["eval_reward_std"] = rewards.std().item()

# Group rewards by unique subset keys for more detailed analysis
# This allows calculating averages for different groups without multiple evaluations
grouping_keys = ["task"] # Can be extended to include other keys like difficulty, category, etc.

for grouping_key in grouping_keys:
if hasattr(eval_results, grouping_key) and getattr(eval_results, grouping_key):
key_values = getattr(eval_results, grouping_key)
grouped_metrics = self._compute_grouped_metrics(
key_values, eval_results.reward, eval_results.metrics
)
metrics.update(grouped_metrics)

# Log individual reward function scores
non_reward_metric_keys = [
"reward",
Expand Down Expand Up @@ -1439,6 +1451,51 @@ def evaluate(
# Return metrics dict to match base class signature
return metrics

def _compute_grouped_metrics(self, grouping_values, rewards, all_metrics):
"""
Compute grouped metrics by unique subset keys.

Args:
grouping_values: List of values to group by (e.g., task names, difficulty levels)
rewards: List of reward values
all_metrics: Dictionary of all metric values

Returns:
Dictionary of grouped metrics
"""
metrics = {}

# Create groups based on unique values
groups = {}
for i, group_key in enumerate(grouping_values):
if group_key not in groups:
groups[group_key] = {"indices": [], "rewards": []}
groups[group_key]["indices"].append(i)
groups[group_key]["rewards"].append(rewards[i])

# Compute metrics for each group
for group_key, group_data in groups.items():
group_rewards = torch.tensor(group_data["rewards"])
metrics[f"eval_reward_{group_key}"] = group_rewards.mean().item()
metrics[f"eval_reward_std_{group_key}"] = group_rewards.std().item()

# Compute grouped metrics for each metric type
for metric_key, metric_values in all_metrics.items():
if metric_key in ["reward", "prompt", "completion", "info", "answer", "state", "task"]:
continue # Skip non-reward metrics or metadata

group_metric_values = [metric_values[i] for i in group_data["indices"]]
if isinstance(group_metric_values, list):
metrics[f"eval_rewards/{metric_key}_{group_key}"] = float(np.mean(group_metric_values))
else:
try:
tensor_values = torch.tensor(group_metric_values)
metrics[f"eval_rewards/{metric_key}_{group_key}"] = tensor_values.mean().item()
except Exception:
continue

return metrics

def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
mode = "train" if self.model is not None and self.model.training else "eval" # type: ignore
metrics = {
Expand Down
141 changes: 120 additions & 21 deletions verifiers/utils/report_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,26 +199,49 @@ def build_report_filename(meta: ReportMeta) -> str:
</tr>
</table>

{% if metrics %}
<h2>Metrics</h2>
<table>
<tr>
<th>metric</th><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th>
</tr>
{% for name, m in metrics.items() %}
<tr>
<td>{{ name }}</td>
<td>{{ m.mean | round(4) }}</td>
<td>{{ m.std | round(4) }}</td>
<td>{{ m.n }}</td>
<td>{{ m.p5 | round(4) }}</td>
<td>{{ m.p25 | round(4) }}</td>
<td>{{ m.p50 | round(4) }}</td>
<td>{{ m.p75 | round(4) }}</td>
<td>{{ m.p95 | round(4) }}</td>
</tr>
{% endfor %}
</table>
{% if grouped_summary %}
<h2>Grouped Rewards</h2>
{% for group_type, groups in grouped_summary.items() %}
{% if group_type != "overall" %}
<h3>Grouped by {{ group_type.replace('grouped_by_', '')|title }}</h3>
{% for group_name, group_summary in groups.items() %}
<h4>{{ group_type.replace('grouped_by_', '')|title }}: {{ group_name }}</h4>
<table>
<tr><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th></tr>
<tr>
<td>{{ group_summary.reward.mean | round(4) }}</td>
<td>{{ group_summary.reward.std | round(4) }}</td>
<td>{{ group_summary.reward.n }}</td>
<td>{{ group_summary.reward.p5 | round(4) }}</td>
<td>{{ group_summary.reward.p25 | round(4) }}</td>
<td>{{ group_summary.reward.p50 | round(4) }}</td>
<td>{{ group_summary.reward.p75 | round(4) }}</td>
<td>{{ group_summary.reward.p95 | round(4) }}</td>
</tr>
</table>
{% if group_summary.metrics %}
<table>
<tr>
<th>metric</th><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th>
</tr>
{% for name, m in group_summary.metrics.items() %}
<tr>
<td>{{ name }}</td>
<td>{{ m.mean | round(4) }}</td>
<td>{{ m.std | round(4) }}</td>
<td>{{ m.n }}</td>
<td>{{ m.p5 | round(4) }}</td>
<td>{{ m.p25 | round(4) }}</td>
<td>{{ m.p50 | round(4) }}</td>
<td>{{ m.p75 | round(4) }}</td>
<td>{{ m.p95 | round(4) }}</td>
</tr>
{% endfor %}
</table>
{% endif %}
{% endfor %}
{% endif %}
{% endfor %}
{% endif %}

<h2>Examples <span class="muted">(showing up to {{ examples|length }} of {{ total_examples }})</span></h2>
Expand Down Expand Up @@ -252,6 +275,7 @@ def render_html(
summary: Dict[str, Any],
examples: List[Dict[str, Any]],
total_examples: int,
grouped_summary: Dict[str, Any] | None = None,
) -> str:
template = _env.from_string(_TEMPLATE)
return template.render(
Expand All @@ -271,29 +295,104 @@ def render_html(
metrics=summary.get("metrics", {}),
examples=examples,
total_examples=total_examples,
grouped_summary=grouped_summary,
)


def write_html_report(
report_dir: Path,
meta: ReportMeta,
results: GenerateOutputs,
group_by_task: bool = False,
grouping_keys: List[str] = None,
) -> Path:
"""Render and write the HTML report next to the environment under `reports/`.

Returns the path to the written HTML file.
Args:
report_dir: Directory to write the report to
meta: Report metadata
results: GenerateOutputs containing evaluation results
group_by_task: Whether to group by task (backward compatibility)
grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])

Returns:
Path to the written HTML file.
"""
report_dir.mkdir(parents=True, exist_ok=True)

summary = compute_summary(results)
examples = build_examples(results, cap=DETAILED_EXAMPLES_CAP)

# Compute grouped summary if requested
grouped_summary = None
if group_by_task or grouping_keys:
# For backward compatibility, if group_by_task is True but no grouping_keys provided,
# default to grouping by task
if group_by_task and not grouping_keys:
grouping_keys = ["task"]
grouped_summary = compute_grouped_summary(results, grouping_keys)

html = render_html(
meta=meta,
summary=summary,
examples=examples,
total_examples=len(results.reward),
grouped_summary=grouped_summary,
)
filename = build_report_filename(meta)
out_path = report_dir / filename
out_path.write_text(html, encoding="utf-8")
return out_path


def compute_grouped_summary(results: GenerateOutputs, grouping_keys: List[str] = None) -> Dict[str, Any]:
"""Compute grouped aggregated statistics from GenerateOutputs by specified subset keys.

Args:
results: GenerateOutputs containing evaluation results
grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])
If None, defaults to ['task'] for backward compatibility

Returns:
Dictionary with overall summary and grouped summaries by specified keys.
"""
summary: Dict[str, Any] = {}

# Overall summary
summary["overall"] = compute_summary(results)

# Default to task grouping for backward compatibility
if grouping_keys is None:
grouping_keys = ["task"]

# Grouped summaries by specified keys
for grouping_key in grouping_keys:
if hasattr(results, grouping_key) and getattr(results, grouping_key):
key_values = getattr(results, grouping_key)
# Only proceed if we have multiple unique values
if len(set(key_values)) > 1:
key_groups = {}
for i, key_value in enumerate(key_values):
if key_value not in key_groups:
key_groups[key_value] = []
key_groups[key_value].append(i)

grouped_summaries = {}
for key_value, indices in key_groups.items():
# Create a subset of results for this group
subset_results = GenerateOutputs(
prompt=[results.prompt[i] for i in indices],
completion=[results.completion[i] for i in indices],
answer=[results.answer[i] for i in indices],
state=[results.state[i] for i in indices],
info=[results.info[i] for i in indices],
task=[results.task[i] for i in indices],
reward=[results.reward[i] for i in indices],
metrics={k: [results.metrics[k][i] for i in indices] for k in results.metrics}
)
grouped_summaries[key_value] = compute_summary(subset_results)

# Add grouped summaries with the key name to avoid conflicts
summary[f"grouped_by_{grouping_key}"] = grouped_summaries

return summary
Loading