diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index ccb87d4c2..9735813f6 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -69,6 +69,8 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
+ group_by_task=False,
+ grouping_keys=None,
)
sa = captured["sampling_args"]
@@ -116,6 +118,8 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
+ group_by_task=False,
+ grouping_keys=None,
)
sa = captured["sampling_args"]
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
index a20a9d725..5123a73b7 100644
--- a/verifiers/scripts/eval.py
+++ b/verifiers/scripts/eval.py
@@ -13,6 +13,7 @@
import verifiers as vf
from verifiers.utils.client_utils import setup_client
from verifiers.utils.message_utils import messages_to_printable, sanitize_tool_calls
+from verifiers.utils.report_utils import compute_grouped_summary
# Setup logger for eval script using verifiers logging format
logger = logging.getLogger("verifiers.scripts.eval")
@@ -36,6 +37,9 @@ def eval_environment(
save_dataset: bool,
save_to_hf_hub: bool,
hf_hub_dataset_name: str,
+ group_by_task: bool,
+ grouping_keys: list[str] | None,
+ grouped_html_report: bool = False,
):
logger.setLevel("DEBUG" if verbose else "INFO")
try:
@@ -144,6 +148,36 @@ def eval_environment(
out = f"r{i + 1}: {trials}"
logger.info(out)
+ # Group rewards by task/subset if available and requested
+ if group_by_task and results.task and len(set(results.task)) > 1:
+ logger.info("--- Grouped Rewards by Task ---")
+ grouped_summary = compute_grouped_summary(results)
+ group_key = "grouped_by_task"
+ if group_key in grouped_summary:
+ for task, summary in grouped_summary[group_key].items():
+ reward_stats = summary["reward"]
+ logger.info(f"Task '{task}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")
+
+ # Group metrics by task as well
+ for metric_name, metric_stats in summary["metrics"].items():
+ logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")
+
+ # Enhanced grouping by multiple keys if specified
+ if grouping_keys:
+ logger.info("--- Grouped Rewards by Specified Keys ---")
+ grouped_summary = compute_grouped_summary(results, grouping_keys)
+ for grouping_key in grouping_keys:
+ group_key = f"grouped_by_{grouping_key}"
+ if group_key in grouped_summary:
+ logger.info(f"--- Grouped by {grouping_key} ---")
+ for group_value, summary in grouped_summary[group_key].items():
+ reward_stats = summary["reward"]
+ logger.info(f"{grouping_key} '{group_value}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")
+
+ # Group metrics by the same key
+ for metric_name, metric_stats in summary["metrics"].items():
+ logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")
+
if save_dataset or save_to_hf_hub:
ids = [i // rollouts_per_example for i in range(n * rollouts_per_example)]
rewards = results.reward
@@ -193,7 +227,32 @@ def eval_environment(
with open(results_path / "metadata.json", "w") as f:
json.dump(metadata, f)
- logger.info(f"Saved dataset to {results_path}")
+ # Generate HTML report
+ try:
+ from verifiers.utils.report_utils import ReportMeta, write_html_report
+
+ report_meta = ReportMeta(
+ env_id=env,
+ env_version="0.0.0", # TODO: Get actual version
+ model=model,
+ num_examples=n,
+ rollouts_per_example=rollouts_per_example,
+ api_base_url=api_base_url,
+ sampling_args=merged_sampling_args or {},
+ env_args=env_args,
+ )
+
+ write_html_report(
+ report_dir=results_path / "reports",
+ meta=report_meta,
+ results=results,
+ group_by_task=grouped_html_report,
+ grouping_keys=grouping_keys,
+ )
+ logger.info(f"Saved dataset to {results_path}")
+ except Exception as e:
+ logger.warning(f"Failed to generate HTML report: {e}")
+
if save_to_hf_hub:
if hf_hub_dataset_name == "":
dataset_name = (
@@ -317,6 +376,27 @@ def main():
default="",
help="Name of dataset to save to Hugging Face Hub",
)
+ parser.add_argument(
+ "--group-by-task",
+ "-g",
+ default=False,
+ action="store_true",
+ help="Group rewards by task/subset when displaying results",
+ )
+ parser.add_argument(
+ "--grouping-keys",
+ "-G",
+ type=str,
+ nargs="+",
+ default=None,
+ help="Group rewards by specified keys (e.g., task difficulty category)",
+ )
+ parser.add_argument(
+ "--grouped-html-report",
+ default=False,
+ action="store_true",
+ help="Generate HTML report with grouped rewards by task/subset",
+ )
args = parser.parse_args()
eval_environment(
@@ -337,6 +417,9 @@ def main():
save_dataset=args.save_dataset,
save_to_hf_hub=args.save_to_hf_hub,
hf_hub_dataset_name=args.hf_hub_dataset_name,
+ group_by_task=args.group_by_task,
+ grouping_keys=args.grouping_keys,
+ grouped_html_report=args.grouped_html_report,
)
diff --git a/verifiers/trainers/grpo_trainer.py b/verifiers/trainers/grpo_trainer.py
index 792cc16db..341f711ce 100644
--- a/verifiers/trainers/grpo_trainer.py
+++ b/verifiers/trainers/grpo_trainer.py
@@ -1342,6 +1342,18 @@ def evaluate(
metrics["eval_reward"] = rewards.mean().item()
metrics["eval_reward_std"] = rewards.std().item()
+ # Group rewards by unique subset keys for more detailed analysis
+ # This allows calculating averages for different groups without multiple evaluations
+ grouping_keys = ["task"] # Can be extended to include other keys like difficulty, category, etc.
+
+ for grouping_key in grouping_keys:
+ if hasattr(eval_results, grouping_key) and getattr(eval_results, grouping_key):
+ key_values = getattr(eval_results, grouping_key)
+ grouped_metrics = self._compute_grouped_metrics(
+ key_values, eval_results.reward, eval_results.metrics
+ )
+ metrics.update(grouped_metrics)
+
# Log individual reward function scores
non_reward_metric_keys = [
"reward",
@@ -1439,6 +1451,51 @@ def evaluate(
# Return metrics dict to match base class signature
return metrics
+ def _compute_grouped_metrics(self, grouping_values, rewards, all_metrics):
+ """
+ Compute grouped metrics by unique subset keys.
+
+ Args:
+ grouping_values: List of values to group by (e.g., task names, difficulty levels)
+ rewards: List of reward values
+ all_metrics: Dictionary of all metric values
+
+ Returns:
+ Dictionary of grouped metrics
+ """
+ metrics = {}
+
+ # Create groups based on unique values
+ groups = {}
+ for i, group_key in enumerate(grouping_values):
+ if group_key not in groups:
+ groups[group_key] = {"indices": [], "rewards": []}
+ groups[group_key]["indices"].append(i)
+ groups[group_key]["rewards"].append(rewards[i])
+
+ # Compute metrics for each group
+ for group_key, group_data in groups.items():
+ group_rewards = torch.tensor(group_data["rewards"])
+ metrics[f"eval_reward_{group_key}"] = group_rewards.mean().item()
+ metrics[f"eval_reward_std_{group_key}"] = group_rewards.std().item()
+
+ # Compute grouped metrics for each metric type
+ for metric_key, metric_values in all_metrics.items():
+ if metric_key in ["reward", "prompt", "completion", "info", "answer", "state", "task"]:
+ continue # Skip non-reward metrics or metadata
+
+ group_metric_values = [metric_values[i] for i in group_data["indices"]]
+ if isinstance(group_metric_values, list):
+ metrics[f"eval_rewards/{metric_key}_{group_key}"] = float(np.mean(group_metric_values))
+ else:
+ try:
+ tensor_values = torch.tensor(group_metric_values)
+ metrics[f"eval_rewards/{metric_key}_{group_key}"] = tensor_values.mean().item()
+ except Exception:
+ continue
+
+ return metrics
+
def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
mode = "train" if self.model is not None and self.model.training else "eval" # type: ignore
metrics = {
diff --git a/verifiers/utils/report_utils.py b/verifiers/utils/report_utils.py
index f679ea83c..98a4b1f9c 100644
--- a/verifiers/utils/report_utils.py
+++ b/verifiers/utils/report_utils.py
@@ -199,26 +199,49 @@ def build_report_filename(meta: ReportMeta) -> str:
- {% if metrics %}
-
Metrics
-
-
- | metric | mean | std | n | p5 | p25 | p50 | p75 | p95 |
-
- {% for name, m in metrics.items() %}
-
- | {{ name }} |
- {{ m.mean | round(4) }} |
- {{ m.std | round(4) }} |
- {{ m.n }} |
- {{ m.p5 | round(4) }} |
- {{ m.p25 | round(4) }} |
- {{ m.p50 | round(4) }} |
- {{ m.p75 | round(4) }} |
- {{ m.p95 | round(4) }} |
-
- {% endfor %}
-
+ {% if grouped_summary %}
+ Grouped Rewards
+ {% for group_type, groups in grouped_summary.items() %}
+ {% if group_type != "overall" %}
+ Grouped by {{ group_type.replace('grouped_by_', '')|title }}
+ {% for group_name, group_summary in groups.items() %}
+ {{ group_type.replace('grouped_by_', '')|title }}: {{ group_name }}
+
+ | mean | std | n | p5 | p25 | p50 | p75 | p95 |
+
+ | {{ group_summary.reward.mean | round(4) }} |
+ {{ group_summary.reward.std | round(4) }} |
+ {{ group_summary.reward.n }} |
+ {{ group_summary.reward.p5 | round(4) }} |
+ {{ group_summary.reward.p25 | round(4) }} |
+ {{ group_summary.reward.p50 | round(4) }} |
+ {{ group_summary.reward.p75 | round(4) }} |
+ {{ group_summary.reward.p95 | round(4) }} |
+
+
+ {% if group_summary.metrics %}
+
+
+ | metric | mean | std | n | p5 | p25 | p50 | p75 | p95 |
+
+ {% for name, m in group_summary.metrics.items() %}
+
+ | {{ name }} |
+ {{ m.mean | round(4) }} |
+ {{ m.std | round(4) }} |
+ {{ m.n }} |
+ {{ m.p5 | round(4) }} |
+ {{ m.p25 | round(4) }} |
+ {{ m.p50 | round(4) }} |
+ {{ m.p75 | round(4) }} |
+ {{ m.p95 | round(4) }} |
+
+ {% endfor %}
+
+ {% endif %}
+ {% endfor %}
+ {% endif %}
+ {% endfor %}
{% endif %}
Examples (showing up to {{ examples|length }} of {{ total_examples }})
@@ -252,6 +275,7 @@ def render_html(
summary: Dict[str, Any],
examples: List[Dict[str, Any]],
total_examples: int,
+ grouped_summary: Dict[str, Any] | None = None,
) -> str:
template = _env.from_string(_TEMPLATE)
return template.render(
@@ -271,6 +295,7 @@ def render_html(
metrics=summary.get("metrics", {}),
examples=examples,
total_examples=total_examples,
+ grouped_summary=grouped_summary,
)
@@ -278,22 +303,96 @@ def write_html_report(
report_dir: Path,
meta: ReportMeta,
results: GenerateOutputs,
+ group_by_task: bool = False,
+ grouping_keys: List[str] = None,
) -> Path:
"""Render and write the HTML report next to the environment under `reports/`.
- Returns the path to the written HTML file.
+ Args:
+ report_dir: Directory to write the report to
+ meta: Report metadata
+ results: GenerateOutputs containing evaluation results
+ group_by_task: Whether to group by task (backward compatibility)
+ grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])
+
+ Returns:
+ Path to the written HTML file.
"""
report_dir.mkdir(parents=True, exist_ok=True)
summary = compute_summary(results)
examples = build_examples(results, cap=DETAILED_EXAMPLES_CAP)
+
+ # Compute grouped summary if requested
+ grouped_summary = None
+ if group_by_task or grouping_keys:
+ # For backward compatibility, if group_by_task is True but no grouping_keys provided,
+ # default to grouping by task
+ if group_by_task and not grouping_keys:
+ grouping_keys = ["task"]
+ grouped_summary = compute_grouped_summary(results, grouping_keys)
+
html = render_html(
meta=meta,
summary=summary,
examples=examples,
total_examples=len(results.reward),
+ grouped_summary=grouped_summary,
)
filename = build_report_filename(meta)
out_path = report_dir / filename
out_path.write_text(html, encoding="utf-8")
return out_path
+
+
+def compute_grouped_summary(results: GenerateOutputs, grouping_keys: List[str] = None) -> Dict[str, Any]:
+ """Compute grouped aggregated statistics from GenerateOutputs by specified subset keys.
+
+ Args:
+ results: GenerateOutputs containing evaluation results
+ grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])
+ If None, defaults to ['task'] for backward compatibility
+
+ Returns:
+ Dictionary with overall summary and grouped summaries by specified keys.
+ """
+ summary: Dict[str, Any] = {}
+
+ # Overall summary
+ summary["overall"] = compute_summary(results)
+
+ # Default to task grouping for backward compatibility
+ if grouping_keys is None:
+ grouping_keys = ["task"]
+
+ # Grouped summaries by specified keys
+ for grouping_key in grouping_keys:
+ if hasattr(results, grouping_key) and getattr(results, grouping_key):
+ key_values = getattr(results, grouping_key)
+ # Only proceed if we have multiple unique values
+ if len(set(key_values)) > 1:
+ key_groups = {}
+ for i, key_value in enumerate(key_values):
+ if key_value not in key_groups:
+ key_groups[key_value] = []
+ key_groups[key_value].append(i)
+
+ grouped_summaries = {}
+ for key_value, indices in key_groups.items():
+ # Create a subset of results for this group
+ subset_results = GenerateOutputs(
+ prompt=[results.prompt[i] for i in indices],
+ completion=[results.completion[i] for i in indices],
+ answer=[results.answer[i] for i in indices],
+ state=[results.state[i] for i in indices],
+ info=[results.info[i] for i in indices],
+ task=[results.task[i] for i in indices],
+ reward=[results.reward[i] for i in indices],
+ metrics={k: [results.metrics[k][i] for i in indices] for k in results.metrics}
+ )
+ grouped_summaries[key_value] = compute_summary(subset_results)
+
+ # Add grouped summaries with the key name to avoid conflicts
+ summary[f"grouped_by_{grouping_key}"] = grouped_summaries
+
+ return summary