diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index ccb87d4c2..9735813f6 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -69,6 +69,8 @@ def __init__(self, api_key=None, base_url=None): save_dataset=False, save_to_hf_hub=False, hf_hub_dataset_name="", + group_by_task=False, + grouping_keys=None, ) sa = captured["sampling_args"] @@ -116,6 +118,8 @@ def __init__(self, api_key=None, base_url=None): save_dataset=False, save_to_hf_hub=False, hf_hub_dataset_name="", + group_by_task=False, + grouping_keys=None, ) sa = captured["sampling_args"] diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py index a20a9d725..5123a73b7 100644 --- a/verifiers/scripts/eval.py +++ b/verifiers/scripts/eval.py @@ -13,6 +13,7 @@ import verifiers as vf from verifiers.utils.client_utils import setup_client from verifiers.utils.message_utils import messages_to_printable, sanitize_tool_calls +from verifiers.utils.report_utils import compute_grouped_summary # Setup logger for eval script using verifiers logging format logger = logging.getLogger("verifiers.scripts.eval") @@ -36,6 +37,9 @@ def eval_environment( save_dataset: bool, save_to_hf_hub: bool, hf_hub_dataset_name: str, + group_by_task: bool, + grouping_keys: list[str] | None, + grouped_html_report: bool = False, ): logger.setLevel("DEBUG" if verbose else "INFO") try: @@ -144,6 +148,36 @@ def eval_environment( out = f"r{i + 1}: {trials}" logger.info(out) + # Group rewards by task/subset if available and requested + if group_by_task and results.task and len(set(results.task)) > 1: + logger.info("--- Grouped Rewards by Task ---") + grouped_summary = compute_grouped_summary(results) + group_key = "grouped_by_task" + if group_key in grouped_summary: + for task, summary in grouped_summary[group_key].items(): + reward_stats = summary["reward"] + logger.info(f"Task '{task}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}") + + # Group metrics by task as well + for metric_name, metric_stats in summary["metrics"].items(): + logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}") + + # Enhanced grouping by multiple keys if specified + if grouping_keys: + logger.info("--- Grouped Rewards by Specified Keys ---") + grouped_summary = compute_grouped_summary(results, grouping_keys) + for grouping_key in grouping_keys: + group_key = f"grouped_by_{grouping_key}" + if group_key in grouped_summary: + logger.info(f"--- Grouped by {grouping_key} ---") + for group_value, summary in grouped_summary[group_key].items(): + reward_stats = summary["reward"] + logger.info(f"{grouping_key} '{group_value}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}") + + # Group metrics by the same key + for metric_name, metric_stats in summary["metrics"].items(): + logger.info(f" {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}") + if save_dataset or save_to_hf_hub: ids = [i // rollouts_per_example for i in range(n * rollouts_per_example)] rewards = results.reward @@ -193,7 +227,32 @@ def eval_environment( with open(results_path / "metadata.json", "w") as f: json.dump(metadata, f) - logger.info(f"Saved dataset to {results_path}") + # Generate HTML report + try: + from verifiers.utils.report_utils import ReportMeta, write_html_report + + report_meta = ReportMeta( + env_id=env, + env_version="0.0.0", # TODO: Get actual version + model=model, + num_examples=n, + rollouts_per_example=rollouts_per_example, + api_base_url=api_base_url, + sampling_args=merged_sampling_args or {}, + env_args=env_args, + ) + + write_html_report( + report_dir=results_path / "reports", + meta=report_meta, + results=results, + group_by_task=grouped_html_report, + grouping_keys=grouping_keys, + ) + logger.info(f"Saved dataset to {results_path}") + except Exception as e: + logger.warning(f"Failed to generate HTML report: {e}") + if save_to_hf_hub: if hf_hub_dataset_name == "": dataset_name = ( @@ -317,6 +376,27 @@ def main(): default="", help="Name of dataset to save to Hugging Face Hub", ) + parser.add_argument( + "--group-by-task", + "-g", + default=False, + action="store_true", + help="Group rewards by task/subset when displaying results", + ) + parser.add_argument( + "--grouping-keys", + "-G", + type=str, + nargs="+", + default=None, + help="Group rewards by specified keys (e.g., task difficulty category)", + ) + parser.add_argument( + "--grouped-html-report", + default=False, + action="store_true", + help="Generate HTML report with grouped rewards by task/subset", + ) args = parser.parse_args() eval_environment( @@ -337,6 +417,9 @@ def main(): save_dataset=args.save_dataset, save_to_hf_hub=args.save_to_hf_hub, hf_hub_dataset_name=args.hf_hub_dataset_name, + group_by_task=args.group_by_task, + grouping_keys=args.grouping_keys, + grouped_html_report=args.grouped_html_report, ) diff --git a/verifiers/trainers/grpo_trainer.py b/verifiers/trainers/grpo_trainer.py index 792cc16db..341f711ce 100644 --- a/verifiers/trainers/grpo_trainer.py +++ b/verifiers/trainers/grpo_trainer.py @@ -1342,6 +1342,18 @@ def evaluate( metrics["eval_reward"] = rewards.mean().item() metrics["eval_reward_std"] = rewards.std().item() + # Group rewards by unique subset keys for more detailed analysis + # This allows calculating averages for different groups without multiple evaluations + grouping_keys = ["task"] # Can be extended to include other keys like difficulty, category, etc. + + for grouping_key in grouping_keys: + if hasattr(eval_results, grouping_key) and getattr(eval_results, grouping_key): + key_values = getattr(eval_results, grouping_key) + grouped_metrics = self._compute_grouped_metrics( + key_values, eval_results.reward, eval_results.metrics + ) + metrics.update(grouped_metrics) + # Log individual reward function scores non_reward_metric_keys = [ "reward", @@ -1439,6 +1451,51 @@ def evaluate( # Return metrics dict to match base class signature return metrics + def _compute_grouped_metrics(self, grouping_values, rewards, all_metrics): + """ + Compute grouped metrics by unique subset keys. + + Args: + grouping_values: List of values to group by (e.g., task names, difficulty levels) + rewards: List of reward values + all_metrics: Dictionary of all metric values + + Returns: + Dictionary of grouped metrics + """ + metrics = {} + + # Create groups based on unique values + groups = {} + for i, group_key in enumerate(grouping_values): + if group_key not in groups: + groups[group_key] = {"indices": [], "rewards": []} + groups[group_key]["indices"].append(i) + groups[group_key]["rewards"].append(rewards[i]) + + # Compute metrics for each group + for group_key, group_data in groups.items(): + group_rewards = torch.tensor(group_data["rewards"]) + metrics[f"eval_reward_{group_key}"] = group_rewards.mean().item() + metrics[f"eval_reward_std_{group_key}"] = group_rewards.std().item() + + # Compute grouped metrics for each metric type + for metric_key, metric_values in all_metrics.items(): + if metric_key in ["reward", "prompt", "completion", "info", "answer", "state", "task"]: + continue # Skip non-reward metrics or metadata + + group_metric_values = [metric_values[i] for i in group_data["indices"]] + if isinstance(group_metric_values, list): + metrics[f"eval_rewards/{metric_key}_{group_key}"] = float(np.mean(group_metric_values)) + else: + try: + tensor_values = torch.tensor(group_metric_values) + metrics[f"eval_rewards/{metric_key}_{group_key}"] = tensor_values.mean().item() + except Exception: + continue + + return metrics + def log(self, logs: dict[str, float], start_time: float | None = None) -> None: mode = "train" if self.model is not None and self.model.training else "eval" # type: ignore metrics = { diff --git a/verifiers/utils/report_utils.py b/verifiers/utils/report_utils.py index f679ea83c..98a4b1f9c 100644 --- a/verifiers/utils/report_utils.py +++ b/verifiers/utils/report_utils.py @@ -199,26 +199,49 @@ def build_report_filename(meta: ReportMeta) -> str: - {% if metrics %} -

Metrics

- - - - - {% for name, m in metrics.items() %} - - - - - - - - - - - - {% endfor %} -
metricmeanstdnp5p25p50p75p95
{{ name }}{{ m.mean | round(4) }}{{ m.std | round(4) }}{{ m.n }}{{ m.p5 | round(4) }}{{ m.p25 | round(4) }}{{ m.p50 | round(4) }}{{ m.p75 | round(4) }}{{ m.p95 | round(4) }}
+ {% if grouped_summary %} +

Grouped Rewards

+ {% for group_type, groups in grouped_summary.items() %} + {% if group_type != "overall" %} +

Grouped by {{ group_type.replace('grouped_by_', '')|title }}

+ {% for group_name, group_summary in groups.items() %} +

{{ group_type.replace('grouped_by_', '')|title }}: {{ group_name }}

+ + + + + + + + + + + + +
meanstdnp5p25p50p75p95
{{ group_summary.reward.mean | round(4) }}{{ group_summary.reward.std | round(4) }}{{ group_summary.reward.n }}{{ group_summary.reward.p5 | round(4) }}{{ group_summary.reward.p25 | round(4) }}{{ group_summary.reward.p50 | round(4) }}{{ group_summary.reward.p75 | round(4) }}{{ group_summary.reward.p95 | round(4) }}
+ {% if group_summary.metrics %} + + + + + {% for name, m in group_summary.metrics.items() %} + + + + + + + + + + + + {% endfor %} +
metricmeanstdnp5p25p50p75p95
{{ name }}{{ m.mean | round(4) }}{{ m.std | round(4) }}{{ m.n }}{{ m.p5 | round(4) }}{{ m.p25 | round(4) }}{{ m.p50 | round(4) }}{{ m.p75 | round(4) }}{{ m.p95 | round(4) }}
+ {% endif %} + {% endfor %} + {% endif %} + {% endfor %} {% endif %}

Examples (showing up to {{ examples|length }} of {{ total_examples }})

@@ -252,6 +275,7 @@ def render_html( summary: Dict[str, Any], examples: List[Dict[str, Any]], total_examples: int, + grouped_summary: Dict[str, Any] | None = None, ) -> str: template = _env.from_string(_TEMPLATE) return template.render( @@ -271,6 +295,7 @@ def render_html( metrics=summary.get("metrics", {}), examples=examples, total_examples=total_examples, + grouped_summary=grouped_summary, ) @@ -278,22 +303,96 @@ def write_html_report( report_dir: Path, meta: ReportMeta, results: GenerateOutputs, + group_by_task: bool = False, + grouping_keys: List[str] = None, ) -> Path: """Render and write the HTML report next to the environment under `reports/`. - Returns the path to the written HTML file. + Args: + report_dir: Directory to write the report to + meta: Report metadata + results: GenerateOutputs containing evaluation results + group_by_task: Whether to group by task (backward compatibility) + grouping_keys: List of column names to group by (e.g., ['task', 'difficulty']) + + Returns: + Path to the written HTML file. """ report_dir.mkdir(parents=True, exist_ok=True) summary = compute_summary(results) examples = build_examples(results, cap=DETAILED_EXAMPLES_CAP) + + # Compute grouped summary if requested + grouped_summary = None + if group_by_task or grouping_keys: + # For backward compatibility, if group_by_task is True but no grouping_keys provided, + # default to grouping by task + if group_by_task and not grouping_keys: + grouping_keys = ["task"] + grouped_summary = compute_grouped_summary(results, grouping_keys) + html = render_html( meta=meta, summary=summary, examples=examples, total_examples=len(results.reward), + grouped_summary=grouped_summary, ) filename = build_report_filename(meta) out_path = report_dir / filename out_path.write_text(html, encoding="utf-8") return out_path + + +def compute_grouped_summary(results: GenerateOutputs, grouping_keys: List[str] = None) -> Dict[str, Any]: + """Compute grouped aggregated statistics from GenerateOutputs by specified subset keys. + + Args: + results: GenerateOutputs containing evaluation results + grouping_keys: List of column names to group by (e.g., ['task', 'difficulty']) + If None, defaults to ['task'] for backward compatibility + + Returns: + Dictionary with overall summary and grouped summaries by specified keys. + """ + summary: Dict[str, Any] = {} + + # Overall summary + summary["overall"] = compute_summary(results) + + # Default to task grouping for backward compatibility + if grouping_keys is None: + grouping_keys = ["task"] + + # Grouped summaries by specified keys + for grouping_key in grouping_keys: + if hasattr(results, grouping_key) and getattr(results, grouping_key): + key_values = getattr(results, grouping_key) + # Only proceed if we have multiple unique values + if len(set(key_values)) > 1: + key_groups = {} + for i, key_value in enumerate(key_values): + if key_value not in key_groups: + key_groups[key_value] = [] + key_groups[key_value].append(i) + + grouped_summaries = {} + for key_value, indices in key_groups.items(): + # Create a subset of results for this group + subset_results = GenerateOutputs( + prompt=[results.prompt[i] for i in indices], + completion=[results.completion[i] for i in indices], + answer=[results.answer[i] for i in indices], + state=[results.state[i] for i in indices], + info=[results.info[i] for i in indices], + task=[results.task[i] for i in indices], + reward=[results.reward[i] for i in indices], + metrics={k: [results.metrics[k][i] for i in indices] for k in results.metrics} + ) + grouped_summaries[key_value] = compute_summary(subset_results) + + # Add grouped summaries with the key name to avoid conflicts + summary[f"grouped_by_{grouping_key}"] = grouped_summaries + + return summary