PrimeIntellect-ai · srthkdev · Sep 8, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -69,6 +69,8 @@ def __init__(self, api_key=None, base_url=None):
         save_dataset=False,
         save_to_hf_hub=False,
         hf_hub_dataset_name="",
+        group_by_task=False,
+        grouping_keys=None,
     )
 
     sa = captured["sampling_args"]
@@ -116,6 +118,8 @@ def __init__(self, api_key=None, base_url=None):
         save_dataset=False,
         save_to_hf_hub=False,
         hf_hub_dataset_name="",
+        group_by_task=False,
+        grouping_keys=None,
     )
 
     sa = captured["sampling_args"]

diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
@@ -13,6 +13,7 @@
 import verifiers as vf
 from verifiers.utils.client_utils import setup_client
 from verifiers.utils.message_utils import messages_to_printable, sanitize_tool_calls
+from verifiers.utils.report_utils import compute_grouped_summary
 
 # Setup logger for eval script using verifiers logging format
 logger = logging.getLogger("verifiers.scripts.eval")
@@ -36,6 +37,9 @@ def eval_environment(
     save_dataset: bool,
     save_to_hf_hub: bool,
     hf_hub_dataset_name: str,
+    group_by_task: bool,
+    grouping_keys: list[str] | None,
+    grouped_html_report: bool = False,
 ):
     logger.setLevel("DEBUG" if verbose else "INFO")
     try:
@@ -144,6 +148,36 @@ def eval_environment(
             out = f"r{i + 1}: {trials}"
             logger.info(out)
 
+    # Group rewards by task/subset if available and requested
+    if group_by_task and results.task and len(set(results.task)) > 1:
+        logger.info("--- Grouped Rewards by Task ---")
+        grouped_summary = compute_grouped_summary(results)
+        group_key = "grouped_by_task"
+        if group_key in grouped_summary:
+            for task, summary in grouped_summary[group_key].items():
+                reward_stats = summary["reward"]
+                logger.info(f"Task '{task}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")
+
+                # Group metrics by task as well
+                for metric_name, metric_stats in summary["metrics"].items():
+                    logger.info(f"  {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")
+
+    # Enhanced grouping by multiple keys if specified
+    if grouping_keys:
+        logger.info("--- Grouped Rewards by Specified Keys ---")
+        grouped_summary = compute_grouped_summary(results, grouping_keys)
+        for grouping_key in grouping_keys:
+            group_key = f"grouped_by_{grouping_key}"
+            if group_key in grouped_summary:
+                logger.info(f"--- Grouped by {grouping_key} ---")
+                for group_value, summary in grouped_summary[group_key].items():
+                    reward_stats = summary["reward"]
+                    logger.info(f"{grouping_key} '{group_value}': avg - {reward_stats['mean']:.3f}, std - {reward_stats['std']:.3f}, count - {reward_stats['n']}")
+
+                    # Group metrics by the same key
+                    for metric_name, metric_stats in summary["metrics"].items():
+                        logger.info(f"  {metric_name}: avg - {metric_stats['mean']:.3f}, std - {metric_stats['std']:.3f}")
+
     if save_dataset or save_to_hf_hub:
         ids = [i // rollouts_per_example for i in range(n * rollouts_per_example)]
         rewards = results.reward
@@ -193,7 +227,32 @@ def eval_environment(
             with open(results_path / "metadata.json", "w") as f:
                 json.dump(metadata, f)
 
-            logger.info(f"Saved dataset to {results_path}")
+            # Generate HTML report
+            try:
+                from verifiers.utils.report_utils import ReportMeta, write_html_report
+
+                report_meta = ReportMeta(
+                    env_id=env,
+                    env_version="0.0.0",  # TODO: Get actual version
+                    model=model,
+                    num_examples=n,
+                    rollouts_per_example=rollouts_per_example,
+                    api_base_url=api_base_url,
+                    sampling_args=merged_sampling_args or {},
+                    env_args=env_args,
+                )
+
+                write_html_report(
+                    report_dir=results_path / "reports",
+                    meta=report_meta,
+                    results=results,
+                    group_by_task=grouped_html_report,
+                    grouping_keys=grouping_keys,
+                )
+                logger.info(f"Saved dataset to {results_path}")
+            except Exception as e:
+                logger.warning(f"Failed to generate HTML report: {e}")
+
         if save_to_hf_hub:
             if hf_hub_dataset_name == "":
                 dataset_name = (
@@ -317,6 +376,27 @@ def main():
         default="",
         help="Name of dataset to save to Hugging Face Hub",
     )
+    parser.add_argument(
+        "--group-by-task",
+        "-g",
+        default=False,
+        action="store_true",
+        help="Group rewards by task/subset when displaying results",
+    )
+    parser.add_argument(
+        "--grouping-keys",
+        "-G",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Group rewards by specified keys (e.g., task difficulty category)",
+    )
+    parser.add_argument(
+        "--grouped-html-report",
+        default=False,
+        action="store_true",
+        help="Generate HTML report with grouped rewards by task/subset",
+    )
     args = parser.parse_args()
 
     eval_environment(
@@ -337,6 +417,9 @@ def main():
         save_dataset=args.save_dataset,
         save_to_hf_hub=args.save_to_hf_hub,
         hf_hub_dataset_name=args.hf_hub_dataset_name,
+        group_by_task=args.group_by_task,
+        grouping_keys=args.grouping_keys,
+        grouped_html_report=args.grouped_html_report,
     )
 
 

diff --git a/verifiers/trainers/grpo_trainer.py b/verifiers/trainers/grpo_trainer.py
@@ -1342,6 +1342,18 @@ def evaluate(
         metrics["eval_reward"] = rewards.mean().item()
         metrics["eval_reward_std"] = rewards.std().item()
 
+        # Group rewards by unique subset keys for more detailed analysis
+        # This allows calculating averages for different groups without multiple evaluations
+        grouping_keys = ["task"]  # Can be extended to include other keys like difficulty, category, etc.
+
+        for grouping_key in grouping_keys:
+            if hasattr(eval_results, grouping_key) and getattr(eval_results, grouping_key):
+                key_values = getattr(eval_results, grouping_key)
+                grouped_metrics = self._compute_grouped_metrics(
+                    key_values, eval_results.reward, eval_results.metrics
+                )
+                metrics.update(grouped_metrics)
+
         # Log individual reward function scores
         non_reward_metric_keys = [
             "reward",
@@ -1439,6 +1451,51 @@ def evaluate(
         # Return metrics dict to match base class signature
         return metrics
 
+    def _compute_grouped_metrics(self, grouping_values, rewards, all_metrics):
+        """
+        Compute grouped metrics by unique subset keys.
+
+        Args:
+            grouping_values: List of values to group by (e.g., task names, difficulty levels)
+            rewards: List of reward values
+            all_metrics: Dictionary of all metric values
+
+        Returns:
+            Dictionary of grouped metrics
+        """
+        metrics = {}
+
+        # Create groups based on unique values
+        groups = {}
+        for i, group_key in enumerate(grouping_values):
+            if group_key not in groups:
+                groups[group_key] = {"indices": [], "rewards": []}
+            groups[group_key]["indices"].append(i)
+            groups[group_key]["rewards"].append(rewards[i])
+
+        # Compute metrics for each group
+        for group_key, group_data in groups.items():
+            group_rewards = torch.tensor(group_data["rewards"])
+            metrics[f"eval_reward_{group_key}"] = group_rewards.mean().item()
+            metrics[f"eval_reward_std_{group_key}"] = group_rewards.std().item()
+
+            # Compute grouped metrics for each metric type
+            for metric_key, metric_values in all_metrics.items():
+                if metric_key in ["reward", "prompt", "completion", "info", "answer", "state", "task"]:
+                    continue  # Skip non-reward metrics or metadata
+
+                group_metric_values = [metric_values[i] for i in group_data["indices"]]
+                if isinstance(group_metric_values, list):
+                    metrics[f"eval_rewards/{metric_key}_{group_key}"] = float(np.mean(group_metric_values))
+                else:
+                    try:
+                        tensor_values = torch.tensor(group_metric_values)
+                        metrics[f"eval_rewards/{metric_key}_{group_key}"] = tensor_values.mean().item()
+                    except Exception:
+                        continue
+
+        return metrics
+
     def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
         mode = "train" if self.model is not None and self.model.training else "eval"  # type: ignore
         metrics = {

diff --git a/verifiers/utils/report_utils.py b/verifiers/utils/report_utils.py
@@ -199,26 +199,49 @@ def build_report_filename(meta: ReportMeta) -> str:
       </tr>
     </table>
 
-    {% if metrics %}
-    <h2>Metrics</h2>
-    <table>
-      <tr>
-        <th>metric</th><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th>
-      </tr>
-      {% for name, m in metrics.items() %}
-      <tr>
-        <td>{{ name }}</td>
-        <td>{{ m.mean | round(4) }}</td>
-        <td>{{ m.std | round(4) }}</td>
-        <td>{{ m.n }}</td>
-        <td>{{ m.p5 | round(4) }}</td>
-        <td>{{ m.p25 | round(4) }}</td>
-        <td>{{ m.p50 | round(4) }}</td>
-        <td>{{ m.p75 | round(4) }}</td>
-        <td>{{ m.p95 | round(4) }}</td>
-      </tr>
-      {% endfor %}
-    </table>
+    {% if grouped_summary %}
+    <h2>Grouped Rewards</h2>
+    {% for group_type, groups in grouped_summary.items() %}
+        {% if group_type != "overall" %}
+        <h3>Grouped by {{ group_type.replace('grouped_by_', '')|title }}</h3>
+        {% for group_name, group_summary in groups.items() %}
+        <h4>{{ group_type.replace('grouped_by_', '')|title }}: {{ group_name }}</h4>
+        <table>
+          <tr><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th></tr>
+          <tr>
+            <td>{{ group_summary.reward.mean | round(4) }}</td>
+            <td>{{ group_summary.reward.std | round(4) }}</td>
+            <td>{{ group_summary.reward.n }}</td>
+            <td>{{ group_summary.reward.p5 | round(4) }}</td>
+            <td>{{ group_summary.reward.p25 | round(4) }}</td>
+            <td>{{ group_summary.reward.p50 | round(4) }}</td>
+            <td>{{ group_summary.reward.p75 | round(4) }}</td>
+            <td>{{ group_summary.reward.p95 | round(4) }}</td>
+          </tr>
+        </table>
+        {% if group_summary.metrics %}
+        <table>
+          <tr>
+            <th>metric</th><th>mean</th><th>std</th><th>n</th><th>p5</th><th>p25</th><th>p50</th><th>p75</th><th>p95</th>
+          </tr>
+          {% for name, m in group_summary.metrics.items() %}
+          <tr>
+            <td>{{ name }}</td>
+            <td>{{ m.mean | round(4) }}</td>
+            <td>{{ m.std | round(4) }}</td>
+            <td>{{ m.n }}</td>
+            <td>{{ m.p5 | round(4) }}</td>
+            <td>{{ m.p25 | round(4) }}</td>
+            <td>{{ m.p50 | round(4) }}</td>
+            <td>{{ m.p75 | round(4) }}</td>
+            <td>{{ m.p95 | round(4) }}</td>
+          </tr>
+          {% endfor %}
+        </table>
+        {% endif %}
+        {% endfor %}
+        {% endif %}
+    {% endfor %}
     {% endif %}
 
     <h2>Examples <span class="muted">(showing up to {{ examples|length }} of {{ total_examples }})</span></h2>
@@ -252,6 +275,7 @@ def render_html(
     summary: Dict[str, Any],
     examples: List[Dict[str, Any]],
     total_examples: int,
+    grouped_summary: Dict[str, Any] | None = None,
 ) -> str:
     template = _env.from_string(_TEMPLATE)
     return template.render(
@@ -271,29 +295,104 @@ def render_html(
         metrics=summary.get("metrics", {}),
         examples=examples,
         total_examples=total_examples,
+        grouped_summary=grouped_summary,
     )
 
 
 def write_html_report(
     report_dir: Path,
     meta: ReportMeta,
     results: GenerateOutputs,
+    group_by_task: bool = False,
+    grouping_keys: List[str] = None,
 ) -> Path:
     """Render and write the HTML report next to the environment under `reports/`.
 
-    Returns the path to the written HTML file.
+    Args:
+        report_dir: Directory to write the report to
+        meta: Report metadata
+        results: GenerateOutputs containing evaluation results
+        group_by_task: Whether to group by task (backward compatibility)
+        grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])
+
+    Returns:
+        Path to the written HTML file.
     """
     report_dir.mkdir(parents=True, exist_ok=True)
 
     summary = compute_summary(results)
     examples = build_examples(results, cap=DETAILED_EXAMPLES_CAP)
+
+    # Compute grouped summary if requested
+    grouped_summary = None
+    if group_by_task or grouping_keys:
+        # For backward compatibility, if group_by_task is True but no grouping_keys provided,
+        # default to grouping by task
+        if group_by_task and not grouping_keys:
+            grouping_keys = ["task"]
+        grouped_summary = compute_grouped_summary(results, grouping_keys)
+
     html = render_html(
         meta=meta,
         summary=summary,
         examples=examples,
         total_examples=len(results.reward),
+        grouped_summary=grouped_summary,
     )
     filename = build_report_filename(meta)
     out_path = report_dir / filename
     out_path.write_text(html, encoding="utf-8")
     return out_path
+
+
+def compute_grouped_summary(results: GenerateOutputs, grouping_keys: List[str] = None) -> Dict[str, Any]:
+    """Compute grouped aggregated statistics from GenerateOutputs by specified subset keys.
+
+    Args:
+        results: GenerateOutputs containing evaluation results
+        grouping_keys: List of column names to group by (e.g., ['task', 'difficulty'])
+                  If None, defaults to ['task'] for backward compatibility
+
+    Returns:
+        Dictionary with overall summary and grouped summaries by specified keys.
+    """
+    summary: Dict[str, Any] = {}
+
+    # Overall summary
+    summary["overall"] = compute_summary(results)
+
+    # Default to task grouping for backward compatibility
+    if grouping_keys is None:
+        grouping_keys = ["task"]
+
+    # Grouped summaries by specified keys
+    for grouping_key in grouping_keys:
+        if hasattr(results, grouping_key) and getattr(results, grouping_key):
+            key_values = getattr(results, grouping_key)
+            # Only proceed if we have multiple unique values
+            if len(set(key_values)) > 1:
+                key_groups = {}
+                for i, key_value in enumerate(key_values):
+                    if key_value not in key_groups:
+                        key_groups[key_value] = []
+                    key_groups[key_value].append(i)
+
+                grouped_summaries = {}
+                for key_value, indices in key_groups.items():
+                    # Create a subset of results for this group
+                    subset_results = GenerateOutputs(
+                        prompt=[results.prompt[i] for i in indices],
+                        completion=[results.completion[i] for i in indices],
+                        answer=[results.answer[i] for i in indices],
+                        state=[results.state[i] for i in indices],
+                        info=[results.info[i] for i in indices],
+                        task=[results.task[i] for i in indices],
+                        reward=[results.reward[i] for i in indices],
+                        metrics={k: [results.metrics[k][i] for i in indices] for k in results.metrics}
+                    )
+                    grouped_summaries[key_value] = compute_summary(subset_results)
+
+                # Add grouped summaries with the key name to avoid conflicts
+                summary[f"grouped_by_{grouping_key}"] = grouped_summaries
+
+    return summary