fix(eval): handle unevaluated final response v2 results

pragnyanramtha · haranrk · copybara-github · commit 5cfef0173d35 · 2026-06-17T11:02:09.000-07:00
Merge #5728 ## Summary Fixes a small aggregation edge case in `FinalResponseMatchV2Evaluator`: when every per-invocation result is skipped or not evaluated, the evaluator currently divides by zero while computing the overall score. ## Root Cause `aggregate_invocation_results()` filters out results whose `score` is `None` or whose `eval_status` is `NOT_EVALUATED`, but it unconditionally computes: ```python overall_score = num_valid / num_evaluated ``` If all judge samples fail to produce a usable score, `num_evaluated` remains `0` and evaluation crashes instead of returning a not-evaluated aggregate result. Other ADK evaluators handle this condition by returning `overall_score=None` and `overall_eval_status=NOT_EVALUATED`. ## Change - Return an `EvaluationResult` with `overall_score=None` and `overall_eval_status=NOT_EVALUATED` when no FinalResponseMatchV2 invocation results are evaluable. - Add a focused regression test for all-skipped/all-not-evaluated invocation results. ## Validation ```bash uv sync --extra test uv run pytest tests/unittests/evaluation/test_final_response_match_v2.py ``` Result: `18 passed, 20 warnings`. Full unit suite was not run; this patch is limited to FinalResponseMatchV2 aggregation and its targeted unit test file. Co-authored-by: Haran Rajkumar <haranrk@google.com> COPYBARA_INTEGRATE_REVIEW=#5728 from pragnyanramtha:pragnyan/final-response-v2-no-eval-guard 3d5ab73 PiperOrigin-RevId: 933818272
diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
@@ -237,6 +237,14 @@ def aggregate_invocation_results(
         continue
       num_evaluated += 1
       num_valid += result.score
+
+    if num_evaluated == 0:
+      return EvaluationResult(
+          overall_score=None,
+          overall_eval_status=EvalStatus.NOT_EVALUATED,
+          per_invocation_results=per_invocation_results,
+      )
+
     overall_score = num_valid / num_evaluated
     return EvaluationResult(
         overall_score=overall_score,
diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py
@@ -561,3 +561,34 @@ def test_aggregate_invocation_results():
   # Only 4 / 8 invocations are evaluated, and 2 / 4 are valid.
   assert aggregated_result.overall_score == 0.5
   assert aggregated_result.overall_eval_status == EvalStatus.PASSED
+
+
+def test_aggregate_invocation_results_none_evaluated():
+  evaluator = _create_test_evaluator_gemini(threshold=0.5)
+
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+
+  per_invocation_results = [
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+  ]
+
+  aggregated_result = evaluator.aggregate_invocation_results(
+      per_invocation_results
+  )
+
+  assert aggregated_result.overall_score is None
+  assert aggregated_result.overall_eval_status == EvalStatus.NOT_EVALUATED
+  assert aggregated_result.per_invocation_results == per_invocation_results