Skip to content

Commit bab3be2

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Add support for persisting eval run results
If the EvalRunResultsManager is provided to LocalEvalService, then we want to persist the eval run results using it. PiperOrigin-RevId: 782196848
1 parent 33eec34 commit bab3be2

File tree

2 files changed

+46
-11
lines changed

2 files changed

+46
-11
lines changed

src/google/adk/evaluation/local_eval_service.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,22 @@ async def run_evaluation(inference_result):
159159
run_evaluation(inference_result)
160160
for inference_result in evaluate_request.inference_results
161161
]
162+
162163
for evaluation_task in asyncio.as_completed(evaluation_tasks):
163-
yield await evaluation_task
164+
inference_result, eval_case_result = await evaluation_task
165+
166+
if self._eval_set_results_manager:
167+
self._eval_set_results_manager.save_eval_set_result(
168+
app_name=inference_result.app_name,
169+
eval_set_id=inference_result.eval_set_id,
170+
eval_case_results=[eval_case_result],
171+
)
172+
173+
yield eval_case_result
164174

165175
async def _evaluate_single_inference_result(
166176
self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
167-
) -> EvalCaseResult:
177+
) -> tuple[InferenceResult, EvalCaseResult]:
168178
"""Returns EvalCaseResult for the given inference result.
169179
170180
A single inference result can have multiple invocations. For each
@@ -267,17 +277,24 @@ async def _evaluate_single_inference_result(
267277
else 'test_user_id'
268278
)
269279

270-
return EvalCaseResult(
280+
eval_case_result = EvalCaseResult(
271281
eval_set_file=inference_result.eval_set_id,
272282
eval_set_id=inference_result.eval_set_id,
273283
eval_id=inference_result.eval_case_id,
274284
final_eval_status=final_eval_status,
275285
overall_eval_metric_results=overall_eval_metric_results,
276286
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
277287
session_id=inference_result.session_id,
288+
session_details=await self._session_service.get_session(
289+
app_name=inference_result.app_name,
290+
user_id=user_id,
291+
session_id=inference_result.session_id,
292+
),
278293
user_id=user_id,
279294
)
280295

296+
return (inference_result, eval_case_result)
297+
281298
async def _evaluate_metric(
282299
self,
283300
eval_metric: EvalMetric,

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from google.adk.evaluation.eval_result import EvalCaseResult
2828
from google.adk.evaluation.eval_set import EvalCase
2929
from google.adk.evaluation.eval_set import EvalSet
30+
from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager
3031
from google.adk.evaluation.eval_sets_manager import EvalSetsManager
3132
from google.adk.evaluation.evaluator import EvalStatus
3233
from google.adk.evaluation.evaluator import EvaluationResult
@@ -51,13 +52,21 @@ def dummy_agent():
5152

5253

5354
@pytest.fixture
54-
def eval_service(dummy_agent, mock_eval_sets_manager):
55+
def mock_eval_set_results_manager():
56+
return mock.create_autospec(EvalSetResultsManager)
57+
58+
59+
@pytest.fixture
60+
def eval_service(
61+
dummy_agent, mock_eval_sets_manager, mock_eval_set_results_manager
62+
):
5563
DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
5664
metric_name="fake_metric", evaluator=FakeEvaluator
5765
)
5866
return LocalEvalService(
5967
root_agent=dummy_agent,
6068
eval_sets_manager=mock_eval_sets_manager,
69+
eval_set_results_manager=mock_eval_set_results_manager,
6170
)
6271

6372

@@ -90,7 +99,9 @@ def evaluate_invocations(
9099

91100
@pytest.mark.asyncio
92101
async def test_perform_inference_success(
93-
eval_service, dummy_agent, mock_eval_sets_manager
102+
eval_service,
103+
dummy_agent,
104+
mock_eval_sets_manager,
94105
):
95106
eval_set = EvalSet(
96107
eval_set_id="test_eval_set",
@@ -127,7 +138,9 @@ async def test_perform_inference_success(
127138

128139
@pytest.mark.asyncio
129140
async def test_perform_inference_with_case_ids(
130-
eval_service, dummy_agent, mock_eval_sets_manager
141+
eval_service,
142+
dummy_agent,
143+
mock_eval_sets_manager,
131144
):
132145
eval_set = EvalSet(
133146
eval_set_id="test_eval_set",
@@ -172,7 +185,8 @@ async def test_perform_inference_with_case_ids(
172185

173186
@pytest.mark.asyncio
174187
async def test_perform_inference_eval_set_not_found(
175-
eval_service, mock_eval_sets_manager
188+
eval_service,
189+
mock_eval_sets_manager,
176190
):
177191
mock_eval_sets_manager.get_eval_set.return_value = None
178192

@@ -188,7 +202,9 @@ async def test_perform_inference_eval_set_not_found(
188202

189203

190204
@pytest.mark.asyncio
191-
async def test_evaluate_success(eval_service, mock_eval_sets_manager):
205+
async def test_evaluate_success(
206+
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager
207+
):
192208
inference_results = [
193209
InferenceResult(
194210
app_name="test_app",
@@ -224,11 +240,13 @@ async def test_evaluate_success(eval_service, mock_eval_sets_manager):
224240
assert isinstance(results[0], EvalCaseResult)
225241
assert isinstance(results[1], EvalCaseResult)
226242
assert mock_eval_sets_manager.get_eval_case.call_count == 2
243+
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
227244

228245

229246
@pytest.mark.asyncio
230247
async def test_evaluate_eval_case_not_found(
231-
eval_service, mock_eval_sets_manager
248+
eval_service,
249+
mock_eval_sets_manager,
232250
):
233251
inference_results = [
234252
InferenceResult(
@@ -256,7 +274,7 @@ async def test_evaluate_eval_case_not_found(
256274

257275
@pytest.mark.asyncio
258276
async def test_evaluate_single_inference_result(
259-
eval_service, mock_eval_sets_manager
277+
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager
260278
):
261279
invocation = Invocation(
262280
user_content=genai_types.Content(
@@ -289,7 +307,7 @@ async def test_evaluate_single_inference_result(
289307
mock_eval_case.session_input = None
290308
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
291309

292-
result = await eval_service._evaluate_single_inference_result(
310+
_, result = await eval_service._evaluate_single_inference_result(
293311
inference_result=inference_result, evaluate_config=evaluate_config
294312
)
295313

0 commit comments

Comments
 (0)