Skip to content

Commit b5b3475

Browse files
committed
Added unit tests and fixed null expected retrieve data
1 parent d88ef07 commit b5b3475

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+34012
-20
lines changed

.github/workflows/lint.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ name: Lint
33
on:
44
pull_request:
55
branches: [main]
6-
push:
7-
branches: [main]
86

97
jobs:
108
lint:

.github/workflows/test-env-ctrl.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,6 @@ on:
77
- "packages/environment_control/**"
88
- "tests/integration/environment_control/**"
99
- "src/webarena_verified/environments/env_ctrl_client/**"
10-
push:
11-
branches: [main]
12-
paths:
13-
- "packages/environment_control/**"
14-
- "tests/integration/environment_control/**"
15-
- "src/webarena_verified/environments/env_ctrl_client/**"
1610

1711
jobs:
1812
test:

src/webarena_verified/core/evaluation/evaluators/agent_response_evaluator.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ def _normalized_actual_value(
9696
k = "performed_operation" # Support legacy field name
9797

9898
if k not in value:
99+
if k == "retrieved_data":
100+
# When retrieved_data key is missing, treat it as None for comparison
101+
# This allows missing key to match expected None (e.g., NOT_FOUND_ERROR tasks)
102+
_normalized_values[k] = None
99103
continue
100104

101105
if k == "retrieved_data":
@@ -179,11 +183,22 @@ def _compare_values( # type: ignore[override]
179183
return assertions
180184

181185
expected_retrieved_data = expected_normalized.get("retrieved_data", None)
182-
if expected_retrieved_data is None:
183-
raise ValueError("Expected retrieved_data must be set in config for retrieve tasks.")
184186

185-
# Handle None actual_retrieved_data - should fail if expected is not None
186-
if actual_retrieved_data is None and expected_retrieved_data is not None:
187+
if expected_retrieved_data is None and actual_retrieved_data is None:
188+
# Both None - success, no data expected and none provided
189+
return assertions
190+
elif expected_retrieved_data is None and actual_retrieved_data is not None:
191+
# Expected None but got data - failure
192+
assertions.append(
193+
EvalAssertion.create(
194+
assertion_name="retrieved_data_unexpected",
195+
status=EvalStatus.FAILURE,
196+
assertion_msgs=[f"Expected no retrieved_data, but got {actual_retrieved_data}"],
197+
)
198+
)
199+
return assertions
200+
elif actual_retrieved_data is None:
201+
# Expected data but got None - failure
187202
assertions.append(
188203
EvalAssertion.create(
189204
assertion_name="retrieved_data_missing_or_null",
@@ -192,14 +207,15 @@ def _compare_values( # type: ignore[override]
192207
)
193208
)
194209
return assertions
195-
196-
assertions.extend(
197-
self.value_comparator.compare(
198-
expected=tuple(expected_retrieved_data),
199-
actual=tuple(actual_retrieved_data),
200-
value_name="retrieved_data",
201-
ordered=config.ordered,
210+
else:
211+
# Both have data - compare them
212+
assertions.extend(
213+
self.value_comparator.compare(
214+
expected=tuple(expected_retrieved_data),
215+
actual=tuple(actual_retrieved_data),
216+
value_name="retrieved_data",
217+
ordered=config.ordered,
218+
)
202219
)
203-
)
204220

205221
return assertions

tests/api/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Tests for webarena_verified.api module."""

0 commit comments

Comments
 (0)