Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions environments/math/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def load_environment(**kwargs) -> vf.Environment:
'''
"""
Loads a custom environment.
'''
"""
raise NotImplementedError("Implement your custom environment here.")
19 changes: 19 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def _run_cli(monkeypatch, overrides):
"save_every": -1,
"save_to_hf_hub": False,
"hf_hub_dataset_name": "",
"include_ids": None,
"exclude_ids": None,
}
base_args.update(overrides)
args_namespace = SimpleNamespace(**base_args)
Expand All @@ -65,7 +67,10 @@ def _run_cli(monkeypatch, overrides):
monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: {})

async def fake_run_evaluation(config):
# capture sampling args and id selection fields passed into EvalConfig
captured["sampling_args"] = dict(config.sampling_args)
captured["include_ids"] = list(config.include_ids or [])
captured["exclude_ids"] = list(config.exclude_ids or [])
metadata = _make_metadata(config)
return GenerateOutputs(
prompt=[[{"role": "user", "content": "p"}]],
Expand Down Expand Up @@ -126,3 +131,17 @@ def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch):
assert sa["max_tokens"] == 55
assert sa["temperature"] == 0.8
assert sa["enable_thinking"] is True


def test_cli_include_ids_parsing(monkeypatch):
# JSON array input
captured = _run_cli(monkeypatch, {"include_ids": '["0","1","2"]'})
assert captured["include_ids"] == ["0", "1", "2"]
assert captured["exclude_ids"] == []


def test_cli_exclude_ids_parsing(monkeypatch):
# Comma-separated input
captured = _run_cli(monkeypatch, {"exclude_ids": "3,4,5"})
assert captured["include_ids"] == []
assert captured["exclude_ids"] == ["3", "4", "5"]
41 changes: 41 additions & 0 deletions verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ def get_env_eval_defaults(env_id: str) -> Dict[str, Any]:
return defaults


def _parse_ids(s: str) -> list[str]:
if not s:
return []
s = s.strip()
if s.startswith("["):
try:
vals = json.loads(s)
return [str(v) for v in vals]
except json.JSONDecodeError as e:
raise ValueError(f"Could not parse IDs JSON array: {e}") from e
return [p.strip() for p in s.split(",") if p.strip()]


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -218,6 +231,24 @@ def main():
default="",
help="Name of dataset to save to Hugging Face Hub",
)
parser.add_argument(
"--include-ids",
type=str,
default="",
help=(
"Comma-separated or JSON array of example IDs to include. "
"If provided, only these IDs will be evaluated."
),
)
parser.add_argument(
"--exclude-ids",
type=str,
default="",
help=(
"Comma-separated or JSON array of example IDs to exclude. "
"If provided, these IDs will be skipped during evaluation."
),
)
args = parser.parse_args()

setup_logging("DEBUG" if args.verbose else os.getenv("VF_LOG_LEVEL", "INFO"))
Expand Down Expand Up @@ -284,6 +315,13 @@ def main():
raise ValueError("--header name cannot be empty")
merged_headers[k] = v

include_ids = _parse_ids(args.include_ids)
exclude_ids = _parse_ids(args.exclude_ids)
if include_ids and exclude_ids:
logger.warning(
"--include-ids and --exclude-ids both provided; exclude_ids will be ignored"
)

client_config = ClientConfig(
api_key_var=api_key_var,
api_base_url=api_base_url,
Expand All @@ -305,6 +343,9 @@ def main():
max_concurrent=args.max_concurrent,
max_concurrent_generation=args.max_concurrent_generation,
max_concurrent_scoring=args.max_concurrent_scoring,
# id selection
include_ids=include_ids,
exclude_ids=exclude_ids,
# logging
print_results=True,
verbose=args.verbose,
Expand Down
2 changes: 2 additions & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,5 @@ class EvalConfig(BaseModel):
save_every: int = -1
save_to_hf_hub: bool = False
hf_hub_dataset_name: str | None = None
include_ids: list[str] | None = None
exclude_ids: list[str] | None = None
Loading