From ca472581c51358109baa9132233d0be0becd4eb5 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Fri, 28 Nov 2025 09:28:09 +0530 Subject: [PATCH 1/2] Added functionality in vf-eval to include or exclude specific IDs from dataset --- environments/math/math.py | 4 ++-- verifiers/scripts/eval.py | 41 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/environments/math/math.py b/environments/math/math.py index f63e6cc02..cff3579d0 100644 --- a/environments/math/math.py +++ b/environments/math/math.py @@ -2,7 +2,7 @@ def load_environment(**kwargs) -> vf.Environment: - ''' + """ Loads a custom environment. - ''' + """ raise NotImplementedError("Implement your custom environment here.") diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py index f4ca29638..30d230343 100644 --- a/verifiers/scripts/eval.py +++ b/verifiers/scripts/eval.py @@ -66,6 +66,19 @@ def get_env_eval_defaults(env_id: str) -> Dict[str, Any]: return defaults +def _parse_ids(s: str) -> list[str]: + if not s: + return [] + s = s.strip() + if s.startswith("["): + try: + vals = json.loads(s) + return [str(v) for v in vals] + except json.JSONDecodeError as e: + raise ValueError(f"Could not parse IDs JSON array: {e}") from e + return [p.strip() for p in s.split(",") if p.strip()] + + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -218,6 +231,24 @@ def main(): default="", help="Name of dataset to save to Hugging Face Hub", ) + parser.add_argument( + "--include-ids", + type=str, + default="", + help=( + "Comma-separated or JSON array of example IDs to include. " + "If provided, only these IDs will be evaluated." + ), + ) + parser.add_argument( + "--exclude-ids", + type=str, + default="", + help=( + "Comma-separated or JSON array of example IDs to exclude. " + "If provided, these IDs will be skipped during evaluation." + ), + ) args = parser.parse_args() setup_logging("DEBUG" if args.verbose else os.getenv("VF_LOG_LEVEL", "INFO")) @@ -284,6 +315,13 @@ def main(): raise ValueError("--header name cannot be empty") merged_headers[k] = v + include_ids = _parse_ids(args.include_ids) + exclude_ids = _parse_ids(args.exclude_ids) + if include_ids and exclude_ids: + logger.warning( + "--include-ids and --exclude-ids both provided; exclude_ids will be ignored" + ) + client_config = ClientConfig( api_key_var=api_key_var, api_base_url=api_base_url, @@ -305,6 +343,9 @@ def main(): max_concurrent=args.max_concurrent, max_concurrent_generation=args.max_concurrent_generation, max_concurrent_scoring=args.max_concurrent_scoring, + # id selection + include_ids=include_ids, + exclude_ids=exclude_ids, # logging print_results=True, verbose=args.verbose, From 634a633c53b4e9ebcc513e9925afa6b3728df241 Mon Sep 17 00:00:00 2001 From: paramthakkar123 Date: Fri, 28 Nov 2025 09:48:19 +0530 Subject: [PATCH 2/2] Updates --- tests/test_eval_cli.py | 19 +++++++++++++++++++ verifiers/types.py | 2 ++ 2 files changed, 21 insertions(+) diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 53337d077..4359a5777 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -50,6 +50,8 @@ def _run_cli(monkeypatch, overrides): "save_every": -1, "save_to_hf_hub": False, "hf_hub_dataset_name": "", + "include_ids": None, + "exclude_ids": None, } base_args.update(overrides) args_namespace = SimpleNamespace(**base_args) @@ -65,7 +67,10 @@ def _run_cli(monkeypatch, overrides): monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: {}) async def fake_run_evaluation(config): + # capture sampling args and id selection fields passed into EvalConfig captured["sampling_args"] = dict(config.sampling_args) + captured["include_ids"] = list(config.include_ids or []) + captured["exclude_ids"] = list(config.exclude_ids or []) metadata = _make_metadata(config) return GenerateOutputs( prompt=[[{"role": "user", "content": "p"}]], @@ -126,3 +131,17 @@ def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch): assert sa["max_tokens"] == 55 assert sa["temperature"] == 0.8 assert sa["enable_thinking"] is True + + +def test_cli_include_ids_parsing(monkeypatch): + # JSON array input + captured = _run_cli(monkeypatch, {"include_ids": '["0","1","2"]'}) + assert captured["include_ids"] == ["0", "1", "2"] + assert captured["exclude_ids"] == [] + + +def test_cli_exclude_ids_parsing(monkeypatch): + # Comma-separated input + captured = _run_cli(monkeypatch, {"exclude_ids": "3,4,5"}) + assert captured["include_ids"] == [] + assert captured["exclude_ids"] == ["3", "4", "5"] diff --git a/verifiers/types.py b/verifiers/types.py index 1a3125075..86fade1e6 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -234,3 +234,5 @@ class EvalConfig(BaseModel): save_every: int = -1 save_to_hf_hub: bool = False hf_hub_dataset_name: str | None = None + include_ids: list[str] | None = None + exclude_ids: list[str] | None = None