From ca472581c51358109baa9132233d0be0becd4eb5 Mon Sep 17 00:00:00 2001
From: paramthakkar123 <paramthakkar864@gmail.com>
Date: Fri, 28 Nov 2025 09:28:09 +0530
Subject: [PATCH 1/2] Added functionality in vf-eval to include or exclude
 specific IDs from dataset

---
 environments/math/math.py |  4 ++--
 verifiers/scripts/eval.py | 41 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/environments/math/math.py b/environments/math/math.py
index f63e6cc02..cff3579d0 100644
--- a/environments/math/math.py
+++ b/environments/math/math.py
@@ -2,7 +2,7 @@
 
 
 def load_environment(**kwargs) -> vf.Environment:
-    '''
+    """
     Loads a custom environment.
-    '''
+    """
     raise NotImplementedError("Implement your custom environment here.")
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
index f4ca29638..30d230343 100644
--- a/verifiers/scripts/eval.py
+++ b/verifiers/scripts/eval.py
@@ -66,6 +66,19 @@ def get_env_eval_defaults(env_id: str) -> Dict[str, Any]:
     return defaults
 
 
+def _parse_ids(s: str) -> list[str]:
+    if not s:
+        return []
+    s = s.strip()
+    if s.startswith("["):
+        try:
+            vals = json.loads(s)
+            return [str(v) for v in vals]
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Could not parse IDs JSON array: {e}") from e
+    return [p.strip() for p in s.split(",") if p.strip()]
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -218,6 +231,24 @@ def main():
         default="",
         help="Name of dataset to save to Hugging Face Hub",
     )
+    parser.add_argument(
+        "--include-ids",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated or JSON array of example IDs to include. "
+            "If provided, only these IDs will be evaluated."
+        ),
+    )
+    parser.add_argument(
+        "--exclude-ids",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated or JSON array of example IDs to exclude. "
+            "If provided, these IDs will be skipped during evaluation."
+        ),
+    )
     args = parser.parse_args()
 
     setup_logging("DEBUG" if args.verbose else os.getenv("VF_LOG_LEVEL", "INFO"))
@@ -284,6 +315,13 @@ def main():
             raise ValueError("--header name cannot be empty")
         merged_headers[k] = v
 
+    include_ids = _parse_ids(args.include_ids)
+    exclude_ids = _parse_ids(args.exclude_ids)
+    if include_ids and exclude_ids:
+        logger.warning(
+            "--include-ids and --exclude-ids both provided; exclude_ids will be ignored"
+        )
+
     client_config = ClientConfig(
         api_key_var=api_key_var,
         api_base_url=api_base_url,
@@ -305,6 +343,9 @@ def main():
         max_concurrent=args.max_concurrent,
         max_concurrent_generation=args.max_concurrent_generation,
         max_concurrent_scoring=args.max_concurrent_scoring,
+        # id selection
+        include_ids=include_ids,
+        exclude_ids=exclude_ids,
         # logging
         print_results=True,
         verbose=args.verbose,

From 634a633c53b4e9ebcc513e9925afa6b3728df241 Mon Sep 17 00:00:00 2001
From: paramthakkar123 <paramthakkar864@gmail.com>
Date: Fri, 28 Nov 2025 09:48:19 +0530
Subject: [PATCH 2/2] Updates

---
 tests/test_eval_cli.py | 19 +++++++++++++++++++
 verifiers/types.py     |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index 53337d077..4359a5777 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -50,6 +50,8 @@ def _run_cli(monkeypatch, overrides):
         "save_every": -1,
         "save_to_hf_hub": False,
         "hf_hub_dataset_name": "",
+        "include_ids": None,
+        "exclude_ids": None,
     }
     base_args.update(overrides)
     args_namespace = SimpleNamespace(**base_args)
@@ -65,7 +67,10 @@ def _run_cli(monkeypatch, overrides):
     monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: {})
 
     async def fake_run_evaluation(config):
+        # capture sampling args and id selection fields passed into EvalConfig
         captured["sampling_args"] = dict(config.sampling_args)
+        captured["include_ids"] = list(config.include_ids or [])
+        captured["exclude_ids"] = list(config.exclude_ids or [])
         metadata = _make_metadata(config)
         return GenerateOutputs(
             prompt=[[{"role": "user", "content": "p"}]],
@@ -126,3 +131,17 @@ def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch):
     assert sa["max_tokens"] == 55
     assert sa["temperature"] == 0.8
     assert sa["enable_thinking"] is True
+
+
+def test_cli_include_ids_parsing(monkeypatch):
+    # JSON array input
+    captured = _run_cli(monkeypatch, {"include_ids": '["0","1","2"]'})
+    assert captured["include_ids"] == ["0", "1", "2"]
+    assert captured["exclude_ids"] == []
+
+
+def test_cli_exclude_ids_parsing(monkeypatch):
+    # Comma-separated input
+    captured = _run_cli(monkeypatch, {"exclude_ids": "3,4,5"})
+    assert captured["include_ids"] == []
+    assert captured["exclude_ids"] == ["3", "4", "5"]
diff --git a/verifiers/types.py b/verifiers/types.py
index 1a3125075..86fade1e6 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -234,3 +234,5 @@ class EvalConfig(BaseModel):
     save_every: int = -1
     save_to_hf_hub: bool = False
     hf_hub_dataset_name: str | None = None
+    include_ids: list[str] | None = None
+    exclude_ids: list[str] | None = None