PrimeIntellect-ai · ParamThakkar123 · Nov 28, 2025 · Nov 28, 2025
diff --git a/environments/math/math.py b/environments/math/math.py
@@ -2,7 +2,7 @@
 
 
 def load_environment(**kwargs) -> vf.Environment:
-    '''
+    """
     Loads a custom environment.
-    '''
+    """
     raise NotImplementedError("Implement your custom environment here.")
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -50,6 +50,8 @@ def _run_cli(monkeypatch, overrides):
         "save_every": -1,
         "save_to_hf_hub": False,
         "hf_hub_dataset_name": "",
+        "include_ids": None,
+        "exclude_ids": None,
     }
     base_args.update(overrides)
     args_namespace = SimpleNamespace(**base_args)
@@ -65,7 +67,10 @@ def _run_cli(monkeypatch, overrides):
     monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: {})
 
     async def fake_run_evaluation(config):
+        # capture sampling args and id selection fields passed into EvalConfig
         captured["sampling_args"] = dict(config.sampling_args)
+        captured["include_ids"] = list(config.include_ids or [])
+        captured["exclude_ids"] = list(config.exclude_ids or [])
         metadata = _make_metadata(config)
         return GenerateOutputs(
             prompt=[[{"role": "user", "content": "p"}]],
@@ -126,3 +131,17 @@ def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch):
     assert sa["max_tokens"] == 55
     assert sa["temperature"] == 0.8
     assert sa["enable_thinking"] is True
+
+
+def test_cli_include_ids_parsing(monkeypatch):
+    # JSON array input
+    captured = _run_cli(monkeypatch, {"include_ids": '["0","1","2"]'})
+    assert captured["include_ids"] == ["0", "1", "2"]
+    assert captured["exclude_ids"] == []
+
+
+def test_cli_exclude_ids_parsing(monkeypatch):
+    # Comma-separated input
+    captured = _run_cli(monkeypatch, {"exclude_ids": "3,4,5"})
+    assert captured["include_ids"] == []
+    assert captured["exclude_ids"] == ["3", "4", "5"]
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
@@ -66,6 +66,19 @@ def get_env_eval_defaults(env_id: str) -> Dict[str, Any]:
     return defaults
 
 
+def _parse_ids(s: str) -> list[str]:
+    if not s:
+        return []
+    s = s.strip()
+    if s.startswith("["):
+        try:
+            vals = json.loads(s)
+            return [str(v) for v in vals]
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Could not parse IDs JSON array: {e}") from e
+    return [p.strip() for p in s.split(",") if p.strip()]
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -218,6 +231,24 @@ def main():
         default="",
         help="Name of dataset to save to Hugging Face Hub",
     )
+    parser.add_argument(
+        "--include-ids",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated or JSON array of example IDs to include. "
+            "If provided, only these IDs will be evaluated."
+        ),
+    )
+    parser.add_argument(
+        "--exclude-ids",
+        type=str,
+        default="",
+        help=(
+            "Comma-separated or JSON array of example IDs to exclude. "
+            "If provided, these IDs will be skipped during evaluation."
+        ),
+    )
     args = parser.parse_args()
 
     setup_logging("DEBUG" if args.verbose else os.getenv("VF_LOG_LEVEL", "INFO"))
@@ -284,6 +315,13 @@ def main():
             raise ValueError("--header name cannot be empty")
         merged_headers[k] = v
 
+    include_ids = _parse_ids(args.include_ids)
+    exclude_ids = _parse_ids(args.exclude_ids)
+    if include_ids and exclude_ids:
+        logger.warning(
+            "--include-ids and --exclude-ids both provided; exclude_ids will be ignored"
+        )
+
     client_config = ClientConfig(
         api_key_var=api_key_var,
         api_base_url=api_base_url,
@@ -305,6 +343,9 @@ def main():
         max_concurrent=args.max_concurrent,
         max_concurrent_generation=args.max_concurrent_generation,
         max_concurrent_scoring=args.max_concurrent_scoring,
+        # id selection
+        include_ids=include_ids,
+        exclude_ids=exclude_ids,
         # logging
         print_results=True,
         verbose=args.verbose,

diff --git a/verifiers/types.py b/verifiers/types.py
@@ -234,3 +234,5 @@ class EvalConfig(BaseModel):
     save_every: int = -1
     save_to_hf_hub: bool = False
     hf_hub_dataset_name: str | None = None
+    include_ids: list[str] | None = None
+    exclude_ids: list[str] | None = None