PrimeIntellect-ai · d42me · Oct 3, 2025 · Oct 3, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ venv/
 env/
 .env
 .env.local
+**/.env-metadata.json
 uv.lock
 .claude/
 .cursorrules

diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ uv run vf-install environment-name # -p /path/to/environments (defaults to "./en
 
 To install an Environment module from this repo's `environments` folder, do:
 ```bash
-uv run vf-install math-python --from-repo # -b branch_or_commit (defaults to "main")
+uv run vf-install wordle --from-repo # -b branch_or_commit (defaults to "main")
 ```
 
 Once an Environment module is installed, you can create an instance of the Environment using `load_environment`, passing any necessary args:
@@ -105,6 +105,111 @@ uv run vf-eval environment-name -s # run and save eval results locally
 # vf-eval -h for config options; defaults to gpt-4.1-mini, 5 prompts, 3 rollouts for each
 ```
 
+### Multi-Environment Evaluation
+
+`vf-eval` supports evaluating multiple environments in parallel, which is useful for benchmarking models across multiple tasks.
+
+#### Simple CLI Usage
+
+For quick evaluations with global settings applied to all environments:
+
+```bash
+# Single environment
+uv run vf-eval gsm8k --num-examples 10 --model gpt-4o
+
+# Multiple environments with shared settings  
+uv run vf-eval gsm8k wordle --num-examples 100 --model gpt-4o-mini --temperature 0.7
+```
+
+#### Per-Environment CLI Configuration
+
+For different settings per environment without a config file:
+
+```bash
+# Different num_examples and rollouts per environment
+uv run vf-eval --env id=gsm8k,num_examples=100 \
+               --env id=wordle,num_examples=50,rollouts_per_example=5 \
+               --model gpt-4o
+
+# Different temperatures per environment
+uv run vf-eval --env id=gsm8k,temperature=0.9 \
+               --env id=gpqa,temperature=0.7,rollouts_per_example=10 \
+               --model gpt-4o --save-to-env-hub
+```
+
+The `--env` flag supports: `id`, `num_examples`, `rollouts_per_example`, `max_concurrent`, `temperature`, and other JSON-compatible values.
+
+#### Config File Approach (Recommended)
+
+For per-environment customization, use a TOML or JSON config file:
+
+```toml
+# eval_config.toml
+environment_ids = ["gsm8k", "wordle"]
+
+# Global defaults
+num_examples = 10
+rollouts_per_example = 3
+max_concurrent = 32
+
+[model]
+name = "gpt-5"
+
+[sampling]
+max_tokens = 2048
+
+# Per-environment overrides
+[env.wordle]
+num_examples = 50
+rollouts_per_example = 3
+use_think = true  # env-specific init arg
+```
+
+Run with:
+```bash
+# Use config file
+vf-eval --config eval_config.toml
+
+# Config file with CLI overrides (CLI takes precedence)
+vf-eval --config eval_config.toml --num-examples 20 --model gpt-4o
+```
+
+See [`examples/eval_config_example.toml`](examples/eval_config_example.toml) for a complete example with all available options.
+
+#### Environment Hub Integration
+
+**[CLOSED BETA]** Push evaluation results to Environment Hub for tracking (requires prime eval permissions):
+
+```bash
+vf-eval --config eval_config.toml --save-to-env-hub
+```
+
+Requires `prime-evals` installed and authenticated. Install with `pip install prime-evals` or `uv pip install prime-evals`.
+
+You can also use multi-environment evaluation programmatically:
+
+```python
+import asyncio
+from openai import AsyncOpenAI
+from verifiers.scripts.eval import eval_environments_parallel
+
+client = AsyncOpenAI(api_key="...", base_url="http://localhost:8000/v1")
+
+results = await eval_environments_parallel(
+    envs=["gsm8k", "wordle"],
+    env_args_dict={"gsm8k": {}, "wordle": {}},
+    client=client,
+    model="gpt-4o-mini",
+    num_examples=[100, 50],
+    rollouts_per_example=[3, 3],
+    max_concurrent=[32, 32],
+    sampling_args={"temperature": 0.7, "max_tokens": 2048},
+)
+
+for env, output in results.items():
+    print(f"{env}: avg_reward={sum(output.reward)/len(output.reward):.3f}")
+```
+
 If you're using Prime Intellect infrastructure, the [`prime` CLI](https://github.com/PrimeIntellect-ai/prime-cli) provides first-class commands for working with Verifiers environments through the [Environments Hub](https://docs.primeintellect.ai/tutorials-environments/environments). Install it with `uv tool install prime`, authenticate via `prime login`, then use `prime env push` to publish your package and `prime env install owner/name` (optionally pinning a version) to consume it from pods or local machines.
 
 The core elements of Environments are:

diff --git a/examples/eval_config_example.toml b/examples/eval_config_example.toml
@@ -0,0 +1,33 @@
+# This config demonstrates per-environment parameter customization
+
+# List of environments to evaluate
+environment_ids = ["gsm8k"]
+
+# Global defaults (applied to all environments unless overridden)
+num_examples = 10
+rollouts_per_example = 3
+max_concurrent = 32
+
+# Model configuration
+[model]
+name = "gpt-4o-mini"
+
+# Global sampling parameters
+[sampling]
+temperature = 0.7
+max_tokens = 2048
+
+# Per-environment configuration
+[env.gsm8k]
+num_examples = 20
+rollouts_per_example = 2
+max_concurrent = 16
+
+max_tokens = 512
+use_think = true
+num_eval_examples = 20
+
+# API configuration (optional)
+api_key_var = "OPENAI_API_KEY"
+api_base_url = "https://api.openai.com/v1"
+
diff --git a/examples/multi_env_eval_example.py b/examples/multi_env_eval_example.py
@@ -0,0 +1,184 @@
+import asyncio
+from openai import AsyncOpenAI
+import os
+
+from verifiers.scripts.eval import eval_environments_parallel, push_eval_to_env_hub
+
+
+async def example_multi_env_eval():
+    """Example: Evaluate multiple environments in parallel."""
+
+    client = AsyncOpenAI(
+        api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
+        base_url="https://api.openai.com/v1",
+    )
+
+    envs = ["gsm8k", "wordle"]
+
+    # Run parallel evaluation
+    results = await eval_environments_parallel(
+        envs=envs,
+        env_args_dict={
+            "gsm8k": {},
+            "wordle": {},
+        },
+        client=client,
+        model="gpt-4o-mini",
+        num_examples=[10, 10],
+        rollouts_per_example=[3, 3],
+        max_concurrent=[32, 32],
+        sampling_args={
+            "temperature": 0.7,
+            "max_tokens": 2048,
+        },
+    )
+
+    # Process results
+    for env_name, output in results.items():
+        print(f"\n=== {env_name} ===")
+        print(f"Number of samples: {len(output.reward)}")
+        print(f"Average reward: {sum(output.reward) / len(output.reward):.3f}")
+        print(f"Rewards: {output.reward[:5]}...")  # Show first 5
+
+        # Show metrics if available
+        if output.metrics:
+            for metric_name, metric_values in output.metrics.items():
+                avg = sum(metric_values) / len(metric_values)
+                print(f"Average {metric_name}: {avg:.3f}")
+
+
+async def example_per_env_sampling_args():
+    """
+    Example: Per-environment sampling arguments.
+
+    This shows how to configure different sampling parameters for each environment.
+    Useful when different tasks require different generation strategies.
+    """
+
+    client = AsyncOpenAI(
+        api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
+        base_url="https://api.openai.com/v1",
+    )
+
+    envs = ["gsm8k", "wordle"]
+
+    # Global sampling args (fallback)
+    global_sampling = {
+        "temperature": 0.7,
+        "max_tokens": 2048,
+    }
+
+    # Per-environment sampling args
+    sampling_args_dict = {
+        "gsm8k": {
+            "temperature": 0.9,
+            "max_tokens": 4096,
+            "top_p": 0.95,
+        },
+        "wordle": {
+            "temperature": 0.3,
+            "max_tokens": 512,
+        },
+    }
+
+    results = await eval_environments_parallel(
+        envs=envs,
+        env_args_dict={env: {} for env in envs},
+        client=client,
+        model="gpt-4o-mini",
+        num_examples=[10, 10, 10],
+        rollouts_per_example=[3, 3, 3],
+        max_concurrent=[32, 32, 32],
+        sampling_args=global_sampling,  # Fallback
+        sampling_args_dict=sampling_args_dict,  # Per-env overrides
+    )
+
+    # Display results
+    for env_name, output in results.items():
+        print(f"\n=== {env_name} ===")
+        print(f"Average reward: {sum(output.reward) / len(output.reward):.3f}")
+
+        # Show which sampling args were used
+        if env_name in sampling_args_dict:
+            print(f"Used sampling args: {sampling_args_dict[env_name]}")
+        else:
+            print(f"Used global sampling args: {global_sampling}")
+
+
+async def example_with_env_hub():
+    """
+    Example: Evaluate and save to Environment Hub.
+
+    NOTE: Before pushing evaluations, you must first push the environments to Environment Hub:
+        1. Using verifiers: env.push_to_env_hub(hub_name='owner/gsm8k')
+        2. Using prime CLI: prime env push gsm8k
+        3. Via web: https://app.primeintellect.ai/environments
+
+    The system will check if the environment exists before pushing eval results.
+    """
+
+    client = AsyncOpenAI(
+        api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
+        base_url="https://api.openai.com/v1",
+    )
+
+    envs = ["gsm8k", "wordle"]
+    model = "gpt-4o-mini"
+
+    # Run evaluation
+    results = await eval_environments_parallel(
+        envs=envs,
+        env_args_dict={"gsm8k": {}, "wordle": {}},
+        client=client,
+        model=model,
+        num_examples=[10, 10],
+        rollouts_per_example=[3, 3],
+        max_concurrent=[32, 32],
+        sampling_args={"temperature": 0.7, "max_tokens": 2048},
+    )
+
+    for env_name, output in results.items():
+        # Calculate metrics
+        avg_reward = sum(output.reward) / len(output.reward)
+
+        metrics = {
+            "avg_reward": float(avg_reward),
+            "num_samples": len(output.reward),
+        }
+
+        # Add any additional metrics from the output
+        for metric_name, metric_values in output.metrics.items():
+            metrics[f"avg_{metric_name}"] = float(
+                sum(metric_values) / len(metric_values)
+            )
+
+        # Prepare metadata
+        metadata = {
+            "environment": env_name,
+            "model": model,
+            "num_examples": 10,
+            "rollouts_per_example": 3,
+            "sampling_args": {"temperature": 0.7, "max_tokens": 2048},
+        }
+
+        # Save to hub (will check if environment exists first)
+        push_eval_to_env_hub(
+            eval_name=f"{model.replace('/', '-')}-{env_name}",
+            model_name=model,
+            environment_id=env_name,
+            metrics=metrics,
+            metadata=metadata,
+        )
+
+
+if __name__ == "__main__":
+    print("Example 1: Basic multi-environment evaluation")
+    asyncio.run(example_multi_env_eval())
+
+    print("\n" + "=" * 80 + "\n")
+    print("Example 2: Per-environment sampling arguments")
+    asyncio.run(example_per_env_sampling_args())
+
+    print("\n" + "=" * 80 + "\n")
+    print("Example 3: With Environment Hub integration")
+    asyncio.run(example_with_env_hub())
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ envs = [
     "brave-search",
     "nltk",
     "textarena",
+    "prime-evals>=0.1.0",
 ]
 all = [
     "torch>=2.7.0",
@@ -89,6 +90,7 @@ all = [
     "brave-search",
     "nltk",
     "textarena",
+    "prime-evals>=0.1.0",
 ]
 docs = [
     "sphinx",