Skip to content

Commit 42d17b9

Browse files
Merge pull request #369 from GoogleCloudPlatform/codex
Add support for Codex
2 parents c72c758 + 30e4035 commit 42d17b9

10 files changed

Lines changed: 1096 additions & 8 deletions

File tree

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "fake-csql-create-instance-success",
5+
"starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.",
6+
"conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.",
7+
"expected_trajectory": [
8+
"create_instance"
9+
],
10+
"env": {
11+
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
12+
},
13+
"kind": "tools",
14+
"max_turns": 3
15+
},
16+
{
17+
"id": "fake-csql-get-instance-failure",
18+
"starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.",
19+
"conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.",
20+
"expected_trajectory": [
21+
"get_instance"
22+
],
23+
"env": {
24+
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
25+
},
26+
"kind": "tools",
27+
"max_turns": 3
28+
}
29+
]
30+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"scenarios": [
3+
{
4+
"id": "cloud-sql-list-instances-01",
5+
"starting_prompt": "list all Cloud SQL instances in project astana-evaluation",
6+
"conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE",
7+
"expected_trajectory": [
8+
"list_instances",
9+
"get_instance"
10+
],
11+
"env": {
12+
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
13+
},
14+
"kind": "tools",
15+
"max_turns": 3
16+
},
17+
{
18+
"id": "csql-create-ambiguous-multiturn-01",
19+
"starting_prompt": "I need a database.",
20+
"conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.",
21+
"expected_trajectory": [
22+
"list_instances",
23+
"create_instance",
24+
"create_database"
25+
],
26+
"env": {
27+
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
28+
},
29+
"kind": "tools",
30+
"max_turns": 6
31+
},
32+
{
33+
"id": "csql-instance-not-found-failure",
34+
"starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.",
35+
"conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.",
36+
"expected_trajectory": [
37+
"update_instance",
38+
"list_instances"
39+
],
40+
"env": {
41+
"GOOGLE_CLOUD_PROJECT": "astana-evaluation"
42+
},
43+
"kind": "tools",
44+
"max_turns": 4
45+
}
46+
]
47+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
############################################################
2+
### Dataset / Eval Items
3+
############################################################
4+
dataset_config: datasets/codex-cli-tools/codex-cli.evalset.json
5+
dataset_format: agent-format
6+
7+
orchestrator: agent
8+
model_config: datasets/model_configs/codex_cli_model.yaml
9+
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
10+
11+
# Concurrency: number of scenarios to run in parallel.
12+
# Set to 1 for sequential runs (easier to follow logs, avoids session conflicts
13+
# on the shared sandboxed ~/.codex store).
14+
runners:
15+
agent_runners: 1
16+
17+
############################################################
18+
### Scorer Related Configs
19+
############################################################
20+
scorers:
21+
trajectory_matcher: {}
22+
goal_completion:
23+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
24+
behavioral_metrics:
25+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
26+
parameter_analysis:
27+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
28+
turn_count: {}
29+
end_to_end_latency: {}
30+
tool_call_latency: {}
31+
token_consumption: {}
32+
33+
############################################################
34+
### Reporting Related Configs
35+
############################################################
36+
reporting:
37+
csv:
38+
output_directory: 'results'
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
############################################################
2+
### Dataset / Eval Items
3+
############################################################
4+
dataset_config: datasets/codex-cli-tools/codex-cli-fake.evalset.json
5+
dataset_format: agent-format
6+
7+
# Orchestrator Configuration
8+
orchestrator: agent
9+
model_config: datasets/model_configs/codex_cli_fake_model.yaml
10+
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
11+
12+
############################################################
13+
### Scorer Related Configs
14+
############################################################
15+
scorers:
16+
trajectory_matcher: {}
17+
goal_completion:
18+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
19+
behavioral_metrics:
20+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
21+
parameter_analysis:
22+
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
23+
turn_count: {}
24+
end_to_end_latency: {}
25+
tool_call_latency: {}
26+
token_consumption: {}
27+
28+
############################################################
29+
### Reporting Related Configs
30+
############################################################
31+
reporting:
32+
csv:
33+
output_directory: 'results'
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
codex_cli_version: "@openai/codex@latest"
2+
generator: codex_cli
3+
model: "gpt-5.5"
4+
5+
# Fetched from Secret Manager (override per-environment as needed).
6+
openai_api_key_secret: "projects/393137573/secrets/OPENAI_API_KEY/versions/1"
7+
8+
env:
9+
GOOGLE_CLOUD_PROJECT: "astana-evaluation"
10+
11+
setup:
12+
mcp_servers:
13+
"cloud-sql":
14+
command: "python"
15+
args:
16+
- "evalbench/util/fake_mcp_server.py"
17+
- "--server-name"
18+
- "cloud-sql"
19+
- "--config"
20+
- "datasets/model_configs/codex_cli_fake_model.yaml"
21+
22+
fake_mcp_tools:
23+
"cloud-sql":
24+
- name: create_instance
25+
description: "Creates a Cloud SQL instance"
26+
parameters:
27+
type: object
28+
properties:
29+
project_id:
30+
type: string
31+
description: "GCP project ID"
32+
instance_name:
33+
type: string
34+
description: "Name for the new instance"
35+
required: ["project_id", "instance_name"]
36+
response:
37+
status: "success"
38+
message: "Instance created successfully"
39+
- name: get_instance
40+
description: "Gets details of a Cloud SQL instance"
41+
parameters:
42+
type: object
43+
properties:
44+
project_id:
45+
type: string
46+
description: "GCP project ID"
47+
instance_name:
48+
type: string
49+
description: "Instance name"
50+
required: ["project_id", "instance_name"]
51+
response:
52+
status: "failure"
53+
error:
54+
code: 404
55+
message: "Instance not found or permission denied"
56+
- name: list_instances
57+
description: "Lists all Cloud SQL instances in a project"
58+
parameters:
59+
type: object
60+
properties:
61+
project_id:
62+
type: string
63+
description: "GCP project ID"
64+
required: ["project_id"]
65+
response:
66+
status: "success"
67+
instances:
68+
- name: "nl2code"
69+
state: "RUNNABLE"
70+
databaseVersion: "POSTGRES_15"
71+
- name: update_instance
72+
description: "Updates a Cloud SQL instance"
73+
parameters:
74+
type: object
75+
properties:
76+
project_id:
77+
type: string
78+
description: "GCP project ID"
79+
instance_name:
80+
type: string
81+
description: "Instance name"
82+
required: ["project_id", "instance_name"]
83+
response:
84+
status: "success"
85+
message: "Instance updated successfully"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# OpenAI Codex CLI version.
2+
# - Globally installed binary: "codex"
3+
# - Pin to a specific npm version (uses `npm exec --yes` like Gemini CLI):
4+
# "@openai/codex@latest"
5+
codex_cli_version: "@openai/codex@latest"
6+
7+
generator: codex_cli
8+
9+
# Model to use. Pass any model id supported by your OpenAI account, e.g.
10+
# "gpt-5.5", "o4-mini", "gpt-4.1", etc.
11+
model: "gpt-5.5"
12+
13+
# OPENAI_API_KEY is fetched from Google Secret Manager. Provide the resource
14+
# path; either the bare form or the `secret_manager://` URL form works.
15+
openai_api_key_secret: "projects/393137573/secrets/OPENAI_API_KEY/versions/1"
16+
17+
# Codex's NDJSON has token counts but no cost. Provide rates here and
18+
# evalbench will compute `cost_usd` per turn. Update these to match the
19+
# model you set above; rates below are placeholder OpenAI list prices.
20+
# `cached_input_per_million_usd` is optional — defaults to 10% of input.
21+
pricing:
22+
input_per_million_usd: 1.25
23+
cached_input_per_million_usd: 0.125
24+
output_per_million_usd: 10.0
25+
26+
env:
27+
GOOGLE_CLOUD_PROJECT: "astana-evaluation"
28+
GOOGLE_CLOUD_LOCATION: "us-central1"
29+
30+
setup:
31+
mcp_servers:
32+
"cloud-sql":
33+
httpUrl: "https://sqladmin.googleapis.com/mcp"
34+
authProviderType: google_credentials
35+
headers:
36+
X-Goog-User-Project: astana-evaluation

evalbench/evaluator/agentevaluator.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from generators.models import get_generator
1010
from generators.models.gemini_cli import GeminiCliGenerator
1111
from generators.models.claude_code import ClaudeCodeGenerator
12+
from generators.models.codex_cli import CodexCliGenerator
1213
from mp import mprunner
1314
from work.agentgenwork import AgentGenWork
1415
from evaluator.simulateduser import SimulatedUser
@@ -40,10 +41,12 @@ def __init__(
4041
self.agent_version = self.generator.claude_code_version
4142
elif isinstance(self.generator, GeminiCliGenerator):
4243
self.agent_version = self.generator.gemini_cli_version
44+
elif isinstance(self.generator, CodexCliGenerator):
45+
self.agent_version = self.generator.codex_cli_version
4346
else:
4447
raise ValueError(
45-
f"AgentEvaluator only supports gemini_cli and claude_code generators, "
46-
f"got {type(self.generator).__name__}")
48+
f"AgentEvaluator only supports gemini_cli, claude_code, and "
49+
f"codex_cli generators, got {type(self.generator).__name__}")
4750

4851
runner_config = self.config.get("runners", {})
4952
self.agent_runners = runner_config.get("agent_runners", 10)
@@ -55,11 +58,11 @@ def evaluate(
5558
job_id: str,
5659
run_time: datetime.datetime,
5760
):
58-
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)):
61+
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
5962
return self._evaluate_agent_cli(dataset, job_id, run_time)
6063
else:
6164
raise NotImplementedError(
62-
"This evaluator currently only supports GeminiCliGenerator and ClaudeCodeGenerator")
65+
"This evaluator currently only supports GeminiCliGenerator, ClaudeCodeGenerator and CodexCliGenerator")
6366

6467
def _evaluate_agent_cli(
6568
self,
@@ -128,8 +131,8 @@ def process_scenario(
128131
for turn in range(max_turns):
129132
logging.info(
130133
f"Turn {turn + 1}/{max_turns} - Prompt: {current_prompt}")
131-
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)):
132-
if isinstance(self.generator, ClaudeCodeGenerator):
134+
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
135+
if isinstance(self.generator, (ClaudeCodeGenerator, CodexCliGenerator)):
133136
cli_cmd = self.generator.create_command(
134137
cli=self.agent_version,
135138
prompt=current_prompt,
@@ -168,7 +171,7 @@ def process_scenario(
168171
self._log_cli_result(turn, max_turns, result)
169172

170173
tools = []
171-
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)):
174+
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
172175
tools = self.generator.extract_tools(result.stdout)
173176
accumulated_tools.extend(tools)
174177

evalbench/evaluator/agentorchestrator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(
2727
self.report_progress = report_progress
2828

2929
def evaluate(self, dataset: list[EvalGeminiCliRequest]):
30-
logging.info("Starting Gemini CLI evaluation")
30+
logging.info("Starting agent CLI evaluation")
3131
evaluator = AgentEvaluator(self.config)
3232
eval_outputs, scoring_results = evaluator.evaluate(
3333
dataset, self.job_id, self.run_time

evalbench/generators/models/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .query_data_api import QueryDataAPIGenerator
99
from .gemini_cli import GeminiCliGenerator
1010
from .claude_code import ClaudeCodeGenerator
11+
from .codex_cli import CodexCliGenerator
1112
from util.config import load_yaml_config
1213

1314

@@ -36,6 +37,8 @@ def get_generator(global_models, model_config_path: str, db: DB = None):
3637
model = GeminiCliGenerator(config)
3738
if config["generator"] == "claude_code":
3839
model = ClaudeCodeGenerator(config)
40+
if config["generator"] == "codex_cli":
41+
model = CodexCliGenerator(config)
3942
if not model:
4043
raise ValueError(f"Unknown Generator {config['generator']}")
4144

0 commit comments

Comments
 (0)