From 9046e471f5790b5ec365510993ac11ef63b6f57b Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 19:36:19 +0800 Subject: [PATCH 01/19] test: add queue-driven fake-codex harness for multi-turn tests Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/fake-codex-fixture.mjs | 111 +++++++++++++++++++++++++++++- tests/fake-codex-fixture.test.mjs | 107 ++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 tests/fake-codex-fixture.test.mjs diff --git a/tests/fake-codex-fixture.mjs b/tests/fake-codex-fixture.mjs index debcadce..39a21b4b 100644 --- a/tests/fake-codex-fixture.mjs +++ b/tests/fake-codex-fixture.mjs @@ -2,7 +2,7 @@ import fs from "node:fs"; import path from "node:path"; import process from "node:process"; -import { writeExecutable } from "./helpers.mjs"; +import { makeTempDir, writeExecutable } from "./helpers.mjs"; export function installFakeCodex(binDir, behavior = "review-ok") { const statePath = path.join(binDir, "fake-codex-state.json"); @@ -369,6 +369,51 @@ rl.on("line", (line) => { case "turn/start": { const thread = ensureThread(state, message.params.threadId); + + if (BEHAVIOR === "queue-driven") { + if (!state.requests) { state.requests = []; } + state.requests.push({ method: "turn/start", params: message.params }); + saveState(state); + + const turnId = nextTurnId(state); + thread.updatedAt = now(); + saveState(state); + + const entry = (state.queue && state.queue.length > 0) ? state.queue.shift() : null; + saveState(state); + + if (entry && entry.rpcError) { + send({ id: message.id, error: { code: -32000, message: entry.rpcError.message } }); + break; + } + + send({ id: message.id, result: { turn: buildTurn(turnId) } }); + send({ method: "turn/started", params: { threadId: thread.id, turn: buildTurn(turnId) } }); + + const commands = (entry && entry.commands) || []; + let cmdCounter = 0; + for (const cmd of commands) { + const itemId = "cmd_" + turnId + "_" + (cmdCounter++); + send({ method: "item/started", params: { threadId: thread.id, turnId, item: { type: "commandExecution", id: itemId, command: cmd.command, status: "in_progress" } } }); + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "commandExecution", id: itemId, command: cmd.command, exitCode: cmd.exitCode ?? 0, status: "completed" } } }); + } + + if (entry && entry.finalAnswer) { + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: entry.finalAnswer.text, phase: "final_answer" } } }); + } + + if (entry && entry.turnError) { + send({ method: "error", params: { threadId: thread.id, turnId, error: { message: entry.turnError.message } } }); + } + + if (!entry) { + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: "", phase: "agent_message" } } }); + } + + send({ method: "turn/completed", params: { threadId: thread.id, turn: buildTurn(turnId, "completed") } }); + break; + } + const prompt = (message.params.input || []) .filter((item) => item.type === "text") .map((item) => item.text) @@ -587,3 +632,67 @@ export function buildEnv(binDir) { PATH: `${binDir}${sep}${process.env.PATH}` }; } + +/** + * Sets up a queue-driven fake Codex harness for multi-turn tests. + * Returns a handle with helpers for scripting turn responses and + * inspecting captured requests. + */ +export function setupFakeCodex({ cwd } = {}) { + const binDir = makeTempDir("codex-queue-driven-"); + installFakeCodex(binDir, "queue-driven"); + + const statePath = path.join(binDir, "fake-codex-state.json"); + const initialState = { + nextThreadId: 1, + nextTurnId: 1, + appServerStarts: 0, + threads: [], + capabilities: null, + lastInterrupt: null, + queue: [], + requests: [] + }; + fs.writeFileSync(statePath, JSON.stringify(initialState, null, 2)); + + const savedPath = process.env.PATH; + const sep = process.platform === "win32" ? ";" : ":"; + process.env.PATH = `${binDir}${sep}${process.env.PATH}`; + + const env = buildEnv(binDir); + const resolvedCwd = cwd || process.cwd(); + + function readState() { + return JSON.parse(fs.readFileSync(statePath, "utf8")); + } + + function writeState(state) { + fs.writeFileSync(statePath, JSON.stringify(state, null, 2)); + } + + return { + cwd: resolvedCwd, + env, + binDir, + queueTurnResponse(entry) { + const state = readState(); + if (!state.queue) { state.queue = []; } + state.queue.push(entry); + writeState(state); + }, + queueTurnRpcError({ message }) { + const state = readState(); + if (!state.queue) { state.queue = []; } + state.queue.push({ rpcError: { message } }); + writeState(state); + }, + get requests() { + const state = readState(); + return state.requests ?? []; + }, + close() { + process.env.PATH = savedPath; + fs.rmSync(binDir, { recursive: true, force: true }); + } + }; +} diff --git a/tests/fake-codex-fixture.test.mjs b/tests/fake-codex-fixture.test.mjs new file mode 100644 index 00000000..5876460d --- /dev/null +++ b/tests/fake-codex-fixture.test.mjs @@ -0,0 +1,107 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { setupFakeCodex } from "./fake-codex-fixture.mjs"; +import { runAppServerTurn } from "../plugins/codex/scripts/lib/codex.mjs"; +import { makeTempDir } from "./helpers.mjs"; + +test("queue-driven fake: final answer is returned via runAppServerTurn", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ finalAnswer: { text: "hi" } }); + + const result = await runAppServerTurn(cwd, { + prompt: "say hi" + }); + + assert.equal(result.finalMessage, "hi"); + assert.equal(result.status, 0); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: requests are captured with params", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ finalAnswer: { text: "captured" } }); + + await runAppServerTurn(cwd, { + prompt: "check capture" + }); + + const turnStarts = handle.requests.filter((r) => r.method === "turn/start"); + assert.equal(turnStarts.length, 1); + // Verify the captured params include the input text + const inputTexts = turnStarts[0].params.input + .filter((item) => item.type === "text") + .map((item) => item.text); + assert.ok(inputTexts.some((text) => text.includes("check capture"))); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: commandExecution items are emitted and captured", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }], + finalAnswer: { text: "done" } + }); + + const result = await runAppServerTurn(cwd, { + prompt: "run commands" + }); + + assert.equal(result.finalMessage, "done"); + assert.equal(result.commandExecutions.length, 1); + assert.equal(result.commandExecutions[0].command, "git diff"); + assert.equal(result.commandExecutions[0].exitCode, 0); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: RPC error causes runAppServerTurn to reject", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnRpcError({ message: "boom" }); + + await assert.rejects( + runAppServerTurn(cwd, { prompt: "should fail" }), + (error) => { + assert.ok(error.message.includes("boom")); + return true; + } + ); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: soft error (turnError) is captured", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ + finalAnswer: { text: "partial" }, + turnError: { message: "soft failure" } + }); + + const result = await runAppServerTurn(cwd, { + prompt: "trigger soft error" + }); + + // The turn still completes, but state.error is set + assert.equal(result.finalMessage, "partial"); + assert.ok(result.error); + assert.equal(result.error.message, "soft failure"); + } finally { + handle.close(); + } +}); From 0e9b8e5bec8ee81db2a971da4a5235a9211fc214 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 20:18:22 +0800 Subject: [PATCH 02/19] test: harden fake-codex harness close() and dedupe saves Address review findings on the queue-driven harness: - close() now splices its own binDir out of PATH so handles can be closed in any order - Collapse redundant saveState calls in the queue-driven branch - Document the requests getter's per-access I/O cost Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/fake-codex-fixture.mjs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/fake-codex-fixture.mjs b/tests/fake-codex-fixture.mjs index 39a21b4b..bca51796 100644 --- a/tests/fake-codex-fixture.mjs +++ b/tests/fake-codex-fixture.mjs @@ -373,11 +373,9 @@ rl.on("line", (line) => { if (BEHAVIOR === "queue-driven") { if (!state.requests) { state.requests = []; } state.requests.push({ method: "turn/start", params: message.params }); - saveState(state); const turnId = nextTurnId(state); thread.updatedAt = now(); - saveState(state); const entry = (state.queue && state.queue.length > 0) ? state.queue.shift() : null; saveState(state); @@ -655,7 +653,6 @@ export function setupFakeCodex({ cwd } = {}) { }; fs.writeFileSync(statePath, JSON.stringify(initialState, null, 2)); - const savedPath = process.env.PATH; const sep = process.platform === "win32" ? ";" : ":"; process.env.PATH = `${binDir}${sep}${process.env.PATH}`; @@ -686,12 +683,17 @@ export function setupFakeCodex({ cwd } = {}) { state.queue.push({ rpcError: { message } }); writeState(state); }, + // `requests` re-reads the state file each access; assign to a local variable for repeated use. get requests() { const state = readState(); return state.requests ?? []; }, close() { - process.env.PATH = savedPath; + const sep = process.platform === "win32" ? ";" : ":"; + process.env.PATH = (process.env.PATH ?? "") + .split(sep) + .filter((entry) => entry !== binDir) + .join(sep); fs.rmSync(binDir, { recursive: true, force: true }); } }; From 3c47c59981dc6818fb53fc59ad8951a55d8b68f0 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 20:26:22 +0800 Subject: [PATCH 03/19] feat(adversarial-review): add investigate + finalize prompt templates Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompts/adversarial-review-finalize.md | 53 +++++++++++++++++++ .../prompts/adversarial-review-investigate.md | 48 +++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 plugins/codex/prompts/adversarial-review-finalize.md create mode 100644 plugins/codex/prompts/adversarial-review-investigate.md diff --git a/plugins/codex/prompts/adversarial-review-finalize.md b/plugins/codex/prompts/adversarial-review-finalize.md new file mode 100644 index 00000000..15c6c11e --- /dev/null +++ b/plugins/codex/prompts/adversarial-review-finalize.md @@ -0,0 +1,53 @@ + +You have just completed an investigation of a code change. Now produce the structured adversarial review. + + + +Based on your investigation in the prior turns of this thread, write up your findings as a structured review. +Target: {{TARGET_LABEL}} +User focus: {{USER_FOCUS}} + + + +Report only material findings. +Do not include style feedback, naming feedback, low-value cleanup, or speculative concerns without evidence. +A finding should answer: +1. What can go wrong? +2. Why is this code path vulnerable? +3. What is the likely impact? +4. What concrete change would reduce the risk? + + + +Return only valid JSON matching the provided schema. +Keep the output compact and specific. +Use `needs-attention` if there is any material risk worth blocking on. +Use `approve` only if you cannot support any substantive adversarial finding from your investigation. +Every finding must include: +- the affected file +- `line_start` and `line_end` +- a confidence score from 0 to 1 +- a concrete recommendation +Write the summary like a terse ship/no-ship assessment, not a neutral recap. + + + +Be aggressive, but stay grounded. +Every finding must be defensible from what you read during the investigation. +Do not invent files, lines, code paths, incidents, attack chains, or runtime behavior you cannot support. +If a conclusion depends on an inference, state that explicitly in the finding body and keep the confidence honest. + + + +Prefer one strong finding over several weak ones. +Do not dilute serious issues with filler. +If the change looks safe, say so directly and return no findings. + + + +Before finalizing, check that each finding is: +- adversarial rather than stylistic +- tied to a concrete code location +- plausible under a real failure scenario +- actionable for an engineer fixing the issue + diff --git a/plugins/codex/prompts/adversarial-review-investigate.md b/plugins/codex/prompts/adversarial-review-investigate.md new file mode 100644 index 00000000..54d6dcc9 --- /dev/null +++ b/plugins/codex/prompts/adversarial-review-investigate.md @@ -0,0 +1,48 @@ + +You are Codex performing an adversarial software review. +Your job is to break confidence in the change, not to validate it. +This is the investigation phase: gather evidence with read-only commands before producing any structured output. + + + +Investigate the change so you can later produce a confident adversarial assessment. +Target: {{TARGET_LABEL}} +User focus: {{USER_FOCUS}} + + + +Default to skepticism. +Assume the change can fail in subtle, high-cost, or user-visible ways until the evidence says otherwise. +Do not give credit for good intent, partial fixes, or likely follow-up work. +If something only works on the happy path, treat that as a real weakness. + + + +Prioritize the kinds of failures that are expensive, dangerous, or hard to detect: +- auth, permissions, tenant isolation, and trust boundaries +- data loss, corruption, duplication, and irreversible state changes +- rollback safety, retries, partial failure, and idempotency gaps +- race conditions, ordering assumptions, stale state, and re-entrancy +- empty-state, null, timeout, and degraded dependency behavior +- version skew, schema drift, migration hazards, and compatibility regressions +- observability gaps that would hide failure or make recovery harder + + + +Use read-only shell commands to inspect the diff and the surrounding code. +Useful starting points: `git diff`, `git log`, `git show`, `git blame`, `cat`, `rg`/`grep`. +Read the changed files, follow references, and confirm or refute hypotheses with evidence from the code. +Do not modify any files. Your sandbox is read-only. +{{REVIEW_COLLECTION_GUIDANCE}} + + + +Continue investigating until you can defend a confident adversarial assessment. +When you have seen enough, emit a brief summary message describing what you found and stop running commands. +A summary message with no further command calls signals that you are ready for the finalization phase. +Do not produce a structured review yet — that comes in the next phase. + + + +{{REVIEW_INPUT}} + From 53f9e75128ba3e48daa8594591c02ce332c7bd8c Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 20:46:17 +0800 Subject: [PATCH 04/19] feat(codex): add runAppServerInvestigation for two-phase reviews Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/lib/codex.mjs | 174 ++++++++++++++ tests/investigation.test.mjs | 361 ++++++++++++++++++++++++++++ 2 files changed, 535 insertions(+) create mode 100644 tests/investigation.test.mjs diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..68773229 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -1028,6 +1028,180 @@ export async function runAppServerTurn(cwd, options = {}) { }); } +const DEFAULT_MAX_INVESTIGATION_TURNS = 10; +const INVESTIGATION_CONTINUATION_CUE = "Continue your investigation."; + +export async function runAppServerInvestigation(cwd, options = {}) { + const availability = getCodexAvailability(cwd); + if (!availability.available) { + throw new Error("Codex CLI is not installed or is missing required runtime support. Install it with `npm install -g @openai/codex`, then rerun `/codex:setup`."); + } + + const investigatePrompt = options.investigatePrompt?.trim(); + const finalizePrompt = options.finalizePrompt?.trim(); + if (!investigatePrompt) { + throw new Error("runAppServerInvestigation requires investigatePrompt."); + } + if (!finalizePrompt) { + throw new Error("runAppServerInvestigation requires finalizePrompt."); + } + const maxInvestigationTurns = Number.isFinite(options.maxInvestigationTurns) && options.maxInvestigationTurns > 0 + ? Math.floor(options.maxInvestigationTurns) + : DEFAULT_MAX_INVESTIGATION_TURNS; + const sandbox = options.sandbox ?? "read-only"; + + return withAppServer(cwd, async (client) => { + emitProgress(options.onProgress, "Starting Codex investigation thread.", "starting"); + const startResponse = await startThread(client, cwd, { + model: options.model, + sandbox, + ephemeral: true, + threadName: null + }); + const threadId = startResponse.thread.id; + emitProgress(options.onProgress, `Thread ready (${threadId}).`, "starting", { threadId }); + + let turnCount = 0; + let truncated = false; + let totalCommandsRun = 0; + const aggregatedCommandExecutions = []; + const aggregatedFileChanges = []; + + for (let i = 1; i <= maxInvestigationTurns; i += 1) { + const promptText = i === 1 ? investigatePrompt : INVESTIGATION_CONTINUATION_CUE; + emitProgress(options.onProgress, `Investigation turn ${i}.`, "investigating"); + + let turnState; + try { + turnState = await captureTurn( + client, + threadId, + () => + client.request("turn/start", { + threadId, + input: buildTurnInput(promptText), + model: options.model ?? null, + effort: options.effort ?? null, + outputSchema: null + }), + { onProgress: options.onProgress } + ); + } catch (transportError) { + return { + status: { code: 1, kind: "error" }, + threadId, + turnId: null, + finalMessage: "", + reasoningSummary: [], + turn: null, + error: { message: transportError?.message ?? String(transportError) }, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated: false } + }; + } + + turnCount = i; + const turnCommandCount = turnState.commandExecutions.length; + totalCommandsRun += turnCommandCount; + for (const cmd of turnState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of turnState.fileChanges) { + aggregatedFileChanges.push(change); + } + + if (turnState.error) { + return { + status: buildResultStatus(turnState), + threadId, + turnId: turnState.turnId, + finalMessage: turnState.lastAgentMessage, + reasoningSummary: turnState.reasoningSummary, + turn: turnState.finalTurn, + error: turnState.error, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated: false } + }; + } + + const converged = turnState.finalAnswerSeen === true && turnCommandCount === 0; + if (converged) { + break; + } + + if (i === maxInvestigationTurns) { + truncated = true; + } + } + + if (totalCommandsRun === 0) { + truncated = true; + } + + emitProgress(options.onProgress, "Investigation complete; finalizing structured output.", "finalizing"); + + let finalizeState; + try { + finalizeState = await captureTurn( + client, + threadId, + () => + client.request("turn/start", { + threadId, + input: buildTurnInput(finalizePrompt), + model: options.model ?? null, + effort: options.effort ?? null, + outputSchema: options.outputSchema ?? null + }), + { onProgress: options.onProgress } + ); + } catch (transportError) { + return { + status: { code: 1, kind: "error" }, + threadId, + turnId: null, + finalMessage: "", + reasoningSummary: [], + turn: null, + error: { message: transportError?.message ?? String(transportError) }, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated } + }; + } + + for (const cmd of finalizeState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of finalizeState.fileChanges) { + aggregatedFileChanges.push(change); + } + + return { + status: buildResultStatus(finalizeState), + threadId, + turnId: finalizeState.turnId, + finalMessage: finalizeState.lastAgentMessage, + reasoningSummary: finalizeState.reasoningSummary, + turn: finalizeState.finalTurn, + error: finalizeState.error, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated } + }; + }); +} + export async function findLatestTaskThread(cwd) { const availability = getCodexAvailability(cwd); if (!availability.available) { diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs new file mode 100644 index 00000000..bd5ab636 --- /dev/null +++ b/tests/investigation.test.mjs @@ -0,0 +1,361 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { setupFakeCodex } from "./fake-codex-fixture.mjs"; +import { runAppServerInvestigation } from "../plugins/codex/scripts/lib/codex.mjs"; +import { makeTempDir } from "./helpers.mjs"; + +// Structured JSON payloads used by multiple tests. +const STRUCTURED_REVIEW = JSON.stringify({ + verdict: "needs-attention", + summary: "Concern X.", + findings: [{ + severity: "high", + title: "Race", + file: "a.js", + line_start: 10, + line_end: 12, + confidence: 0.8, + body: "Potential race condition.", + recommendation: "Add a mutex." + }], + next_steps: [] +}); + +const APPROVE_REVIEW = JSON.stringify({ + verdict: "approve", + summary: "No material issues found.", + findings: [], + next_steps: [] +}); + +test("converges when Codex emits a final-answer turn with no commands", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, no final answer + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn 3 + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate the changes.", + finalizePrompt: "Produce your structured verdict.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.investigation.turnCount, 2); + assert.equal(result.investigation.truncated, false); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "should have 3 turn/start requests (2 recon + 1 finalize)"); + } finally { + fake.close(); + } +}); + +test("respects maxInvestigationTurns and marks truncated", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Queue 4 recon turns: all have commands, no final answer + for (let i = 0; i < 4; i++) { + fake.queueTurnResponse({ + commands: [{ command: `check-${i}`, exitCode: 0 }] + }); + } + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + maxInvestigationTurns: 3 + }); + + assert.equal(result.investigation.turnCount, 3); + assert.equal(result.investigation.truncated, true); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 4, "3 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + +test("turn with both finalAnswer and commands does not converge", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands AND final answer => does NOT converge + fake.queueTurnResponse({ + commands: [{ command: "grep -r TODO", exitCode: 0 }], + finalAnswer: { text: "Partial finding." } + }); + // Recon turn 2: only final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.equal(result.investigation.turnCount, 2); + assert.equal(result.investigation.truncated, false); + } finally { + fake.close(); + } +}); + +test("outputSchema is null on recon turns and set on finalize turn", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "cat file.js", exitCode: 0 }] + }); + // Recon turn 2: final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Done investigating." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const schema = { type: "object", required: ["verdict"] }; + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: schema + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3); + + // Recon turns must have outputSchema === null + assert.equal(starts[0].params.outputSchema, null, "recon turn 1 outputSchema should be null"); + assert.equal(starts[1].params.outputSchema, null, "recon turn 2 outputSchema should be null"); + + // Finalize turn must have the schema + assert.deepEqual(starts[2].params.outputSchema, schema, "finalize turn should have the outputSchema"); + } finally { + fake.close(); + } +}); + +test("phase-1 soft error (turn/failed) aborts before phase-2 finalize", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, normal + fake.queueTurnResponse({ + commands: [{ command: "git log --oneline", exitCode: 0 }] + }); + // Recon turn 2: soft error + fake.queueTurnResponse({ + turnError: { message: "model produced unrenderable response" } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should have an error"); + assert.equal(result.investigation.turnCount, 2, "soft-error turn IS counted"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "NO finalize turn should be attempted"); + } finally { + fake.close(); + } +}); + +test("phase-1 hard error (transport throw) aborts before phase-2 finalize", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, normal + fake.queueTurnResponse({ + commands: [{ command: "git status", exitCode: 0 }] + }); + // Recon turn 2: RPC error (transport throw) + fake.queueTurnRpcError({ message: "ECONNRESET" }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should have an error"); + assert.match(result.error.message, /ECONNRESET/); + assert.equal(result.investigation.turnCount, 1, "hard error returns BEFORE incrementing turnCount"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "the failing turn was still attempted"); + } finally { + fake.close(); + } +}); + +test("converges with zero commands flags truncated=true", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: only final answer, no commands => converges immediately + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Nothing to investigate." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.equal(result.investigation.turnCount, 1); + assert.equal(result.investigation.truncated, true, "zero commands across investigation => truncated"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "1 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + +test("recon turn 1 sends the investigate prompt; turn 2+ sends the continuation cue", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "ls", exitCode: 0 }] + }); + // Recon turn 2: commands + fake.queueTurnResponse({ + commands: [{ command: "cat a.js", exitCode: 0 }] + }); + // Recon turn 3: final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "All done." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "FULL INVESTIGATE PROMPT: look at the code", + finalizePrompt: "FINALIZE PROMPT: produce verdict" + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 4, "3 recon + 1 finalize"); + + // Extract input text from each turn/start + const inputTexts = starts.map((s) => + (s.params.input || []) + .filter((p) => p.type === "text") + .map((p) => p.text) + .join("") + ); + + // Turn 1: investigate prompt + assert.match(inputTexts[0], /FULL INVESTIGATE PROMPT/, "turn 1 should use investigate prompt"); + // Turn 2 and 3: continuation cue + assert.equal(inputTexts[1], "Continue your investigation.", "turn 2 should use continuation cue"); + assert.equal(inputTexts[2], "Continue your investigation.", "turn 3 should use continuation cue"); + // Turn 4: finalize prompt + assert.match(inputTexts[3], /FINALIZE PROMPT/, "turn 4 should use finalize prompt"); + } finally { + fake.close(); + } +}); + +test("outputSchema-set finalize turn produces schema-conformant final message", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, no final answer + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn with structured output + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const schema = { + type: "object", + required: ["verdict"], + properties: { + verdict: { type: "string" }, + summary: { type: "string" }, + findings: { type: "array" }, + next_steps: { type: "array" } + } + }; + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate the changes.", + finalizePrompt: "Produce your structured verdict.", + outputSchema: schema + }); + + // The finalMessage should be parseable JSON with a verdict field + const parsed = JSON.parse(result.finalMessage); + assert.equal(parsed.verdict, "needs-attention"); + assert.equal(parsed.findings.length, 1); + assert.equal(parsed.findings[0].severity, "high"); + + // The finalize turn should have received the schema + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.deepEqual(starts[starts.length - 1].params.outputSchema, schema); + + // Only the finalize turn's reasoningSummary is returned + assert.ok(Array.isArray(result.reasoningSummary)); + } finally { + fake.close(); + } +}); From 2708672fd5e803605d10b684f82df8fd8cc3e4f4 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 20:53:54 +0800 Subject: [PATCH 05/19] test: strengthen soft-error assertion to check error message Address review finding: the soft-error test only checked truthiness of result.error. The fixture queues a deterministic message, so match against it explicitly to catch regressions where the error gets swallowed or replaced. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/investigation.test.mjs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index bd5ab636..5b58c1f4 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -187,6 +187,7 @@ test("phase-1 soft error (turn/failed) aborts before phase-2 finalize", async () }); assert.ok(result.error, "result should have an error"); + assert.match(result.error.message, /model produced unrenderable response/); assert.equal(result.investigation.turnCount, 2, "soft-error turn IS counted"); const requests = fake.requests; From fbb2bc7d919be2c8f330bed88a9ce9cdb8bfb071 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 21:00:36 +0800 Subject: [PATCH 06/19] feat(adversarial-review): add investigate/finalize prompt builders Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..8949efd6 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -246,6 +246,24 @@ function buildAdversarialReviewPrompt(context, focusText) { }); } +function buildAdversarialInvestigatePrompt(context, focusText) { + const template = loadPromptTemplate(ROOT_DIR, "adversarial-review-investigate"); + return interpolateTemplate(template, { + TARGET_LABEL: context.target.label, + USER_FOCUS: focusText || "No extra focus provided.", + REVIEW_COLLECTION_GUIDANCE: context.collectionGuidance, + REVIEW_INPUT: context.content + }); +} + +function buildAdversarialFinalizePrompt(context, focusText) { + const template = loadPromptTemplate(ROOT_DIR, "adversarial-review-finalize"); + return interpolateTemplate(template, { + TARGET_LABEL: context.target.label, + USER_FOCUS: focusText || "No extra focus provided." + }); +} + function ensureCodexAvailable(cwd) { const availability = getCodexAvailability(cwd); if (!availability.available) { From cfa0423505118bc5f311235f8a85705f9d982f3f Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 23:00:26 +0800 Subject: [PATCH 07/19] fix(adversarial-review): route self-collect through two-phase investigation Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 37 ++++-- tests/investigation.test.mjs | 153 ++++++++++++++++++++++ tests/runtime.test.mjs | 9 +- 3 files changed, 188 insertions(+), 11 deletions(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 8949efd6..7b08b5ea 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -17,6 +17,7 @@ import { interruptAppServerTurn, parseStructuredOutput, readOutputSchema, + runAppServerInvestigation, runAppServerReview, runAppServerTurn } from "./lib/codex.mjs"; @@ -422,14 +423,29 @@ async function executeReviewRun(request) { } const context = collectReviewContext(request.cwd, target); - const prompt = buildAdversarialReviewPrompt(context, focusText); - const result = await runAppServerTurn(context.repoRoot, { - prompt, - model: request.model, - sandbox: "read-only", - outputSchema: readOutputSchema(REVIEW_SCHEMA), - onProgress: request.onProgress - }); + let result; + if (context.inputMode === "self-collect") { + const investigatePrompt = buildAdversarialInvestigatePrompt(context, focusText); + const finalizePrompt = buildAdversarialFinalizePrompt(context, focusText); + result = await runAppServerInvestigation(context.repoRoot, { + investigatePrompt, + finalizePrompt, + outputSchema: readOutputSchema(REVIEW_SCHEMA), + model: request.model, + sandbox: "read-only", + maxInvestigationTurns: request.maxInvestigationTurns, + onProgress: request.onProgress + }); + } else { + const prompt = buildAdversarialReviewPrompt(context, focusText); + result = await runAppServerTurn(context.repoRoot, { + prompt, + model: request.model, + sandbox: "read-only", + outputSchema: readOutputSchema(REVIEW_SCHEMA), + onProgress: request.onProgress + }); + } const parsed = parseStructuredOutput(result.finalMessage, { status: result.status, failureMessage: result.error?.message ?? result.stderr @@ -454,9 +470,12 @@ async function executeReviewRun(request) { parseError: parsed.parseError, reasoningSummary: result.reasoningSummary }; + if (result.investigation) { + payload.investigation = result.investigation; + } return { - exitStatus: result.status, + exitStatus: typeof result.status === "number" ? result.status : (result.status?.code ?? 1), threadId: result.threadId, turnId: result.turnId, payload, diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 5b58c1f4..2e5be652 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -307,6 +307,159 @@ test("recon turn 1 sends the investigate prompt; turn 2+ sends the continuation } }); +// ------------------------------------------------------------------- +// Integration tests: subprocess-based end-to-end companion tests +// ------------------------------------------------------------------- + +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { spawnSync } from "node:child_process"; +import { fileURLToPath } from "node:url"; + +const COMPANION_PATH = fileURLToPath( + new URL("../plugins/codex/scripts/codex-companion.mjs", import.meta.url) +); + +function makeSelfCollectGitFixture() { + // 3+ changed files triggers self-collect (DEFAULT_INLINE_DIFF_MAX_FILES = 2) + const root = mkdtempSync(path.join(tmpdir(), "codex-self-collect-test-")); + spawnSync("git", ["init", "-q", "-b", "main"], { cwd: root }); + spawnSync("git", ["config", "user.email", "test@example.com"], { cwd: root }); + spawnSync("git", ["config", "user.name", "Test"], { cwd: root }); + spawnSync("git", ["config", "commit.gpgsign", "false"], { cwd: root }); + writeFileSync(path.join(root, "README.md"), "# repo\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "init"], { cwd: root }); + spawnSync("git", ["checkout", "-q", "-b", "feature"], { cwd: root }); + mkdirSync(path.join(root, "src"), { recursive: true }); + for (let i = 0; i < 5; i += 1) { + writeFileSync(path.join(root, "src", `f${i}.js`), `export const v${i} = ${i};\n`); + } + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "feature"], { cwd: root }); + return root; +} + +function makeInlineGitFixture() { + // 1 changed file stays on inline-diff path + const root = mkdtempSync(path.join(tmpdir(), "codex-inline-test-")); + spawnSync("git", ["init", "-q", "-b", "main"], { cwd: root }); + spawnSync("git", ["config", "user.email", "test@example.com"], { cwd: root }); + spawnSync("git", ["config", "user.name", "Test"], { cwd: root }); + spawnSync("git", ["config", "commit.gpgsign", "false"], { cwd: root }); + writeFileSync(path.join(root, "README.md"), "# repo\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "init"], { cwd: root }); + spawnSync("git", ["checkout", "-q", "-b", "feature"], { cwd: root }); + writeFileSync(path.join(root, "one.js"), "export const v = 1;\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "tiny"], { cwd: root }); + return root; +} + +function runCompanion(args, env) { + return spawnSync("node", [COMPANION_PATH, ...args], { + env: { ...process.env, ...env }, + encoding: "utf8", + timeout: 30000 + }); +} + +test("self-collect path uses runAppServerInvestigation end-to-end", async () => { + const cwd = makeSelfCollectGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: runs a command, no convergence + fake.queueTurnResponse({ + commands: [{ command: "git diff main...HEAD", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn: structured output + fake.queueTurnResponse({ + finalAnswer: { + text: JSON.stringify({ + verdict: "needs-attention", + summary: "Found risk in src/f1.js.", + findings: [{ + severity: "high", + title: "Unguarded export", + file: "src/f1.js", + line_start: 1, + line_end: 1, + confidence: 0.7, + body: "Module exports v1 with no validation.", + recommendation: "Add validation." + }], + next_steps: [] + }) + } + }); + + const result = runCompanion( + ["adversarial-review", "--base", "main", "--scope", "branch", "--cwd", cwd, "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + + // stdout may contain progress lines followed by JSON; parse the last JSON object + const stdout = result.stdout.trim(); + const payload = JSON.parse(stdout); + assert.ok(payload.investigation, "self-collect payload must have investigation field"); + assert.equal(payload.investigation.turnCount, 2); + assert.equal(payload.investigation.truncated, false); + assert.equal(payload.result?.verdict, "needs-attention"); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +test("inline-diff path does not call runAppServerInvestigation", async () => { + const cwd = makeInlineGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + // Single turn: inline path uses runAppServerTurn + fake.queueTurnResponse({ + finalAnswer: { + text: JSON.stringify({ + verdict: "approve", + summary: "No material issues found.", + findings: [], + next_steps: [] + }) + } + }); + + const result = runCompanion( + ["adversarial-review", "--base", "main", "--scope", "branch", "--cwd", cwd, "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + const payload = JSON.parse(result.stdout.trim()); + assert.equal(payload.investigation, undefined, + "inline-path payload must not carry the investigation field"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 1, "inline path is single-turn"); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +// ------------------------------------------------------------------- +// Unit tests for runAppServerInvestigation (continued from above) +// ------------------------------------------------------------------- + test("outputSchema-set finalize turn produces schema-conformant final message", async () => { const cwd = makeTempDir("codex-inv-test-"); const fake = setupFakeCodex({ cwd }); diff --git a/tests/runtime.test.mjs b/tests/runtime.test.mjs index 90408372..1961a592 100644 --- a/tests/runtime.test.mjs +++ b/tests/runtime.test.mjs @@ -295,8 +295,13 @@ test("adversarial review asks Codex to inspect larger diffs itself", () => { assert.equal(result.status, 0, result.stderr); const state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); - assert.match(state.lastTurnStart.prompt, /lightweight summary/i); - assert.match(state.lastTurnStart.prompt, /read-only git commands/i); + // With the two-phase investigation flow, the default fake converges on turn 1 + // (no commands, finalAnswer), then the finalize turn fires. lastTurnStart + // captures the finalize turn, which uses buildAdversarialFinalizePrompt. + // The investigate prompt (turn 1) contained the self-collect guidance; the + // finalize prompt (turn 2) references the investigation completed in prior turns. + assert.match(state.lastTurnStart.prompt, /investigation|structured review/i); + // File contents must not be inlined in either prompt (self-collect mode) assert.doesNotMatch(state.lastTurnStart.prompt, /PROMPT_SELF_COLLECT_[ABC]/); }); From 8b25e7b24c010fce2bbae12d5e4354b850ee2b5e Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 23:06:23 +0800 Subject: [PATCH 08/19] feat(adversarial-review): show banner when investigation is truncated Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 3 +- plugins/codex/scripts/lib/render.mjs | 10 +++ tests/investigation.test.mjs | 76 +++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 7b08b5ea..dfac48a8 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -482,7 +482,8 @@ async function executeReviewRun(request) { rendered: renderReviewResult(parsed, { reviewLabel: reviewName, targetLabel: context.target.label, - reasoningSummary: result.reasoningSummary + reasoningSummary: result.reasoningSummary, + investigation: result.investigation ?? null }), summary: parsed.parsed?.summary ?? parsed.parseError ?? firstMeaningfulLine(result.finalMessage, `${reviewName} finished.`), jobTitle: `Codex ${reviewName}`, diff --git a/plugins/codex/scripts/lib/render.mjs b/plugins/codex/scripts/lib/render.mjs index 2ec18523..9b578b60 100644 --- a/plugins/codex/scripts/lib/render.mjs +++ b/plugins/codex/scripts/lib/render.mjs @@ -208,6 +208,13 @@ export function renderSetupReport(report) { return `${lines.join("\n").trimEnd()}\n`; } +function appendInvestigationBanner(lines, meta) { + if (meta.investigation?.truncated === true) { + const turns = meta.investigation.turnCount ?? "?"; + lines.push(`Investigation truncated at ${turns} turns; findings may be shallow.`, ""); + } +} + export function renderReviewResult(parsedResult, meta) { if (!parsedResult.parsed) { const lines = [ @@ -217,6 +224,7 @@ export function renderReviewResult(parsedResult, meta) { "", `- Parse error: ${parsedResult.parseError}` ]; + appendInvestigationBanner(lines, meta); if (parsedResult.rawOutput) { lines.push("", "Raw final message:", "", "```text", parsedResult.rawOutput, "```"); @@ -237,6 +245,7 @@ export function renderReviewResult(parsedResult, meta) { "", `- Validation error: ${validationError}` ]; + appendInvestigationBanner(lines, meta); if (parsedResult.rawOutput) { lines.push("", "Raw final message:", "", "```text", parsedResult.rawOutput, "```"); @@ -258,6 +267,7 @@ export function renderReviewResult(parsedResult, meta) { data.summary, "" ]; + appendInvestigationBanner(lines, meta); if (findings.length === 0) { lines.push("No material findings."); diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 2e5be652..691a05c7 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -513,3 +513,79 @@ test("outputSchema-set finalize turn produces schema-conformant final message", fake.close(); } }); + +// ------------------------------------------------------------------- +// Task 5: renderer truncation banner +// ------------------------------------------------------------------- + +import { renderReviewResult } from "../plugins/codex/scripts/lib/render.mjs"; + +test("renderer prepends truncation banner when investigation.truncated is true", () => { + const parsed = { + parsed: { + verdict: "needs-attention", + summary: "Risk identified.", + findings: [], + next_steps: [] + } + }; + const out = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + }); + assert.match(out, /Investigation truncated at 10 turns; findings may be shallow\./); +}); + +test("renderer omits truncation banner when investigation is null or not truncated", () => { + const parsed = { + parsed: { + verdict: "approve", + summary: "Looks fine.", + findings: [], + next_steps: [] + } + }; + const outNull = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: null + }); + assert.doesNotMatch(outNull, /Investigation truncated/); + + const outOk = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 4, truncated: false } + }); + assert.doesNotMatch(outOk, /Investigation truncated/); +}); + +test("renderer shows truncation banner on parse-error and validation-error paths", () => { + const parseErrorOut = renderReviewResult( + { parsed: null, parseError: "Unexpected token", rawOutput: "{not json" }, + { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + } + ); + assert.match(parseErrorOut, /Investigation truncated at 10 turns/, + "banner must appear when output is unparseable AND investigation was truncated"); + + const validationErrorOut = renderReviewResult( + { parsed: { not: "review-shaped" } }, + { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + } + ); + assert.match(validationErrorOut, /Investigation truncated at 10 turns/, + "banner must appear when output has wrong shape AND investigation was truncated"); +}); From 93609a5a85aa18da563b21be4d2e7dd435421311 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 23:28:31 +0800 Subject: [PATCH 09/19] feat(adversarial-review): plumb --max-investigation-turns CLI flag Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 13 ++++- tests/investigation.test.mjs | 62 +++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index dfac48a8..8fab4ba6 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -719,13 +719,23 @@ function enqueueBackgroundTask(cwd, job, request) { async function handleReviewCommand(argv, config) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["base", "scope", "model", "cwd"], + valueOptions: ["base", "scope", "model", "cwd", "max-investigation-turns"], booleanOptions: ["json", "background", "wait"], aliasMap: { m: "model" } }); + const rawMaxTurns = options["max-investigation-turns"]; + let maxInvestigationTurns; + if (rawMaxTurns !== undefined) { + const parsedTurns = Number.parseInt(rawMaxTurns, 10); + if (!Number.isFinite(parsedTurns) || parsedTurns <= 0) { + throw new Error(`--max-investigation-turns must be a positive integer (got: ${rawMaxTurns})`); + } + maxInvestigationTurns = parsedTurns; + } + const cwd = resolveCommandCwd(options); const workspaceRoot = resolveCommandWorkspace(options); const focusText = positionals.join(" ").trim(); @@ -754,6 +764,7 @@ async function handleReviewCommand(argv, config) { model: options.model, focusText, reviewName: config.reviewName, + maxInvestigationTurns, onProgress: progress }), { json: options.json } diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 691a05c7..f8c67428 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -589,3 +589,65 @@ test("renderer shows truncation banner on parse-error and validation-error paths assert.match(validationErrorOut, /Investigation truncated at 10 turns/, "banner must appear when output has wrong shape AND investigation was truncated"); }); + +// ------------------------------------------------------------------- +// Task 6: --max-investigation-turns CLI flag +// ------------------------------------------------------------------- + +import { parseArgs } from "../plugins/codex/scripts/lib/args.mjs"; + +test("parseArgs accepts --max-investigation-turns as a value option", () => { + const { options } = parseArgs( + ["--base", "main", "--max-investigation-turns", "15", "auth"], + { + valueOptions: ["base", "scope", "model", "cwd", "max-investigation-turns"], + booleanOptions: ["json", "background", "wait"] + } + ); + assert.equal(options["max-investigation-turns"], "15"); +}); + +test("--max-investigation-turns propagates from CLI to runAppServerInvestigation", async () => { + const cwd = makeSelfCollectGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + for (let i = 0; i < 6; i += 1) { + fake.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }], + finalAnswer: null + }); + } + fake.queueTurnResponse({ + finalAnswer: { text: JSON.stringify({ verdict: "approve", summary: "ok", findings: [], next_steps: [] }) } + }); + + const result = runCompanion( + ["adversarial-review", + "--base", "main", "--scope", "branch", "--cwd", cwd, + "--max-investigation-turns", "5", + "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + const payload = JSON.parse(result.stdout.trim()); + assert.equal(payload.investigation.turnCount, 5); + assert.equal(payload.investigation.truncated, true); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +test("invalid --max-investigation-turns raises a clear error", () => { + const r = spawnSync("node", + [COMPANION_PATH, "adversarial-review", "--max-investigation-turns", "abc"], + { encoding: "utf8", timeout: 30000 } + ); + assert.notEqual(r.status, 0, "should exit non-zero on invalid flag value"); + assert.match( + r.stderr, + /must be a positive integer/, + "error message should explain the validation failure" + ); +}); From a32d1d11dca26668fd06828de13bd7414a818b93 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 23:28:55 +0800 Subject: [PATCH 10/19] docs(adversarial-review): document --max-investigation-turns Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/commands/adversarial-review.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/codex/commands/adversarial-review.md b/plugins/codex/commands/adversarial-review.md index da440ab4..bbe9b945 100644 --- a/plugins/codex/commands/adversarial-review.md +++ b/plugins/codex/commands/adversarial-review.md @@ -1,6 +1,6 @@ --- description: Run a Codex review that challenges the implementation approach and design choices -argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [focus ...]' +argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [--max-investigation-turns N] [focus ...]' disable-model-invocation: true allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion --- @@ -43,6 +43,7 @@ Argument handling: - It supports working-tree review, branch review, and `--base `. - It does not support `--scope staged` or `--scope unstaged`. - Unlike `/codex:review`, it can still take extra focus text after the flags. +- For very large diffs that exceed the inline threshold, Codex investigates the diff with read-only commands across multiple turns. Use `--max-investigation-turns N` (default 10) to raise or lower the cap. Foreground flow: - Run: From 5b767ffbd3bfc6fa6fa001a880f2df65763dfa71 Mon Sep 17 00:00:00 2001 From: Kent Date: Sun, 17 May 2026 23:36:44 +0800 Subject: [PATCH 11/19] test(commands): allow --max-investigation-turns in argument-hint assertion The argument-hint assertion in commands.test.mjs hard-coded a contiguous match of "[--scope ...] [focus ...]". Splitting that into separate matches lets the new flag sit between the two without breaking the test. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/commands.test.mjs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 3724ffa4..496a740e 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -49,7 +49,9 @@ test("adversarial review command uses AskUserQuestion and background Bash while assert.match(source, /```bash/); assert.match(source, /```typescript/); assert.match(source, /adversarial-review "\$ARGUMENTS"/); - assert.match(source, /\[--scope auto\|working-tree\|branch\] \[focus \.\.\.\]/); + assert.match(source, /\[--scope auto\|working-tree\|branch\]/); + assert.match(source, /\[--max-investigation-turns N\]/); + assert.match(source, /\[focus \.\.\.\]/); assert.match(source, /run_in_background:\s*true/); assert.match(source, /command:\s*`node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/codex-companion\.mjs" adversarial-review "\$ARGUMENTS"`/); assert.match(source, /description:\s*"Codex adversarial review"/); From 950329c2da1dd7d082d3262e7cd5537dd17e85e5 Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 00:03:15 +0800 Subject: [PATCH 12/19] fix(codex): normalize runAppServerInvestigation status to numeric The two transport-error returns in runAppServerInvestigation were emitting { code: 1, kind: "error" } as `status`, while the happy path and runAppServerTurn always return a number. Downstream payload.codex.status inherited this object on error paths but a number elsewhere. Normalizing the error returns to `status: 1` makes the public shape consistent and removes the need for the defensive typeof check at the executeReviewRun seam. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 2 +- plugins/codex/scripts/lib/codex.mjs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 8fab4ba6..84caf7c9 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -475,7 +475,7 @@ async function executeReviewRun(request) { } return { - exitStatus: typeof result.status === "number" ? result.status : (result.status?.code ?? 1), + exitStatus: result.status, threadId: result.threadId, turnId: result.turnId, payload, diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index 68773229..9de987ac 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -1088,7 +1088,7 @@ export async function runAppServerInvestigation(cwd, options = {}) { ); } catch (transportError) { return { - status: { code: 1, kind: "error" }, + status: 1, threadId, turnId: null, finalMessage: "", @@ -1163,7 +1163,7 @@ export async function runAppServerInvestigation(cwd, options = {}) { ); } catch (transportError) { return { - status: { code: 1, kind: "error" }, + status: 1, threadId, turnId: null, finalMessage: "", From 0981f6dd3e4246f8727a8c3b6038bcd496a77b9d Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 00:03:22 +0800 Subject: [PATCH 13/19] test(investigation): cover phase-2 finalize transport error path Adds a unit test that runs both recon turns to convergence, then queues a transport error on the finalize turn. Verifies that the investigation metadata (turnCount=2, truncated=false) is preserved even when finalize fails, and that the numeric status contract holds. Also tightens the existing phase-1 hard-error assertion with the status check. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/investigation.test.mjs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index f8c67428..0808cd33 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -217,6 +217,7 @@ test("phase-1 hard error (transport throw) aborts before phase-2 finalize", asyn assert.ok(result.error, "result should have an error"); assert.match(result.error.message, /ECONNRESET/); assert.equal(result.investigation.turnCount, 1, "hard error returns BEFORE incrementing turnCount"); + assert.equal(result.status, 1, "transport-error path returns numeric status"); const requests = fake.requests; const starts = requests.filter((r) => r.method === "turn/start"); @@ -226,6 +227,41 @@ test("phase-1 hard error (transport throw) aborts before phase-2 finalize", asyn } }); +test("phase-2 finalize transport error preserves investigation metadata", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }] + }); + // Recon turn 2: converge with final answer + no commands + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn: RPC error + fake.queueTurnRpcError({ message: "ETIMEDOUT during finalize" }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should carry the finalize error"); + assert.match(result.error.message, /ETIMEDOUT during finalize/); + assert.equal(result.investigation.turnCount, 2, "investigation completed both recon turns before finalize failed"); + assert.equal(result.investigation.truncated, false, "investigation converged; not truncated"); + assert.equal(result.status, 1, "finalize-error path returns numeric status"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "2 recon + 1 finalize attempted"); + } finally { + fake.close(); + } +}); + test("converges with zero commands flags truncated=true", async () => { const cwd = makeTempDir("codex-inv-test-"); const fake = setupFakeCodex({ cwd }); From 6992596928ca92e9c0eac2cee05c633f58d103cf Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 00:03:33 +0800 Subject: [PATCH 14/19] feat(adversarial-review): point to --max-investigation-turns in banner Extend the truncation banner to tell the user how to raise the cap. The banner only fires when investigation hit the turn cap, which is exactly when the flag is actionable. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/lib/render.mjs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/codex/scripts/lib/render.mjs b/plugins/codex/scripts/lib/render.mjs index 9b578b60..f4591c65 100644 --- a/plugins/codex/scripts/lib/render.mjs +++ b/plugins/codex/scripts/lib/render.mjs @@ -211,7 +211,10 @@ export function renderSetupReport(report) { function appendInvestigationBanner(lines, meta) { if (meta.investigation?.truncated === true) { const turns = meta.investigation.turnCount ?? "?"; - lines.push(`Investigation truncated at ${turns} turns; findings may be shallow.`, ""); + lines.push( + `Investigation truncated at ${turns} turns; findings may be shallow. Use --max-investigation-turns to raise the cap.`, + "" + ); } } From 3de18953bf348037cdfa55877a89e4a7913afd8f Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 09:58:59 +0800 Subject: [PATCH 15/19] fix(adversarial-review): tighten --max-investigation-turns validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Number.parseInt silently salvages "1.5" as 1, "10abc" as 10, " 5" as 5, and "1e2" via Number() as 100 — all of which violate the flag contract of "positive integer literal" and would silently change the depth of investigation without any error signal. Replace the parseInt-based check with a strict regex /^[1-9][0-9]*$/ so typos throw instead of degrading review depth. Reported by bot review on PR #328. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/codex-companion.mjs | 5 ++--- tests/investigation.test.mjs | 25 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 84caf7c9..78e65c3d 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -729,11 +729,10 @@ async function handleReviewCommand(argv, config) { const rawMaxTurns = options["max-investigation-turns"]; let maxInvestigationTurns; if (rawMaxTurns !== undefined) { - const parsedTurns = Number.parseInt(rawMaxTurns, 10); - if (!Number.isFinite(parsedTurns) || parsedTurns <= 0) { + if (!/^[1-9][0-9]*$/.test(String(rawMaxTurns))) { throw new Error(`--max-investigation-turns must be a positive integer (got: ${rawMaxTurns})`); } - maxInvestigationTurns = parsedTurns; + maxInvestigationTurns = Number(rawMaxTurns); } const cwd = resolveCommandCwd(options); diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 0808cd33..f0ed7002 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -687,3 +687,28 @@ test("invalid --max-investigation-turns raises a clear error", () => { "error message should explain the validation failure" ); }); + +test("--max-investigation-turns rejects malformed numeric tokens", () => { + // parseInt-style salvage must NOT accept these — they are typos, not valid input. + const cases = [ + "1.5", // parseInt would yield 1 + "10abc", // parseInt would yield 10 + "1e2", // exponential notation: Number(...) yields 100, but the contract is "integer literal" + " 5", // leading whitespace from accidental quoting + "0", // zero is not positive + "-3", // negative integer + "" // empty string + ]; + for (const value of cases) { + const r = spawnSync("node", + [COMPANION_PATH, "adversarial-review", "--max-investigation-turns", value], + { encoding: "utf8", timeout: 30000 } + ); + assert.notEqual(r.status, 0, `value ${JSON.stringify(value)} should exit non-zero`); + assert.match( + r.stderr, + /must be a positive integer/, + `value ${JSON.stringify(value)} should trigger the validation error` + ); + } +}); From daac52f8d6499f48ea2698e229c9e8987c072537 Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 11:50:00 +0800 Subject: [PATCH 16/19] fix(adversarial-review): converge on any 0-command turn with an agent message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy convergence check required `phase: "final_answer"` on the agent message, but recon turns run with outputSchema=null and the app-server does not always tag messages with that phase. Production runs hit the cap with the model emitting "My investigation is complete" on every turn — visible evidence the loop was ignoring the contract the investigate prompt teaches the model. Treat any 0-command turn that emits an agent message as convergence; this matches the prompt and unblocks the runaway-loop scenario. Test fixture: make finalAnswer.phase overridable so we can simulate the real-world case where recon agent messages lack the final_answer tag. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/lib/codex.mjs | 10 ++++++- tests/fake-codex-fixture.mjs | 3 ++- tests/investigation.test.mjs | 41 +++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index 9de987ac..1c6a6b95 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -1130,7 +1130,15 @@ export async function runAppServerInvestigation(cwd, options = {}) { }; } - const converged = turnState.finalAnswerSeen === true && turnCommandCount === 0; + // Convergence: a turn that produces no commands and emits an agent + // message is the contract the investigate prompt teaches the model + // ("a summary message with no further command calls signals readiness"). + // The legacy check required `phase: "final_answer"`, but recon turns + // run with outputSchema=null so the app-server does not always tag + // messages with that phase — leading to runaway turns where the model + // keeps insisting it has converged but the loop refuses to listen. + const turnHadAgentMessage = Boolean(turnState.lastAgentMessage); + const converged = turnCommandCount === 0 && turnHadAgentMessage; if (converged) { break; } diff --git a/tests/fake-codex-fixture.mjs b/tests/fake-codex-fixture.mjs index bca51796..e1ac99cd 100644 --- a/tests/fake-codex-fixture.mjs +++ b/tests/fake-codex-fixture.mjs @@ -397,7 +397,8 @@ rl.on("line", (line) => { } if (entry && entry.finalAnswer) { - send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: entry.finalAnswer.text, phase: "final_answer" } } }); + const phase = entry.finalAnswer.phase ?? "final_answer"; + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: entry.finalAnswer.text, phase } } }); } if (entry && entry.turnError) { diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index f0ed7002..984bb07d 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -65,6 +65,47 @@ test("converges when Codex emits a final-answer turn with no commands", async () } }); +test("converges when agentMessage has no `final_answer` phase tag (real-world case)", async () => { + // In production, recon turns run with outputSchema=null, and the + // app-server does NOT always tag agent messages with phase="final_answer". + // The convergence detector must treat any 0-command turn that emits an + // agent message as convergence — otherwise the model keeps insisting it + // has converged but the loop refuses to stop. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, agent message WITHOUT final_answer phase + // => must still converge (this is what real codex sends in recon mode). + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "My investigation is complete.", phase: "agent_message" } + }); + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.investigation.turnCount, 2, + "convergence on the no-command + agentMessage turn must not be missed"); + assert.equal(result.investigation.truncated, false); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "2 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + test("respects maxInvestigationTurns and marks truncated", async () => { const cwd = makeTempDir("codex-inv-test-"); const fake = setupFakeCodex({ cwd }); From a7a90c7d731ea2e5b61899d9e45e75d9abc4d111 Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 15:30:18 +0800 Subject: [PATCH 17/19] fix(adversarial-review): retry finalize turn if the model runs commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production observation: even with a clean recon convergence, the model sometimes emits a tool-call stub (e.g. {"cmd": "wc -l README.md"}) during the finalize turn instead of the structured review JSON. The companion's parser then surfaces "Validation error: Missing string verdict" with the raw stub as the final message — a confusing failure for the user when the investigation produced real findings. Fix in two layers: 1. Tighten the finalize prompt: explicitly forbid running commands or emitting anything other than the schema-conforming JSON. 2. Detect the contract violation in the orchestrator: if the finalize turn's commandExecutions is non-empty, retry once with a STRICT FINALIZE preamble that reiterates the no-tool-call rule. After two attempts, surface whatever the model produced so the parser can still flag malformed output instead of looping forever. Test fixes: - Adapt `respects maxInvestigationTurns` so the 4th queued entry is the pure-JSON finalize turn (previous version queued 4 cmd-only entries before finalize, accidentally relying on the old leniency). - Add coverage for the new retry path: success-on-second-attempt and give-up-after-two-attempts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompts/adversarial-review-finalize.md | 3 +- plugins/codex/scripts/lib/codex.mjs | 94 ++++++++++++------ tests/investigation.test.mjs | 98 ++++++++++++++++++- 3 files changed, 162 insertions(+), 33 deletions(-) diff --git a/plugins/codex/prompts/adversarial-review-finalize.md b/plugins/codex/prompts/adversarial-review-finalize.md index 15c6c11e..c4d0097b 100644 --- a/plugins/codex/prompts/adversarial-review-finalize.md +++ b/plugins/codex/prompts/adversarial-review-finalize.md @@ -19,7 +19,8 @@ A finding should answer: -Return only valid JSON matching the provided schema. +This is the finalization turn. Do NOT run any shell commands or tool calls in this turn — your investigation is already complete and you have all the context you need from the prior turns of this thread. +Return only valid JSON matching the provided schema. Your entire output must be that JSON — no prose before or after, no shell commands, no tool-call payloads. Keep the output compact and specific. Use `needs-attention` if there is any material risk worth blocking on. Use `approve` only if you cannot support any substantive adversarial finding from your investigation. diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index 1c6a6b95..269f57d7 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -1154,36 +1154,72 @@ export async function runAppServerInvestigation(cwd, options = {}) { emitProgress(options.onProgress, "Investigation complete; finalizing structured output.", "finalizing"); + // The finalize turn is supposed to emit only the structured JSON. In + // practice the model sometimes emits a tool-call stub instead (e.g. + // {"cmd": "wc -l ..."}) — if any commands ran during finalize, treat + // that as a contract violation and retry once with a sharper prompt. + const STRICT_FINALIZE_REMINDER = + "STRICT FINALIZE: do not run any shell commands. Output ONLY the JSON " + + "matching the schema, with no prose, no tool calls, and nothing else.\n\n"; let finalizeState; - try { - finalizeState = await captureTurn( - client, - threadId, - () => - client.request("turn/start", { - threadId, - input: buildTurnInput(finalizePrompt), - model: options.model ?? null, - effort: options.effort ?? null, - outputSchema: options.outputSchema ?? null - }), - { onProgress: options.onProgress } - ); - } catch (transportError) { - return { - status: 1, - threadId, - turnId: null, - finalMessage: "", - reasoningSummary: [], - turn: null, - error: { message: transportError?.message ?? String(transportError) }, - stderr: cleanCodexStderr(client.stderr), - fileChanges: aggregatedFileChanges, - touchedFiles: collectTouchedFiles(aggregatedFileChanges), - commandExecutions: aggregatedCommandExecutions, - investigation: { turnCount, truncated } - }; + let finalizeAttempts = 0; + const MAX_FINALIZE_ATTEMPTS = 2; + while (finalizeAttempts < MAX_FINALIZE_ATTEMPTS) { + finalizeAttempts += 1; + const promptText = finalizeAttempts === 1 + ? finalizePrompt + : STRICT_FINALIZE_REMINDER + finalizePrompt; + try { + finalizeState = await captureTurn( + client, + threadId, + () => + client.request("turn/start", { + threadId, + input: buildTurnInput(promptText), + model: options.model ?? null, + effort: options.effort ?? null, + outputSchema: options.outputSchema ?? null + }), + { onProgress: options.onProgress } + ); + } catch (transportError) { + return { + status: 1, + threadId, + turnId: null, + finalMessage: "", + reasoningSummary: [], + turn: null, + error: { message: transportError?.message ?? String(transportError) }, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated } + }; + } + + // If the finalize turn ran commands, the model violated the contract. + // Retry once with a stricter prompt; if it still fails, accept the + // (likely-malformed) output and let the parser surface the error. + if (finalizeState.commandExecutions.length === 0) { + break; + } + if (finalizeAttempts < MAX_FINALIZE_ATTEMPTS) { + emitProgress( + options.onProgress, + "Finalize turn ran commands; retrying with stricter prompt.", + "finalizing" + ); + } + // Aggregate the wasted commands so the caller still sees what happened. + for (const cmd of finalizeState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of finalizeState.fileChanges) { + aggregatedFileChanges.push(change); + } } for (const cmd of finalizeState.commandExecutions) { diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 984bb07d..e8cab087 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -110,13 +110,14 @@ test("respects maxInvestigationTurns and marks truncated", async () => { const cwd = makeTempDir("codex-inv-test-"); const fake = setupFakeCodex({ cwd }); try { - // Queue 4 recon turns: all have commands, no final answer - for (let i = 0; i < 4; i++) { + // Queue 3 recon turns: all have commands, no final answer. + // (With maxInvestigationTurns=3 the loop will exhaust here.) + for (let i = 0; i < 3; i++) { fake.queueTurnResponse({ commands: [{ command: `check-${i}`, exitCode: 0 }] }); } - // Finalize turn + // Finalize turn: pure JSON, zero commands. fake.queueTurnResponse({ finalAnswer: { text: APPROVE_REVIEW } }); @@ -268,6 +269,97 @@ test("phase-1 hard error (transport throw) aborts before phase-2 finalize", asyn } }); +test("finalize turn that runs commands triggers one strict-prompt retry", async () => { + // Production observation: the model occasionally emits a tool-call stub + // during finalize (e.g. {"cmd": "wc -l ..."}) instead of the structured + // JSON. When that happens the finalize turn has commandExecutions.length > 0. + // The orchestrator should detect this contract violation and retry once + // with a stricter prompt. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon: converge. + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // First finalize attempt: model misbehaves and runs a command. + fake.queueTurnResponse({ + commands: [{ command: "wc -l README.md", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l README.md\"}" } + }); + // Second finalize attempt: model behaves and emits proper JSON. + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.finalMessage, APPROVE_REVIEW, + "the second (well-behaved) finalize attempt should be the final message"); + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "1 recon + 2 finalize attempts"); + + // The retry must use a stricter prompt distinct from the first try. + const finalizeStarts = starts.slice(1); // first start is recon + assert.match( + finalizeStarts[1].params.input?.[0]?.text ?? "", + /STRICT FINALIZE/, + "retry prompt must include the stricter directive" + ); + assert.doesNotMatch( + finalizeStarts[0].params.input?.[0]?.text ?? "", + /STRICT FINALIZE/, + "first finalize attempt uses the normal prompt" + ); + } finally { + fake.close(); + } +}); + +test("finalize retry gives up after the second attempt and surfaces the output", async () => { + // If the model misbehaves twice in a row, the orchestrator must not loop + // forever — it should accept the second attempt's output and let the + // upstream parser produce a useful validation error. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Both finalize attempts misbehave. + fake.queueTurnResponse({ + commands: [{ command: "wc -l a", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l a\"}" } + }); + fake.queueTurnResponse({ + commands: [{ command: "wc -l b", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l b\"}" } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "1 recon + 2 finalize attempts only — no infinite loop"); + assert.equal(result.finalMessage, "{\"cmd\":\"wc -l b\"}", + "the second-attempt output is surfaced so the parser can flag it"); + } finally { + fake.close(); + } +}); + test("phase-2 finalize transport error preserves investigation metadata", async () => { const cwd = makeTempDir("codex-inv-test-"); const fake = setupFakeCodex({ cwd }); From 324836e24bc2b78757dc56db5b983367032cc2cd Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 15:41:14 +0800 Subject: [PATCH 18/19] fix(adversarial-review): narrow inline-diff path to single-file reviews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production observation: a 2-file untracked review of two large markdown docs (~100KB total) hit the inline-diff path, embedded both files into a single-turn schema-pinned prompt, and the model responded with a tool-call stub `{"cmd": "wc -l ..."}` instead of the review JSON. The companion's parser then surfaced "Validation error: Missing string verdict" — the bug PR #328 thought it had fixed, except inline-diff is a separate code path from self-collect and was never touched. Two layers: 1. Lower the inline cap from 2 files to 1. The inline path embeds full file contents into a single turn with outputSchema pinned and has no recovery if the model wants to investigate before producing a verdict. Anything larger than 1 file falls through to the two-phase self-collect path which tolerates exploratory recon turns. 2. Tighten the inline prompt: explicitly forbid running shell commands or emitting tool-call stubs, with the `{"cmd": "..."}` anti-pattern named so the model cannot rationalize emitting one. Tests: - New git.test.mjs case locks 2 files → self-collect. - New investigation.test.mjs cases lock the prompt-contract for both the inline and finalize prompts (no shell commands, no tool-call stubs). Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/prompts/adversarial-review.md | 3 +- plugins/codex/scripts/lib/git.mjs | 7 ++++- tests/git.test.mjs | 22 ++++++++++++++ tests/investigation.test.mjs | 32 +++++++++++++++++++++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/plugins/codex/prompts/adversarial-review.md b/plugins/codex/prompts/adversarial-review.md index 78668af6..036e66f7 100644 --- a/plugins/codex/prompts/adversarial-review.md +++ b/plugins/codex/prompts/adversarial-review.md @@ -46,7 +46,8 @@ A finding should answer: -Return only valid JSON matching the provided schema. +This is a single-turn review. Do NOT run any shell commands or tool calls — the file contents you need are already embedded in the repository_context block below. Do not emit a tool-use stub like `{"cmd": "..."}` instead of the review; that is not the schema. +Return only valid JSON matching the provided schema. Your entire output must be that JSON — no prose before or after, no shell commands, no tool-call payloads. Keep the output compact and specific. Use `needs-attention` if there is any material risk worth blocking on. Use `approve` only if you cannot support any substantive adversarial finding from the provided context. diff --git a/plugins/codex/scripts/lib/git.mjs b/plugins/codex/scripts/lib/git.mjs index 1749cfc8..c06c599b 100644 --- a/plugins/codex/scripts/lib/git.mjs +++ b/plugins/codex/scripts/lib/git.mjs @@ -5,7 +5,12 @@ import { isProbablyText } from "./fs.mjs"; import { formatCommandFailure, runCommand, runCommandChecked } from "./process.mjs"; const MAX_UNTRACKED_BYTES = 24 * 1024; -const DEFAULT_INLINE_DIFF_MAX_FILES = 2; +// Inline-diff embeds full file contents into the prompt and pins outputSchema +// on a single turn — there is no recovery if the model wants to investigate +// before producing the verdict. Keep this path narrow: only single-file +// reviews of small diffs use it. Anything larger falls through to the +// two-phase self-collect path which can tolerate exploratory turns. +const DEFAULT_INLINE_DIFF_MAX_FILES = 1; const DEFAULT_INLINE_DIFF_MAX_BYTES = 256 * 1024; function git(cwd, args, options = {}) { diff --git a/tests/git.test.mjs b/tests/git.test.mjs index 14ff2576..9b567e4d 100644 --- a/tests/git.test.mjs +++ b/tests/git.test.mjs @@ -86,6 +86,28 @@ test("collectReviewContext keeps inline diffs for tiny adversarial reviews", () assert.match(context.content, /INLINE_MARKER/); }); +test("collectReviewContext routes 2-file changes to self-collect (inline cap is 1)", () => { + // Regression guard: a 2-file change used to slip into inline-diff because + // the cap was 2, embedding both files into a single-turn schema-pinned + // prompt — and the model often responded with a tool-call stub instead + // of the review JSON. Two files now go through the two-phase self-collect + // path which tolerates exploratory turns. + const cwd = makeTempDir(); + initGitRepo(cwd); + fs.writeFileSync(path.join(cwd, "seed.js"), "export const value = 'seed';\n"); + run("git", ["add", "seed.js"], { cwd }); + run("git", ["commit", "-m", "init"], { cwd }); + fs.writeFileSync(path.join(cwd, "doc-one.md"), "# planning doc\n".repeat(50)); + fs.writeFileSync(path.join(cwd, "doc-two.md"), "# spec doc\n".repeat(50)); + + const target = resolveReviewTarget(cwd, {}); + const context = collectReviewContext(cwd, target); + + assert.equal(context.fileCount, 2); + assert.equal(context.inputMode, "self-collect", + "2-file changes must NOT be inlined; they hit the schema-pinned single-turn bug otherwise"); +}); + test("collectReviewContext skips untracked directories in working tree review", () => { const cwd = makeTempDir(); initGitRepo(cwd); diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index e8cab087..9f2f45a3 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -845,3 +845,35 @@ test("--max-investigation-turns rejects malformed numeric tokens", () => { ); } }); + +// ------------------------------------------------------------------- +// Prompt contract: inline + finalize prompts must forbid tool-call stubs +// ------------------------------------------------------------------- + +import { readFileSync } from "node:fs"; +import { fileURLToPath as fileURLToPathPromptCheck } from "node:url"; + +const PROMPT_DIR = fileURLToPathPromptCheck( + new URL("../plugins/codex/prompts/", import.meta.url) +); + +test("inline adversarial-review prompt forbids tool-call stub output", () => { + const text = readFileSync(`${PROMPT_DIR}adversarial-review.md`, "utf8"); + // The model used to respond with payloads like {"cmd": "wc -l ..."} + // instead of the review JSON — the prompt must explicitly disallow it + // since this path is single-turn and has no recovery. + assert.match(text, /tool[- ]?call|tool[- ]?use/i, + "inline prompt must mention tool-call/tool-use"); + assert.match(text, /\{"cmd"|stub/i, + "inline prompt must show or name the tool-call stub anti-pattern"); + assert.match(text, /Do NOT run any shell commands/, + "inline prompt must explicitly forbid running shell commands"); +}); + +test("finalize prompt forbids tool-call stub output", () => { + const text = readFileSync(`${PROMPT_DIR}adversarial-review-finalize.md`, "utf8"); + assert.match(text, /Do NOT run any shell commands/, + "finalize prompt must explicitly forbid running shell commands"); + assert.match(text, /no tool-call payloads|no shell commands/i, + "finalize prompt must spell out the no-tool-call rule"); +}); From d222542840a38f0325ad7ed631cba433cca68885 Mon Sep 17 00:00:00 2001 From: Kent Date: Mon, 18 May 2026 20:14:15 +0800 Subject: [PATCH 19/19] fix(adversarial-review): force status=1 when investigation turn errors When captureTurn records an error notification but the app-server still emits turn/completed with status="completed", buildResultStatus was returning 0. That made /codex:adversarial-review exit 0 even though no valid review JSON was produced (only parseError), so CI/automation could treat a failed run as success. Honor turnState.error in buildResultStatus so all four call sites (review, turn, investigation soft-error branch, finalize) report a non-zero status whenever an error was observed. Strengthen the existing soft-error test to assert result.status === 1. Co-Authored-By: Claude Opus 4.7 (1M context) --- plugins/codex/scripts/lib/codex.mjs | 3 +++ tests/investigation.test.mjs | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index 269f57d7..9a64f530 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -658,6 +658,9 @@ async function resumeThread(client, threadId, cwd, options = {}) { } function buildResultStatus(turnState) { + if (turnState.error) { + return 1; + } return turnState.finalTurn?.status === "completed" ? 0 : 1; } diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs index 9f2f45a3..4ee571ec 100644 --- a/tests/investigation.test.mjs +++ b/tests/investigation.test.mjs @@ -231,6 +231,10 @@ test("phase-1 soft error (turn/failed) aborts before phase-2 finalize", async () assert.ok(result.error, "result should have an error"); assert.match(result.error.message, /model produced unrenderable response/); assert.equal(result.investigation.turnCount, 2, "soft-error turn IS counted"); + // A turn/completed with status="completed" can still arrive after an + // error notification, so result.status must be derived from the error, + // not from finalTurn.status. CI/automation relies on this. + assert.equal(result.status, 1, "soft-error path returns numeric status 1"); const requests = fake.requests; const starts = requests.filter((r) => r.method === "turn/start");