diff --git a/README.md b/README.md index d4cba76..e1b98bf 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ The result is not a magical mind meld. It is a practical workflow: the agents on - Stale Claude frontend attachments are probed and evicted instead of blocking a new Claude session indefinitely. - Codex turns have a watchdog fallback, and the viewer/status model now distinguishes idle, busy, stale, and offline agent states. - Codex resume/fork argument handling, active thread tracking, and app-server port cleanup are more robust, including LISTEN-only and Windows-aware process checks. +- Reliability guards now bound backup-agent timeout cleanup, daemon shutdown steps, and long-running Codex turns with visible ledger/status telemetry. - The browser viewer remains the **Command Deck**: a read-only, color-coded, latest-first dashboard with clear history controls, task lanes, artifacts, policy state, and connection health. ## What It Helps With @@ -724,7 +725,7 @@ Business value: the viewer and `ctxrelay status` no longer make a completed or s Technical shape: -- Codex exposes `idle`, `busy`, `stale`, and `offline` state. A watchdog force-clears a Codex turn only after a configurable silence window, emits a visible forced-completion marker, and lets Claude send again. +- Codex exposes `idle`, `busy`, `stale`, and `offline` state. Watchdogs force-clear a Codex turn after either a configurable silence window or a configurable per-turn wall-clock budget, emit visible markers, and let Claude send again. - Claude exposes `idle`, `expected`, `stale`, and `offline` state. Claude-owned lanes become stale after a configurable response timeout. - A new Claude frontend does not blindly replace a live session. The daemon probes the incumbent frontend first; if it responds, the newcomer is rejected, and if it does not, the stale frontend is evicted with close code `4002`. @@ -911,8 +912,11 @@ The exact runtime state path can be changed with `CONTEXTRELAY_STATE_DIR`. Norma | `CONTEXTRELAY_CLAUDE_PROBE_TIMEOUT_MS` | `3000` | How long the daemon waits for an attached Claude frontend to answer a liveness probe before evicting it as stale. Set to `0` to disable probe eviction and keep the older reject-only behavior. | | `CONTEXTRELAY_CLAUDE_RESPONSE_TIMEOUT_MS` | `300000` | How long a Claude-owned active task lane can remain unanswered before the task board marks it stale. | | `CONTEXTRELAY_CODEX_TURN_IDLE_TIMEOUT_MS` | `300000` | Silence window before a stuck Codex turn is force-cleared and reported as a forced completion. | +| `CONTEXTRELAY_TURN_MAX_MS` | `300000` | Wall-clock budget for a single Codex turn before that turn is cleared from the busy set and reported as a turn watchdog event. Set to `0` to disable this guard. | | `CONTEXTRELAY_MAX_DEPTH` | `3` | Maximum relay recursion depth. | | `CONTEXTRELAY_BACKUP_THROTTLE_MS` | `60000` | Minimum delay between backup starts for the same target. | +| `CONTEXTRELAY_BACKUP_KILL_GRACE_MS` | `2000` | Grace period between backup timeout SIGTERM and SIGKILL escalation. | +| `CONTEXTRELAY_DAEMON_SHUTDOWN_STEP_TIMEOUT_MS` | `1500` | Per-step deadline for daemon shutdown cleanup before recording the step as timed out and continuing shutdown. | | `CONTEXTRELAY_MAX_CONTROL_MESSAGE_BYTES` | `1000000` | Maximum accepted control WebSocket message size. | | `CONTEXTRELAY_MAX_CONTROL_MESSAGES_PER_MINUTE` | `120` | Per-control-connection rate limit. | | `CONTEXTRELAY_DAEMON_ENTRY` | bundled daemon | Plugin daemon entry. Overrides require `CONTEXTRELAY_ALLOW_DAEMON_ENTRY_OVERRIDE=1`. | diff --git a/dist/cli.js b/dist/cli.js index 222e6e3..376087c 100755 --- a/dist/cli.js +++ b/dist/cli.js @@ -2867,6 +2867,9 @@ function formatRuntimeEvent(event) { event.path ? `path: ${event.path}` : undefined, event.method ? `method: ${event.method}` : undefined, typeof event.exitCode === "number" ? `exit_code: ${event.exitCode}` : undefined, + typeof event.durationMs === "number" ? `duration_ms: ${event.durationMs}` : undefined, + event.requestId ? `request_id: ${event.requestId}` : undefined, + event.backupTarget ? `backup_target: ${event.backupTarget}` : undefined, event.detail ].filter(Boolean); return details.join(` diff --git a/plugins/contextrelay/server/daemon.js b/plugins/contextrelay/server/daemon.js index 14f88e7..a023720 100755 --- a/plugins/contextrelay/server/daemon.js +++ b/plugins/contextrelay/server/daemon.js @@ -334,6 +334,7 @@ class CodexAdapter extends EventEmitter { localAuthToken; stateDir; turnIdleTimeoutMs; + turnMaxMs; static RESPONSE_TRACKING_TTL_MS = 30000; proc = null; appServerWs = null; @@ -352,8 +353,11 @@ class CodexAdapter extends EventEmitter { activeTurnIds = new Set; turnInProgress = false; turnWatchdogTimer = null; + turnMaxTimers = new Map; + turnStartedAt = new Map; lastCodexActivityAt = Date.now(); forcedTurnResetAt = 0; + lastTurnWatchdog = null; nextProxyId = 1e5; upstreamToClient = new Map; serverRequestToProxy = new Map; @@ -366,11 +370,12 @@ class CodexAdapter extends EventEmitter { reconnectingForNewSession = false; replayingBufferedMessages = false; appServerGeneration = 0; - constructor(appPort = 4500, proxyPort = 4501, logFile = new StateDirResolver().logFile, localAuthToken = "", stateDir = new StateDirResolver, turnIdleTimeoutMs = envInt("CONTEXTRELAY_CODEX_TURN_IDLE_TIMEOUT_MS", "AGENTBRIDGE_CODEX_TURN_IDLE_TIMEOUT_MS", 300000)) { + constructor(appPort = 4500, proxyPort = 4501, logFile = new StateDirResolver().logFile, localAuthToken = "", stateDir = new StateDirResolver, turnIdleTimeoutMs = envInt("CONTEXTRELAY_CODEX_TURN_IDLE_TIMEOUT_MS", "AGENTBRIDGE_CODEX_TURN_IDLE_TIMEOUT_MS", 300000), turnMaxMs = envInt("CONTEXTRELAY_TURN_MAX_MS", "AGENTBRIDGE_TURN_MAX_MS", 300000)) { super(); this.localAuthToken = localAuthToken; this.stateDir = stateDir; this.turnIdleTimeoutMs = turnIdleTimeoutMs; + this.turnMaxMs = turnMaxMs; this.appPort = appPort; this.proxyPort = proxyPort; this.logFile = logFile; @@ -387,6 +392,9 @@ class CodexAdapter extends EventEmitter { get lastForcedTurnResetAt() { return this.forcedTurnResetAt; } + get lastTurnWatchdogSummary() { + return this.lastTurnWatchdog; + } static buildPortListenLsofArgs(port) { return ["-ti", `tcp:${port}`, "-sTCP:LISTEN"]; } @@ -432,6 +440,7 @@ class CodexAdapter extends EventEmitter { this.proxyServer = null; this.clearResponseTrackingState(); this.clearTurnWatchdog(); + this.clearAllTurnMaxWatchdogs(); } stop() { this.intentionalDisconnect = true; @@ -553,6 +562,7 @@ class CodexAdapter extends EventEmitter { this.activeTurnIds.clear(); this.turnInProgress = false; this.clearTurnWatchdog(); + this.clearAllTurnMaxWatchdogs(); try { await this.connectToAppServer(false); this.log("App-server reconnected for new TUI session \u2014 replaying buffered messages"); @@ -1184,11 +1194,13 @@ class CodexAdapter extends EventEmitter { markTurnStarted(turnId) { const wasInProgress = this.turnInProgress; this.noteCodexActivity(); + const normalizedTurnId = typeof turnId === "string" && turnId.length > 0 ? turnId : `unknown:${Date.now()}`; if (typeof turnId === "string" && turnId.length > 0) { - this.activeTurnIds.add(turnId); + this.activeTurnIds.add(normalizedTurnId); } else { - this.activeTurnIds.add(`unknown:${Date.now()}`); + this.activeTurnIds.add(normalizedTurnId); } + this.scheduleTurnMaxWatchdog(normalizedTurnId); this.turnInProgress = this.activeTurnIds.size > 0; if (!wasInProgress && this.turnInProgress) { this.emit("turnStarted"); @@ -1198,8 +1210,10 @@ class CodexAdapter extends EventEmitter { markTurnCompleted(turnId) { if (typeof turnId === "string" && turnId.length > 0) { this.activeTurnIds.delete(turnId); + this.clearTurnMaxWatchdog(turnId); } else { this.activeTurnIds.clear(); + this.clearAllTurnMaxWatchdogs(); } this.turnInProgress = this.activeTurnIds.size > 0; if (this.turnInProgress) { @@ -1231,6 +1245,7 @@ class CodexAdapter extends EventEmitter { } const activeTurns = [...this.activeTurnIds]; this.activeTurnIds.clear(); + this.clearAllTurnMaxWatchdogs(); this.turnInProgress = false; this.forcedTurnResetAt = Date.now(); this.log(`Codex turn watchdog forced completion after ${idleFor}ms of silence (turns=${activeTurns.join(",") || "unknown"})`); @@ -1250,6 +1265,58 @@ class CodexAdapter extends EventEmitter { clearTimeout(this.turnWatchdogTimer); this.turnWatchdogTimer = null; } + scheduleTurnMaxWatchdog(turnId) { + this.clearTurnMaxWatchdog(turnId); + if (this.turnMaxMs <= 0) + return; + const startedAt = Date.now(); + this.turnStartedAt.set(turnId, startedAt); + const timer = setTimeout(() => this.handleTurnMaxWatchdog(turnId, startedAt), this.turnMaxMs); + timer.unref?.(); + this.turnMaxTimers.set(turnId, timer); + } + handleTurnMaxWatchdog(turnId, startedAt) { + if (!this.activeTurnIds.has(turnId)) + return; + this.clearTurnMaxWatchdog(turnId); + const firedAt = Date.now(); + const elapsedMs = firedAt - startedAt; + this.activeTurnIds.delete(turnId); + this.turnInProgress = this.activeTurnIds.size > 0; + this.forcedTurnResetAt = firedAt; + this.lastTurnWatchdog = { turnId, firedAt, elapsedMs }; + this.log(`Codex turn wall-clock watchdog fired after ${elapsedMs}ms (turn=${turnId})`); + this.emitRuntimeEvent({ + kind: "turn_watchdog", + status: "blocked", + title: "Codex turn exceeded wall-clock budget", + detail: `Turn ${turnId} exceeded ${this.turnMaxMs}ms; cleared that turn from the busy set without killing Codex.`, + turnId, + durationMs: elapsedMs + }); + this.emit("turnWatchdog", this.lastTurnWatchdog); + if (this.turnInProgress) { + this.scheduleTurnWatchdog(); + } else { + this.clearTurnWatchdog(); + this.emit("turnCompletedForced", { idleForMs: elapsedMs, turnIds: [turnId] }); + this.emit("turnCompleted"); + } + } + clearTurnMaxWatchdog(turnId) { + const timer = this.turnMaxTimers.get(turnId); + if (timer) + clearTimeout(timer); + this.turnMaxTimers.delete(turnId); + this.turnStartedAt.delete(turnId); + } + clearAllTurnMaxWatchdogs() { + for (const timer of this.turnMaxTimers.values()) { + clearTimeout(timer); + } + this.turnMaxTimers.clear(); + this.turnStartedAt.clear(); + } requestKey(id) { if (typeof id === "number" || typeof id === "string") return String(id); @@ -2850,6 +2917,9 @@ function formatRuntimeEvent(event) { event.path ? `path: ${event.path}` : undefined, event.method ? `method: ${event.method}` : undefined, typeof event.exitCode === "number" ? `exit_code: ${event.exitCode}` : undefined, + typeof event.durationMs === "number" ? `duration_ms: ${event.durationMs}` : undefined, + event.requestId ? `request_id: ${event.requestId}` : undefined, + event.backupTarget ? `backup_target: ${event.backupTarget}` : undefined, event.detail ].filter(Boolean); return details.join(` @@ -3087,22 +3157,48 @@ function buildBackupEnv(target, sourceEnv = process.env) { } return allowed; } -async function runBackupAgent(request) { +async function runBackupAgent(request, options = {}) { const started = Date.now(); - const { command, args } = buildBackupCommand(request); + const defaultCommand = buildBackupCommand(request); + const command = options.command ?? defaultCommand.command; + const args = options.args ?? defaultCommand.args; + const killGraceMs = Math.max(0, options.killGraceMs ?? envInt("CONTEXTRELAY_BACKUP_KILL_GRACE_MS", "AGENTBRIDGE_BACKUP_KILL_GRACE_MS", 2000)); return new Promise((resolve2) => { const child = spawn3(command, args, { cwd: request.cwd, stdio: ["ignore", "pipe", "pipe"], - env: buildBackupEnv(request.target) + env: options.env ?? buildBackupEnv(request.target) }); let stdout = ""; let stderr = ""; let timedOut = false; + let closed = false; + let killed = false; + let killTimer = null; + const clearTimers = () => { + clearTimeout(timer); + if (killTimer) + clearTimeout(killTimer); + killTimer = null; + }; const timer = setTimeout(() => { timedOut = true; child.kill("SIGTERM"); + options.onTimeout?.({ + target: request.target, + requestId: request.requestId, + timeoutMs: request.timeoutMs, + durationMs: Date.now() - started + }); + killTimer = setTimeout(() => { + if (closed) + return; + killed = true; + child.kill("SIGKILL"); + }, killGraceMs); + killTimer.unref?.(); }, request.timeoutMs); + timer.unref?.(); child.stdout?.on("data", (chunk) => { stdout += chunk.toString(); }); @@ -3110,7 +3206,7 @@ async function runBackupAgent(request) { stderr += chunk.toString(); }); child.on("error", (err) => { - clearTimeout(timer); + clearTimers(); resolve2({ target: request.target, ok: false, @@ -3119,22 +3215,28 @@ async function runBackupAgent(request) { exitCode: null, durationMs: Date.now() - started, cost: null, - tokens: null + tokens: null, + timedOut, + killed }); }); - child.on("close", (code) => { - clearTimeout(timer); + child.on("close", (code, signal) => { + closed = true; + clearTimers(); const extracted = extractStreamJson(stdout); + const timeoutLine = timedOut ? `Timed out after ${request.timeoutMs}ms${killed ? "; escalated to SIGKILL" : ""}${signal ? `; signal=${signal}` : ""}` : ""; resolve2({ target: request.target, ok: code === 0 && !timedOut, output: extracted.output || stdout.trim(), stderr: timedOut ? `${stderr.trim()} -Timed out after ${request.timeoutMs}ms`.trim() : stderr.trim(), +${timeoutLine}`.trim() : stderr.trim(), exitCode: code, durationMs: Date.now() - started, cost: extracted.cost, - tokens: extracted.tokens + tokens: extracted.tokens, + timedOut, + killed }); }); }); @@ -3549,6 +3651,7 @@ function viewerHtml() { card("Codex", status.codexState || (status.codexTurnInProgress ? "busy" : "idle"), status.codexState === "idle"), card("Claude", status.claudeState || (status.claudeConnected ? "connected" : "offline"), status.claudeState === "idle"), card("Codex TUI", status.tuiConnected ? "connected" : "offline", !!status.tuiConnected), + card("Turn Watchdog", status.lastTurnWatchdog ? new Date(status.lastTurnWatchdog.firedAt).toLocaleTimeString() : "none", !status.lastTurnWatchdog), card("Session", status.sessionId || "none"), card("Ledger entries", status.ledgerEntries ?? 0), card("Autonomy", status.autonomyEnabled ? "on" : "off", !!status.autonomyEnabled), @@ -3705,6 +3808,49 @@ async function runLivenessProbe(options) { return acked ? "alive" : "stale"; } +// src/shutdown.ts +async function runShutdownSteps(steps, deadlineMs) { + const results = []; + const boundedDeadline = Math.max(1, deadlineMs); + for (const step of steps) { + const started = Date.now(); + let timer = null; + try { + await Promise.race([ + Promise.resolve().then(step.run), + new Promise((_, reject) => { + timer = setTimeout(() => reject(new ShutdownTimeoutError(step.name)), boundedDeadline); + timer.unref?.(); + }) + ]); + results.push({ + name: step.name, + ok: true, + timedOut: false, + durationMs: Date.now() - started + }); + } catch (err) { + results.push({ + name: step.name, + ok: false, + timedOut: err instanceof ShutdownTimeoutError, + durationMs: Date.now() - started, + error: err?.message ?? String(err) + }); + } finally { + if (timer) + clearTimeout(timer); + } + } + return results; +} + +class ShutdownTimeoutError extends Error { + constructor(stepName) { + super(`shutdown step timed out: ${stepName}`); + } +} + // src/daemon.ts var configService = new ConfigService; var config = configService.loadOrDefault(); @@ -3721,6 +3867,7 @@ var TUI_DISCONNECT_GRACE_MS = parseInt(process.env.TUI_DISCONNECT_GRACE_MS ?? "2 var CLAUDE_DISCONNECT_GRACE_MS = 5000; var CLAUDE_PROBE_TIMEOUT_MS = envInt("CONTEXTRELAY_CLAUDE_PROBE_TIMEOUT_MS", "AGENTBRIDGE_CLAUDE_PROBE_TIMEOUT_MS", 3000); var CLAUDE_RESPONSE_TIMEOUT_MS = envInt("CONTEXTRELAY_CLAUDE_RESPONSE_TIMEOUT_MS", "AGENTBRIDGE_CLAUDE_RESPONSE_TIMEOUT_MS", 300000); +var DAEMON_SHUTDOWN_STEP_TIMEOUT_MS = envInt("CONTEXTRELAY_DAEMON_SHUTDOWN_STEP_TIMEOUT_MS", "AGENTBRIDGE_DAEMON_SHUTDOWN_STEP_TIMEOUT_MS", 1500); var MAX_BUFFERED_MESSAGES = envInt("CONTEXTRELAY_MAX_BUFFERED_MESSAGES", "AGENTBRIDGE_MAX_BUFFERED_MESSAGES", 100); var MAX_CONTROL_MESSAGE_BYTES = envInt("CONTEXTRELAY_MAX_CONTROL_MESSAGE_BYTES", "AGENTBRIDGE_MAX_CONTROL_MESSAGE_BYTES", 1e6); var MAX_CONTROL_MESSAGES_PER_MINUTE = envInt("CONTEXTRELAY_MAX_CONTROL_MESSAGES_PER_MINUTE", "AGENTBRIDGE_MAX_CONTROL_MESSAGES_PER_MINUTE", 120); @@ -3900,6 +4047,10 @@ codex.on("turnCompletedForced", (event) => { log(`Codex turn watchdog forced idle reset after ${event.idleForMs}ms`); emitToClaude(systemMessage("system_turn_completed_forced", `\u26A0\uFE0F Codex stopped sending turn events for ${event.idleForMs}ms, so ${DISPLAY_NAME} cleared the busy state as a fallback. If Codex is still working, wait for a fresh message before sending more work.`)); }); +codex.on("turnWatchdog", (event) => { + log(`Codex turn wall-clock watchdog fired for ${event.turnId} after ${event.elapsedMs}ms`); + emitToClaude(systemMessage("system_turn_watchdog", `\u26A0\uFE0F Codex turn ${event.turnId} exceeded the wall-clock budget after ${event.elapsedMs}ms. ${DISPLAY_NAME} cleared that turn from the busy state without killing Codex; you can retry or inspect status before sending more work.`)); +}); codex.on("ready", (threadId) => { tuiConnectionState.markBridgeReady(); log(`Codex ready \u2014 thread ${threadId}`); @@ -4641,7 +4792,20 @@ ${request.context_refs.map((ref) => `- ${ref}`).join(` target: request.target, prompt, cwd: process.cwd(), - timeoutMs: runtimeConfig.autonomy.backupTimeoutMs + timeoutMs: runtimeConfig.autonomy.backupTimeoutMs, + requestId: requestEntry.id + }, { + onTimeout: async (event) => { + await ledger.appendRuntimeEvent({ + kind: "backup_timeout", + status: "blocked", + title: "Backup agent timed out", + detail: `${event.target} backup exceeded ${event.timeoutMs}ms; waiting for process termination.`, + requestId: event.requestId, + durationMs: event.durationMs, + backupTarget: event.target + }, "system", request.source, sid); + } }); lastBackupResult = { target: request.target, @@ -4673,6 +4837,8 @@ ${result.stderr || "(empty)"}`; durationMs: result.durationMs, cost: result.cost, tokens: result.tokens, + timedOut: result.timedOut, + killed: result.killed, stderr: result.stderr } }); @@ -4872,6 +5038,7 @@ function currentStatus() { return { bridgeReady: tuiConnectionState.canReply(), codexTurnInProgress: codex.turnInProgress, + lastTurnWatchdog: codex.lastTurnWatchdogSummary, codexState: currentCodexState(snapshot.tuiConnected), claudeState: currentClaudeState(claudeSnapshot.connected, taskBoard), deliveryMode: claudeDeliveryMode, @@ -4920,6 +5087,7 @@ function currentViewerContext(url) { connectionHealth: { bridgeReady: status.bridgeReady, codexTurnInProgress: status.codexTurnInProgress, + lastTurnWatchdog: status.lastTurnWatchdog, claudeConnected: status.claudeConnected, tuiConnected: status.tuiConnected, queuedMessageCount: status.queuedMessageCount @@ -5004,22 +5172,63 @@ async function bootCodex() { broadcastStatus(); } } -function shutdown(reason) { +async function shutdown(reason) { if (shuttingDown) return; shuttingDown = true; log(`Shutting down daemon (${reason})...`); - tuiConnectionState.dispose(`daemon shutdown (${reason})`); - claudeAttachmentState.dispose(`daemon shutdown (${reason})`); - controlServer?.stop(); - controlServer = null; - codex.stop(); + const stepResults = await runShutdownSteps([ + { + name: "tui_connection_state", + run: () => tuiConnectionState.dispose(`daemon shutdown (${reason})`) + }, + { + name: "claude_attachment_state", + run: () => claudeAttachmentState.dispose(`daemon shutdown (${reason})`) + }, + { + name: "control_server", + run: () => { + controlServer?.stop(); + controlServer = null; + } + }, + { + name: "codex_adapter", + run: () => codex.stop() + } + ], DAEMON_SHUTDOWN_STEP_TIMEOUT_MS); + for (const result of stepResults) { + if (!result.ok) { + log(`Shutdown step ${result.name} failed${result.timedOut ? " by timeout" : ""}: ${result.error ?? "unknown"}`); + } + } + try { + await ledger.appendRuntimeEvent({ + kind: "daemon_shutdown", + status: stepResults.every((result) => result.ok) ? "completed" : "blocked", + title: "Daemon shutdown", + detail: `reason: ${reason} +steps: +${stepResults.map((result) => `- ${result.name}: ${result.ok ? "ok" : "failed"}${result.timedOut ? " timeout" : ""} ${result.durationMs}ms`).join(` +`)}`, + durationMs: stepResults.reduce((sum, result) => sum + result.durationMs, 0) + }, "system", undefined, sessionId); + } catch (err) { + log(`Failed to record daemon shutdown event: ${err.message}`); + } removePidFile(); removeStatusFile(); process.exit(0); } -process.on("SIGINT", () => shutdown("SIGINT")); -process.on("SIGTERM", () => shutdown("SIGTERM")); +process.on("SIGINT", () => void shutdown("SIGINT").catch((err) => { + log(`Shutdown failed: ${err.stack ?? err.message}`); + process.exit(1); +})); +process.on("SIGTERM", () => void shutdown("SIGTERM").catch((err) => { + log(`Shutdown failed: ${err.stack ?? err.message}`); + process.exit(1); +})); process.on("exit", () => { removePidFile(); removeStatusFile(); diff --git a/src/backup/runner.ts b/src/backup/runner.ts index d062891..cd82b1b 100644 --- a/src/backup/runner.ts +++ b/src/backup/runner.ts @@ -1,11 +1,13 @@ import { spawn } from "node:child_process"; import type { MessageSource } from "../types"; +import { envInt } from "../env"; export interface BackupRunRequest { target: MessageSource; prompt: string; cwd: string; timeoutMs: number; + requestId?: string; } export interface BackupRunResult { @@ -17,6 +19,23 @@ export interface BackupRunResult { durationMs: number; cost: number | null; tokens: number | null; + timedOut: boolean; + killed: boolean; +} + +export interface BackupTimeoutEvent { + target: MessageSource; + requestId?: string; + timeoutMs: number; + durationMs: number; +} + +export interface BackupRunOptions { + command?: string; + args?: string[]; + env?: NodeJS.ProcessEnv; + killGraceMs?: number; + onTimeout?: (event: BackupTimeoutEvent) => void | Promise; } export function buildBackupCommand(request: BackupRunRequest): { command: string; args: string[] } { @@ -69,31 +88,57 @@ export function buildBackupEnv(target: MessageSource, sourceEnv: NodeJS.ProcessE return allowed; } -export async function runBackupAgent(request: BackupRunRequest): Promise { +export async function runBackupAgent(request: BackupRunRequest, options: BackupRunOptions = {}): Promise { const started = Date.now(); - const { command, args } = buildBackupCommand(request); + const defaultCommand = buildBackupCommand(request); + const command = options.command ?? defaultCommand.command; + const args = options.args ?? defaultCommand.args; + const killGraceMs = Math.max(0, options.killGraceMs ?? envInt("CONTEXTRELAY_BACKUP_KILL_GRACE_MS", "AGENTBRIDGE_BACKUP_KILL_GRACE_MS", 2_000)); return new Promise((resolve) => { const child = spawn(command, args, { cwd: request.cwd, stdio: ["ignore", "pipe", "pipe"], - env: buildBackupEnv(request.target), + env: options.env ?? buildBackupEnv(request.target), }); let stdout = ""; let stderr = ""; let timedOut = false; + let closed = false; + let killed = false; + let killTimer: ReturnType | null = null; + + const clearTimers = () => { + clearTimeout(timer); + if (killTimer) clearTimeout(killTimer); + killTimer = null; + }; const timer = setTimeout(() => { timedOut = true; child.kill("SIGTERM"); + void options.onTimeout?.({ + target: request.target, + requestId: request.requestId, + timeoutMs: request.timeoutMs, + durationMs: Date.now() - started, + }); + + killTimer = setTimeout(() => { + if (closed) return; + killed = true; + child.kill("SIGKILL"); + }, killGraceMs); + killTimer.unref?.(); }, request.timeoutMs); + timer.unref?.(); child.stdout?.on("data", (chunk) => { stdout += chunk.toString(); }); child.stderr?.on("data", (chunk) => { stderr += chunk.toString(); }); child.on("error", (err) => { - clearTimeout(timer); + clearTimers(); resolve({ target: request.target, ok: false, @@ -103,21 +148,29 @@ export async function runBackupAgent(request: BackupRunRequest): Promise { - clearTimeout(timer); + child.on("close", (code, signal) => { + closed = true; + clearTimers(); const extracted = extractStreamJson(stdout); + const timeoutLine = timedOut + ? `Timed out after ${request.timeoutMs}ms${killed ? "; escalated to SIGKILL" : ""}${signal ? `; signal=${signal}` : ""}` + : ""; resolve({ target: request.target, ok: code === 0 && !timedOut, output: extracted.output || stdout.trim(), - stderr: timedOut ? `${stderr.trim()}\nTimed out after ${request.timeoutMs}ms`.trim() : stderr.trim(), + stderr: timedOut ? `${stderr.trim()}\n${timeoutLine}`.trim() : stderr.trim(), exitCode: code, durationMs: Date.now() - started, cost: extracted.cost, tokens: extracted.tokens, + timedOut, + killed, }); }); }); diff --git a/src/codex-adapter.ts b/src/codex-adapter.ts index 7e810a3..f288b09 100644 --- a/src/codex-adapter.ts +++ b/src/codex-adapter.ts @@ -94,8 +94,11 @@ export class CodexAdapter extends EventEmitter { private activeTurnIds = new Set(); turnInProgress = false; private turnWatchdogTimer: ReturnType | null = null; + private turnMaxTimers = new Map>(); + private turnStartedAt = new Map(); private lastCodexActivityAt = Date.now(); private forcedTurnResetAt = 0; + private lastTurnWatchdog: { turnId: string; firedAt: number; elapsedMs: number } | null = null; // Proxy-layer id rewriting: upstream uses globally unique ids private nextProxyId = 100000; @@ -120,6 +123,7 @@ export class CodexAdapter extends EventEmitter { private readonly localAuthToken = "", private readonly stateDir = new StateDirResolver(), private readonly turnIdleTimeoutMs = envInt("CONTEXTRELAY_CODEX_TURN_IDLE_TIMEOUT_MS", "AGENTBRIDGE_CODEX_TURN_IDLE_TIMEOUT_MS", 300_000), + private readonly turnMaxMs = envInt("CONTEXTRELAY_TURN_MAX_MS", "AGENTBRIDGE_TURN_MAX_MS", 300_000), ) { super(); this.appPort = appPort; @@ -131,6 +135,7 @@ export class CodexAdapter extends EventEmitter { get proxyUrl() { return `ws://127.0.0.1:${this.proxyPort}`; } get activeThreadId() { return this.threadId; } get lastForcedTurnResetAt() { return this.forcedTurnResetAt; } + get lastTurnWatchdogSummary() { return this.lastTurnWatchdog; } static buildPortListenLsofArgs(port: number): string[] { return ["-ti", `tcp:${port}`, "-sTCP:LISTEN"]; @@ -190,6 +195,7 @@ export class CodexAdapter extends EventEmitter { this.proxyServer = null; this.clearResponseTrackingState(); this.clearTurnWatchdog(); + this.clearAllTurnMaxWatchdogs(); } /** Fully stop: disconnect bridge AND kill the Codex process. */ @@ -327,6 +333,7 @@ export class CodexAdapter extends EventEmitter { this.activeTurnIds.clear(); this.turnInProgress = false; this.clearTurnWatchdog(); + this.clearAllTurnMaxWatchdogs(); try { await this.connectToAppServer(false); @@ -1096,11 +1103,15 @@ export class CodexAdapter extends EventEmitter { private markTurnStarted(turnId?: string) { const wasInProgress = this.turnInProgress; this.noteCodexActivity(); + const normalizedTurnId = typeof turnId === "string" && turnId.length > 0 + ? turnId + : `unknown:${Date.now()}`; if (typeof turnId === "string" && turnId.length > 0) { - this.activeTurnIds.add(turnId); + this.activeTurnIds.add(normalizedTurnId); } else { - this.activeTurnIds.add(`unknown:${Date.now()}`); + this.activeTurnIds.add(normalizedTurnId); } + this.scheduleTurnMaxWatchdog(normalizedTurnId); this.turnInProgress = this.activeTurnIds.size > 0; if (!wasInProgress && this.turnInProgress) { @@ -1112,8 +1123,10 @@ export class CodexAdapter extends EventEmitter { private markTurnCompleted(turnId?: string) { if (typeof turnId === "string" && turnId.length > 0) { this.activeTurnIds.delete(turnId); + this.clearTurnMaxWatchdog(turnId); } else { this.activeTurnIds.clear(); + this.clearAllTurnMaxWatchdogs(); } this.turnInProgress = this.activeTurnIds.size > 0; @@ -1150,6 +1163,7 @@ export class CodexAdapter extends EventEmitter { const activeTurns = [...this.activeTurnIds]; this.activeTurnIds.clear(); + this.clearAllTurnMaxWatchdogs(); this.turnInProgress = false; this.forcedTurnResetAt = Date.now(); this.log(`Codex turn watchdog forced completion after ${idleFor}ms of silence (turns=${activeTurns.join(",") || "unknown"})`); @@ -1170,6 +1184,63 @@ export class CodexAdapter extends EventEmitter { this.turnWatchdogTimer = null; } + private scheduleTurnMaxWatchdog(turnId: string) { + this.clearTurnMaxWatchdog(turnId); + if (this.turnMaxMs <= 0) return; + + const startedAt = Date.now(); + this.turnStartedAt.set(turnId, startedAt); + const timer = setTimeout(() => this.handleTurnMaxWatchdog(turnId, startedAt), this.turnMaxMs); + timer.unref?.(); + this.turnMaxTimers.set(turnId, timer); + } + + private handleTurnMaxWatchdog(turnId: string, startedAt: number) { + if (!this.activeTurnIds.has(turnId)) return; + + this.clearTurnMaxWatchdog(turnId); + const firedAt = Date.now(); + const elapsedMs = firedAt - startedAt; + this.activeTurnIds.delete(turnId); + this.turnInProgress = this.activeTurnIds.size > 0; + this.forcedTurnResetAt = firedAt; + this.lastTurnWatchdog = { turnId, firedAt, elapsedMs }; + + this.log(`Codex turn wall-clock watchdog fired after ${elapsedMs}ms (turn=${turnId})`); + this.emitRuntimeEvent({ + kind: "turn_watchdog", + status: "blocked", + title: "Codex turn exceeded wall-clock budget", + detail: `Turn ${turnId} exceeded ${this.turnMaxMs}ms; cleared that turn from the busy set without killing Codex.`, + turnId, + durationMs: elapsedMs, + }); + this.emit("turnWatchdog", this.lastTurnWatchdog); + + if (this.turnInProgress) { + this.scheduleTurnWatchdog(); + } else { + this.clearTurnWatchdog(); + this.emit("turnCompletedForced", { idleForMs: elapsedMs, turnIds: [turnId] }); + this.emit("turnCompleted"); + } + } + + private clearTurnMaxWatchdog(turnId: string) { + const timer = this.turnMaxTimers.get(turnId); + if (timer) clearTimeout(timer); + this.turnMaxTimers.delete(turnId); + this.turnStartedAt.delete(turnId); + } + + private clearAllTurnMaxWatchdogs() { + for (const timer of this.turnMaxTimers.values()) { + clearTimeout(timer); + } + this.turnMaxTimers.clear(); + this.turnStartedAt.clear(); + } + private requestKey(id: unknown): string | null { if (typeof id === "number" || typeof id === "string") return String(id); return null; diff --git a/src/control-protocol.ts b/src/control-protocol.ts index f9f64ff..5bf50dc 100644 --- a/src/control-protocol.ts +++ b/src/control-protocol.ts @@ -19,6 +19,7 @@ export interface BackupStatus { export interface DaemonStatus { bridgeReady: boolean; codexTurnInProgress?: boolean; + lastTurnWatchdog?: { turnId: string; firedAt: number; elapsedMs: number } | null; codexState?: "idle" | "busy" | "stale" | "offline"; claudeState?: "idle" | "expected" | "stale" | "offline"; deliveryMode: "push" | "pull"; diff --git a/src/daemon.ts b/src/daemon.ts index 6729bcc..c5c7099 100644 --- a/src/daemon.ts +++ b/src/daemon.ts @@ -44,6 +44,7 @@ import { buildViewerModel } from "./viewer-model"; import { buildAgentDescriptors } from "./agent-descriptors"; import { routeMessageToAgent } from "./agent-routing"; import { runLivenessProbe } from "./liveness-probe"; +import { runShutdownSteps } from "./shutdown"; interface ControlSocketData { clientId: number; @@ -69,6 +70,7 @@ const TUI_DISCONNECT_GRACE_MS = parseInt(process.env.TUI_DISCONNECT_GRACE_MS ?? const CLAUDE_DISCONNECT_GRACE_MS = 5_000; const CLAUDE_PROBE_TIMEOUT_MS = envInt("CONTEXTRELAY_CLAUDE_PROBE_TIMEOUT_MS", "AGENTBRIDGE_CLAUDE_PROBE_TIMEOUT_MS", 3_000); const CLAUDE_RESPONSE_TIMEOUT_MS = envInt("CONTEXTRELAY_CLAUDE_RESPONSE_TIMEOUT_MS", "AGENTBRIDGE_CLAUDE_RESPONSE_TIMEOUT_MS", 300_000); +const DAEMON_SHUTDOWN_STEP_TIMEOUT_MS = envInt("CONTEXTRELAY_DAEMON_SHUTDOWN_STEP_TIMEOUT_MS", "AGENTBRIDGE_DAEMON_SHUTDOWN_STEP_TIMEOUT_MS", 1_500); const MAX_BUFFERED_MESSAGES = envInt("CONTEXTRELAY_MAX_BUFFERED_MESSAGES", "AGENTBRIDGE_MAX_BUFFERED_MESSAGES", 100); const MAX_CONTROL_MESSAGE_BYTES = envInt("CONTEXTRELAY_MAX_CONTROL_MESSAGE_BYTES", "AGENTBRIDGE_MAX_CONTROL_MESSAGE_BYTES", 1_000_000); const MAX_CONTROL_MESSAGES_PER_MINUTE = envInt("CONTEXTRELAY_MAX_CONTROL_MESSAGES_PER_MINUTE", "AGENTBRIDGE_MAX_CONTROL_MESSAGES_PER_MINUTE", 120); @@ -312,6 +314,16 @@ codex.on("turnCompletedForced", (event: { idleForMs: number; turnIds: string[] } ); }); +codex.on("turnWatchdog", (event: { turnId: string; firedAt: number; elapsedMs: number }) => { + log(`Codex turn wall-clock watchdog fired for ${event.turnId} after ${event.elapsedMs}ms`); + emitToClaude( + systemMessage( + "system_turn_watchdog", + `⚠️ Codex turn ${event.turnId} exceeded the wall-clock budget after ${event.elapsedMs}ms. ${DISPLAY_NAME} cleared that turn from the busy state without killing Codex; you can retry or inspect status before sending more work.`, + ), + ); +}); + codex.on("ready", (threadId: string) => { tuiConnectionState.markBridgeReady(); log(`Codex ready — thread ${threadId}`); @@ -1127,6 +1139,19 @@ async function requestBackup(request: { prompt, cwd: process.cwd(), timeoutMs: runtimeConfig.autonomy.backupTimeoutMs, + requestId: requestEntry.id, + }, { + onTimeout: async (event) => { + await ledger.appendRuntimeEvent({ + kind: "backup_timeout", + status: "blocked", + title: "Backup agent timed out", + detail: `${event.target} backup exceeded ${event.timeoutMs}ms; waiting for process termination.`, + requestId: event.requestId, + durationMs: event.durationMs, + backupTarget: event.target, + }, "system", request.source, sid); + }, }); lastBackupResult = { @@ -1160,6 +1185,8 @@ async function requestBackup(request: { durationMs: result.durationMs, cost: result.cost, tokens: result.tokens, + timedOut: result.timedOut, + killed: result.killed, stderr: result.stderr, }, }); @@ -1384,6 +1411,7 @@ function currentStatus(): DaemonStatus { return { bridgeReady: tuiConnectionState.canReply(), codexTurnInProgress: codex.turnInProgress, + lastTurnWatchdog: codex.lastTurnWatchdogSummary, codexState: currentCodexState(snapshot.tuiConnected), claudeState: currentClaudeState(claudeSnapshot.connected, taskBoard), deliveryMode: claudeDeliveryMode, @@ -1435,6 +1463,7 @@ function currentViewerContext(url: URL): Record { connectionHealth: { bridgeReady: status.bridgeReady, codexTurnInProgress: status.codexTurnInProgress, + lastTurnWatchdog: status.lastTurnWatchdog, claudeConnected: status.claudeConnected, tuiConnected: status.tuiConnected, queuedMessageCount: status.queuedMessageCount, @@ -1532,22 +1561,64 @@ async function bootCodex() { } } -function shutdown(reason: string) { +async function shutdown(reason: string) { if (shuttingDown) return; shuttingDown = true; log(`Shutting down daemon (${reason})...`); - tuiConnectionState.dispose(`daemon shutdown (${reason})`); - claudeAttachmentState.dispose(`daemon shutdown (${reason})`); - controlServer?.stop(); - controlServer = null; - codex.stop(); + + const stepResults = await runShutdownSteps([ + { + name: "tui_connection_state", + run: () => tuiConnectionState.dispose(`daemon shutdown (${reason})`), + }, + { + name: "claude_attachment_state", + run: () => claudeAttachmentState.dispose(`daemon shutdown (${reason})`), + }, + { + name: "control_server", + run: () => { + controlServer?.stop(); + controlServer = null; + }, + }, + { + name: "codex_adapter", + run: () => codex.stop(), + }, + ], DAEMON_SHUTDOWN_STEP_TIMEOUT_MS); + + for (const result of stepResults) { + if (!result.ok) { + log(`Shutdown step ${result.name} failed${result.timedOut ? " by timeout" : ""}: ${result.error ?? "unknown"}`); + } + } + + try { + await ledger.appendRuntimeEvent({ + kind: "daemon_shutdown", + status: stepResults.every((result) => result.ok) ? "completed" : "blocked", + title: "Daemon shutdown", + detail: `reason: ${reason}\nsteps:\n${stepResults.map((result) => `- ${result.name}: ${result.ok ? "ok" : "failed"}${result.timedOut ? " timeout" : ""} ${result.durationMs}ms`).join("\n")}`, + durationMs: stepResults.reduce((sum, result) => sum + result.durationMs, 0), + }, "system", undefined, sessionId); + } catch (err: any) { + log(`Failed to record daemon shutdown event: ${err.message}`); + } + removePidFile(); removeStatusFile(); process.exit(0); } -process.on("SIGINT", () => shutdown("SIGINT")); -process.on("SIGTERM", () => shutdown("SIGTERM")); +process.on("SIGINT", () => void shutdown("SIGINT").catch((err) => { + log(`Shutdown failed: ${err.stack ?? err.message}`); + process.exit(1); +})); +process.on("SIGTERM", () => void shutdown("SIGTERM").catch((err) => { + log(`Shutdown failed: ${err.stack ?? err.message}`); + process.exit(1); +})); process.on("exit", () => { removePidFile(); removeStatusFile(); }); process.on("uncaughtException", (err) => { log(`UNCAUGHT EXCEPTION: ${err.stack ?? err.message}`); diff --git a/src/session/ledger.ts b/src/session/ledger.ts index d923e06..a9462c7 100644 --- a/src/session/ledger.ts +++ b/src/session/ledger.ts @@ -354,6 +354,9 @@ function formatRuntimeEvent(event: RuntimeEvent): string { event.path ? `path: ${event.path}` : undefined, event.method ? `method: ${event.method}` : undefined, typeof event.exitCode === "number" ? `exit_code: ${event.exitCode}` : undefined, + typeof event.durationMs === "number" ? `duration_ms: ${event.durationMs}` : undefined, + event.requestId ? `request_id: ${event.requestId}` : undefined, + event.backupTarget ? `backup_target: ${event.backupTarget}` : undefined, event.detail, ].filter(Boolean); return details.join("\n"); diff --git a/src/shutdown.ts b/src/shutdown.ts new file mode 100644 index 0000000..51b03bd --- /dev/null +++ b/src/shutdown.ts @@ -0,0 +1,58 @@ +export interface ShutdownStep { + name: string; + run: () => void | Promise; +} + +export interface ShutdownStepResult { + name: string; + ok: boolean; + timedOut: boolean; + durationMs: number; + error?: string; +} + +export async function runShutdownSteps( + steps: ShutdownStep[], + deadlineMs: number, +): Promise { + const results: ShutdownStepResult[] = []; + const boundedDeadline = Math.max(1, deadlineMs); + + for (const step of steps) { + const started = Date.now(); + let timer: ReturnType | null = null; + try { + await Promise.race([ + Promise.resolve().then(step.run), + new Promise((_, reject) => { + timer = setTimeout(() => reject(new ShutdownTimeoutError(step.name)), boundedDeadline); + timer.unref?.(); + }), + ]); + results.push({ + name: step.name, + ok: true, + timedOut: false, + durationMs: Date.now() - started, + }); + } catch (err: any) { + results.push({ + name: step.name, + ok: false, + timedOut: err instanceof ShutdownTimeoutError, + durationMs: Date.now() - started, + error: err?.message ?? String(err), + }); + } finally { + if (timer) clearTimeout(timer); + } + } + + return results; +} + +class ShutdownTimeoutError extends Error { + constructor(stepName: string) { + super(`shutdown step timed out: ${stepName}`); + } +} diff --git a/src/types.ts b/src/types.ts index 44483e3..307c40d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,7 +9,10 @@ export type RuntimeEventKind = | "file_change" | "check" | "approval_request" - | "turn_lifecycle"; + | "turn_lifecycle" + | "backup_timeout" + | "daemon_shutdown" + | "turn_watchdog"; export type RuntimeEventStatus = | "started" @@ -29,6 +32,9 @@ export interface RuntimeEvent { method?: string; threadId?: string | null; turnId?: string | null; + requestId?: string; + durationMs?: number; + backupTarget?: MessageSource; } export type LedgerArtifactKind = diff --git a/src/unit-test/backup-runner.test.ts b/src/unit-test/backup-runner.test.ts new file mode 100644 index 0000000..0429321 --- /dev/null +++ b/src/unit-test/backup-runner.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, test } from "bun:test"; +import { runBackupAgent } from "../backup/runner"; + +describe("backup runner timeout handling", () => { + test("escalates timed-out children to SIGKILL and resolves", async () => { + const timeoutEvents: unknown[] = []; + const result = await runBackupAgent({ + target: "codex", + prompt: "ignored", + cwd: process.cwd(), + timeoutMs: 20, + requestId: "backup-1", + }, { + command: process.execPath, + args: ["-e", "process.on('SIGTERM', () => {}); setInterval(() => {}, 1000);"], + killGraceMs: 20, + onTimeout: (event) => { + timeoutEvents.push(event); + }, + }); + + expect(result.ok).toBe(false); + expect(result.timedOut).toBe(true); + expect(result.killed).toBe(true); + expect(result.stderr).toContain("Timed out after 20ms"); + expect(timeoutEvents).toHaveLength(1); + expect(timeoutEvents[0]).toMatchObject({ + target: "codex", + requestId: "backup-1", + timeoutMs: 20, + }); + }); +}); diff --git a/src/unit-test/codex-adapter.test.ts b/src/unit-test/codex-adapter.test.ts index 0e0f4b7..b306f58 100644 --- a/src/unit-test/codex-adapter.test.ts +++ b/src/unit-test/codex-adapter.test.ts @@ -305,6 +305,26 @@ describe("CodexAdapter turn state machine", () => { adapter.clearResponseTrackingState(); }); + test("per-turn wall-clock watchdog clears a turn even when idle timeout has not elapsed", async () => { + const adapter = new CodexAdapter(4510, 4511, undefined, "", undefined, 10_000, 5) as any; + const watchdogEvents: any[] = []; + const runtimeEvents: any[] = []; + adapter.on("turnWatchdog", (event: any) => watchdogEvents.push(event)); + adapter.on("runtimeEvent", (event: any) => runtimeEvents.push(event)); + + adapter.handleServerNotification({ method: "turn/started", params: { turn: { id: "max-turn" } } }); + await new Promise((resolve) => setTimeout(resolve, 25)); + + expect(adapter.turnInProgress).toBe(false); + expect(adapter.activeTurnIds.size).toBe(0); + expect(watchdogEvents).toHaveLength(1); + expect(watchdogEvents[0].turnId).toBe("max-turn"); + expect(adapter.lastTurnWatchdogSummary.turnId).toBe("max-turn"); + expect(runtimeEvents.some((event) => event.kind === "turn_watchdog" && event.turnId === "max-turn")).toBe(true); + + adapter.clearResponseTrackingState(); + }); + test("thread/start tracked request lifecycle emits ready from response thread id", () => { const adapter = createAdapter(); const appSent: string[] = []; diff --git a/src/unit-test/shutdown.test.ts b/src/unit-test/shutdown.test.ts new file mode 100644 index 0000000..24efaec --- /dev/null +++ b/src/unit-test/shutdown.test.ts @@ -0,0 +1,30 @@ +import { describe, expect, test } from "bun:test"; +import { runShutdownSteps } from "../shutdown"; + +describe("runShutdownSteps", () => { + test("runs steps in order and records durations", async () => { + const calls: string[] = []; + const results = await runShutdownSteps([ + { name: "first", run: () => { calls.push("first"); } }, + { name: "second", run: async () => { calls.push("second"); } }, + ], 50); + + expect(calls).toEqual(["first", "second"]); + expect(results.map((result) => result.ok)).toEqual([true, true]); + expect(results.every((result) => result.durationMs >= 0)).toBe(true); + }); + + test("bounds hung async steps by deadline and continues", async () => { + const calls: string[] = []; + const started = Date.now(); + const results = await runShutdownSteps([ + { name: "hung", run: () => new Promise(() => {}) }, + { name: "after", run: () => { calls.push("after"); } }, + ], 10); + + expect(Date.now() - started).toBeLessThan(250); + expect(results[0]).toMatchObject({ name: "hung", ok: false, timedOut: true }); + expect(results[1]).toMatchObject({ name: "after", ok: true, timedOut: false }); + expect(calls).toEqual(["after"]); + }); +}); diff --git a/src/viewer-model.ts b/src/viewer-model.ts index 11e1b3b..c8e2c01 100644 --- a/src/viewer-model.ts +++ b/src/viewer-model.ts @@ -15,6 +15,7 @@ export interface ViewerModelInput { connectionHealth: { bridgeReady: boolean; codexTurnInProgress?: boolean; + lastTurnWatchdog?: { turnId: string; firedAt: number; elapsedMs: number } | null; claudeConnected: boolean; tuiConnected: boolean; queuedMessageCount: number; diff --git a/src/viewer.ts b/src/viewer.ts index bdcb7a3..286f399 100644 --- a/src/viewer.ts +++ b/src/viewer.ts @@ -120,6 +120,7 @@ export function viewerHtml(): string { card("Codex", status.codexState || (status.codexTurnInProgress ? "busy" : "idle"), status.codexState === "idle"), card("Claude", status.claudeState || (status.claudeConnected ? "connected" : "offline"), status.claudeState === "idle"), card("Codex TUI", status.tuiConnected ? "connected" : "offline", !!status.tuiConnected), + card("Turn Watchdog", status.lastTurnWatchdog ? new Date(status.lastTurnWatchdog.firedAt).toLocaleTimeString() : "none", !status.lastTurnWatchdog), card("Session", status.sessionId || "none"), card("Ledger entries", status.ledgerEntries ?? 0), card("Autonomy", status.autonomyEnabled ? "on" : "off", !!status.autonomyEnabled),