diff --git a/.changeset/keep-tmux-session-alive-after-agent-exit.md b/.changeset/keep-tmux-session-alive-after-agent-exit.md new file mode 100644 index 000000000..44b60042b --- /dev/null +++ b/.changeset/keep-tmux-session-alive-after-agent-exit.md @@ -0,0 +1,8 @@ +--- +"@aoagents/ao-plugin-runtime-tmux": patch +"@aoagents/ao-web": patch +--- + +Tmux sessions no longer die when the agent process inside them exits. When you Ctrl-C the agent in a web terminal, the pane now drops to an interactive `$SHELL` in the workspace dir instead of nuking the tmux session and leaving the dashboard in a phantom "runtime lost" state. The lifecycle manager still detects the agent exit (via `agent.isProcessRunning`) and transitions the session to `agent_process_exited`, but the runtime stays usable so you can run shell commands or manually re-launch the agent. + +Also: the mux-websocket re-attach loop now checks `tmux has-session` before retrying after a PTY exit. When the tmux session is genuinely gone (e.g. `ao stop`), it skips the three doomed `attach-session` spawns from #1640 and notifies the dashboard immediately. (#1756) diff --git a/packages/plugins/runtime-tmux/src/__tests__/index.test.ts b/packages/plugins/runtime-tmux/src/__tests__/index.test.ts index cd5f38e07..9932ef9fc 100644 --- a/packages/plugins/runtime-tmux/src/__tests__/index.test.ts +++ b/packages/plugins/runtime-tmux/src/__tests__/index.test.ts @@ -98,10 +98,19 @@ describe("runtime.create()", () => { expect(handle.runtimeName).toBe("tmux"); expect(handle.data.workspacePath).toBe("/tmp/workspace"); - // First call: new-session + // First call: new-session — launch command has the keep-alive shell tail + // appended so the tmux session survives agent exit (issue #1756). expect(mockExecFileCustom).toHaveBeenCalledWith( "tmux", - ["new-session", "-d", "-s", "test-session", "-c", "/tmp/workspace", "echo hello"], + [ + "new-session", + "-d", + "-s", + "test-session", + "-c", + "/tmp/workspace", + 'echo hello\nexec "${SHELL:-/bin/bash}" -i', + ], expectedTmuxOptions, ); }); @@ -148,7 +157,7 @@ describe("runtime.create()", () => { expect(args).toContain("-e"); expect(args).toContain("AO_SESSION=env-session"); expect(args).toContain("FOO=bar"); - expect(args.at(-1)).toBe("bash"); + expect(args.at(-1)).toBe('bash\nexec "${SHELL:-/bin/bash}" -i'); }); it("starts the launch command as the initial tmux pane command", async () => { @@ -164,15 +173,42 @@ describe("runtime.create()", () => { environment: {}, }); - // First call: new-session passes the launch command as the pane's initial command + // First call: new-session passes the launch command as the pane's initial + // command, with the keep-alive shell tail appended. expect(mockExecFileCustom).toHaveBeenCalledWith( "tmux", - ["new-session", "-d", "-s", "launch-test", "-c", "/tmp/ws", "claude --session abc"], + [ + "new-session", + "-d", + "-s", + "launch-test", + "-c", + "/tmp/ws", + 'claude --session abc\nexec "${SHELL:-/bin/bash}" -i', + ], expectedTmuxOptions, ); }); - it("uses a temp launch script for long launch commands", async () => { + it("appends an interactive shell tail so the tmux pane survives agent exit (regression for #1756)", async () => { + const runtime = create(); + + mockTmuxSuccess(); + mockTmuxSuccess(); + + await runtime.create({ + sessionId: "keep-alive", + workspacePath: "/tmp/ws", + launchCommand: "claude --session abc", + environment: {}, + }); + + const finalArg = (mockExecFileCustom.mock.calls[0][1] as string[]).at(-1)!; + expect(finalArg).toContain("claude --session abc"); + expect(finalArg).toMatch(/exec "\$\{SHELL:-\/bin\/bash\}" -i\s*$/); + }); + + it("keeps the keep-alive tail in the temp script for long launch commands", async () => { const runtime = create(); const longCommand = "x".repeat(250); @@ -193,6 +229,12 @@ describe("runtime.create()", () => { { encoding: "utf-8", mode: 0o700 }, ); + // The script body includes the interactive shell tail too — without it + // long-command sessions would still nuke tmux on agent exit (#1756). + const writeCall = (fs.writeFileSync as unknown as { mock: { calls: unknown[][] } }).mock + .calls[0]; + expect(writeCall[1]).toMatch(/exec "\$\{SHELL:-\/bin\/bash\}" -i/); + expect(mockExecFileCustom).toHaveBeenNthCalledWith( 1, "tmux", @@ -317,7 +359,7 @@ describe("runtime.create()", () => { "no-env", "-c", "/tmp/ws", - "echo hi", + 'echo hi\nexec "${SHELL:-/bin/bash}" -i', ]); }); }); diff --git a/packages/plugins/runtime-tmux/src/index.ts b/packages/plugins/runtime-tmux/src/index.ts index 2cb981b92..53919d91e 100644 --- a/packages/plugins/runtime-tmux/src/index.ts +++ b/packages/plugins/runtime-tmux/src/index.ts @@ -34,9 +34,27 @@ function assertValidSessionId(id: string): void { } } +/** + * Shell snippet appended after the agent launch command so the tmux pane + * (and therefore the tmux session) survives agent exit. Without this, the + * pane closes when the agent process exits, the only window goes away, and + * the whole tmux session dies — leaving the dashboard with a phantom + * "runtime lost" state and the user with no way to do anything in that + * workspace (issue #1756). + * + * `exec` replaces the wrapping sh/bash with the user's interactive shell, + * so the lifecycle manager still detects agent termination via + * `agent.isProcessRunning` and transitions the session correctly. + */ +const KEEP_ALIVE_SHELL = `exec "\${SHELL:-/bin/bash}" -i`; + +function withKeepAliveShell(command: string): string { + return `${command.replace(/\n+$/, "")}\n${KEEP_ALIVE_SHELL}`; +} + function writeLaunchScript(command: string): string { const scriptPath = join(tmpdir(), `ao-launch-${randomUUID()}.sh`); - const content = `#!/usr/bin/env bash\nrm -- "$0" 2>/dev/null || true\n${command}\n`; + const content = `#!/usr/bin/env bash\nrm -- "$0" 2>/dev/null || true\n${withKeepAliveShell(command)}\n`; writeFileSync(scriptPath, content, { encoding: "utf-8", mode: 0o700 }); return `bash ${shellEscape(scriptPath)}`; } @@ -78,9 +96,12 @@ export function create(): Runtime { // Start the launch command as the pane's initial command instead of // typing into a live shell. A dashboard attach can trigger terminal // device responses; if those race with tmux send-keys, they become - // literal shell input and corrupt the launch path. + // literal shell input and corrupt the launch path. The keep-alive + // tail is appended in both code paths — see KEEP_ALIVE_SHELL. const shellCommand = - launchCommand.length > 200 ? writeLaunchScript(launchCommand) : launchCommand; + launchCommand.length > 200 + ? writeLaunchScript(launchCommand) + : withKeepAliveShell(launchCommand); await tmux( "new-session", diff --git a/packages/web/server/__tests__/mux-websocket.test.ts b/packages/web/server/__tests__/mux-websocket.test.ts index 6efb2dc98..48f38e96f 100644 --- a/packages/web/server/__tests__/mux-websocket.test.ts +++ b/packages/web/server/__tests__/mux-websocket.test.ts @@ -4,9 +4,10 @@ import type { SessionBroadcaster as SessionBroadcasterType } from "../mux-websoc // vi.mock factories run before module-level statements. Hoist the mock // fns so the factories close over the same instances the tests use. -const { mockSpawn, mockPtySpawn } = vi.hoisted(() => ({ +const { mockSpawn, mockPtySpawn, mockTmuxHasSession } = vi.hoisted(() => ({ mockSpawn: vi.fn(), mockPtySpawn: vi.fn(), + mockTmuxHasSession: vi.fn(), })); vi.mock("node:child_process", async (importOriginal) => { @@ -33,6 +34,7 @@ vi.mock("../tmux-utils.js", () => ({ findTmux: () => "/usr/bin/tmux", validateSessionId: () => true, resolveTmuxSession: () => "ao-177", + tmuxHasSession: (...args: unknown[]) => mockTmuxHasSession(...args), })); const { SessionBroadcaster, TerminalManager } = await import("../mux-websocket"); @@ -311,3 +313,62 @@ describe("TerminalManager.open — tmux target args (regression for #1714)", () expect(args).toEqual(["attach-session", "-t", "=ao-177"]); }); }); + +describe("TerminalManager.open — re-attach skipped when tmux session is gone (regression for #1756)", () => { + // Captures the latest onExit callback registered by ptySpawn so tests can + // synthesise a PTY exit without spawning a real process. The handler is + // async (so it can await the promisified has-session probe), so tests + // await its return value before asserting. + let capturedOnExit: ((evt: { exitCode: number }) => Promise | void) | undefined; + + beforeEach(() => { + mockSpawn.mockReset(); + mockPtySpawn.mockReset(); + mockTmuxHasSession.mockReset(); + capturedOnExit = undefined; + + mockSpawn.mockImplementation(() => new EventEmitter()); + mockPtySpawn.mockImplementation(() => ({ + onData: vi.fn(), + onExit: vi.fn((cb: (evt: { exitCode: number }) => Promise | void) => { + capturedOnExit = cb; + }), + write: vi.fn(), + resize: vi.fn(), + kill: vi.fn(), + })); + }); + + it("skips re-attach and notifies subscribers when has-session reports the tmux session is gone", async () => { + const mgr = new TerminalManager("/usr/bin/tmux"); + const exitCb = vi.fn(); + mgr.subscribe("ao-177", undefined, vi.fn(), exitCb); + + expect(mockPtySpawn).toHaveBeenCalledTimes(1); + expect(capturedOnExit).toBeDefined(); + + mockTmuxHasSession.mockResolvedValueOnce(false); + await capturedOnExit!({ exitCode: 0 }); + + // No second attach-session was spawned — the re-attach loop was skipped. + expect(mockPtySpawn).toHaveBeenCalledTimes(1); + // Subscribers were notified with the original exit code. + expect(exitCb).toHaveBeenCalledTimes(1); + expect(exitCb).toHaveBeenCalledWith(0); + }); + + it("still re-attaches when has-session reports the tmux session is alive", async () => { + const mgr = new TerminalManager("/usr/bin/tmux"); + const exitCb = vi.fn(); + mgr.subscribe("ao-177", undefined, vi.fn(), exitCb); + + expect(mockPtySpawn).toHaveBeenCalledTimes(1); + + mockTmuxHasSession.mockResolvedValueOnce(true); + await capturedOnExit!({ exitCode: 1 }); + + // Re-attach happened: ptySpawn called a second time, exit not yet notified. + expect(mockPtySpawn).toHaveBeenCalledTimes(2); + expect(exitCb).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/web/server/__tests__/tmux-utils.test.ts b/packages/web/server/__tests__/tmux-utils.test.ts index 0cbe56301..7e38ff021 100644 --- a/packages/web/server/__tests__/tmux-utils.test.ts +++ b/packages/web/server/__tests__/tmux-utils.test.ts @@ -6,7 +6,14 @@ */ import { describe, it, expect, vi, afterEach } from "vitest"; -import { findTmux, resolveTmuxSession, resolvePipePath, validateSessionId, SESSION_ID_PATTERN } from "../tmux-utils.js"; +import { + findTmux, + resolveTmuxSession, + resolvePipePath, + tmuxHasSession, + validateSessionId, + SESSION_ID_PATTERN, +} from "../tmux-utils.js"; // Default fs adapter for resolveTmuxSession tests — empty AO base directory // so the on-disk storageKey lookup always misses and we exercise the @@ -357,6 +364,47 @@ describe("findTmux", () => { }); }); +// ============================================================================= +// tmuxHasSession +// ============================================================================= + +describe("tmuxHasSession", () => { + const TMUX = "/opt/homebrew/bin/tmux"; + + it("returns true when has-session resolves", async () => { + const mockExec = vi.fn().mockResolvedValue({ stdout: "", stderr: "" }); + + await expect(tmuxHasSession(TMUX, "ao-104", mockExec)).resolves.toBe(true); + }); + + it("returns false when has-session rejects (session missing)", async () => { + const mockExec = vi + .fn() + .mockRejectedValue(new Error("can't find session: ao-104")); + + await expect(tmuxHasSession(TMUX, "ao-104", mockExec)).resolves.toBe(false); + }); + + it("uses the = exact-match prefix to avoid tmux prefix matching", async () => { + const mockExec = vi.fn().mockResolvedValue({ stdout: "", stderr: "" }); + + await tmuxHasSession(TMUX, "ao-1", mockExec); + + expect(mockExec).toHaveBeenCalledWith( + TMUX, + ["has-session", "-t", "=ao-1"], + { timeout: 5000 }, + ); + }); + + it("returns false when tmuxPath is null without invoking exec", async () => { + const mockExec = vi.fn(); + + await expect(tmuxHasSession(null, "ao-104", mockExec)).resolves.toBe(false); + expect(mockExec).not.toHaveBeenCalled(); + }); +}); + // ============================================================================= // resolveTmuxSession // ============================================================================= diff --git a/packages/web/server/mux-websocket.ts b/packages/web/server/mux-websocket.ts index 5a8c24170..5515574df 100644 --- a/packages/web/server/mux-websocket.ts +++ b/packages/web/server/mux-websocket.ts @@ -9,7 +9,13 @@ import { WebSocketServer, WebSocket } from "ws"; import { spawn } from "node:child_process"; import { type Socket, connect as netConnect } from "node:net"; -import { findTmux, resolveTmuxSession, resolvePipePath, validateSessionId } from "./tmux-utils.js"; +import { + findTmux, + resolveTmuxSession, + resolvePipePath, + tmuxHasSession, + validateSessionId, +} from "./tmux-utils.js"; import { getEnvDefaults, isWindows } from "@aoagents/ao-core"; // These types mirror src/lib/mux-protocol.ts exactly. @@ -368,10 +374,39 @@ export class TerminalManager { }); // Handle PTY exit - pty.onExit(({ exitCode }) => { + // + // Async: the has-session probe shells out via promisified execFile and + // must be awaited. node-pty fires onExit on the main thread; a sync + // probe would freeze the entire web server (every WebSocket, HTTP + // request, in-flight terminal) for up to the subprocess timeout when + // tmux is slow to respond. + pty.onExit(async ({ exitCode }) => { console.log(`[MuxServer] PTY exited for ${id} with code ${exitCode}`); terminal.pty = null; + // Skip the re-attach loop entirely when the underlying tmux session is + // gone (e.g. user pressed Ctrl-C in the pane and the launch command + // exited, taking the only window with it). Without this guard we + // burn three doomed attach-session spawns and emit a noisy + // "Max re-attach attempts reached" log line for what is actually a + // clean user-initiated termination — see issue #1756. The + // MAX_REATTACH_ATTEMPTS bound from #1640 still covers tmux server + // hiccups where the session does still exist. + if ( + terminal.subscribers.size > 0 && + !(await tmuxHasSession(this.TMUX, tmuxSessionId)) + ) { + console.log(`[MuxServer] tmux session ${tmuxSessionId} is gone, not re-attaching`); + if (terminal.resetTimer) { + clearTimeout(terminal.resetTimer); + terminal.resetTimer = undefined; + } + for (const cb of terminal.exitCallbacks) { + cb(exitCode); + } + return; + } + // Re-attach if subscribers are still present, up to MAX_REATTACH_ATTEMPTS. // The cap prevents an unbounded respawn loop when the PTY crashes immediately // after every attach (e.g. resource exhaustion or a broken tmux session). diff --git a/packages/web/server/tmux-utils.ts b/packages/web/server/tmux-utils.ts index 7aa5a5a97..e872157b7 100644 --- a/packages/web/server/tmux-utils.ts +++ b/packages/web/server/tmux-utils.ts @@ -5,12 +5,15 @@ * so the logic can be properly unit tested. */ -import { execFileSync } from "node:child_process"; +import { execFile, execFileSync } from "node:child_process"; import { readdirSync, existsSync, readFileSync } from "node:fs"; import { homedir } from "node:os"; import { join } from "node:path"; +import { promisify } from "node:util"; import { isWindows } from "@aoagents/ao-core"; +const execFileAsync = promisify(execFile); + /** Session ID validation regex — alphanumeric, hyphens, underscores only */ export const SESSION_ID_PATTERN = /^[a-zA-Z0-9_-]+$/; @@ -128,6 +131,45 @@ export function findTmux( return "tmux"; // Fall back to bare name } +/** Async exec signature used by `tmuxHasSession` (and injectable for tests). */ +export type ExecFileAsync = ( + file: string, + args: readonly string[], + options: { timeout: number }, +) => Promise; + +/** + * Check whether a tmux session with the given name exists. + * + * Uses `=` exact-match prefix so the lookup never falls back to tmux's + * default prefix matching (where "ao-1" would match "ao-15"). The caller + * must already have the canonical tmux session name (typically the value + * returned by `resolveTmuxSession`). + * + * Async: this runs from inside node-pty's `onExit` callback on every agent + * exit, and the WebSocket server is single-threaded. A synchronous + * `execFileSync` here would block the event loop — and every WebSocket + * connection, HTTP request, and in-flight terminal — for up to the 5 s + * subprocess timeout when tmux is slow to respond. Use the promisified + * `execFile` form instead. + * + * @returns true if the session exists, false otherwise (including tmux + * not running, no sessions, or any unexpected error) + */ +export async function tmuxHasSession( + tmuxPath: string | null, + tmuxSessionName: string, + execFn: ExecFileAsync = execFileAsync as unknown as ExecFileAsync, +): Promise { + if (!tmuxPath) return false; + try { + await execFn(tmuxPath, ["has-session", "-t", `=${tmuxSessionName}`], { timeout: 5000 }); + return true; + } catch { + return false; + } +} + /** * Resolve a user-facing session ID to its actual tmux session name. *