diff --git a/packages/cli/__tests__/commands/start.test.ts b/packages/cli/__tests__/commands/start.test.ts index 06041c2f3..62c29842e 100644 --- a/packages/cli/__tests__/commands/start.test.ts +++ b/packages/cli/__tests__/commands/start.test.ts @@ -1633,6 +1633,186 @@ describe("start command — orchestrator session strategy display", () => { expect(mockClearLastStop).toHaveBeenCalled(); }); + // Regression for issue #1743: when last-stop.json is missing or empty + // (e.g. because `ao stop` crashed before writing it, or `ao update` + // wiped state), `ao start` must still offer to restore recently + // `manually_killed` sessions by scanning the session manager. + it("falls back to recently manually-killed sessions when last-stop.json is missing (issue #1743)", async () => { + // Force getGlobalConfigPath() to a non-existent path so the fallback's + // global-config load is a no-op and the test does not read the host's + // real ~/.agent-orchestrator/config.yaml. + const origGlobalEnv = process.env["AO_GLOBAL_CONFIG"]; + process.env["AO_GLOBAL_CONFIG"] = join(tmpDir, "no-such-global.yaml"); + + try { + mockReadLastStop.mockResolvedValue(null); + + mockConfigRef.current = makeConfig({ "my-app": makeProject() }); + const { findWebDir } = await import("../../src/lib/web-dir.js"); + vi.mocked(findWebDir).mockReturnValue(tmpDir); + writeFileSync(join(tmpDir, "package.json"), "{}"); + + const fakeDashboard = { on: vi.fn(), kill: vi.fn(), emit: vi.fn() }; + mockSpawn.mockReturnValue(fakeDashboard); + + const recentTerminatedAt = new Date(Date.now() - 60_000).toISOString(); + mockSessionManager.list.mockResolvedValue([ + { + id: "app-1", + projectId: "my-app", + status: "killed", + activity: "exited", + metadata: {}, + lastActivityAt: new Date(), + lifecycle: { + version: 2, + session: { + kind: "worker", + state: "terminated", + reason: "manually_killed", + startedAt: null, + completedAt: null, + terminatedAt: recentTerminatedAt, + lastTransitionAt: recentTerminatedAt, + }, + pr: { state: "none", reason: "not_created", number: null, url: null, lastObservedAt: null }, + runtime: { state: "missing", reason: "manual_kill_requested", lastObservedAt: null, handle: null, tmuxName: null }, + }, + }, + ]); + mockSessionManager.restore.mockResolvedValue(undefined); + + await program.parseAsync(["node", "test", "start", "--no-orchestrator"]); + + expect(mockSessionManager.restore).toHaveBeenCalledWith("app-1"); + } finally { + if (origGlobalEnv === undefined) delete process.env["AO_GLOBAL_CONFIG"]; + else process.env["AO_GLOBAL_CONFIG"] = origGlobalEnv; + } + }); + + it("does not surface fallback candidates older than the recent window (issue #1743)", async () => { + const origGlobalEnv = process.env["AO_GLOBAL_CONFIG"]; + process.env["AO_GLOBAL_CONFIG"] = join(tmpDir, "no-such-global.yaml"); + + try { + mockReadLastStop.mockResolvedValue(null); + + mockConfigRef.current = makeConfig({ "my-app": makeProject() }); + const { findWebDir } = await import("../../src/lib/web-dir.js"); + vi.mocked(findWebDir).mockReturnValue(tmpDir); + writeFileSync(join(tmpDir, "package.json"), "{}"); + + const fakeDashboard = { on: vi.fn(), kill: vi.fn(), emit: vi.fn() }; + mockSpawn.mockReturnValue(fakeDashboard); + + // Terminated 30 minutes ago — beyond the 10-minute fallback window. + const oldTerminatedAt = new Date(Date.now() - 30 * 60 * 1000).toISOString(); + mockSessionManager.list.mockResolvedValue([ + { + id: "app-1", + projectId: "my-app", + status: "killed", + activity: "exited", + metadata: {}, + lastActivityAt: new Date(), + lifecycle: { + version: 2, + session: { + kind: "worker", + state: "terminated", + reason: "manually_killed", + startedAt: null, + completedAt: null, + terminatedAt: oldTerminatedAt, + lastTransitionAt: oldTerminatedAt, + }, + pr: { state: "none", reason: "not_created", number: null, url: null, lastObservedAt: null }, + runtime: { state: "missing", reason: "manual_kill_requested", lastObservedAt: null, handle: null, tmuxName: null }, + }, + }, + ]); + + await program.parseAsync(["node", "test", "start", "--no-orchestrator"]); + + expect(mockSessionManager.restore).not.toHaveBeenCalled(); + expect(mockPromptConfirm).not.toHaveBeenCalled(); + } finally { + if (origGlobalEnv === undefined) delete process.env["AO_GLOBAL_CONFIG"]; + else process.env["AO_GLOBAL_CONFIG"] = origGlobalEnv; + } + }); + + // Regression for Greptile P1 on PR #1780. Before the fix, the fallback's + // session manager was built from the project-scoped config, so `sm.list()` + // only saw the current project's sessions and `otherProjects` in the + // synthesized LastStopState was always empty — defeating the cross-project + // restore that `readLastStop()` already supports. + it("fallback uses the global config so cross-project sessions appear in otherProjects (PR #1780)", async () => { + const origGlobalEnv = process.env["AO_GLOBAL_CONFIG"]; + const globalPath = join(tmpDir, "global-config.yaml"); + // Real, parseable global config so loadConfig(globalPath) succeeds and + // existsSync(globalPath) returns true. Contents don't have to match the + // mocked sessions — getSessionManager is mocked to ignore config. + writeFileSync( + globalPath, + "version: 1\nport: 3000\nprojects:\n my-app:\n name: My App\n path: /tmp/my-app\n sessionPrefix: app\n other-app:\n name: Other App\n path: /tmp/other-app\n sessionPrefix: other\n", + ); + process.env["AO_GLOBAL_CONFIG"] = globalPath; + + try { + mockReadLastStop.mockResolvedValue(null); + mockConfigRef.current = makeConfig({ "my-app": makeProject() }); + const { findWebDir } = await import("../../src/lib/web-dir.js"); + vi.mocked(findWebDir).mockReturnValue(tmpDir); + writeFileSync(join(tmpDir, "package.json"), "{}"); + + const fakeDashboard = { on: vi.fn(), kill: vi.fn(), emit: vi.fn() }; + mockSpawn.mockReturnValue(fakeDashboard); + mockPromptConfirm.mockResolvedValue(true); + + const recent = new Date(Date.now() - 60_000).toISOString(); + const terminated = (id: string, projectId: string) => ({ + id, + projectId, + status: "killed", + activity: "exited", + metadata: {}, + lastActivityAt: new Date(), + lifecycle: { + version: 2, + session: { + kind: "worker", + state: "terminated", + reason: "manually_killed", + startedAt: null, + completedAt: null, + terminatedAt: recent, + lastTransitionAt: recent, + }, + pr: { state: "none", reason: "not_created", number: null, url: null, lastObservedAt: null }, + runtime: { state: "missing", reason: "manual_kill_requested", lastObservedAt: null, handle: null, tmuxName: null }, + }, + }); + + mockSessionManager.list.mockResolvedValue([ + terminated("app-1", "my-app"), + terminated("other-1", "other-app"), + ]); + mockSessionManager.restore.mockResolvedValue(undefined); + + await program.parseAsync(["node", "test", "start", "--no-orchestrator"]); + + // Both the in-project session AND the cross-project session must be + // routed to restore. Pre-fix, only "app-1" would have been seen. + expect(mockSessionManager.restore).toHaveBeenCalledWith("app-1"); + expect(mockSessionManager.restore).toHaveBeenCalledWith("other-1"); + } finally { + if (origGlobalEnv === undefined) delete process.env["AO_GLOBAL_CONFIG"]; + else process.env["AO_GLOBAL_CONFIG"] = origGlobalEnv; + } + }); + it("opens the bare dashboard URL when --no-orchestrator skips the orchestrator block", async () => { mockConfigRef.current = makeConfig({ "my-app": makeProject() }); @@ -2204,6 +2384,7 @@ describe("start command — platform-aware runtime fallback", () => { else process.env["AO_GLOBAL_CONFIG"] = origGlobalEnv; } }); + }); // --------------------------------------------------------------------------- diff --git a/packages/cli/__tests__/commands/update.test.ts b/packages/cli/__tests__/commands/update.test.ts index a0797e226..84ed95f49 100644 --- a/packages/cli/__tests__/commands/update.test.ts +++ b/packages/cli/__tests__/commands/update.test.ts @@ -213,6 +213,48 @@ describe("update command", () => { await program.parseAsync(["node", "test", "update"]); expect(mockInvalidateCache).toHaveBeenCalledTimes(1); }); + + // Regression for issue #1743: `ao update` must NOT touch + // ~/.agent-orchestrator/last-stop.json. Without this guarantee, + // the `ao stop` → `ao update` → `ao start` flow loses the restore + // record. We exercise the full update code path with a real + // last-stop.json on disk in a temp HOME and assert it survives. + it("does not touch ~/.agent-orchestrator/last-stop.json (issue #1743)", async () => { + const { mkdirSync, mkdtempSync, writeFileSync, readFileSync, existsSync, rmSync } = + await import("node:fs"); + const { tmpdir } = await import("node:os"); + const { join } = await import("node:path"); + + const fakeHome = mkdtempSync(join(tmpdir(), "ao-update-home-")); + const stateDir = join(fakeHome, ".agent-orchestrator"); + mkdirSync(stateDir, { recursive: true }); + const lastStopPath = join(stateDir, "last-stop.json"); + const before = JSON.stringify({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: "my-app", + sessionIds: ["app-1", "app-2"], + }); + writeFileSync(lastStopPath, before, "utf-8"); + + const origHome = process.env["HOME"]; + const origUserprofile = process.env["USERPROFILE"]; + process.env["HOME"] = fakeHome; + process.env["USERPROFILE"] = fakeHome; + + try { + await program.parseAsync(["node", "test", "update"]); + + expect(existsSync(lastStopPath)).toBe(true); + expect(readFileSync(lastStopPath, "utf-8")).toBe(before); + expect(mockRunRepoScript).toHaveBeenCalledWith("ao-update.sh", []); + } finally { + if (origHome === undefined) delete process.env["HOME"]; + else process.env["HOME"] = origHome; + if (origUserprofile === undefined) delete process.env["USERPROFILE"]; + else process.env["USERPROFILE"] = origUserprofile; + rmSync(fakeHome, { recursive: true, force: true }); + } + }); }); // ----------------------------------------------------------------------- diff --git a/packages/cli/__tests__/lib/last-stop-fallback.test.ts b/packages/cli/__tests__/lib/last-stop-fallback.test.ts new file mode 100644 index 000000000..8784c7885 --- /dev/null +++ b/packages/cli/__tests__/lib/last-stop-fallback.test.ts @@ -0,0 +1,176 @@ +/** + * Tests for the `last-stop.json` missing/malformed fallback in + * `ao start`. See issue #1743 — without this fallback, a single + * regression in the last-stop write pipeline silently drops the + * restore prompt. + */ + +import { describe, it, expect } from "vitest"; +import type { CanonicalSessionLifecycle, Session } from "@aoagents/ao-core"; +import { + buildLastStopFallback, + FALLBACK_RECENT_WINDOW_MS, +} from "../../src/lib/last-stop-fallback.js"; + +const NOW = Date.parse("2026-05-08T18:00:00.000Z"); + +function makeLifecycle( + state: CanonicalSessionLifecycle["session"]["state"], + reason: CanonicalSessionLifecycle["session"]["reason"], + terminatedAtIso: string | null, +): CanonicalSessionLifecycle { + return { + version: 2, + session: { + kind: "worker", + state, + reason, + startedAt: null, + completedAt: null, + terminatedAt: terminatedAtIso, + lastTransitionAt: terminatedAtIso ?? "2026-05-08T17:00:00.000Z", + }, + pr: { state: "none", reason: "not_created", number: null, url: null, lastObservedAt: null }, + runtime: { state: "missing", reason: "manual_kill_requested", lastObservedAt: null, handle: null, tmuxName: null }, + }; +} + +function makeSession( + id: string, + projectId: string, + lifecycle: CanonicalSessionLifecycle, +): Session { + return { + id, + projectId, + status: lifecycle.session.state === "terminated" ? "killed" : "working", + activity: lifecycle.session.state === "terminated" ? "exited" : null, + activitySignal: { state: "unavailable", activity: null, source: "none" }, + lifecycle, + branch: null, + issueId: null, + pr: null, + workspacePath: null, + runtimeHandle: null, + agentInfo: null, + createdAt: new Date(NOW - 60_000), + lastActivityAt: new Date(NOW - 30_000), + metadata: {}, + }; +} + +describe("last-stop-fallback", () => { + it("returns null when there are no manually-killed sessions in the window", () => { + const sessions: Session[] = [ + // Wrong reason — auto cleanup, not the user's `ao stop` + makeSession( + "s1", + "my-app", + makeLifecycle("terminated", "auto_cleanup", new Date(NOW - 60_000).toISOString()), + ), + // Killed too long ago (> 10 min) + makeSession( + "s2", + "my-app", + makeLifecycle( + "terminated", + "manually_killed", + new Date(NOW - FALLBACK_RECENT_WINDOW_MS - 60_000).toISOString(), + ), + ), + // Still alive + makeSession( + "s3", + "my-app", + makeLifecycle("working", "task_in_progress", null), + ), + ]; + + const result = buildLastStopFallback(sessions, "my-app", { now: NOW }); + expect(result).toBeNull(); + }); + + it("surfaces recently manually-killed sessions in the primary project", () => { + const t1 = new Date(NOW - 30_000).toISOString(); + const t2 = new Date(NOW - 60_000).toISOString(); + const sessions: Session[] = [ + makeSession("s1", "my-app", makeLifecycle("terminated", "manually_killed", t1)), + makeSession("s2", "my-app", makeLifecycle("terminated", "manually_killed", t2)), + // Different project — goes to otherProjects. + makeSession( + "s3", + "other-app", + makeLifecycle("terminated", "manually_killed", new Date(NOW - 90_000).toISOString()), + ), + ]; + + const result = buildLastStopFallback(sessions, "my-app", { now: NOW }); + expect(result).not.toBeNull(); + expect(result?.projectId).toBe("my-app"); + expect(result?.sessionIds).toEqual(expect.arrayContaining(["s1", "s2"])); + expect(result?.sessionIds).toHaveLength(2); + // Most recent terminatedAt becomes stoppedAt so the prompt's "stopped at" + // string is meaningful. + expect(result?.stoppedAt).toBe(t1); + expect(result?.otherProjects).toEqual([ + { projectId: "other-app", sessionIds: ["s3"] }, + ]); + }); + + it("surfaces other-project sessions even when the primary project has none", () => { + const sessions: Session[] = [ + makeSession( + "s1", + "other-app", + makeLifecycle( + "terminated", + "manually_killed", + new Date(NOW - 30_000).toISOString(), + ), + ), + ]; + + const result = buildLastStopFallback(sessions, "my-app", { now: NOW }); + expect(result).not.toBeNull(); + expect(result?.projectId).toBe("my-app"); + expect(result?.sessionIds).toEqual([]); + expect(result?.otherProjects).toEqual([ + { projectId: "other-app", sessionIds: ["s1"] }, + ]); + }); + + it("ignores sessions with missing or unparseable terminatedAt", () => { + const sessions: Session[] = [ + makeSession("s1", "my-app", makeLifecycle("terminated", "manually_killed", null)), + makeSession( + "s2", + "my-app", + makeLifecycle("terminated", "manually_killed", "not-an-iso-date"), + ), + ]; + + const result = buildLastStopFallback(sessions, "my-app", { now: NOW }); + expect(result).toBeNull(); + }); + + it("respects a custom window", () => { + const sessions: Session[] = [ + makeSession( + "s1", + "my-app", + makeLifecycle( + "terminated", + "manually_killed", + new Date(NOW - 5 * 60 * 1000).toISOString(), + ), + ), + ]; + + // Default window includes it. + expect(buildLastStopFallback(sessions, "my-app", { now: NOW })).not.toBeNull(); + // 1-minute window excludes it. + expect( + buildLastStopFallback(sessions, "my-app", { now: NOW, windowMs: 60_000 }), + ).toBeNull(); + }); +}); diff --git a/packages/cli/__tests__/lib/running-state.test.ts b/packages/cli/__tests__/lib/running-state.test.ts index f3b2f6319..5db093fcd 100644 --- a/packages/cli/__tests__/lib/running-state.test.ts +++ b/packages/cli/__tests__/lib/running-state.test.ts @@ -1,18 +1,49 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { existsSync, readFileSync, rmSync } from "node:fs"; +import type * as fsModule from "node:fs"; +import { existsSync, readdirSync, readFileSync, rmSync } from "node:fs"; import { join } from "node:path"; const testHome = join(process.cwd(), ".tmp-running-state-home"); +const { mockFsyncSync, mockWriteFileSync } = vi.hoisted(() => ({ + mockFsyncSync: vi.fn(), + mockWriteFileSync: vi.fn(), +})); + vi.mock("node:os", () => ({ homedir: () => testHome, })); +// Wrap node:fs so we can observe fsyncSync calls without breaking ESM +// namespace immutability (vi.spyOn on a node:fs export errors out). +vi.mock("node:fs", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + fsyncSync: (...args: Parameters) => { + mockFsyncSync(...args); + return actual.fsyncSync(...args); + }, + writeFileSync: (...args: Parameters) => { + const override = mockWriteFileSync(...args); + if (override === "throw") { + throw new Error("synthetic disk-full"); + } + return actual.writeFileSync(...args); + }, + }; +}); + describe("running-state", () => { beforeEach(() => { rmSync(testHome, { recursive: true, force: true }); vi.restoreAllMocks(); vi.resetModules(); + // The hoisted node:fs wrappers persist across tests; reset their + // implementations so a previous test's mockImplementation can't + // leak into the next. + mockFsyncSync.mockReset(); + mockWriteFileSync.mockReset(); }); afterEach(() => { @@ -50,6 +81,70 @@ describe("running-state", () => { expect(killSpy).toHaveBeenCalledWith(424242, 0); }); + it("writeLastStop fsyncs the temp file before renaming so the record survives SIGKILL (issue #1743)", async () => { + mockFsyncSync.mockClear(); + const runningState = await import("../../src/lib/running-state.js"); + + await runningState.writeLastStop({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: "my-app", + sessionIds: ["app-1", "app-2"], + }); + + const lastStopFile = join(testHome, ".agent-orchestrator", "last-stop.json"); + expect(existsSync(lastStopFile)).toBe(true); + expect(JSON.parse(readFileSync(lastStopFile, "utf-8"))).toEqual({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: "my-app", + sessionIds: ["app-1", "app-2"], + }); + expect(mockFsyncSync).toHaveBeenCalled(); + }); + + it("writeLastStop leaves no temp file behind when the write itself throws", async () => { + // Match on the payload so the synthetic failure only fires on the + // data write, not the lockfile metadata write that fires first. + const marker = "marker-1743-disk-full"; + mockWriteFileSync.mockImplementation((_fd: number, content: string) => + typeof content === "string" && content.includes(marker) ? "throw" : undefined, + ); + const runningState = await import("../../src/lib/running-state.js"); + + await expect( + runningState.writeLastStop({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: marker, + sessionIds: ["app-1"], + }), + ).rejects.toThrow("synthetic disk-full"); + + const stateDir = join(testHome, ".agent-orchestrator"); + const stale = existsSync(stateDir) + ? readdirSync(stateDir).filter((n) => n.startsWith("last-stop.json.tmp.")) + : []; + expect(stale).toEqual([]); + expect(existsSync(join(stateDir, "last-stop.json"))).toBe(false); + }); + + it("readLastStop round-trips otherProjects through writeLastStop", async () => { + const runningState = await import("../../src/lib/running-state.js"); + + await runningState.writeLastStop({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: "my-app", + sessionIds: ["app-1"], + otherProjects: [{ projectId: "other-app", sessionIds: ["other-1", "other-2"] }], + }); + + const read = await runningState.readLastStop(); + expect(read).toEqual({ + stoppedAt: "2026-05-08T17:53:15.909Z", + projectId: "my-app", + sessionIds: ["app-1"], + otherProjects: [{ projectId: "other-app", sessionIds: ["other-1", "other-2"] }], + }); + }); + it("keeps startup locks alive when the pid probe returns EPERM", async () => { const runningState = await import("../../src/lib/running-state.js"); const lockDir = join(testHome, ".agent-orchestrator"); diff --git a/packages/cli/__tests__/lib/stop-update-start-flow.test.ts b/packages/cli/__tests__/lib/stop-update-start-flow.test.ts new file mode 100644 index 000000000..dca20f705 --- /dev/null +++ b/packages/cli/__tests__/lib/stop-update-start-flow.test.ts @@ -0,0 +1,158 @@ +/** + * Integration test for the stop → simulated-update → start flow. + * + * Reproduces the user-observed bug from issue #1743 in a hermetic + * environment: the test stages a `last-stop.json` exactly the way + * `ao stop` would, simulates the only side effect `ao update` + * legitimately has on this directory (i.e. none), and asserts that + * `ao start`'s read of `last-stop.json` returns the expected record. + * + * The fallback path is also exercised: if the record is somehow + * missing on the next `ao start`, `findRecentlyKilledSessions` must + * synthesize a restore set from recent `manually_killed` sessions + * exposed by the session manager. + */ + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { CanonicalSessionLifecycle, Session, SessionManager } from "@aoagents/ao-core"; + +const testHome = join(process.cwd(), ".tmp-stop-update-start-home"); + +vi.mock("node:os", () => ({ + homedir: () => testHome, +})); + +const NOW = Date.parse("2026-05-08T17:54:00.000Z"); +const STOPPED_AT = "2026-05-08T17:53:15.909Z"; + +function makeLifecycle( + state: CanonicalSessionLifecycle["session"]["state"], + reason: CanonicalSessionLifecycle["session"]["reason"], + terminatedAt: string | null, +): CanonicalSessionLifecycle { + return { + version: 2, + session: { + kind: "worker", + state, + reason, + startedAt: null, + completedAt: null, + terminatedAt, + lastTransitionAt: terminatedAt ?? "2026-05-08T17:00:00.000Z", + }, + pr: { state: "none", reason: "not_created", number: null, url: null, lastObservedAt: null }, + runtime: { + state: "missing", + reason: "manual_kill_requested", + lastObservedAt: null, + handle: null, + tmuxName: null, + }, + }; +} + +function makeSession(id: string, projectId: string, lifecycle: CanonicalSessionLifecycle): Session { + return { + id, + projectId, + status: lifecycle.session.state === "terminated" ? "killed" : "working", + activity: lifecycle.session.state === "terminated" ? "exited" : null, + activitySignal: { state: "unavailable", activity: null, source: "none" }, + lifecycle, + branch: null, + issueId: null, + pr: null, + workspacePath: null, + runtimeHandle: null, + agentInfo: null, + createdAt: new Date(NOW - 60_000), + lastActivityAt: new Date(NOW - 30_000), + metadata: {}, + }; +} + +describe("stop → update → start flow (issue #1743)", () => { + beforeEach(() => { + rmSync(testHome, { recursive: true, force: true }); + vi.resetModules(); + }); + + afterEach(() => { + rmSync(testHome, { recursive: true, force: true }); + vi.restoreAllMocks(); + }); + + it("ao stop's writeLastStop produces a file ao start can read back across a simulated update", async () => { + const runningState = await import("../../src/lib/running-state.js"); + + // 1. Simulate `ao stop` recording the active sessions. + await runningState.writeLastStop({ + stoppedAt: STOPPED_AT, + projectId: "agent-orchestrator", + sessionIds: ["ao-170", "ao-171"], + }); + + const lastStopPath = join(testHome, ".agent-orchestrator", "last-stop.json"); + expect(existsSync(lastStopPath)).toBe(true); + const beforeUpdate = readFileSync(lastStopPath, "utf-8"); + + // 2. Simulate `ao update`. The current update command never touches + // ~/.agent-orchestrator/, so the only valid simulation is a + // no-op — anything else would be a regression. Verify the + // file's bytes are byte-for-byte identical afterwards. + // (We deliberately add a few unrelated state mutations to make + // sure the simulated update only affects what it should.) + mkdirSync(join(testHome, ".agent-orchestrator", "fake-update-marker"), { recursive: true }); + writeFileSync( + join(testHome, ".agent-orchestrator", "fake-update-marker", "version"), + "0.6.0\n", + ); + + const afterUpdate = readFileSync(lastStopPath, "utf-8"); + expect(afterUpdate).toBe(beforeUpdate); + + // 3. Simulate `ao start` reading the record back. + const restored = await runningState.readLastStop(); + expect(restored).toEqual({ + stoppedAt: STOPPED_AT, + projectId: "agent-orchestrator", + sessionIds: ["ao-170", "ao-171"], + }); + }); + + it("ao start's fallback finds recently manually-killed sessions when last-stop.json is missing", async () => { + // No last-stop.json exists — this is the failure mode reported in + // the bug. The fallback should still surface the killed sessions + // so the user gets a restore prompt. + const stateDir = join(testHome, ".agent-orchestrator"); + mkdirSync(stateDir, { recursive: true }); + expect(existsSync(join(stateDir, "last-stop.json"))).toBe(false); + + const fallback = await import("../../src/lib/last-stop-fallback.js"); + const sessions: Session[] = [ + makeSession( + "ao-170", + "agent-orchestrator", + makeLifecycle("terminated", "manually_killed", "2026-05-08T17:53:15.909Z"), + ), + makeSession( + "ao-171", + "agent-orchestrator", + makeLifecycle("terminated", "manually_killed", "2026-05-08T17:53:16.728Z"), + ), + ]; + const fakeSm = { + list: vi.fn(async () => sessions), + } as unknown as SessionManager; + + const result = await fallback.findRecentlyKilledSessions(fakeSm, "agent-orchestrator", { + now: NOW, + }); + expect(result).not.toBeNull(); + expect(result?.projectId).toBe("agent-orchestrator"); + expect(result?.sessionIds).toEqual(expect.arrayContaining(["ao-170", "ao-171"])); + }); +}); diff --git a/packages/cli/src/commands/start.ts b/packages/cli/src/commands/start.ts index 42a6ab502..74977b28c 100644 --- a/packages/cli/src/commands/start.ts +++ b/packages/cli/src/commands/start.ts @@ -97,6 +97,7 @@ import { } from "../lib/install-helpers.js"; import { ensureGit, runtimePreflight } from "../lib/startup-preflight.js"; import { installShutdownHandlers } from "../lib/shutdown.js"; +import { findRecentlyKilledSessions } from "../lib/last-stop-fallback.js"; import { resolveOrCreateProject } from "../lib/resolve-project.js"; import { pathsEqual } from "../lib/path-equality.js"; @@ -935,11 +936,40 @@ async function runStartup( } } - // Check for sessions from last `ao stop` and offer to restore them + // Check for sessions from last `ao stop` and offer to restore them. + // If `last-stop.json` is missing or empty (which can happen if the + // record never made it to disk — see issue #1743), fall back to a + // scan of recently `manually_killed` sessions so the restore prompt + // still fires. if (isHumanCaller()) { try { - const lastStop = await readLastStop(); - if (lastStop && lastStop.sessionIds.length > 0) { + let lastStop = await readLastStop(); + const lastStopHasContent = + !!lastStop && + (lastStop.sessionIds.length > 0 || (lastStop.otherProjects ?? []).length > 0); + if (!lastStopHasContent) { + // Use the global config so `sm.list()` sees sessions from every + // registered project. The project-scoped `config` only sees the + // current project's sessions, which would silently drop the + // cross-project rows the existing `readLastStop` path preserves + // via `otherProjects`. The restore step on line ~1002 already + // promotes to the global config when otherProjects is non-empty, + // so this just ensures the fallback can populate that array. + let fallbackConfig = config; + const globalPath = getGlobalConfigPath(); + if (existsSync(globalPath)) { + fallbackConfig = loadConfig(globalPath); + } + const fallbackSm = await getSessionManager(fallbackConfig); + const fallback = await findRecentlyKilledSessions(fallbackSm, projectId); + if ( + fallback && + (fallback.sessionIds.length > 0 || (fallback.otherProjects ?? []).length > 0) + ) { + lastStop = fallback; + } + } + if (lastStop && (lastStop.sessionIds.length > 0 || (lastStop.otherProjects ?? []).length > 0)) { const stoppedAgo = `stopped at ${new Date(lastStop.stoppedAt).toLocaleString()}`; const otherProjects = lastStop.otherProjects ?? []; diff --git a/packages/cli/src/lib/last-stop-fallback.ts b/packages/cli/src/lib/last-stop-fallback.ts new file mode 100644 index 000000000..c4649333d --- /dev/null +++ b/packages/cli/src/lib/last-stop-fallback.ts @@ -0,0 +1,122 @@ +/** + * Fallback for `ao start`'s restore prompt when `last-stop.json` is + * missing or malformed (issue #1743). Without this, a single regression + * in the last-stop write pipeline silently swallows the user's + * in-flight work. + * + * The fallback scans recently terminated sessions across every project + * the running session manager can see and synthesizes a `LastStopState` + * from any that look like they were killed by `ao stop` (canonical + * lifecycle reason `manually_killed`) within the recent past. + * + * It deliberately does *not* try to be clever: it does not consult + * `running.json`, does not infer ownership from process state, and does + * not attempt to deduplicate against an already-restored session. + * `sm.restore()` is the source of truth for whether a candidate is + * actually restorable; the fallback's only job is to make sure the + * prompt fires at all. + */ + +import type { Session, SessionManager } from "@aoagents/ao-core"; +import type { LastStopState } from "./running-state.js"; + +/** + * How recent a `manually_killed` session must be (in ms) for the + * fallback to surface it. 10 minutes covers the typical + * `ao stop` → `ao update` → `ao start` flow (a few seconds of stop + + * a minute or two of update) with generous slack for slow updates, + * while still excluding stale sessions from days/weeks ago. + */ +export const FALLBACK_RECENT_WINDOW_MS = 10 * 60 * 1000; + +function getTerminatedAtMs(session: Session): number | null { + const ts = session.lifecycle?.session?.terminatedAt ?? null; + if (!ts) return null; + const ms = Date.parse(ts); + if (!Number.isFinite(ms)) return null; + return ms; +} + +/** + * Build a synthetic `LastStopState` from sessions that: + * - are canonically terminated, + * - have reason "manually_killed", + * - and were terminated within `windowMs` of `now`. + * + * Returns null if nothing qualifies, so callers can do + * `lastStop ?? buildFallback(...)`. + * + * The returned record uses the *most recent* terminatedAt as the + * `stoppedAt` timestamp so the prompt's "stopped at " string is + * meaningful. + */ +export function buildLastStopFallback( + sessions: Session[], + primaryProjectId: string, + options?: { now?: number; windowMs?: number }, +): LastStopState | null { + const now = options?.now ?? Date.now(); + const windowMs = options?.windowMs ?? FALLBACK_RECENT_WINDOW_MS; + const cutoff = now - windowMs; + + const candidates: Array<{ session: Session; terminatedAt: number }> = []; + for (const session of sessions) { + const lifecycle = session.lifecycle?.session; + if (!lifecycle) continue; + if (lifecycle.state !== "terminated") continue; + if (lifecycle.reason !== "manually_killed") continue; + const terminatedAt = getTerminatedAtMs(session); + if (terminatedAt === null) continue; + if (terminatedAt < cutoff) continue; + candidates.push({ session, terminatedAt }); + } + + if (candidates.length === 0) return null; + + // Group by project, with the active CLI's project surfaced via + // `sessionIds` and everything else routed to `otherProjects` — + // matches the shape `ao start`'s prompt already knows how to render. + const byProject = new Map(); + let mostRecent = 0; + for (const { session, terminatedAt } of candidates) { + const pid = session.projectId ?? "unknown"; + const list = byProject.get(pid) ?? []; + list.push(session.id); + byProject.set(pid, list); + if (terminatedAt > mostRecent) mostRecent = terminatedAt; + } + + const primaryIds = byProject.get(primaryProjectId) ?? []; + byProject.delete(primaryProjectId); + const otherProjects: Array<{ projectId: string; sessionIds: string[] }> = []; + for (const [pid, ids] of byProject) { + otherProjects.push({ projectId: pid, sessionIds: ids }); + } + + return { + stoppedAt: new Date(mostRecent).toISOString(), + projectId: primaryProjectId, + sessionIds: primaryIds, + ...(otherProjects.length > 0 ? { otherProjects } : {}), + }; +} + +/** + * Scan every project the session manager can see and build a fallback + * LastStopState from recently `manually_killed` sessions. + * + * Returns null on any error (the fallback is best-effort — failing to + * surface candidates must never block startup). + */ +export async function findRecentlyKilledSessions( + sm: SessionManager, + primaryProjectId: string, + options?: { now?: number; windowMs?: number }, +): Promise { + try { + const sessions = await sm.list(); + return buildLastStopFallback(sessions, primaryProjectId, options); + } catch { + return null; + } +} diff --git a/packages/cli/src/lib/running-state.ts b/packages/cli/src/lib/running-state.ts index e24cd8bb0..3537bc684 100644 --- a/packages/cli/src/lib/running-state.ts +++ b/packages/cli/src/lib/running-state.ts @@ -7,6 +7,8 @@ import { closeSync, constants, statSync, + fsyncSync, + renameSync, } from "node:fs"; import { join } from "node:path"; import { homedir } from "node:os"; @@ -284,16 +286,51 @@ export async function waitForExit(pid: number, timeoutMs = 5000): Promise { const release = await acquireLock(LAST_STOP_LOCK_FILE, 5000, "last-stop.json lock"); try { - // Atomic temp+rename so a crash mid-write cannot leave torn JSON - // that makes `readLastStop()` silently return `null` and lose the - // restore prompt for sessions the user just stopped. - atomicWriteFileSync(LAST_STOP_FILE, JSON.stringify(state, null, 2)); + atomicWriteFileSyncDurable(LAST_STOP_FILE, JSON.stringify(state, null, 2)); } finally { release(); }