diff --git a/index.ts b/index.ts index 07f2e28..78a700c 100644 --- a/index.ts +++ b/index.ts @@ -26,6 +26,49 @@ const plugin = { return await controller.handleInboundClaim(event); }); + const registerInternalHook = ( + api as OpenClawPluginApi & { + registerHook?: ( + events: string | string[], + handler: (event: unknown) => Promise | void, + opts?: { name?: string; description?: string }, + ) => void; + } + ).registerHook; + if (typeof registerInternalHook === "function") { + registerInternalHook( + "message:transcribed", + async (event) => { + await controller.handleMessageTranscribed(event as { + type?: string; + action?: string; + sessionKey?: string; + context?: Record; + }); + }, + { + name: "codex-transcribed-handoff", + description: "Send transcribed inbound audio to the bound Codex thread as text.", + }, + ); + + registerInternalHook( + "message:preprocessed", + async (event) => { + await controller.handleMessagePreprocessed(event as { + type?: string; + action?: string; + sessionKey?: string; + context?: Record; + }); + }, + { + name: "codex-preprocessed-audio-fallback", + description: "Fallback: transcribe bound inbound audio from mediaPath when transcript events do not arrive.", + }, + ); + } + api.registerInteractiveHandler({ channel: "telegram", namespace: INTERACTIVE_NAMESPACE, diff --git a/src/controller.test.ts b/src/controller.test.ts index 6b590b2..95bf6db 100644 --- a/src/controller.test.ts +++ b/src/controller.test.ts @@ -3582,7 +3582,7 @@ describe("Discord controller flows", () => { { type: "text", text: "Read this file" }, { type: "text", - text: "Attached file: note.txt\nContent-Type: text/plain\n\nhello", + text: `Attached file: note.txt\nLocal path: ${filePath}\nContent-Type: text/plain\n\nhello`, }, ], }), @@ -3635,14 +3635,90 @@ describe("Discord controller flows", () => { input: [ { type: "text", - text: "Attached file: README.md\n\n# Heading\n\nBody text.\n", + text: `Attached file: README.md\nLocal path: ${filePath}\n\n# Heading\n\nBody text.\n`, }, ], }), ); }); - it("still ignores unsupported binary document attachments", async () => { + it("starts a turn when a message:transcribed event arrives for a bound session", async () => { + const { controller } = await createControllerHarness(); + await (controller as any).store.upsertBinding({ + conversation: { + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + }, + sessionKey: "session-1", + threadId: "thread-1", + workspaceDir: "/repo/openclaw", + updatedAt: Date.now(), + }); + const startTurn = vi.fn(async () => undefined); + (controller as any).startTurn = startTurn; + + await controller.handleMessageTranscribed({ + type: "message", + action: "transcribed", + sessionKey: "session-1", + context: { + transcript: "Hello from voice", + mediaType: "audio/ogg", + }, + }); + + expect(startTurn).toHaveBeenCalledWith( + expect.objectContaining({ + conversation: expect.objectContaining({ conversationId: TEST_TELEGRAM_PEER_ID }), + prompt: "Hello from voice", + reason: "inbound", + }), + ); + }); + + it("falls back to runtime transcription on message:preprocessed when transcript is missing", async () => { + const { controller } = await createControllerHarness(); + await (controller as any).store.upsertBinding({ + conversation: { + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + }, + sessionKey: "session-1", + threadId: "thread-1", + workspaceDir: "/repo/openclaw", + updatedAt: Date.now(), + }); + const startTurn = vi.fn(async () => undefined); + (controller as any).startTurn = startTurn; + (controller as any).api.runtime = { + mediaUnderstanding: { + transcribeAudioFile: vi.fn(async () => ({ text: "Fallback transcript" })), + }, + }; + (controller as any).api.config = {}; + + await controller.handleMessagePreprocessed({ + type: "message", + action: "preprocessed", + sessionKey: "session-1", + context: { + mediaPath: "/tmp/test.ogg", + mediaType: "audio/ogg", + }, + }); + + expect(startTurn).toHaveBeenCalledWith( + expect.objectContaining({ + conversation: expect.objectContaining({ conversationId: TEST_TELEGRAM_PEER_ID }), + prompt: "Fallback transcript", + reason: "inbound", + }), + ); + }); + + it("passes unsupported binary documents through as local file references", async () => { const { controller, stateDir } = await createControllerHarness(); const filePath = path.join(stateDir, "tmp", "manual.pdf"); fs.mkdirSync(path.dirname(filePath), { recursive: true }); @@ -3685,7 +3761,14 @@ describe("Discord controller flows", () => { expect(startTurn).toHaveBeenCalledWith( expect.objectContaining({ prompt: "Read this document", - input: [{ type: "text", text: "Read this document" }], + input: [ + { type: "text", text: "Read this document" }, + { + type: "text", + text: + `Attached file: manual.pdf\nLocal path: ${filePath}\nContent-Type: application/pdf\n\nUse this local file path directly from the server workspace. Do not ask the user to re-upload it unless the path is unreadable.`, + }, + ], }), ); }); diff --git a/src/controller.ts b/src/controller.ts index c9c5249..9d27ca4 100644 --- a/src/controller.ts +++ b/src/controller.ts @@ -607,7 +607,7 @@ async function toCodexTextAttachmentInputItem( const displayName = media.fileName?.trim() || path.basename(normalizedPath) || "attached-file.txt"; const mimeType = normalizeMimeType(media.mimeType); - const lines = [`Attached file: ${displayName}`]; + const lines = [`Attached file: ${displayName}`, `Local path: ${normalizedPath}`]; if (mimeType) { lines.push(`Content-Type: ${mimeType}`); } @@ -618,6 +618,32 @@ async function toCodexTextAttachmentInputItem( return { type: "text", text: lines.join("\n") }; } +function toCodexGenericFileReferenceInputItem( + media: PluginInboundMedia, +): CodexTurnInputItem | null { + if (media.kind === "image") { + return null; + } + const normalizedPath = normalizeInboundMediaPath(media.path ?? media.url); + if (!normalizedPath || !path.isAbsolute(normalizedPath)) { + return null; + } + const displayName = media.fileName?.trim() || path.basename(normalizedPath) || "attached-file"; + const mimeType = normalizeMimeType(media.mimeType); + const lines = [ + `Attached file: ${displayName}`, + `Local path: ${normalizedPath}`, + ]; + if (mimeType) { + lines.push(`Content-Type: ${mimeType}`); + } + lines.push( + "", + "Use this local file path directly from the server workspace. Do not ask the user to re-upload it unless the path is unreadable.", + ); + return { type: "text", text: lines.join("\n") }; +} + async function buildInboundTurnInput(event: { content: string; media?: PluginInboundMedia[]; @@ -629,7 +655,10 @@ async function buildInboundTurnInput(event: { } const seen = new Set(); for (const media of [...(event.media ?? []), ...extractInboundMetadataMedia(event.metadata)]) { - const item = toCodexImageInputItem(media) ?? (await toCodexTextAttachmentInputItem(media)); + const item = + toCodexImageInputItem(media) ?? + (await toCodexTextAttachmentInputItem(media)) ?? + toCodexGenericFileReferenceInputItem(media); if (!item) { continue; } @@ -1409,6 +1438,120 @@ export class CodexPluginController { } } + async handleMessageTranscribed(event: { + type?: string; + action?: string; + sessionKey?: string; + context?: Record; + }): Promise { + try { + if (!this.settings.enabled) { + return; + } + if (event.type !== "message" || event.action !== "transcribed") { + return; + } + await this.start(); + const sessionKey = typeof event.sessionKey === "string" ? event.sessionKey : ""; + const ctx = event.context ?? {}; + const transcript = typeof ctx.transcript === "string" ? ctx.transcript.trim() : ""; + if (!sessionKey || !transcript) { + return; + } + const binding = this.store.listBindings().find((entry) => entry.sessionKey === sessionKey) ?? null; + if (!binding) { + return; + } + const mediaType = typeof ctx.mediaType === "string" ? ctx.mediaType : ""; + const conversation = binding.conversation; + this.api.logger.info( + `codex message:transcribed starting turn ${this.formatConversationForLog(conversation)} mediaType=${mediaType || ""} prompt="${summarizeTextForLog(transcript)}"`, + ); + await this.startTurn({ + conversation, + binding, + workspaceDir: binding.workspaceDir, + prompt: transcript, + reason: "inbound", + }); + } catch (error) { + const detail = error instanceof Error ? `${error.message}\n${error.stack ?? ""}`.trim() : String(error); + this.api.logger.error(`codex message:transcribed failed: ${detail}`); + } + } + + async handleMessagePreprocessed(event: { + type?: string; + action?: string; + sessionKey?: string; + context?: Record; + }): Promise { + try { + if (!this.settings.enabled) { + return; + } + if (event.type !== "message" || event.action !== "preprocessed") { + return; + } + await this.start(); + const sessionKey = typeof event.sessionKey === "string" ? event.sessionKey : ""; + const ctx = event.context ?? {}; + const mediaPath = typeof ctx.mediaPath === "string" ? ctx.mediaPath : ""; + const mediaType = typeof ctx.mediaType === "string" ? ctx.mediaType : ""; + const existingTranscript = typeof ctx.transcript === "string" ? ctx.transcript.trim() : ""; + if ( + !sessionKey || + !mediaPath || + (!mediaType.startsWith("audio/") && !mediaPath.match(/\.(ogg|oga|opus|mp3|wav|m4a)$/i)) + ) { + return; + } + const binding = this.store.listBindings().find((entry) => entry.sessionKey === sessionKey) ?? null; + if (!binding) { + return; + } + const mediaUnderstanding = (this.api as OpenClawPluginApi & { + runtime?: { + mediaUnderstanding?: { + transcribeAudioFile?: (params: { + filePath: string; + cfg?: unknown; + mime?: string; + }) => Promise<{ text?: string } | null | undefined>; + }; + }; + config?: unknown; + }).runtime?.mediaUnderstanding; + const transcript = + existingTranscript || + (await mediaUnderstanding?.transcribeAudioFile?.({ + filePath: mediaPath, + cfg: (this.api as { config?: unknown }).config, + mime: mediaType || undefined, + }))?.text?.trim() || + ""; + if (!transcript) { + this.api.logger.warn( + `codex message:preprocessed audio fallback produced no transcript ${this.formatConversationForLog(binding.conversation)} mediaPath=${mediaPath}`, + ); + return; + } + this.api.logger.info( + `codex message:preprocessed audio fallback starting turn ${this.formatConversationForLog(binding.conversation)} prompt="${summarizeTextForLog(transcript)}"`, + ); + await this.startTurn({ + conversation: binding.conversation, + binding, + workspaceDir: binding.workspaceDir, + prompt: transcript, + reason: "inbound", + }); + } catch (error) { + const detail = error instanceof Error ? `${error.message}\n${error.stack ?? ""}`.trim() : String(error); + this.api.logger.error(`codex message:preprocessed fallback failed: ${detail}`); + } + } + async handleTelegramInteractive(ctx: PluginInteractiveTelegramHandlerContext): Promise { await this.start(); const bindingApi = asScopedBindingApi(ctx);