diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ee2a1c..b414661 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Unreleased + +### Highlights + +- Added an optional inbound audio transcription preprocessor so bound conversations can convert staged voice/audio attachments into normal text turn input before forwarding the turn into Codex. The plugin stays transport-agnostic by delegating transcription to a configurable local command that prints transcript text to stdout. + +### Docs + +- Documented the new `inboundAudioTranscription` plugin config and clarified the media bridge notes around staged inbound audio handling. + ## v0.6.0 - 2026-04-03 ### Highlights diff --git a/README.md b/README.md index ffc3ae8..750d394 100644 --- a/README.md +++ b/README.md @@ -212,6 +212,39 @@ The plugin schema in [`openclaw.plugin.json`](./openclaw.plugin.json) supports: - `defaultWorkspaceDir`: fallback workspace for unbound actions - `defaultModel`: model used when a new thread starts without an explicit selection - `defaultServiceTier`: default service tier for new turns +- `inboundAudioTranscription`: optional preprocessor for inbound audio/voice attachments before they are forwarded into Codex + +### Optional inbound audio transcription + +If your chat surface provides inbound audio files as local paths or media metadata, this plugin can transcribe them before forwarding the turn to Codex. This keeps the plugin transport-agnostic: Codex still receives normal text input, while transcription is delegated to any local command you choose. + +Example config using an existing local script: + +```json +{ + "inboundAudioTranscription": { + "enabled": true, + "command": "/root/.openclaw/workspace/scripts/local-stt-transcribe.sh", + "args": ["{path}"], + "timeoutMs": 20000 + } +} +``` + +Behavior: + +- audio-only inbound messages become transcript text +- caption + audio keeps the caption and adds a labeled transcript block +- the command should print the transcript to stdout +- if stdout is JSON, `.text` or `.transcript` is used automatically + +Argument placeholders supported in `args`: + +- `{path}` +- `{mimeType}` +- `{fileName}` + +If `{path}` is omitted from `args`, the plugin appends the media path automatically. ## Developer Workflow With A Local OpenClaw Checkout diff --git a/docs/specs/MEDIA.md b/docs/specs/MEDIA.md index 4300594..ed33ad8 100644 --- a/docs/specs/MEDIA.md +++ b/docs/specs/MEDIA.md @@ -5,7 +5,8 @@ This document captures the current state of media handling relevant to this plug - how Codex app-server accepts image input - what this plugin currently sends - what OpenClaw currently exposes to plugins -- the gap for inbound media +- the remaining gap for richer inbound media +- the staged-audio transcription bridge this plugin now supports - a recommended bridge design for future implementation This is a spec/notes document only. It does not imply that inbound media support has already been implemented here. @@ -15,9 +16,11 @@ This is a spec/notes document only. It does not imply that inbound media support - Codex app-server already supports multimodal turn input via `UserInput`. - The supported image-shaped input items are remote/data URL images and local filesystem images. - This plugin now supports mixed text + image turn input and forwards inbound image media into Codex when OpenClaw provides a staged media path or URL. +- This plugin can also transcribe staged inbound audio/voice attachments into plain text turn input when a local transcription command is configured. - OpenClaw’s plugin SDK already supports outbound attachments from a plugin via `mediaUrl` and `mediaUrls`. - OpenClaw’s plugin SDK still does not model inbound attachments as a first-class typed field on command or `inbound_claim` events. - In practice, current `inbound_claim` hook metadata already carries `mediaPath` / `mediaType`, which is enough for this plugin to forward a staged inbound image. +- The same staged inbound path is also enough to transcribe audio before Codex sees the turn, as long as the plugin can execute an external transcription command against the staged file. - The cleanest future bridge is: OpenClaw stages inbound files locally, then this plugin maps image paths to Codex `localImage` items. ## Codex App-Server Input Model @@ -177,8 +180,41 @@ That means: - text-only turns still work as before - mixed text + image turns can be forwarded into Codex - image-only inbound turns can be forwarded into Codex +- audio-only inbound turns can be converted into transcript text before the turn starts when `inboundAudioTranscription` is configured +- mixed caption + audio inbound turns can keep the original text and append a labeled transcript block - staged text attachments such as `.txt`, `.md`, `.json`, `.yaml`, and `.yml` can be read and forwarded as additional `text` items -- unsupported binary non-image inbound media is still ignored for now +- unsupported binary non-image inbound media is still ignored for now unless a future bridge teaches the plugin how to reinterpret it + +## Inbound Audio Transcription Bridge + +The plugin does not send raw audio into Codex. Instead, it can optionally reinterpret staged audio files as text by invoking a configurable local command. + +Configuration shape: + +```json +{ + "inboundAudioTranscription": { + "enabled": true, + "command": "/path/to/transcribe", + "args": ["{path}"], + "timeoutMs": 20000 + } +} +``` + +Behavior: + +- The command receives the staged media path either through an explicit `{path}` placeholder or as an appended trailing argument. +- Optional placeholders `{mimeType}` and `{fileName}` are available for wrappers that need them. +- The command should print the transcript to stdout. +- If stdout is JSON, the plugin uses `.text` first and then `.transcript`. +- On transcription failure or timeout, the plugin logs the failure and falls back to the previous behavior instead of crashing the inbound turn. + +This keeps the bridge generic: + +- no hard dependency on a specific speech-to-text engine +- no plugin-side audio decoding logic +- no transport-specific behavior baked into the Codex turn layer ## OpenClaw Plugin SDK: Outbound Media diff --git a/openclaw.plugin.json b/openclaw.plugin.json index 2c928a0..265a26a 100644 --- a/openclaw.plugin.json +++ b/openclaw.plugin.json @@ -53,6 +53,28 @@ }, "defaultServiceTier": { "type": "string" + }, + "inboundAudioTranscription": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "command": { + "type": "string" + }, + "args": { + "type": "array", + "items": { + "type": "string" + } + }, + "timeoutMs": { + "type": "number", + "minimum": 100 + } + } } } }, @@ -100,6 +122,11 @@ "defaultServiceTier": { "label": "Default Service Tier", "advanced": true + }, + "inboundAudioTranscription": { + "label": "Inbound Audio Transcription", + "advanced": true, + "help": "Optional preprocessor for inbound audio/voice attachments. The command should print the transcript to stdout. Use {path}, {mimeType}, and {fileName} placeholders in args when needed." } } } diff --git a/src/config.ts b/src/config.ts index 5d1ab5f..5f40097 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,4 +1,8 @@ -import type { PluginSettings } from "./types.js"; +import type { + EndpointSettings, + InboundAudioTranscriptionSettings, + PluginSettings, +} from "./types.js"; import { DEFAULT_REQUEST_TIMEOUT_MS, } from "./types.js"; @@ -56,6 +60,23 @@ function readNumber( return fallback; } +function resolveInboundAudioTranscription( + record: Record, +): InboundAudioTranscriptionSettings | undefined { + const nested = asRecord(record.inboundAudioTranscription); + const legacy = asRecord(record.audioTranscription); + const source = Object.keys(nested).length > 0 ? nested : legacy; + if (Object.keys(source).length === 0) { + return undefined; + } + return { + enabled: source.enabled !== false, + command: readString(source, "command"), + args: readStringArray(source, "args"), + timeoutMs: readNumber(source, "timeoutMs", 20_000, 100), + }; +} + export function resolvePluginSettings(rawConfig: unknown): PluginSettings { const record = asRecord(rawConfig); const transport = record.transport === "websocket" ? "websocket" : "stdio"; @@ -82,6 +103,7 @@ export function resolvePluginSettings(rawConfig: unknown): PluginSettings { defaultWorkspaceDir: readString(record, "defaultWorkspaceDir"), defaultModel: readString(record, "defaultModel"), defaultServiceTier: readString(record, "defaultServiceTier"), + inboundAudioTranscription: resolveInboundAudioTranscription(record), }; } diff --git a/src/controller.test.ts b/src/controller.test.ts index af9973e..007c3f5 100644 --- a/src/controller.test.ts +++ b/src/controller.test.ts @@ -312,6 +312,99 @@ async function createControllerHarness() { }; } +async function createControllerHarnessWithPluginConfig(pluginConfigOverrides: Record) { + const { + api, + sendComponentMessage, + sendMessageDiscord, + sendMessageTelegram, + discordTypingStart, + renameTopic, + resolveTelegramToken, + editChannel, + discordOutbound, + stateDir, + } = createApiMock(pluginConfigOverrides); + const controller = new CodexPluginController(api); + await controller.start(); + const threadState: any = { + threadId: "thread-1", + threadName: "Discord Thread", + model: "openai/gpt-5.4", + cwd: "/repo/openclaw", + serviceTier: "default", + approvalPolicy: "on-request", + sandbox: "workspace-write", + }; + const clientMock = { + hasProfile: vi.fn((profile: string) => profile === "default" || profile === "full-access"), + listThreads: vi.fn(async () => []), + startThread: vi.fn(async () => ({ + threadId: "thread-new", + threadName: "New Thread", + model: "openai/gpt-5.4", + cwd: "/repo/openclaw", + serviceTier: "default", + })), + listModels: vi.fn(async () => [{ id: "openai/gpt-5.4", current: true }]), + listSkills: vi.fn(async () => []), + listMcpServers: vi.fn(async () => []), + readThreadState: vi.fn(async () => ({ ...threadState })), + readThreadContext: vi.fn(async () => ({ + lastUserMessage: undefined, + lastAssistantMessage: undefined, + })), + setThreadName: vi.fn(async () => ({ + threadId: "thread-1", + threadName: "Discord Thread", + })), + setThreadModel: vi.fn(async (params: { model: string }) => { + threadState.model = params.model; + return { ...threadState }; + }), + setThreadServiceTier: vi.fn(async (params: { serviceTier: string | null }) => { + threadState.serviceTier = params.serviceTier ?? "default"; + return { ...threadState }; + }), + setThreadPermissions: vi.fn(async (params: { approvalPolicy: string; sandbox: string }) => { + threadState.approvalPolicy = params.approvalPolicy; + threadState.sandbox = params.sandbox; + return { ...threadState }; + }), + startReview: vi.fn(() => ({ + result: new Promise(() => {}), + getThreadId: () => "thread-1", + queueMessage: vi.fn(async () => false), + interrupt: vi.fn(async () => {}), + isAwaitingInput: () => false, + submitPendingInput: vi.fn(async () => false), + submitPendingInputPayload: vi.fn(async () => false), + })), + readAccount: vi.fn(async () => ({ + email: "test@example.com", + planType: "pro", + type: "chatgpt", + })), + readRateLimits: vi.fn(async () => []), + }; + (controller as any).client = clientMock; + (controller as any).readThreadHasChanges = vi.fn(async () => false); + return { + controller, + api, + clientMock, + sendComponentMessage, + sendMessageDiscord, + sendMessageTelegram, + discordTypingStart, + renameTopic, + resolveTelegramToken, + editChannel, + discordOutbound, + stateDir, + }; +} + async function createControllerHarnessWithoutLegacyBindings() { const harness = createApiMock(); delete (harness.api as any).runtime.channel.bindings; @@ -4225,6 +4318,127 @@ describe("Discord controller flows", () => { ); }); + it("transcribes inbound audio with a configured command before starting the turn", async () => { + const { controller, stateDir } = await createControllerHarnessWithPluginConfig({ + inboundAudioTranscription: { + enabled: true, + command: process.execPath, + args: [ + "-e", + 'process.stdout.write(JSON.stringify({text:`Transcript for ${process.argv[1]}`}))', + "{path}", + ], + }, + }); + const audioPath = path.join(stateDir, "tmp", "voice.ogg"); + fs.mkdirSync(path.dirname(audioPath), { recursive: true }); + fs.writeFileSync(audioPath, "ogg"); + await (controller as any).store.upsertBinding({ + conversation: { + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + }, + sessionKey: "session-1", + threadId: "thread-1", + workspaceDir: "/repo/openclaw", + updatedAt: Date.now(), + }); + const startTurn = vi.fn(() => ({ + result: Promise.resolve({ + threadId: "thread-1", + text: "handled", + }), + getThreadId: () => "thread-1", + queueMessage: vi.fn(async () => true), + interrupt: vi.fn(async () => {}), + isAwaitingInput: () => false, + submitPendingInput: vi.fn(async () => false), + submitPendingInputPayload: vi.fn(async () => false), + })); + (controller as any).client.startTurn = startTurn; + + const result = await controller.handleInboundClaim({ + content: "", + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + isGroup: false, + metadata: { mediaPath: audioPath, mediaType: "audio/ogg" }, + }); + + expect(result).toEqual({ handled: true }); + expect(startTurn).toHaveBeenCalledWith( + expect.objectContaining({ + prompt: "", + input: [{ type: "text", text: `Transcript for ${audioPath}` }], + }), + ); + }); + + it("keeps labeled transcript text when audio arrives with a caption", async () => { + const { controller, stateDir } = await createControllerHarnessWithPluginConfig({ + inboundAudioTranscription: { + enabled: true, + command: process.execPath, + args: [ + "-e", + 'process.stdout.write("hello from audio")', + ], + }, + }); + const audioPath = path.join(stateDir, "tmp", "voice-note.ogg"); + fs.mkdirSync(path.dirname(audioPath), { recursive: true }); + fs.writeFileSync(audioPath, "ogg"); + await (controller as any).store.upsertBinding({ + conversation: { + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + }, + sessionKey: "session-1", + threadId: "thread-1", + workspaceDir: "/repo/openclaw", + updatedAt: Date.now(), + }); + const startTurn = vi.fn(() => ({ + result: Promise.resolve({ + threadId: "thread-1", + text: "handled", + }), + getThreadId: () => "thread-1", + queueMessage: vi.fn(async () => true), + interrupt: vi.fn(async () => {}), + isAwaitingInput: () => false, + submitPendingInput: vi.fn(async () => false), + submitPendingInputPayload: vi.fn(async () => false), + })); + (controller as any).client.startTurn = startTurn; + + const result = await controller.handleInboundClaim({ + content: "Please use this note", + channel: "telegram", + accountId: "default", + conversationId: TEST_TELEGRAM_PEER_ID, + isGroup: false, + metadata: { mediaPath: audioPath, mediaType: "audio/ogg" }, + }); + + expect(result).toEqual({ handled: true }); + expect(startTurn).toHaveBeenCalledWith( + expect.objectContaining({ + prompt: "Please use this note", + input: [ + { type: "text", text: "Please use this note" }, + { + type: "text", + text: "Transcribed audio: voice-note.ogg\n\nhello from audio", + }, + ], + }), + ); + }); + it("forwards text file inbound media metadata as text turn input", async () => { const { controller, stateDir } = await createControllerHarness(); const filePath = path.join(stateDir, "tmp", "note.txt"); diff --git a/src/controller.ts b/src/controller.ts index 4ba6fb1..38db4b2 100644 --- a/src/controller.ts +++ b/src/controller.ts @@ -190,6 +190,16 @@ const TEXT_ATTACHMENT_MIME_TYPES = new Set([ "text/x-markdown", "text/yaml", ]); +const AUDIO_FILE_EXTENSIONS = new Set([ + ".aac", + ".flac", + ".m4a", + ".mp3", + ".ogg", + ".opus", + ".wav", + ".webm", +]); const MAX_TEXT_ATTACHMENT_BYTES = 64 * 1024; type TelegramOutboundAdapter = { @@ -662,6 +672,19 @@ function isImagePathLike(value: string | undefined): boolean { return IMAGE_FILE_EXTENSIONS.has(path.extname(normalized).toLowerCase()); } +function isAudioMimeType(value: string | undefined): boolean { + const normalized = normalizeMimeType(value); + return Boolean(normalized?.startsWith("audio/")); +} + +function isAudioPathLike(value: string | undefined): boolean { + const normalized = normalizeInboundMediaPath(value); + if (!normalized) { + return false; + } + return AUDIO_FILE_EXTENSIONS.has(path.extname(normalized).toLowerCase()); +} + function isTextAttachmentMimeType(value: string | undefined): boolean { const normalized = normalizeMimeType(value); return Boolean( @@ -790,18 +813,81 @@ async function toCodexTextAttachmentInputItem( return { type: "text", text: lines.join("\n") }; } +function extractTranscriptText(stdout: string): string { + const trimmed = stdout.trim(); + if (!trimmed) { + return ""; + } + try { + const parsed = JSON.parse(trimmed) as { text?: unknown; transcript?: unknown }; + const value = + typeof parsed?.text === "string" + ? parsed.text + : typeof parsed?.transcript === "string" + ? parsed.transcript + : undefined; + return value?.trim() ?? trimmed; + } catch { + return trimmed; + } +} + +function buildAudioTranscriptArgv(params: { + args: readonly string[]; + mediaPath: string; + mimeType?: string; + fileName?: string; +}): string[] { + const replacements = { + path: params.mediaPath, + mimeType: params.mimeType ?? "", + fileName: params.fileName ?? path.basename(params.mediaPath), + }; + const rendered = params.args.map((entry) => + entry.replace(/\{(path|mimeType|fileName)\}/g, (_match, key: keyof typeof replacements) => { + return replacements[key] ?? ""; + }), + ); + if (!rendered.some((entry) => entry.includes(params.mediaPath))) { + rendered.push(params.mediaPath); + } + return rendered; +} + async function buildInboundTurnInput(event: { content: string; media?: PluginInboundMedia[]; metadata?: Record; + transcribeAudio?: (media: PluginInboundMedia) => Promise; }): Promise { const items: CodexTurnInputItem[] = []; if (event.content.trim()) { items.push({ type: "text", text: event.content }); } + const normalizedMedia = [...(event.media ?? []), ...extractInboundMetadataMedia(event.metadata)]; + const onlyAudioWithoutPrompt = + !event.content.trim() && + normalizedMedia.length === 1 && + (isAudioMimeType(normalizedMedia[0]?.mimeType) || + isAudioPathLike(normalizedMedia[0]?.path) || + isAudioPathLike(normalizedMedia[0]?.url)); const seen = new Set(); - for (const media of [...(event.media ?? []), ...extractInboundMetadataMedia(event.metadata)]) { - const item = toCodexImageInputItem(media) ?? (await toCodexTextAttachmentInputItem(media)); + for (const media of normalizedMedia) { + let item: CodexTurnInputItem | null = null; + if (event.transcribeAudio && + (isAudioMimeType(media.mimeType) || isAudioPathLike(media.path) || isAudioPathLike(media.url))) { + const transcript = await event.transcribeAudio(media); + if (transcript?.trim()) { + const displayName = media.fileName?.trim() || path.basename(media.path ?? media.url ?? "audio"); + item = { + type: "text", + text: onlyAudioWithoutPrompt + ? transcript.trim() + : [`Transcribed audio: ${displayName}`, "", transcript.trim()].join("\n"), + }; + } + } + item ??= toCodexImageInputItem(media) ?? (await toCodexTextAttachmentInputItem(media)); if (!item) { continue; } @@ -1472,6 +1558,40 @@ export class CodexPluginController { ].join(" "); } + private async transcribeInboundAudio(media: PluginInboundMedia): Promise { + const settings = this.settings.inboundAudioTranscription; + if (!settings?.enabled || !settings.command?.trim()) { + return null; + } + const mediaPath = normalizeInboundMediaPath(media.path ?? media.url); + if (!mediaPath || !path.isAbsolute(mediaPath)) { + return null; + } + const stats = await fs.stat(mediaPath).catch(() => undefined); + if (!stats?.isFile()) { + return null; + } + const argv = buildAudioTranscriptArgv({ + args: settings.args, + mediaPath, + mimeType: normalizeMimeType(media.mimeType), + fileName: media.fileName, + }); + try { + const result = await execFileAsync(settings.command, argv, { + timeout: settings.timeoutMs, + maxBuffer: 1024 * 1024, + }); + const transcript = extractTranscriptText(result.stdout); + return transcript.trim() || null; + } catch (error) { + this.api.logger.warn( + `codex inbound audio transcription failed file=${mediaPath}: ${String(error)}`, + ); + return null; + } + } + async handleInboundClaim(event: { content: string; channel: string; @@ -1492,7 +1612,10 @@ export class CodexPluginController { if (!conversation) { return { handled: false }; } - const input = await buildInboundTurnInput(event); + const input = await buildInboundTurnInput({ + ...event, + transcribeAudio: async (media) => await this.transcribeInboundAudio(media), + }); const requiresStructuredInput = !isQueueCompatibleTurnInput(event.content, input); const activeKey = buildConversationKey(conversation); const active = this.activeRuns.get(activeKey); diff --git a/src/types.ts b/src/types.ts index f6e161f..25ce108 100644 --- a/src/types.ts +++ b/src/types.ts @@ -22,6 +22,14 @@ export type PluginSettings = { defaultWorkspaceDir?: string; defaultModel?: string; defaultServiceTier?: string; + inboundAudioTranscription?: InboundAudioTranscriptionSettings; +}; + +export type InboundAudioTranscriptionSettings = { + enabled: boolean; + command?: string; + args: string[]; + timeoutMs: number; }; export type CodexPlanStep = {