diff --git a/sdk/go/ai/media_integration_test.go b/sdk/go/ai/media_integration_test.go index b5c89bcde..53184095f 100644 --- a/sdk/go/ai/media_integration_test.go +++ b/sdk/go/ai/media_integration_test.go @@ -1,6 +1,7 @@ package ai import ( + "bytes" "context" "encoding/base64" "encoding/json" @@ -250,7 +251,7 @@ func TestIntegrationAudioSSEStream(t *testing.T) { var payload map[string]any require.NoError(t, json.NewDecoder(r.Body).Decode(&payload)) assert.Equal(t, true, payload["stream"]) - assert.Equal(t, "openai/gpt-4o-mini-tts", payload["model"]) + assert.Equal(t, "openai/gpt-audio-mini", payload["model"]) w.Header().Set("Content-Type", "text/event-stream") flusher, _ := w.(http.Flusher) @@ -274,18 +275,21 @@ func TestIntegrationAudioSSEStream(t *testing.T) { BaseURL: srv.URL, Client: srv.Client(), } + // Pre-seed metadata so routing picks the chat-completions path + // without hitting the real OpenRouter `/models/.../endpoints` endpoint. + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) resp, err := p.GenerateAudio(context.Background(), AudioRequest{ Text: "Say hello", - Model: "openrouter/openai/gpt-4o-mini-tts", + Model: "openrouter/openai/gpt-audio-mini", Voice: "nova", - Format: "wav", + Format: "mp3", // avoid pcm→wav rewrap so we can compare raw bytes }) require.NoError(t, err) assert.Equal(t, "Hello world", resp.Text) require.NotNil(t, resp.Audio) - assert.Equal(t, "wav", resp.Audio.Format) + assert.Equal(t, "mp3", resp.Audio.Format) assert.NotEmpty(t, resp.Audio.Data) // Decode and verify audio bytes @@ -315,9 +319,11 @@ func TestIntegrationAudioWithCustomFormat(t *testing.T) { BaseURL: srv.URL, Client: srv.Client(), } + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) resp, err := p.GenerateAudio(context.Background(), AudioRequest{ Text: "test", + Model: "openai/gpt-audio-mini", Voice: "echo", Format: "mp3", }) @@ -325,6 +331,43 @@ func TestIntegrationAudioWithCustomFormat(t *testing.T) { assert.Equal(t, "mp3", resp.Audio.Format) } +func TestIntegrationAudioSpeechEndpoint(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/audio/speech", r.URL.Path) + assert.Equal(t, "Bearer test-key", r.Header.Get("Authorization")) + + var payload map[string]any + require.NoError(t, json.NewDecoder(r.Body).Decode(&payload)) + assert.Equal(t, "hexgrad/kokoro-82m", payload["model"]) + assert.Equal(t, "af_bella", payload["voice"]) + assert.Equal(t, "pcm", payload["response_format"]) // wav → pcm on wire + + // Return 1KB of fake PCM16 + pcm := bytes.Repeat([]byte{0x00, 0x01}, 500) + w.Header().Set("Content-Type", "audio/pcm") + w.WriteHeader(http.StatusOK) + _, _ = w.Write(pcm) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "test-key", BaseURL: srv.URL, Client: srv.Client()} + p.SeedModelMeta("hexgrad/kokoro-82m", []string{"speech"}, []string{"text"}) + + resp, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hello", + Model: "openrouter/hexgrad/kokoro-82m", + Voice: "af_bella", + Format: "wav", + }) + require.NoError(t, err) + require.NotNil(t, resp.Audio) + assert.Equal(t, "wav", resp.Audio.Format) + decoded, err := base64.StdEncoding.DecodeString(resp.Audio.Data) + require.NoError(t, err) + assert.Equal(t, []byte("RIFF"), decoded[:4]) + assert.Equal(t, []byte("WAVE"), decoded[8:12]) +} + func TestIntegrationImageGeneration(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { var payload map[string]any diff --git a/sdk/go/ai/media_provider.go b/sdk/go/ai/media_provider.go index 6782d0d36..e4e0ef416 100644 --- a/sdk/go/ai/media_provider.go +++ b/sdk/go/ai/media_provider.go @@ -17,35 +17,69 @@ type VideoRequest struct { AspectRatio string `json:"aspect_ratio,omitempty"` GenerateAudio *bool `json:"generate_audio,omitempty"` Seed *int `json:"seed,omitempty"` + // ImageURL is a single input image for image-to-video models (convenience + // alternative to FrameImages with frame_type=first_frame). + ImageURL string `json:"image_url,omitempty"` + // FrameImages is per-frame guidance — first_frame / last_frame. Items + // follow OpenRouter's shape: {type, image_url:{url}, frame_type}. FrameImages []map[string]any `json:"frame_images,omitempty"` + // InputReferences supplies style/subject reference images (Veo + // "reference-to-video"). InputReferences []map[string]any `json:"input_references,omitempty"` PollInterval time.Duration `json:"-"` Timeout time.Duration `json:"-"` + // Extra passes through additional model-specific parameters (e.g. Veo's + // `personGeneration`). Extra map[string]any `json:"-"` } // ImageRequest holds parameters for image generation. type ImageRequest struct { - Prompt string `json:"prompt"` - Model string `json:"model,omitempty"` - Size string `json:"size,omitempty"` - Quality string `json:"quality,omitempty"` - ImageConfig *ImageConfig `json:"image_config,omitempty"` + Prompt string `json:"prompt"` + Model string `json:"model,omitempty"` + Size string `json:"size,omitempty"` + Quality string `json:"quality,omitempty"` + // ImageURLs are reference / source images for image+text→image models + // (e.g. x-ai/grok-imagine-image-quality). Each may be an http(s) or + // data: URL. + ImageURLs []string `json:"-"` + ImageConfig *ImageConfig `json:"image_config,omitempty"` + // Extra passes through additional model-specific parameters. + Extra map[string]any `json:"-"` } // ImageConfig holds OpenRouter-specific image configuration. type ImageConfig struct { - AspectRatio string `json:"aspect_ratio,omitempty"` - ImageSize string `json:"image_size,omitempty"` - SuperResolutionReferences []string `json:"super_resolution_references,omitempty"` + AspectRatio string `json:"aspect_ratio,omitempty"` + ImageSize string `json:"image_size,omitempty"` + // Strength is the image-to-image blend (0–1, model-dependent). + Strength *float64 `json:"strength,omitempty"` + // Style hint (e.g. Recraft V3 styles). + Style string `json:"style,omitempty"` + // RgbColors is a color palette — array of [r,g,b]. + RgbColors [][3]int `json:"rgb_colors,omitempty"` + // BackgroundRgbColor is [r,g,b]. + BackgroundRgbColor *[3]int `json:"background_rgb_color,omitempty"` + SuperResolutionReferences []string `json:"super_resolution_references,omitempty"` + FontInputs []FontInput `json:"font_inputs,omitempty"` +} + +// FontInput configures custom text rendering for compatible image models. +type FontInput struct { + FontURL string `json:"font_url"` + Text string `json:"text"` } // AudioRequest holds parameters for audio generation. type AudioRequest struct { - Text string `json:"text"` - Model string `json:"model,omitempty"` - Voice string `json:"voice,omitempty"` - Format string `json:"format,omitempty"` + Text string `json:"text"` + Model string `json:"model,omitempty"` + Voice string `json:"voice,omitempty"` + Format string `json:"format,omitempty"` + // Speed multiplier (OpenAI TTS respects; other models ignore). + Speed *float64 `json:"speed,omitempty"` + // Extra passes through additional model-specific parameters. + Extra map[string]any `json:"-"` } // MediaResponse holds the result of a media generation call. diff --git a/sdk/go/ai/media_provider_test.go b/sdk/go/ai/media_provider_test.go index 0f6d4c668..715c49178 100644 --- a/sdk/go/ai/media_provider_test.go +++ b/sdk/go/ai/media_provider_test.go @@ -219,15 +219,18 @@ func TestOpenRouterGenerateAudio(t *testing.T) { BaseURL: srv.URL, Client: srv.Client(), } + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) resp, err := p.GenerateAudio(context.Background(), AudioRequest{ - Text: "Say hello", - Voice: "nova", + Text: "Say hello", + Model: "openai/gpt-audio-mini", + Voice: "nova", + Format: "mp3", }) require.NoError(t, err) assert.Equal(t, "Hello", resp.Text) require.NotNil(t, resp.Audio) - assert.Equal(t, "pcm16", resp.Audio.Format) + assert.Equal(t, "mp3", resp.Audio.Format) assert.NotEmpty(t, resp.Audio.Data) } diff --git a/sdk/go/ai/openrouter_media.go b/sdk/go/ai/openrouter_media.go index 4d126b3eb..3abe5d8c7 100644 --- a/sdk/go/ai/openrouter_media.go +++ b/sdk/go/ai/openrouter_media.go @@ -5,13 +5,16 @@ import ( "bytes" "context" "encoding/base64" + "encoding/binary" "encoding/json" "fmt" "io" "net/http" + "net/url" "os" "regexp" "strings" + "sync" "time" ) @@ -22,13 +25,24 @@ const ( defaultOpenRouterBaseURL = "https://openrouter.ai/api/v1" defaultVideoPollInterval = 30 * time.Second defaultVideoTimeout = 10 * time.Minute + defaultTTSSampleRate = 24000 + openRouterModelMetaTTL = 30 * time.Minute ) +// modelMeta holds the architecture metadata for an OpenRouter model. +type modelMeta struct { + OutputModalities []string + InputModalities []string +} + // OpenRouterMediaProvider implements MediaProvider for OpenRouter's media APIs. type OpenRouterMediaProvider struct { APIKey string BaseURL string Client *http.Client + + metaMu sync.Mutex + metaCache map[string]modelMeta } // NewOpenRouterMediaProvider creates a provider. If apiKey is empty, reads OPENROUTER_API_KEY. @@ -67,6 +81,114 @@ func stripPrefix(model string) string { return strings.TrimPrefix(model, "openrouter/") } +// SeedModelMeta lets callers (or tests) pre-populate the metadata cache for a +// model. Useful when running against test servers that don't expose +// `GET /models/{id}/endpoints`. Output modalities follow OpenRouter's +// convention — e.g. `[]string{"speech"}` for TTS-only or +// `[]string{"text","audio"}` for chat-audio models. +func (p *OpenRouterMediaProvider) SeedModelMeta(model string, outputModalities, inputModalities []string) { + stripped := stripPrefix(model) + p.metaMu.Lock() + defer p.metaMu.Unlock() + if p.metaCache == nil { + p.metaCache = make(map[string]modelMeta) + } + p.metaCache[stripped] = modelMeta{ + OutputModalities: append([]string(nil), outputModalities...), + InputModalities: append([]string(nil), inputModalities...), + } +} + +// fetchModelMeta retrieves and caches a model's output_modalities so we can +// route audio/image requests to the right OpenRouter endpoint. Returns a +// zero-value meta on any failure so callers can fall back to defaults. +func (p *OpenRouterMediaProvider) fetchModelMeta(ctx context.Context, model string) modelMeta { + stripped := stripPrefix(model) + p.metaMu.Lock() + if p.metaCache == nil { + p.metaCache = make(map[string]modelMeta) + } + if cached, ok := p.metaCache[stripped]; ok { + p.metaMu.Unlock() + return cached + } + p.metaMu.Unlock() + + reqURL := p.baseURL() + "/models/" + stripped + "/endpoints" + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil) + if err != nil { + return modelMeta{} + } + p.setHeaders(httpReq) + resp, err := p.Client.Do(httpReq) + if err != nil { + return modelMeta{} + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return modelMeta{} + } + body, err := io.ReadAll(io.LimitReader(resp.Body, 1*1024*1024)) + if err != nil { + return modelMeta{} + } + var payload struct { + Data struct { + Architecture struct { + OutputModalities []string `json:"output_modalities"` + InputModalities []string `json:"input_modalities"` + } `json:"architecture"` + } `json:"data"` + } + if err := json.Unmarshal(body, &payload); err != nil { + return modelMeta{} + } + meta := modelMeta{ + OutputModalities: payload.Data.Architecture.OutputModalities, + InputModalities: payload.Data.Architecture.InputModalities, + } + p.metaMu.Lock() + p.metaCache[stripped] = meta + p.metaMu.Unlock() + return meta +} + +// wrapPCM16AsWAV wraps raw little-endian PCM16 mono bytes in a WAV (RIFF) container. +func wrapPCM16AsWAV(pcm []byte, sampleRate int) []byte { + channels := uint16(1) + bitsPerSample := uint16(16) + byteRate := uint32(sampleRate) * uint32(channels) * uint32(bitsPerSample) / 8 + blockAlign := channels * bitsPerSample / 8 + dataSize := uint32(len(pcm)) + + buf := bytes.NewBuffer(make([]byte, 0, 44+len(pcm))) + buf.WriteString("RIFF") + _ = binary.Write(buf, binary.LittleEndian, uint32(36+dataSize)) + buf.WriteString("WAVE") + buf.WriteString("fmt ") + _ = binary.Write(buf, binary.LittleEndian, uint32(16)) // PCM fmt chunk size + _ = binary.Write(buf, binary.LittleEndian, uint16(1)) // PCM format + _ = binary.Write(buf, binary.LittleEndian, channels) + _ = binary.Write(buf, binary.LittleEndian, uint32(sampleRate)) + _ = binary.Write(buf, binary.LittleEndian, byteRate) + _ = binary.Write(buf, binary.LittleEndian, blockAlign) + _ = binary.Write(buf, binary.LittleEndian, bitsPerSample) + buf.WriteString("data") + _ = binary.Write(buf, binary.LittleEndian, dataSize) + buf.Write(pcm) + return buf.Bytes() +} + +// containsString reports whether haystack contains s (case-sensitive). +func containsString(haystack []string, s string) bool { + for _, h := range haystack { + if h == s { + return true + } + } + return false +} + // GenerateVideo submits a video job, polls until complete, downloads result. func (p *OpenRouterMediaProvider) GenerateVideo(ctx context.Context, req VideoRequest) (*MediaResponse, error) { if strings.TrimSpace(req.Prompt) == "" { @@ -102,6 +224,9 @@ func (p *OpenRouterMediaProvider) GenerateVideo(ctx context.Context, req VideoRe if req.Seed != nil { payload["seed"] = *req.Seed } + if req.ImageURL != "" { + payload["image_url"] = req.ImageURL + } if len(req.FrameImages) > 0 { payload["frame_images"] = req.FrameImages } @@ -195,12 +320,16 @@ func (p *OpenRouterMediaProvider) GenerateVideo(ctx context.Context, req VideoRe } type videoJobStatus struct { - ID string `json:"id"` - Status string `json:"status"` - Error string `json:"error,omitempty"` - UnsignedURL string `json:"unsigned_url,omitempty"` - Duration float64 `json:"duration,omitempty"` - CostUSD float64 `json:"cost_usd,omitempty"` + ID string `json:"id"` + Status string `json:"status"` + Error string `json:"error,omitempty"` + UnsignedURL string `json:"unsigned_url,omitempty"` // legacy single-URL form + UnsignedURLs []string `json:"unsigned_urls,omitempty"` // current API + Duration float64 `json:"duration,omitempty"` + CostUSD float64 `json:"cost_usd,omitempty"` + Usage struct { + Cost float64 `json:"cost,omitempty"` + } `json:"usage,omitempty"` } func (p *OpenRouterMediaProvider) pollVideoJob(ctx context.Context, url string) (*videoJobStatus, error) { @@ -231,13 +360,51 @@ func (p *OpenRouterMediaProvider) pollVideoJob(ctx context.Context, url string) return &status, nil } -func (p *OpenRouterMediaProvider) buildVideoResponse(_ context.Context, status *videoJobStatus) (*MediaResponse, error) { +func (p *OpenRouterMediaProvider) buildVideoResponse(ctx context.Context, status *videoJobStatus) (*MediaResponse, error) { + videoURL := "" + if len(status.UnsignedURLs) > 0 { + videoURL = status.UnsignedURLs[0] + } else if status.UnsignedURL != "" { + videoURL = status.UnsignedURL + } + + cost := status.CostUSD + if cost == 0 { + cost = status.Usage.Cost + } + video := VideoData{ - URL: status.UnsignedURL, + URL: videoURL, MimeType: "video/mp4", Filename: "generated_video.mp4", Duration: status.Duration, - CostUSD: status.CostUSD, + CostUSD: cost, + } + + // Download bytes when we have a URL. OpenRouter's "unsigned" URLs are + // actually served from openrouter.ai itself and require the same Bearer + // auth as the API; other hosts (CDNs) take the URL bare. + if videoURL != "" { + dlReq, err := http.NewRequestWithContext(ctx, http.MethodGet, videoURL, nil) + if err == nil { + if u, perr := url.Parse(videoURL); perr == nil { + host := strings.ToLower(u.Hostname()) + if host == "openrouter.ai" || strings.HasSuffix(host, ".openrouter.ai") { + dlReq.Header.Set("Authorization", "Bearer "+p.APIKey) + } + } + dlResp, derr := p.Client.Do(dlReq) + if derr == nil { + defer dlResp.Body.Close() + if dlResp.StatusCode == http.StatusOK { + const maxVideoBytes = 500 * 1024 * 1024 // 500 MB + raw, rerr := io.ReadAll(io.LimitReader(dlResp.Body, maxVideoBytes)) + if rerr == nil { + video.Data = base64.StdEncoding.EncodeToString(raw) + } + } + } + } } return &MediaResponse{ @@ -254,12 +421,26 @@ func (p *OpenRouterMediaProvider) GenerateImage(ctx context.Context, req ImageRe } model = stripPrefix(model) + // Request only image output — works for both image-only models (e.g. + // x-ai/grok-imagine-image-quality) and dual-output models. Image-only + // models return 404 when "text" is also requested. + var userContent any = req.Prompt + if len(req.ImageURLs) > 0 { + parts := []map[string]any{{"type": "text", "text": req.Prompt}} + for _, u := range req.ImageURLs { + parts = append(parts, map[string]any{ + "type": "image_url", + "image_url": map[string]string{"url": u}, + }) + } + userContent = parts + } payload := map[string]any{ "model": model, "messages": []map[string]any{ - {"role": "user", "content": req.Prompt}, + {"role": "user", "content": userContent}, }, - "modalities": []string{"image", "text"}, + "modalities": []string{"image"}, } if req.Size != "" { payload["size"] = req.Size @@ -270,6 +451,9 @@ func (p *OpenRouterMediaProvider) GenerateImage(ctx context.Context, req ImageRe if req.ImageConfig != nil { payload["image_config"] = req.ImageConfig } + for k, v := range req.Extra { + payload[k] = v + } body, err := json.Marshal(payload) if err != nil { @@ -379,7 +563,11 @@ func (p *OpenRouterMediaProvider) GenerateImage(ctx context.Context, req ImageRe return result, nil } -// GenerateAudio uses streaming chat completions with audio modality. +// GenerateAudio auto-routes to the right OpenRouter endpoint based on the +// model's output_modalities: +// - ["speech"] (e.g. hexgrad/kokoro-82m) → POST /audio/speech +// - contains "audio" (e.g. openai/gpt-audio*) → chat-completions SSE +// - unknown → POST /audio/speech (broader compat) func (p *OpenRouterMediaProvider) GenerateAudio(ctx context.Context, req AudioRequest) (*MediaResponse, error) { if strings.TrimSpace(req.Text) == "" { return nil, fmt.Errorf("audio text input must not be empty") @@ -387,31 +575,45 @@ func (p *OpenRouterMediaProvider) GenerateAudio(ctx context.Context, req AudioRe model := req.Model if model == "" { - model = "openai/gpt-4o-audio-preview" + model = "openai/gpt-4o-mini-tts" } model = stripPrefix(model) - payload := map[string]any{ - "model": model, - "messages": []map[string]any{ - {"role": "user", "content": req.Text}, - }, - "modalities": []string{"text", "audio"}, - "stream": true, + requestedFormat := req.Format + if requestedFormat == "" { + requestedFormat = "wav" } - // When streaming, OpenAI only supports pcm16 format; use pcm16 as default. - audioFormat := "pcm16" - if req.Format != "" { - audioFormat = req.Format + meta := p.fetchModelMeta(ctx, model) + useSpeech := len(meta.OutputModalities) == 0 || + containsString(meta.OutputModalities, "speech") || + !containsString(meta.OutputModalities, "audio") + + if useSpeech { + return p.generateAudioViaSpeechEndpoint(ctx, model, req.Text, req.Voice, requestedFormat, &req) } - audioConfig := map[string]string{"format": audioFormat} + + // Chat-completions audio modality (gpt-audio family). Streaming on OpenAI + // is locked to pcm16 — wire that, then re-wrap to caller's format below. + wireFormat := requestedFormat + if requestedFormat == "wav" { + wireFormat = "pcm16" + } + audioConfig := map[string]string{"format": wireFormat} if req.Voice != "" { audioConfig["voice"] = req.Voice } else { audioConfig["voice"] = "alloy" } - payload["audio"] = audioConfig + payload := map[string]any{ + "model": model, + "messages": []map[string]any{ + {"role": "user", "content": req.Text}, + }, + "modalities": []string{"text", "audio"}, + "stream": true, + "audio": audioConfig, + } body, err := json.Marshal(payload) if err != nil { @@ -482,20 +684,16 @@ func (p *OpenRouterMediaProvider) GenerateAudio(ctx context.Context, req AudioRe return nil, fmt.Errorf("read audio stream: %w", err) } - // Concatenate base64 audio chunks - outputFormat := "pcm16" - if req.Format != "" { - outputFormat = req.Format - } + outputFormat := requestedFormat var audioData string if len(audioChunks) > 0 { - // Decode all chunks, concatenate raw bytes, re-encode + // Decode all chunks, concatenate raw bytes, re-encode (with WAV + // header when caller asked for wav). var raw []byte for _, chunk := range audioChunks { decoded, err := base64.StdEncoding.DecodeString(chunk) if err != nil { - // Try without padding decoded, err = base64.RawStdEncoding.DecodeString(chunk) if err != nil { return nil, fmt.Errorf("decode audio chunk: %w (chunk length: %d)", err, len(chunk)) @@ -503,6 +701,9 @@ func (p *OpenRouterMediaProvider) GenerateAudio(ctx context.Context, req AudioRe } raw = append(raw, decoded...) } + if outputFormat == "wav" { + raw = wrapPCM16AsWAV(raw, defaultTTSSampleRate) + } audioData = base64.StdEncoding.EncodeToString(raw) } @@ -519,3 +720,80 @@ func (p *OpenRouterMediaProvider) setHeaders(req *http.Request) { req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", "Bearer "+p.APIKey) } + +// generateAudioViaSpeechEndpoint calls POST /api/v1/audio/speech (OpenAI-compat +// TTS). Returns raw bytes for the caller's requested format; wraps PCM → WAV +// client-side when requestedFormat == "wav". +func (p *OpenRouterMediaProvider) generateAudioViaSpeechEndpoint( + ctx context.Context, model, text, voice, requestedFormat string, req *AudioRequest, +) (*MediaResponse, error) { + wireFormat := requestedFormat + switch requestedFormat { + case "wav", "pcm", "pcm16": + wireFormat = "pcm" + } + + if voice == "" { + voice = "alloy" + } + + payload := map[string]any{ + "model": model, + "input": text, + "voice": voice, + "response_format": wireFormat, + } + if req != nil && req.Speed != nil { + payload["speed"] = *req.Speed + } + if req != nil { + for k, v := range req.Extra { + payload[k] = v + } + } + body, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("marshal speech request: %w", err) + } + + endpoint := p.baseURL() + "/audio/speech" + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("create speech request: %w", err) + } + p.setHeaders(httpReq) + + resp, err := p.Client.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("execute speech request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + errBody, _ := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) + return nil, fmt.Errorf("audio/speech error (%d): %s", resp.StatusCode, string(errBody)) + } + + const maxAudioBytes = 100 * 1024 * 1024 + audioBytes, err := io.ReadAll(io.LimitReader(resp.Body, maxAudioBytes)) + if err != nil { + return nil, fmt.Errorf("read speech body: %w", err) + } + + if requestedFormat == "wav" { + audioBytes = wrapPCM16AsWAV(audioBytes, defaultTTSSampleRate) + } + + return &MediaResponse{ + Text: text, + Audio: &AudioData{ + Data: base64.StdEncoding.EncodeToString(audioBytes), + Format: requestedFormat, + }, + RawResponse: map[string]string{ + "endpoint": "audio/speech", + "model": model, + "mime_type": resp.Header.Get("Content-Type"), + }, + }, nil +} diff --git a/sdk/go/ai/openrouter_media_coverage_test.go b/sdk/go/ai/openrouter_media_coverage_test.go index 2a4698bab..b1c249319 100644 --- a/sdk/go/ai/openrouter_media_coverage_test.go +++ b/sdk/go/ai/openrouter_media_coverage_test.go @@ -236,7 +236,10 @@ func TestGenerateAudioDefaultVoice(t *testing.T) { defer srv.Close() p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} - _, err := p.GenerateAudio(context.Background(), AudioRequest{Text: "hi"}) + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) require.NoError(t, err) audioConf := got["audio"].(map[string]any) assert.Equal(t, "alloy", audioConf["voice"]) @@ -251,7 +254,10 @@ func TestGenerateAudioHTTPError(t *testing.T) { defer srv.Close() p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} - _, err := p.GenerateAudio(context.Background(), AudioRequest{Text: "hi"}) + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) assert.Error(t, err) assert.Contains(t, err.Error(), "401") } @@ -273,7 +279,10 @@ func TestGenerateAudioSkipsInvalidSSELines(t *testing.T) { defer srv.Close() p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} - resp, err := p.GenerateAudio(context.Background(), AudioRequest{Text: "hi"}) + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + resp, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) require.NoError(t, err) assert.Equal(t, "ok", resp.Text) } @@ -291,7 +300,10 @@ func TestGenerateAudioInvalidBase64Chunk(t *testing.T) { defer srv.Close() p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} - _, err := p.GenerateAudio(context.Background(), AudioRequest{Text: "hi"}) + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) assert.Error(t, err) assert.Contains(t, err.Error(), "decode audio chunk") } @@ -312,7 +324,10 @@ func TestGenerateAudioRawStdBase64Fallback(t *testing.T) { defer srv.Close() p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} - resp, err := p.GenerateAudio(context.Background(), AudioRequest{Text: "hi"}) + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + resp, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) require.NoError(t, err) decoded, err := base64.StdEncoding.DecodeString(resp.Audio.Data) require.NoError(t, err) diff --git a/sdk/go/ai/openrouter_media_routing_test.go b/sdk/go/ai/openrouter_media_routing_test.go new file mode 100644 index 000000000..ef9e70278 --- /dev/null +++ b/sdk/go/ai/openrouter_media_routing_test.go @@ -0,0 +1,347 @@ +package ai + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// ============================================================================= +// fetchModelMeta — happy path + cache reuse + network failures +// ============================================================================= + +func TestFetchModelMetaCachesResult(t *testing.T) { + calls := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls++ + assert.True(t, strings.HasSuffix(r.URL.Path, "/models/openai/gpt-audio-mini/endpoints")) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"data":{"id":"openai/gpt-audio-mini","architecture":{"output_modalities":["text","audio"],"input_modalities":["text"]}}}`)) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + + meta1 := p.fetchModelMeta(context.Background(), "openrouter/openai/gpt-audio-mini") + meta2 := p.fetchModelMeta(context.Background(), "openai/gpt-audio-mini") // same model, no prefix + + assert.Equal(t, 1, calls, "cache should prevent second HTTP call") + assert.Equal(t, []string{"text", "audio"}, meta1.OutputModalities) + assert.Equal(t, meta1, meta2) +} + +func TestFetchModelMetaReturnsEmptyOnHTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + meta := p.fetchModelMeta(context.Background(), "unknown/model") + assert.Empty(t, meta.OutputModalities) + assert.Empty(t, meta.InputModalities) +} + +func TestFetchModelMetaReturnsEmptyOnMalformedJSON(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`not-json`)) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + meta := p.fetchModelMeta(context.Background(), "x/y") + assert.Empty(t, meta.OutputModalities) +} + +func TestFetchModelMetaTriggersAutoRoutingViaSpeechEndpoint(t *testing.T) { + // Server handles BOTH the metadata GET and the /audio/speech POST so the + // provider can self-discover routing end-to-end. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.HasSuffix(r.URL.Path, "/models/hexgrad/kokoro-82m/endpoints"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"data":{"id":"hexgrad/kokoro-82m","architecture":{"output_modalities":["speech"],"input_modalities":["text"]}}}`)) + case r.URL.Path == "/audio/speech": + w.Header().Set("Content-Type", "audio/pcm") + _, _ = w.Write(make([]byte, 240)) // raw PCM + default: + t.Fatalf("unexpected path: %s", r.URL.Path) + } + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + resp, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", + Model: "hexgrad/kokoro-82m", + Voice: "af_bella", + Format: "wav", + }) + require.NoError(t, err) + require.NotNil(t, resp.Audio) + decoded, err := base64.StdEncoding.DecodeString(resp.Audio.Data) + require.NoError(t, err) + assert.Equal(t, []byte("RIFF"), decoded[:4]) +} + +// ============================================================================= +// generateAudioViaSpeechEndpoint — non-WAV format, error paths +// ============================================================================= + +func TestGenerateAudioSpeechMP3PassthroughNoWavWrap(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/audio/speech", r.URL.Path) + var body map[string]any + require.NoError(t, json.NewDecoder(r.Body).Decode(&body)) + assert.Equal(t, "mp3", body["response_format"]) + assert.Equal(t, 1.5, body["speed"]) + assert.Equal(t, "fr-FR", body["language"]) // from Extra + w.Header().Set("Content-Type", "audio/mpeg") + _, _ = w.Write([]byte("FAKE_MP3_BYTES_ABCDEF")) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + p.SeedModelMeta("hexgrad/kokoro-82m", []string{"speech"}, []string{"text"}) + + speed := 1.5 + resp, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "bonjour", + Model: "hexgrad/kokoro-82m", + Voice: "af_bella", + Format: "mp3", + Speed: &speed, + Extra: map[string]any{"language": "fr-FR"}, + }) + require.NoError(t, err) + require.NotNil(t, resp.Audio) + assert.Equal(t, "mp3", resp.Audio.Format) + // MP3 is returned as-is (no WAV wrap) + decoded, err := base64.StdEncoding.DecodeString(resp.Audio.Data) + require.NoError(t, err) + assert.Equal(t, "FAKE_MP3_BYTES_ABCDEF", string(decoded)) +} + +func TestGenerateAudioSpeechErrorBubblesUp(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = w.Write([]byte(`{"error":"bad key"}`)) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + p.SeedModelMeta("hexgrad/kokoro-82m", []string{"speech"}, []string{"text"}) + + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "x", Model: "hexgrad/kokoro-82m", Voice: "af_bella", Format: "wav", + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "audio/speech error") + assert.Contains(t, err.Error(), "401") +} + +func TestGenerateAudioSpeechDefaultsVoiceToAlloy(t *testing.T) { + var seen map[string]any + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = json.NewDecoder(r.Body).Decode(&seen) + w.Header().Set("Content-Type", "audio/mpeg") + _, _ = w.Write([]byte("x")) + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + p.SeedModelMeta("openai/gpt-4o-mini-tts", []string{"speech"}, []string{"text"}) + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-4o-mini-tts", Format: "mp3", + }) + require.NoError(t, err) + assert.Equal(t, "alloy", seen["voice"]) +} + +// ============================================================================= +// Video — first/last frame + auth-aware download +// ============================================================================= + +func TestGenerateVideoSendsFrameImagesAndImageURL(t *testing.T) { + var submit map[string]any + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/videos": + require.NoError(t, json.NewDecoder(r.Body).Decode(&submit)) + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"id":"jobABC"}`)) + case "/videos/jobABC": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "id":"jobABC","status":"completed", + "unsigned_urls":["` + r.Host + `/blob/jobABC"], + "usage":{"cost":0.42} + }`)) + } + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + + resp, err := p.GenerateVideo(context.Background(), VideoRequest{ + Prompt: "test", + Model: "openrouter/google/veo-3.1-lite", + Duration: 4, + ImageURL: "https://example.com/seed.jpg", + FrameImages: []map[string]any{ + {"type": "image_url", "image_url": map[string]string{"url": "https://x/first.jpg"}, "frame_type": "first_frame"}, + {"type": "image_url", "image_url": map[string]string{"url": "https://x/last.jpg"}, "frame_type": "last_frame"}, + }, + InputReferences: []map[string]any{ + {"type": "image_url", "image_url": map[string]string{"url": "https://x/ref.jpg"}}, + }, + Extra: map[string]any{"personGeneration": "allow_all"}, + PollInterval: 10 * time.Millisecond, + Timeout: 30 * time.Second, + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.Len(t, resp.Videos, 1) + assert.Equal(t, 0.42, resp.Videos[0].CostUSD) + + // Verify submit payload included our params. + assert.Equal(t, "google/veo-3.1-lite", submit["model"]) + assert.Equal(t, "https://example.com/seed.jpg", submit["image_url"]) + frames := submit["frame_images"].([]any) + require.Len(t, frames, 2) + assert.Equal(t, "first_frame", frames[0].(map[string]any)["frame_type"]) + assert.Equal(t, "last_frame", frames[1].(map[string]any)["frame_type"]) + assert.Equal(t, "allow_all", submit["personGeneration"]) +} + +func TestGenerateVideoDownloadsWithoutAuthFromNonOpenRouterHost(t *testing.T) { + // Use a fixed URL string captured after server start so the poll response + // can embed an absolute http:// URL. + var apiSrv *httptest.Server + apiSrv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/videos": + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"id":"job1"}`)) + case "/videos/job1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(fmt.Sprintf( + `{"id":"job1","status":"completed","unsigned_urls":["%s/blob/job1"],"usage":{"cost":0.1}}`, + apiSrv.URL))) + case "/blob/job1": + // Non-openrouter host → no Authorization header. + assert.Empty(t, r.Header.Get("Authorization")) + w.Header().Set("Content-Type", "video/mp4") + _, _ = w.Write([]byte("FAKE_MP4")) + } + })) + defer apiSrv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: apiSrv.URL, Client: apiSrv.Client()} + resp, err := p.GenerateVideo(context.Background(), VideoRequest{ + Prompt: "t", Model: "x/y", + PollInterval: 10 * time.Millisecond, Timeout: 30 * time.Second, + }) + require.NoError(t, err) + require.Len(t, resp.Videos, 1) + decoded, err := base64.StdEncoding.DecodeString(resp.Videos[0].Data) + require.NoError(t, err) + assert.Equal(t, "FAKE_MP4", string(decoded)) +} + +// ============================================================================= +// Image — ImageURLs reference images + image_config conversion +// ============================================================================= + +func TestGenerateImageWithReferenceImagesAndConfig(t *testing.T) { + var captured map[string]any + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.NoError(t, json.NewDecoder(r.Body).Decode(&captured)) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"choices":[{"message":{"content":null,"images":[{"type":"image_url","image_url":{"url":"data:image/png;base64,QUJD"}}]}}]}`)) + })) + defer srv.Close() + + strength := 0.7 + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + resp, err := p.GenerateImage(context.Background(), ImageRequest{ + Prompt: "a fox in watercolor", + Model: "openrouter/x-ai/grok-imagine-image-quality", + ImageURLs: []string{"https://x/ref1.png", "https://x/ref2.png"}, + ImageConfig: &ImageConfig{ + AspectRatio: "16:9", + Strength: &strength, + RgbColors: [][3]int{{255, 100, 50}}, + }, + Extra: map[string]any{"high_quality": true}, + }) + require.NoError(t, err) + require.Len(t, resp.Images, 1) + assert.Equal(t, "QUJD", resp.Images[0].B64JSON) + + // Modalities sent as ["image"] only. + mods := captured["modalities"].([]any) + require.Len(t, mods, 1) + assert.Equal(t, "image", mods[0]) + + // User content is a multi-part array (text + 2 image_url entries). + messages := captured["messages"].([]any) + userMsg := messages[0].(map[string]any) + content := userMsg["content"].([]any) + require.Len(t, content, 3) + assert.Equal(t, "text", content[0].(map[string]any)["type"]) + assert.Equal(t, "image_url", content[1].(map[string]any)["type"]) + assert.Equal(t, "image_url", content[2].(map[string]any)["type"]) + + // Extra passthrough. + assert.Equal(t, true, captured["high_quality"]) +} + +// ============================================================================= +// wrapPCM16AsWAV — header correctness +// ============================================================================= + +func TestWrapPCM16AsWAVHeader(t *testing.T) { + pcm := make([]byte, 240) // 240 bytes = 120 16-bit samples = 5ms @ 24kHz + wav := wrapPCM16AsWAV(pcm, 24000) + + assert.Equal(t, []byte("RIFF"), wav[:4]) + assert.Equal(t, []byte("WAVE"), wav[8:12]) + assert.Equal(t, []byte("fmt "), wav[12:16]) + assert.Equal(t, []byte("data"), wav[36:40]) + // total file size = 44 (header) + 240 (data) + assert.Len(t, wav, 44+240) +} + +// ============================================================================= +// SeedModelMeta — chat-audio path verification on a separate model +// ============================================================================= + +func TestSeedModelMetaRoutesToChatCompletions(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/chat/completions", r.URL.Path) + w.Header().Set("Content-Type", "text/event-stream") + flusher, _ := w.(http.Flusher) + fmt.Fprint(w, "data: [DONE]\n\n") + flusher.Flush() + })) + defer srv.Close() + + p := &OpenRouterMediaProvider{APIKey: "k", BaseURL: srv.URL, Client: srv.Client()} + p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"}) + + _, err := p.GenerateAudio(context.Background(), AudioRequest{ + Text: "hi", Model: "openai/gpt-audio-mini", Format: "mp3", + }) + require.NoError(t, err) +} diff --git a/sdk/python/agentfield/media_providers.py b/sdk/python/agentfield/media_providers.py index 874b60562..33ecea81a 100644 --- a/sdk/python/agentfield/media_providers.py +++ b/sdk/python/agentfield/media_providers.py @@ -782,6 +782,29 @@ def _assert_safe_download_url(url: str) -> None: raise RuntimeError(f"Refusing to download video from private IP: {url}") +def _wrap_pcm16_bytes_as_wav(pcm: bytes, *, sample_rate: int = 24000) -> bytes: + """Wrap raw little-endian PCM16 mono bytes in a WAV (RIFF) container.""" + import io + import wave + + buf = io.BytesIO() + with wave.open(buf, "wb") as w: + w.setnchannels(1) + w.setsampwidth(2) + w.setframerate(sample_rate) + w.writeframes(pcm) + return buf.getvalue() + + +def _wrap_pcm16_as_wav_b64(pcm_b64: str, *, sample_rate: int = 24000) -> str: + """Decode base64 PCM16 → wrap as WAV → re-encode base64.""" + import base64 + + pcm = base64.b64decode(pcm_b64) + wav = _wrap_pcm16_bytes_as_wav(pcm, sample_rate=sample_rate) + return base64.b64encode(wav).decode("ascii") + + class OpenRouterProvider(MediaProvider): """ OpenRouter provider for image generation via chat completions. @@ -803,6 +826,10 @@ class OpenRouterProvider(MediaProvider): def __init__(self, api_key: Optional[str] = None): self._api_key = api_key + # Per-instance cache of model metadata (output_modalities) so we can + # route requests to the right OpenRouter endpoint without re-fetching + # on every call. Keyed by the stripped model id ("hexgrad/kokoro-82m"). + self._model_meta_cache: Dict[str, Dict[str, Any]] = {} @property def name(self) -> str: @@ -812,28 +839,92 @@ def name(self) -> str: def supported_modalities(self) -> List[str]: return ["image", "video", "audio", "music"] + @staticmethod + def _strip_or_prefix(model: str) -> str: + return model[len("openrouter/") :] if model.startswith("openrouter/") else model + + async def _fetch_model_meta(self, model: str) -> Dict[str, Any]: + """Fetch + cache OpenRouter model metadata (output_modalities etc.). + + On any error, returns an empty dict so callers can fall back to + defaults rather than fail the user's call. + """ + import os + + import aiohttp + + stripped = self._strip_or_prefix(model) + cached = self._model_meta_cache.get(stripped) + if cached is not None: + return cached + + api_key = self._api_key or os.environ.get("OPENROUTER_API_KEY", "") + if not api_key: + return {} + + url = f"https://openrouter.ai/api/v1/models/{stripped}/endpoints" + headers = {"Authorization": f"Bearer {api_key}"} + try: + timeout = aiohttp.ClientTimeout(total=10.0) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get(url, headers=headers) as resp: + if resp.status != 200: + return {} + payload = await resp.json() + except Exception: + return {} + + data = payload.get("data", {}) if isinstance(payload, dict) else {} + arch = data.get("architecture", {}) if isinstance(data, dict) else {} + meta = { + "id": data.get("id", stripped), + "output_modalities": list(arch.get("output_modalities", []) or []), + "input_modalities": list(arch.get("input_modalities", []) or []), + } + self._model_meta_cache[stripped] = meta + return meta + async def generate_image( self, prompt: str, model: Optional[str] = None, size: str = "1024x1024", quality: str = "standard", + image_urls: Optional[List[str]] = None, image_config: Optional[Dict[str, Any]] = None, + extra: Optional[Dict[str, Any]] = None, **kwargs, ) -> MultimodalResponse: """Generate image using OpenRouter's chat completions API. - Note: image_config is an OpenRouter-specific extension not present - in the base MediaProvider.generate_image() interface. + Args: + prompt: Text description for image generation. + model: OpenRouter model (defaults to ``google/gemini-2.5-flash-image``). + size: Image dimensions (model-specific). + quality: Quality hint (model-specific). + image_urls: Optional reference / source images for image+text→image + models (e.g. ``x-ai/grok-imagine-image-quality``). Each entry can + be an http(s) URL or a ``data:`` URL. + image_config: OpenRouter-specific extras — ``aspect_ratio``, + ``image_size``, ``strength``, ``style``, ``rgb_colors``, + ``background_rgb_color``, ``super_resolution_references``, + ``font_inputs``. + extra: Arbitrary passthrough fields merged into the completion + request (e.g. model-specific switches). """ from agentfield import vision - model = model or "openrouter/google/gemini-2.5-flash-image-preview" + model = model or "openrouter/google/gemini-2.5-flash-image" # Ensure model has openrouter prefix if not model.startswith("openrouter/"): model = f"openrouter/{model}" + if image_urls: + kwargs["image_urls"] = image_urls + if extra: + kwargs.update(extra) + return await vision.generate_image_openrouter( prompt=prompt, model=model, @@ -857,6 +948,7 @@ async def generate_video( seed: Optional[int] = None, frame_images: Optional[List[Dict]] = None, input_references: Optional[List[Dict]] = None, + extra: Optional[Dict[str, Any]] = None, poll_interval: float = 30.0, timeout: float = 600.0, **kwargs, @@ -930,6 +1022,8 @@ async def generate_video( body["input_references"] = input_references if image_url is not None: body["image_url"] = image_url + if extra: + body.update(extra) _error_messages = self._VIDEO_ERROR_MESSAGES @@ -1025,9 +1119,19 @@ async def generate_video( video_url = unsigned_urls[0] _assert_safe_download_url(video_url) + # OpenRouter's "unsigned_urls" are served from openrouter.ai itself + # and require the same Bearer auth as the API. CDN-hosted URLs + # (other hosts) don't need auth — strip in that case. + from urllib.parse import urlparse + + download_headers = ( + headers + if (urlparse(video_url).hostname or "").endswith("openrouter.ai") + else {} + ) + video_data_bytes: Optional[bytes] = None - # Download without auth headers — video_url is a public CDN URL - async with session.get(video_url) as resp: + async with session.get(video_url, headers=download_headers) as resp: if resp.status != 200: raise RuntimeError( f"Failed to download video from {video_url}: HTTP {resp.status}" @@ -1179,19 +1283,34 @@ async def generate_audio( model: Optional[str] = None, voice: str = "alloy", format: str = "wav", + speed: Optional[float] = None, + extra: Optional[Dict[str, Any]] = None, **kwargs, ) -> MultimodalResponse: """ - Generate audio via OpenRouter chat completions with SSE streaming. + Generate audio via OpenRouter, auto-routing to the right endpoint. - Uses the modalities parameter to request audio output from audio-capable - models on OpenRouter. + OpenRouter exposes two API surfaces for audio output: + - ``POST /audio/speech`` (OpenAI-compatible TTS) — used by dedicated + TTS models like ``hexgrad/kokoro-82m`` whose ``output_modalities`` + is ``["speech"]``. + - ``POST /chat/completions`` with ``modalities=["text","audio"]`` + SSE streaming — used by chat-audio models like the ``openai/gpt-audio`` + family whose ``output_modalities`` contains ``"audio"``. + + We fetch the model's metadata once (cached per provider instance) and + pick the right path. On metadata failure we default to ``/audio/speech`` + because it covers the broader population of TTS models. Args: text: Text to convert to speech - model: OpenRouter model ID (e.g., "openai/gpt-4o-mini-tts") - voice: Voice identifier (alloy, echo, fable, onyx, nova, shimmer) - format: Audio format (wav, mp3, flac, opus, pcm16) + model: OpenRouter model ID (e.g., "openai/gpt-audio-mini", + "hexgrad/kokoro-82m"). Default: ``openai/gpt-4o-mini-tts``. + voice: Voice identifier (model-specific — e.g. ``alloy`` for + OpenAI, ``af_bella`` for Kokoro) + format: Audio format (wav, mp3, flac, opus, pcm16). ``wav`` is + synthesized client-side when the upstream endpoint only emits + pcm. **kwargs: Additional parameters (timeout overrides default 300s) Returns: @@ -1205,45 +1324,80 @@ async def generate_audio( "OpenRouter API key required. Set OPENROUTER_API_KEY env var or pass api_key." ) - # Strip openrouter/ prefix if present - send_model = model or "openai/gpt-4o-mini-tts" - if send_model.startswith("openrouter/"): - send_model = send_model[len("openrouter/") :] - - supported_voices = {"alloy", "echo", "fable", "onyx", "nova", "shimmer"} - if voice not in supported_voices: - voice = "alloy" + send_model = self._strip_or_prefix(model or "openai/gpt-4o-mini-tts") audio_format = format - supported_formats = {"wav", "mp3", "flac", "opus", "pcm16"} + supported_formats = {"wav", "mp3", "flac", "opus", "pcm16", "pcm"} if audio_format not in supported_formats: audio_format = "wav" timeout = kwargs.pop("timeout", 300.0) + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + meta = await self._fetch_model_meta(send_model) + output_mods = meta.get("output_modalities") or [] + # Choose path: TTS-only models advertise "speech"; chat-audio models + # advertise "audio". If metadata is missing, prefer /audio/speech as + # the broader-compat default. + use_speech_endpoint = ("speech" in output_mods) or (not output_mods) + if "audio" in output_mods and "speech" not in output_mods: + use_speech_endpoint = False + + if use_speech_endpoint: + audio_b64, mime = await self._openrouter_audio_speech( + text=text, + model=send_model, + voice=voice, + requested_format=audio_format, + headers=headers, + timeout=timeout, + speed=speed, + extra=extra, + ) + audio_output = AudioOutput( + data=audio_b64 if audio_b64 else None, + format=audio_format, + url=None, + ) + return MultimodalResponse( + text=text, + audio=audio_output if audio_b64 else None, + images=[], + files=[], + raw_response={ + "endpoint": "audio/speech", + "model": send_model, + "mime_type": mime, + }, + ) + # Chat-completions audio modality path (gpt-audio family). + # Streaming on the OpenAI provider only emits pcm16 — fall back to + # pcm16 over the wire and re-wrap to user's requested format below. + wire_format = "pcm16" if audio_format == "wav" else audio_format payload = { "model": send_model, "messages": [{"role": "user", "content": text}], "modalities": ["text", "audio"], - "audio": {"voice": voice, "format": audio_format}, + "audio": {"voice": voice, "format": wire_format}, "stream": True, } - - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - } - b64_full, transcript = await self._stream_openrouter_audio( payload, headers, timeout=timeout, label="audio" ) + # Re-wrap pcm16 -> wav if user asked for wav. + if audio_format == "wav" and b64_full: + b64_full = _wrap_pcm16_as_wav_b64(b64_full, sample_rate=24000) + audio_output = AudioOutput( data=b64_full if b64_full else None, format=audio_format, url=None, ) - return MultimodalResponse( text=transcript or text, audio=audio_output if b64_full else None, @@ -1252,6 +1406,67 @@ async def generate_audio( raw_response={"transcript": transcript, "model": send_model}, ) + async def _openrouter_audio_speech( + self, + *, + text: str, + model: str, + voice: str, + requested_format: str, + headers: Dict[str, str], + timeout: float, + speed: Optional[float] = None, + extra: Optional[Dict[str, Any]] = None, + ) -> tuple: + """Call ``POST /api/v1/audio/speech`` and return ``(b64_data, mime)``. + + Handles format translation: when the caller wants ``wav`` we ask the + upstream for ``pcm`` and wrap it in a WAV header ourselves (24 kHz + mono int16 — the rate that current OpenRouter TTS endpoints emit). + """ + import base64 + + import aiohttp + + # Map caller's format → upstream response_format + if requested_format in ("wav", "pcm", "pcm16"): + wire_format = "pcm" + else: + wire_format = requested_format # mp3 / flac / opus / aac + + body: Dict[str, Any] = { + "model": model, + "input": text, + "voice": voice, + "response_format": wire_format, + } + if speed is not None: + body["speed"] = speed + if extra: + body.update(extra) + + client_timeout = aiohttp.ClientTimeout(total=timeout) + async with aiohttp.ClientSession(timeout=client_timeout) as session: + async with session.post( + "https://openrouter.ai/api/v1/audio/speech", + json=body, + headers=headers, + ) as resp: + content_type = resp.headers.get("Content-Type", "") + if resp.status >= 400: + detail = await resp.text() + raise RuntimeError( + f"OpenRouter audio/speech request failed " + f"({resp.status}): {detail[:500]}" + ) + audio_bytes = await resp.read() + + if requested_format == "wav": + wav_bytes = _wrap_pcm16_bytes_as_wav(audio_bytes, sample_rate=24000) + return base64.b64encode(wav_bytes).decode("ascii"), "audio/wav" + + return base64.b64encode(audio_bytes).decode("ascii"), content_type + async def generate_music( self, prompt: str, diff --git a/sdk/python/agentfield/multimodal_response.py b/sdk/python/agentfield/multimodal_response.py index cff9d6403..5c91d39ad 100644 --- a/sdk/python/agentfield/multimodal_response.py +++ b/sdk/python/agentfield/multimodal_response.py @@ -88,47 +88,32 @@ class ImageOutput(BaseModel): def save(self, path: Union[str, Path]) -> None: """Save image to file.""" + if not self.b64_json and not self.url: + raise ValueError("No image data or URL available to save") path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) - - if self.b64_json: - # Save from base64 data - image_bytes = base64.b64decode(self.b64_json) - with open(path, "wb") as f: - f.write(image_bytes) - elif self.url: - # Download from URL - try: - import requests - - response = requests.get(self.url) - response.raise_for_status() - with open(path, "wb") as f: - f.write(response.content) - except ImportError: - raise ImportError( - "URL download requires requests: pip install requests" - ) - else: - raise ValueError("No image data or URL available to save") + with open(path, "wb") as f: + f.write(self.get_bytes()) def get_bytes(self) -> bytes: - """Get raw image bytes.""" + """Get raw image bytes from b64_json, a data: URL, or an http(s) URL.""" if self.b64_json: return base64.b64decode(self.b64_json) - elif self.url: + if self.url: + if self.url.startswith("data:"): + # data:image/jpeg;base64, + _, _, payload = self.url.partition(",") + return base64.b64decode(payload) try: import requests - - response = requests.get(self.url) - response.raise_for_status() - return response.content except ImportError: raise ImportError( "URL download requires requests: pip install requests" ) - else: - raise ValueError("No image data or URL available") + response = requests.get(self.url) + response.raise_for_status() + return response.content + raise ValueError("No image data or URL available") def show(self) -> None: """Display image if possible (requires PIL/Pillow).""" diff --git a/sdk/python/agentfield/vision.py b/sdk/python/agentfield/vision.py index 571c9bacd..b333d81ed 100644 --- a/sdk/python/agentfield/vision.py +++ b/sdk/python/agentfield/vision.py @@ -134,15 +134,29 @@ async def generate_image_openrouter( from agentfield.multimodal_response import ImageOutput, MultimodalResponse + # Pull image_urls out of kwargs so we can build a multi-part user message + # for image+text→image models (e.g. x-ai/grok-imagine-image-quality). + image_urls = kwargs.pop("image_urls", None) or [] + if image_urls: + user_content: Any = [{"type": "text", "text": prompt}] + [ + {"type": "image_url", "image_url": {"url": u}} for u in image_urls + ] + else: + user_content = prompt + # Build messages for OpenRouter chat completions - messages = [{"role": "user", "content": prompt}] + messages = [{"role": "user", "content": user_content}] # Prepare parameters for OpenRouter # OpenRouter uses chat completions with modalities parameter + # Request only image output — works for both image-only models (e.g. + # x-ai/grok-imagine-image-quality) and dual-output models (e.g. + # google/gemini-2.5-flash-image). Image-only models 404 when "text" is + # also requested. completion_params = { "model": model, "messages": messages, - "modalities": ["image", "text"], + "modalities": ["image"], **kwargs, } diff --git a/sdk/python/tests/test_media_integration.py b/sdk/python/tests/test_media_integration.py index 98b02925b..22391fdb7 100644 --- a/sdk/python/tests/test_media_integration.py +++ b/sdk/python/tests/test_media_integration.py @@ -426,16 +426,25 @@ async def fake_iter_any(): session_cm.__aenter__ = AsyncMock(return_value=mock_session) session_cm.__aexit__ = AsyncMock(return_value=False) + # Pre-populate metadata cache so routing picks chat-completions instead + # of /audio/speech (the gpt-4o-mini-tts default is actually TTS-only). + provider._model_meta_cache["openai/gpt-audio-mini"] = { + "id": "openai/gpt-audio-mini", + "output_modalities": ["text", "audio"], + "input_modalities": ["text"], + } + with patch("aiohttp.ClientSession", return_value=session_cm): result = await provider.generate_audio( text="Say hello", - model="openai/gpt-4o-mini-tts", + model="openai/gpt-audio-mini", voice="nova", + format="mp3", # avoid pcm→wav re-wrap so we can compare base64 ) assert result.audio is not None assert result.audio.data == "AAAABBBB" - assert result.audio.format == "wav" + assert result.audio.format == "mp3" @pytest.mark.asyncio async def test_audio_api_key_required(self): diff --git a/sdk/python/tests/test_openrouter_audio.py b/sdk/python/tests/test_openrouter_audio.py index fb12a09c0..ab1d82288 100644 --- a/sdk/python/tests/test_openrouter_audio.py +++ b/sdk/python/tests/test_openrouter_audio.py @@ -154,14 +154,51 @@ def __init__(self): # ============================================================================= +def _prime_chat_audio_cache(provider: OpenRouterProvider, model: str) -> None: + """Pre-populate model metadata so audio requests route via chat-completions. + + Without this, the provider tries to GET /api/v1/models/{id}/endpoints to + discover routing — that's already mocked away in these tests. + """ + stripped = model.removeprefix("openrouter/") + provider._model_meta_cache[stripped] = { + "id": stripped, + "output_modalities": ["text", "audio"], + "input_modalities": ["text"], + } + + +class _BytesResponse: + """Fake aiohttp response for /audio/speech (returns raw bytes).""" + + def __init__(self, body: bytes, status: int = 200, content_type: str = "audio/pcm"): + self.status = status + self._body = body + self.headers = {"Content-Type": content_type} + + async def read(self) -> bytes: + return self._body + + async def text(self) -> str: + return self._body.decode("utf-8", errors="replace") + + async def __aenter__(self): + return self + + async def __aexit__(self, *args): + pass + + class TestOpenRouterGenerateAudio: - """Tests for OpenRouterProvider.generate_audio SSE streaming.""" + """Tests for OpenRouterProvider.generate_audio (chat-completions SSE path).""" @pytest.mark.asyncio async def test_sse_stream_parsing_and_concatenation(self, monkeypatch): - """Audio base64 chunks from SSE should be concatenated correctly.""" + """Audio chunks from SSE should be concatenated correctly for chat-audio models.""" + # Two valid base64 chunks decode to "audio_part_1" + "audio_part_2". chunk1 = base64.b64encode(b"audio_part_1").decode() chunk2 = base64.b64encode(b"audio_part_2").decode() + merged_b64 = base64.b64encode(b"audio_part_1" + b"audio_part_2").decode() events = [ _audio_event(b64_chunk=chunk1, transcript="Hello "), @@ -175,25 +212,32 @@ async def test_sse_stream_parsing_and_concatenation(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") result = await provider.generate_audio( text="Say hello", - model="openai/gpt-4o-mini-tts", + model="openai/gpt-audio-mini", voice="alloy", - format="wav", + format="mp3", # avoid pcm→wav rewrap so we can assert raw concat ) assert result.has_audio - assert result.audio.data == chunk1 + chunk2 - assert result.audio.format == "wav" + assert result.audio.data == merged_b64 + assert result.audio.format == "mp3" assert result.text == "Hello world" @pytest.mark.asyncio async def test_transcript_extraction(self, monkeypatch): """Transcript text should be accumulated from SSE events.""" + # Use valid base64 chunks so the new merged-bytes path doesn't error. + chunks = [ + base64.b64encode(b"AAAA").decode(), + base64.b64encode(b"BBBB").decode(), + base64.b64encode(b"CCCC").decode(), + ] events = [ - _audio_event(b64_chunk="AAAA", transcript="First "), - _audio_event(b64_chunk="BBBB", transcript="second "), - _audio_event(b64_chunk="CCCC", transcript="third."), + _audio_event(b64_chunk=chunks[0], transcript="First "), + _audio_event(b64_chunk=chunks[1], transcript="second "), + _audio_event(b64_chunk=chunks[2], transcript="third."), ] lines = _make_sse_lines(events) fake_resp = _FakeStreamResponse(lines) @@ -203,14 +247,17 @@ async def test_transcript_extraction(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() - result = await provider.generate_audio(text="test") + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") + result = await provider.generate_audio( + text="test", model="openai/gpt-audio-mini", format="mp3" + ) assert result.text == "First second third." @pytest.mark.asyncio async def test_model_prefix_stripping(self, monkeypatch): - """openrouter/ prefix should be stripped from model before sending.""" - events = [_audio_event(b64_chunk="AAAA")] + """openrouter/ prefix should be stripped from model before sending (chat path).""" + events = [_audio_event(b64_chunk=base64.b64encode(b"AAAA").decode())] lines = _make_sse_lines(events) fake_resp = _FakeStreamResponse(lines) fake_session = _FakeSession(fake_resp) @@ -219,20 +266,20 @@ async def test_model_prefix_stripping(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") await provider.generate_audio( text="test", - model="openrouter/openai/gpt-4o-mini-tts", + model="openrouter/openai/gpt-audio-mini", + format="mp3", ) - # Check the payload sent - post_kwargs = fake_session._last_post_kwargs - payload = post_kwargs["json"] - assert payload["model"] == "openai/gpt-4o-mini-tts" + payload = fake_session._last_post_kwargs["json"] + assert payload["model"] == "openai/gpt-audio-mini" assert not payload["model"].startswith("openrouter/") @pytest.mark.asyncio async def test_empty_stream_returns_no_audio(self, monkeypatch): - """Empty SSE stream should return response with no audio.""" + """Empty SSE stream should return response with no audio (chat-audio path).""" lines = _make_sse_lines([]) fake_resp = _FakeStreamResponse(lines) fake_session = _FakeSession(fake_resp) @@ -241,48 +288,17 @@ async def test_empty_stream_returns_no_audio(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() - result = await provider.generate_audio(text="test") + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") + result = await provider.generate_audio( + text="test", model="openai/gpt-audio-mini", format="mp3" + ) assert not result.has_audio assert result.text == "test" - @pytest.mark.asyncio - async def test_invalid_voice_defaults_to_alloy(self, monkeypatch): - """Invalid voice should fall back to alloy.""" - events = [_audio_event(b64_chunk="AAAA")] - lines = _make_sse_lines(events) - fake_resp = _FakeStreamResponse(lines) - fake_session = _FakeSession(fake_resp) - - monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") - - with patch("aiohttp.ClientSession", return_value=fake_session): - provider = OpenRouterProvider() - await provider.generate_audio(text="test", voice="invalid_voice") - - payload = fake_session._last_post_kwargs["json"] - assert payload["audio"]["voice"] == "alloy" - - @pytest.mark.asyncio - async def test_invalid_format_defaults_to_wav(self, monkeypatch): - """Invalid format should fall back to wav.""" - events = [_audio_event(b64_chunk="AAAA")] - lines = _make_sse_lines(events) - fake_resp = _FakeStreamResponse(lines) - fake_session = _FakeSession(fake_resp) - - monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") - - with patch("aiohttp.ClientSession", return_value=fake_session): - provider = OpenRouterProvider() - await provider.generate_audio(text="test", format="invalid_fmt") - - payload = fake_session._last_post_kwargs["json"] - assert payload["audio"]["format"] == "wav" - @pytest.mark.asyncio async def test_http_error_raises(self, monkeypatch): - """Non-200 response should raise RuntimeError.""" + """Non-200 response should raise RuntimeError (chat-audio path).""" fake_resp = _FakeStreamResponse([], status=400) fake_session = _FakeSession(fake_resp) @@ -290,8 +306,11 @@ async def test_http_error_raises(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") with pytest.raises(RuntimeError, match="failed.*400"): - await provider.generate_audio(text="test") + await provider.generate_audio( + text="test", model="openai/gpt-audio-mini", format="mp3" + ) @pytest.mark.asyncio async def test_missing_api_key_raises(self, monkeypatch): @@ -304,11 +323,12 @@ async def test_missing_api_key_raises(self, monkeypatch): @pytest.mark.asyncio async def test_malformed_sse_lines_skipped(self, monkeypatch): """Non-JSON SSE lines and non-data lines should be safely skipped.""" + valid_b64 = base64.b64encode(b"AAA").decode() lines = [ b"event: ping\n", b"data: not_json\n", b'data: {"choices": []}\n', - f"data: {json.dumps(_audio_event(b64_chunk='QUFB'))}\n".encode(), + f"data: {json.dumps(_audio_event(b64_chunk=valid_b64))}\n".encode(), b"data: [DONE]\n", ] fake_resp = _FakeStreamResponse(lines) @@ -318,10 +338,83 @@ async def test_malformed_sse_lines_skipped(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() - result = await provider.generate_audio(text="test") + _prime_chat_audio_cache(provider, "openai/gpt-audio-mini") + result = await provider.generate_audio( + text="test", model="openai/gpt-audio-mini", format="mp3" + ) + + assert result.has_audio + assert result.audio.data == valid_b64 + + +class TestOpenRouterAudioSpeechEndpoint: + """Tests for OpenRouterProvider.generate_audio routing to /audio/speech.""" + + @pytest.mark.asyncio + async def test_tts_only_model_routes_to_audio_speech(self, monkeypatch): + """A model with output_modalities=['speech'] (e.g. Kokoro) hits /audio/speech.""" + pcm_body = b"\x00\x01" * 1000 # 2KB of fake PCM + speech_resp = _BytesResponse(pcm_body, status=200, content_type="audio/pcm") + fake_session = _FakeSession(speech_resp) + + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + + with patch("aiohttp.ClientSession", return_value=fake_session): + provider = OpenRouterProvider() + # Pre-populate cache as a TTS-only model so routing is deterministic. + provider._model_meta_cache["hexgrad/kokoro-82m"] = { + "id": "hexgrad/kokoro-82m", + "output_modalities": ["speech"], + "input_modalities": ["text"], + } + result = await provider.generate_audio( + text="hello", + model="openrouter/hexgrad/kokoro-82m", + voice="af_bella", + format="wav", + ) + + # Verify endpoint and payload. + assert "audio/speech" in fake_session._last_post_url + payload = fake_session._last_post_kwargs["json"] + assert payload["model"] == "hexgrad/kokoro-82m" + assert payload["voice"] == "af_bella" + assert payload["response_format"] == "pcm" # wav→pcm wire, wrapped client-side + assert payload["input"] == "hello" + # WAV wrapping should produce a RIFF/WAVE container. assert result.has_audio - assert result.audio.data == "QUFB" + decoded = base64.b64decode(result.audio.data) + assert decoded[:4] == b"RIFF" + assert decoded[8:12] == b"WAVE" + + @pytest.mark.asyncio + async def test_audio_speech_passes_speed_and_extra(self, monkeypatch): + """speed and extra are forwarded to /audio/speech body.""" + speech_resp = _BytesResponse(b"\x00\x01" * 100) + fake_session = _FakeSession(speech_resp) + + monkeypatch.setenv("OPENROUTER_API_KEY", "test-key") + + with patch("aiohttp.ClientSession", return_value=fake_session): + provider = OpenRouterProvider() + provider._model_meta_cache["openai/gpt-4o-mini-tts"] = { + "id": "openai/gpt-4o-mini-tts", + "output_modalities": ["speech"], + "input_modalities": ["text"], + } + await provider.generate_audio( + text="hi", + model="openai/gpt-4o-mini-tts", + speed=1.25, + extra={"language": "en-US"}, + format="mp3", + ) + + payload = fake_session._last_post_kwargs["json"] + assert payload["speed"] == 1.25 + assert payload["language"] == "en-US" + assert payload["response_format"] == "mp3" # ============================================================================= @@ -358,6 +451,7 @@ async def test_openrouter_generate_music_streams(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "google/lyria-3-pro") result = await provider.generate_music( prompt="jazz piano", model="google/lyria-3-pro", @@ -385,6 +479,7 @@ async def test_openrouter_generate_music_default_model(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "google/lyria-3-pro") await provider.generate_music(prompt="test") payload = fake_session._last_post_kwargs["json"] @@ -402,6 +497,7 @@ async def test_openrouter_generate_music_strips_prefix(self, monkeypatch): with patch("aiohttp.ClientSession", return_value=fake_session): provider = OpenRouterProvider() + _prime_chat_audio_cache(provider, "google/lyria-3-pro") await provider.generate_music( prompt="test", model="openrouter/google/lyria-3-pro", diff --git a/sdk/typescript/src/ai/MediaProvider.ts b/sdk/typescript/src/ai/MediaProvider.ts index 6c08e07be..4b0477b47 100644 --- a/sdk/typescript/src/ai/MediaProvider.ts +++ b/sdk/typescript/src/ai/MediaProvider.ts @@ -21,16 +21,40 @@ export class MediaProviderError extends Error { } } +/** Frame guidance for image-to-video models (e.g. Veo). */ +export interface VideoFrameImage { + /** Image content type — usually "image_url". */ + type?: string; + /** Image URL or `data:` URL. */ + imageUrl: { url: string }; + /** Which frame this image controls. */ + frameType?: 'first_frame' | 'last_frame'; +} + +/** Reference image for style / subject guidance (Veo "reference-to-video"). */ +export interface VideoInputReference { + type?: string; + imageUrl: { url: string }; +} + export interface VideoRequest { prompt: string; model?: string; + /** Duration in seconds (model-dependent — typically 4, 6, or 8). */ duration?: number; resolution?: '480p' | '720p' | '1080p' | '1K' | '2K' | '4K'; aspectRatio?: '16:9' | '9:16' | '1:1' | '4:3' | '3:4' | '21:9' | '9:21'; + /** Toggle synchronized audio track (when model supports it). */ generateAudio?: boolean; seed?: number; - frameImages?: Array<{ type: string; imageUrl: { url: string }; frameType?: string }>; - inputReferences?: Array<{ type: string; imageUrl: { url: string } }>; + /** Single input image for image-to-video (legacy convenience field). */ + imageUrl?: string; + /** Per-frame guidance — first_frame / last_frame. Takes precedence over `imageUrl`. */ + frameImages?: VideoFrameImage[]; + /** Reference images for style/subject guidance. */ + inputReferences?: VideoInputReference[]; + /** Model-specific passthrough parameters (e.g. Veo's `personGeneration`). */ + extra?: Record; pollInterval?: number; // ms, default 30000 timeout?: number; // ms, default 600000 } @@ -40,12 +64,24 @@ export interface ImageRequest { model?: string; size?: string; quality?: string; + /** Reference / source image(s) for image+text→image models (e.g. grok-imagine). */ + imageUrls?: string[]; imageConfig?: { aspectRatio?: string; imageSize?: string; + /** Image-to-image blend strength (model-dependent, 0–1). */ + strength?: number; + /** Style hint — Recraft V3 etc. */ + style?: string; + /** RGB color palette — array of [r,g,b]. */ + rgbColors?: number[][]; + /** Background color hint as [r,g,b]. */ + backgroundRgbColor?: number[]; superResolutionReferences?: string[]; fontInputs?: Array<{ fontUrl: string; text: string }>; }; + /** Model-specific passthrough parameters. */ + extra?: Record; } export interface AudioRequest { @@ -53,6 +89,10 @@ export interface AudioRequest { model?: string; voice?: string; format?: string; + /** Playback speed multiplier (OpenAI TTS only — other models ignore). */ + speed?: number; + /** Model-specific passthrough parameters. */ + extra?: Record; } export interface MediaResponse { diff --git a/sdk/typescript/src/ai/OpenRouterMediaProvider.ts b/sdk/typescript/src/ai/OpenRouterMediaProvider.ts index 732a62bce..7f645f3b6 100644 --- a/sdk/typescript/src/ai/OpenRouterMediaProvider.ts +++ b/sdk/typescript/src/ai/OpenRouterMediaProvider.ts @@ -25,6 +25,12 @@ const MAX_CONSECUTIVE_PARSE_ERRORS = 50; /** Module-level WeakMap to keep API key off the instance (CR-03). */ const apiKeyStore = new WeakMap(); +/** Per-instance cache of model metadata (output_modalities, input_modalities). */ +const modelMetaStore = new WeakMap< + OpenRouterMediaProvider, + Map +>(); + function emptyMediaResponse(raw: unknown): MediaResponse { return { text: '', images: [], audio: null, files: [], videos: [], rawResponse: raw }; } @@ -33,6 +39,46 @@ function stripPrefix(model: string): string { return model.startsWith('openrouter/') ? model.slice('openrouter/'.length) : model; } +/** + * Wrap raw little-endian PCM16 mono bytes in a WAV (RIFF) container. + * OpenRouter's TTS endpoints emit PCM at 24 kHz; default to that. + */ +function wrapPcm16AsWav(pcm: Uint8Array, sampleRate = 24000): Uint8Array { + const channels = 1; + const bitsPerSample = 16; + const byteRate = (sampleRate * channels * bitsPerSample) / 8; + const blockAlign = (channels * bitsPerSample) / 8; + const dataSize = pcm.byteLength; + const buffer = new ArrayBuffer(44 + dataSize); + const view = new DataView(buffer); + // RIFF header + view.setUint8(0, 0x52); view.setUint8(1, 0x49); view.setUint8(2, 0x46); view.setUint8(3, 0x46); // "RIFF" + view.setUint32(4, 36 + dataSize, true); + view.setUint8(8, 0x57); view.setUint8(9, 0x41); view.setUint8(10, 0x56); view.setUint8(11, 0x45); // "WAVE" + // fmt chunk + view.setUint8(12, 0x66); view.setUint8(13, 0x6d); view.setUint8(14, 0x74); view.setUint8(15, 0x20); // "fmt " + view.setUint32(16, 16, true); // PCM chunk size + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, channels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitsPerSample, true); + // data chunk + view.setUint8(36, 0x64); view.setUint8(37, 0x61); view.setUint8(38, 0x74); view.setUint8(39, 0x61); // "data" + view.setUint32(40, dataSize, true); + new Uint8Array(buffer, 44).set(pcm); + return new Uint8Array(buffer); +} + +function bytesToBase64(bytes: Uint8Array): string { + return Buffer.from(bytes).toString('base64'); +} + +function base64ToBytes(b64: string): Uint8Array { + return new Uint8Array(Buffer.from(b64, 'base64')); +} + /** * Validate a URL is safe to download from (CR-02 — SSRF protection). * Rejects non-https, localhost, and private/reserved IP ranges. @@ -102,6 +148,62 @@ export class OpenRouterMediaProvider implements MediaProvider { }); } apiKeyStore.set(this, key); + modelMetaStore.set(this, new Map()); + } + + /** + * Seed the metadata cache for a model. Useful when running against test + * servers that don't expose `GET /models/{id}/endpoints`, or when callers + * already know the routing they want. + * + * Output modalities follow OpenRouter's convention — `["speech"]` for + * TTS-only (Kokoro etc.), `["text","audio"]` for chat-audio (gpt-audio + * family), `["video"]`, `["image"]`, etc. + */ + seedModelMeta(model: string, outputModalities: string[], inputModalities: string[] = []): void { + const stripped = stripPrefix(model); + const cache = modelMetaStore.get(this)!; + cache.set(stripped, { + outputModalities: [...outputModalities], + inputModalities: [...inputModalities], + }); + } + + /** + * Fetch + cache OpenRouter model metadata so we can route requests to the + * right endpoint. On any error returns an empty meta object so callers can + * fall back to defaults. + */ + private async fetchModelMeta(model: string): Promise<{ + outputModalities: string[]; + inputModalities: string[]; + }> { + const stripped = stripPrefix(model); + const cache = modelMetaStore.get(this)!; + const cached = cache.get(stripped); + if (cached) return cached; + + const url = `${this.baseUrl}/models/${stripped}/endpoints`; + try { + const res = await this.get(url); + if (!res.ok) { + const meta = { outputModalities: [], inputModalities: [] }; + cache.set(stripped, meta); + return meta; + } + const data = (await res.json()) as { data?: { architecture?: { output_modalities?: string[]; input_modalities?: string[] } } }; + const arch = data?.data?.architecture ?? {}; + const meta = { + outputModalities: arch.output_modalities ?? [], + inputModalities: arch.input_modalities ?? [], + }; + cache.set(stripped, meta); + return meta; + } catch { + const meta = { outputModalities: [], inputModalities: [] }; + cache.set(stripped, meta); + return meta; + } } /** Prevent API key from leaking via JSON.stringify (CR-03). */ @@ -130,8 +232,22 @@ export class OpenRouterMediaProvider implements MediaProvider { if (request.aspectRatio) body.aspect_ratio = request.aspectRatio; if (request.generateAudio != null) body.generate_audio = request.generateAudio; if (request.seed != null) body.seed = request.seed; - if (request.frameImages) body.frame_images = request.frameImages; - if (request.inputReferences) body.input_references = request.inputReferences; + if (request.imageUrl) body.image_url = request.imageUrl; + if (request.frameImages) { + // Convert TS camelCase to OpenRouter snake_case. + body.frame_images = request.frameImages.map((fi) => ({ + type: fi.type ?? 'image_url', + image_url: fi.imageUrl, + ...(fi.frameType ? { frame_type: fi.frameType } : {}), + })); + } + if (request.inputReferences) { + body.input_references = request.inputReferences.map((ref) => ({ + type: ref.type ?? 'image_url', + image_url: ref.imageUrl, + })); + } + if (request.extra) Object.assign(body, request.extra); const submitEndpoint = `${this.baseUrl}/videos`; @@ -189,16 +305,33 @@ export class OpenRouterMediaProvider implements MediaProvider { ); } - // Extract video URL + // Extract video URL. OpenRouter returns either an array `unsigned_urls` + // (current API) or a single `unsigned_url` / `url` for legacy responses. + const unsignedUrls = jobData.unsigned_urls as string[] | undefined; const unsignedUrl = jobData.unsigned_url as string | undefined; const signedUrl = jobData.url as string | undefined; - const videoUrl = unsignedUrl ?? signedUrl; + const videoUrl = unsignedUrls?.[0] ?? unsignedUrl ?? signedUrl; // Download video bytes if URL available (CR-02: validate URL, redirect: 'error') let videoData: string | undefined; if (videoUrl) { assertSafeUrl(videoUrl); + // OpenRouter's "unsigned" URLs are served from openrouter.ai itself and + // require the same Bearer auth as the API; non-openrouter hosts (CDN) + // accept the URL bare. + const downloadHeaders: Record = {}; + try { + const host = new URL(videoUrl).hostname.toLowerCase(); + if (host === 'openrouter.ai' || host.endsWith('.openrouter.ai')) { + const key = apiKeyStore.get(this); + if (key) downloadHeaders.Authorization = `Bearer ${key}`; + } + } catch { + /* non-URL — leave headers empty */ + } + const dlRes = await fetch(videoUrl, { + headers: downloadHeaders, signal: AbortSignal.timeout(DOWNLOAD_TIMEOUT), redirect: 'error', }); @@ -227,15 +360,48 @@ export class OpenRouterMediaProvider implements MediaProvider { async generateImage(request: ImageRequest): Promise { const model = stripPrefix(request.model ?? 'openai/gpt-image-1'); - const messages: unknown[] = [{ role: 'user', content: request.prompt }]; + // Request only image output — works for both image-only models (e.g. + // x-ai/grok-imagine-image-quality) and dual-output models. Image-only + // models return 404 when "text" is also requested. + let userContent: unknown = request.prompt; + if (request.imageUrls && request.imageUrls.length > 0) { + // Multi-modal content array — text + reference images. + userContent = [ + { type: 'text', text: request.prompt }, + ...request.imageUrls.map((url) => ({ + type: 'image_url', + image_url: { url }, + })), + ]; + } + const messages: unknown[] = [{ role: 'user', content: userContent }]; const body: Record = { model, messages, - modalities: ['image', 'text'], + modalities: ['image'], }; if (request.size) body.size = request.size; if (request.quality) body.quality = request.quality; - if (request.imageConfig) body.image_config = request.imageConfig; + if (request.imageConfig) { + // Convert camelCase keys to OpenRouter snake_case. + const ic = request.imageConfig; + const out: Record = {}; + if (ic.aspectRatio) out.aspect_ratio = ic.aspectRatio; + if (ic.imageSize) out.image_size = ic.imageSize; + if (ic.strength != null) out.strength = ic.strength; + if (ic.style) out.style = ic.style; + if (ic.rgbColors) out.rgb_colors = ic.rgbColors; + if (ic.backgroundRgbColor) out.background_rgb_color = ic.backgroundRgbColor; + if (ic.superResolutionReferences) out.super_resolution_references = ic.superResolutionReferences; + if (ic.fontInputs) { + out.font_inputs = ic.fontInputs.map((fi) => ({ + font_url: fi.fontUrl, + text: fi.text, + })); + } + body.image_config = out; + } + if (request.extra) Object.assign(body, request.extra); const endpoint = `${this.baseUrl}/chat/completions`; const res = await this.post(endpoint, body); @@ -248,7 +414,19 @@ export class OpenRouterMediaProvider implements MediaProvider { const data = (await res.json()) as Record; const resp = emptyMediaResponse(data); - // Extract images from choices + // Extract images from choices. OpenRouter places images either inline in + // `message.content` as multimodal parts (gpt-image-1 style) or in a + // dedicated `message.images` array (gemini-*-image, grok-imagine style + // where `content` is null). + const pushImageFromUrl = (url: string | undefined) => { + if (!url) return; + if (url.startsWith('data:')) { + const b64 = url.split(',', 2)[1]; + resp.images.push({ url, b64Json: b64 }); + } else { + resp.images.push({ url }); + } + }; const choices = data.choices as Array> | undefined; if (choices) { for (const choice of choices) { @@ -258,7 +436,7 @@ export class OpenRouterMediaProvider implements MediaProvider { if (typeof msg.content === 'string') { resp.text += msg.content; } - // Content array (multimodal) + // Content array (gpt-image-1 multimodal style) if (Array.isArray(msg.content)) { for (const part of msg.content) { const p = part as Record; @@ -266,16 +444,18 @@ export class OpenRouterMediaProvider implements MediaProvider { resp.text += p.text as string; } else if (p.type === 'image_url') { const imgUrl = p.image_url as Record | undefined; - const url = imgUrl?.url as string | undefined; - if (url?.startsWith('data:')) { - const b64 = url.split(',', 2)[1]; - resp.images.push({ url, b64Json: b64 }); - } else if (url) { - resp.images.push({ url }); - } + pushImageFromUrl(imgUrl?.url as string | undefined); } } } + // Dedicated images array (gemini-*-image, grok-imagine — content is null) + const images = msg.images as Array> | undefined; + if (Array.isArray(images)) { + for (const img of images) { + const imgUrl = img.image_url as Record | undefined; + pushImageFromUrl(imgUrl?.url as string | undefined); + } + } } } @@ -286,7 +466,34 @@ export class OpenRouterMediaProvider implements MediaProvider { async generateAudio(request: AudioRequest): Promise { const model = stripPrefix(request.model ?? 'openai/gpt-4o-mini-tts'); + const requestedFormat = request.format ?? 'wav'; + + // Route based on model capability. + // output_modalities=["speech"] → POST /audio/speech (OpenAI-compat TTS, + // e.g. hexgrad/kokoro-82m) + // contains "audio" → chat-completions SSE w/ audio modality + // (e.g. openai/gpt-audio*) + // unknown → try /audio/speech (broader-compat). + const meta = await this.fetchModelMeta(model); + const outMods = meta.outputModalities; + const useSpeechEndpoint = + outMods.includes('speech') || + (outMods.length === 0) || + !outMods.includes('audio'); + + if (useSpeechEndpoint) { + return this.generateAudioViaSpeechEndpoint( + model, + request.text, + request.voice ?? 'alloy', + requestedFormat, + request + ); + } + // Chat-completions audio modality: openai/gpt-audio family. Streaming on + // OpenAI is locked to pcm16 — wire that and re-wrap to user's format below. + const wireFormat = requestedFormat === 'wav' ? 'pcm16' : requestedFormat; const messages: unknown[] = [{ role: 'user', content: request.text }]; const body: Record = { model, @@ -295,7 +502,7 @@ export class OpenRouterMediaProvider implements MediaProvider { stream: true, audio: { voice: request.voice ?? 'alloy', - format: request.format ?? 'wav', + format: wireFormat, }, }; @@ -399,14 +606,78 @@ export class OpenRouterMediaProvider implements MediaProvider { const resp = emptyMediaResponse(null); resp.text = textContent; if (audioChunks.length > 0) { + let b64 = audioChunks.join(''); + // SSE chunks decode independently — concatenate raw bytes for cleaner output. + try { + const parts = audioChunks.map(base64ToBytes); + const total = parts.reduce((n, p) => n + p.byteLength, 0); + const merged = new Uint8Array(total); + let off = 0; + for (const p of parts) { merged.set(p, off); off += p.byteLength; } + b64 = bytesToBase64(merged); + if (requestedFormat === 'wav') { + b64 = bytesToBase64(wrapPcm16AsWav(merged)); + } + } catch { + /* fall back to concatenated base64 strings */ + } resp.audio = { - data: audioChunks.join(''), - format: request.format ?? 'wav', + data: b64, + format: requestedFormat, }; } return resp; } + /** + * Call OpenRouter's OpenAI-compatible TTS endpoint (`POST /audio/speech`). + * Returns raw bytes for the requested format; wraps PCM → WAV when needed. + */ + private async generateAudioViaSpeechEndpoint( + model: string, + text: string, + voice: string, + requestedFormat: string, + request?: AudioRequest + ): Promise { + // Map requested format → upstream response_format. Kokoro etc. only + // emit pcm/mp3; we wrap pcm into WAV ourselves when caller asked for wav. + const wireFormat = + requestedFormat === 'wav' || requestedFormat === 'pcm' || requestedFormat === 'pcm16' + ? 'pcm' + : requestedFormat; + + const endpoint = `${this.baseUrl}/audio/speech`; + const body: Record = { + model, + input: text, + voice, + response_format: wireFormat, + }; + if (request?.speed != null) body.speed = request.speed; + if (request?.extra) Object.assign(body, request.extra); + const res = await this.post(endpoint, body); + if (!res.ok) { + throw new MediaProviderError( + `Audio generation failed [model=${model}] [endpoint=${endpoint}]: ${res.status} ${await res.text()}`, + { provider: 'openrouter', model, endpoint } + ); + } + const buf = new Uint8Array(await res.arrayBuffer()); + const finalBytes = requestedFormat === 'wav' ? wrapPcm16AsWav(buf) : buf; + const resp = emptyMediaResponse({ + endpoint: 'audio/speech', + model, + mime_type: res.headers.get('content-type') ?? '', + }); + resp.text = text; + resp.audio = { + data: bytesToBase64(finalBytes), + format: requestedFormat, + }; + return resp; + } + // ── Helpers ──────────────────────────────────────────────────────── private post(url: string, body: unknown): Promise { diff --git a/sdk/typescript/tests/media_integration.test.ts b/sdk/typescript/tests/media_integration.test.ts index 273447c9a..b5cd758f5 100644 --- a/sdk/typescript/tests/media_integration.test.ts +++ b/sdk/typescript/tests/media_integration.test.ts @@ -289,6 +289,7 @@ describe('Integration: OpenRouterMediaProvider', () => { describe('generateAudio', () => { it('parses SSE stream into audio output', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); const sseLines = [ 'data: {"choices":[{"delta":{"content":"Hello"}}]}\n\n', @@ -315,16 +316,22 @@ describe('Integration: OpenRouterMediaProvider', () => { body: { getReader: () => mockReader }, }); - const resp = await provider.generateAudio({ text: 'say hello', voice: 'nova' }); + const resp = await provider.generateAudio({ + text: 'say hello', + model: 'openai/gpt-audio-mini', + voice: 'nova', + format: 'mp3', + }); expect(resp.text).toBe('Hello'); expect(resp.audio).not.toBeNull(); expect(resp.audio!.data).toBe('AAAABBBB'); - expect(resp.audio!.format).toBe('wav'); + expect(resp.audio!.format).toBe('mp3'); }); it('custom format is respected', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); const sseLines = [ 'data: {"choices":[{"delta":{"audio":{"data":"X"}}}]}\n\n', @@ -351,6 +358,7 @@ describe('Integration: OpenRouterMediaProvider', () => { const resp = await provider.generateAudio({ text: 'test', + model: 'openai/gpt-audio-mini', format: 'mp3', }); @@ -361,8 +369,43 @@ describe('Integration: OpenRouterMediaProvider', () => { expect(body.audio.format).toBe('mp3'); }); + it('routes TTS-only models to /audio/speech and WAV-wraps PCM', async () => { + const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('hexgrad/kokoro-82m', ['speech'], ['text']); + + const pcmBytes = new Uint8Array(2000); // raw fake PCM + mockFetch.mockResolvedValueOnce({ + ok: true, + headers: { get: () => 'audio/pcm' }, + arrayBuffer: async () => pcmBytes.buffer, + text: async () => '', + }); + + const resp = await provider.generateAudio({ + text: 'hello', + model: 'openrouter/hexgrad/kokoro-82m', + voice: 'af_bella', + format: 'wav', + }); + expect(resp.audio).not.toBeNull(); + expect(resp.audio!.format).toBe('wav'); + + // /audio/speech endpoint + correct payload + const [url, init] = mockFetch.mock.calls[0]; + expect(String(url)).toContain('/audio/speech'); + const body = JSON.parse(init.body); + expect(body.model).toBe('hexgrad/kokoro-82m'); + expect(body.voice).toBe('af_bella'); + expect(body.response_format).toBe('pcm'); + + const wav = Buffer.from(resp.audio!.data!, 'base64'); + expect(wav.subarray(0, 4).toString()).toBe('RIFF'); + expect(wav.subarray(8, 12).toString()).toBe('WAVE'); + }); + it('throws MediaProviderError on HTTP failure', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); mockFetch.mockResolvedValueOnce({ ok: false, @@ -371,7 +414,9 @@ describe('Integration: OpenRouterMediaProvider', () => { }); await expect( - provider.generateAudio({ text: 'test' }) + provider.generateAudio({ + text: 'test', model: 'openai/gpt-audio-mini', format: 'mp3', + }) ).rejects.toThrow(MediaProviderError); }); }); diff --git a/sdk/typescript/tests/media_provider.test.ts b/sdk/typescript/tests/media_provider.test.ts index 3b6ce4132..cc0434820 100644 --- a/sdk/typescript/tests/media_provider.test.ts +++ b/sdk/typescript/tests/media_provider.test.ts @@ -409,6 +409,10 @@ describe('OpenRouterMediaProvider', () => { describe('generateAudio', () => { it('parses SSE stream and collects audio chunks', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + // Force chat-completions routing — without this, the provider would try + // GET /models/{id}/endpoints to discover routing and fall through to + // /audio/speech because mocks return undefined. + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); const sseLines = [ 'data: {"choices":[{"delta":{"content":"Hello"}}]}\n\n', @@ -435,15 +439,19 @@ describe('OpenRouterMediaProvider', () => { body: { getReader: () => mockReader }, }); - const resp = await provider.generateAudio({ text: 'say hello' }); + // mp3 keeps chunks as raw base64 (no wav wrap). + const resp = await provider.generateAudio({ + text: 'say hello', model: 'openai/gpt-audio-mini', format: 'mp3', + }); expect(resp.text).toBe('Hello'); expect(resp.audio).not.toBeNull(); expect(resp.audio!.data).toBe('AAAABBBB'); - expect(resp.audio!.format).toBe('wav'); + expect(resp.audio!.format).toBe('mp3'); }); it('processes remaining buffer after stream ends (WR-02)', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); // Send data without trailing newline so it stays in buffer const encoder = new TextEncoder(); @@ -469,12 +477,15 @@ describe('OpenRouterMediaProvider', () => { body: { getReader: () => mockReader }, }); - const resp = await provider.generateAudio({ text: 'test' }); + const resp = await provider.generateAudio({ + text: 'test', model: 'openai/gpt-audio-mini', format: 'mp3', + }); expect(resp.text).toBe('AB'); }); it('throws on failure with context (WR-05)', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); mockFetch.mockResolvedValueOnce({ ok: false, @@ -483,7 +494,9 @@ describe('OpenRouterMediaProvider', () => { }); await expect( - provider.generateAudio({ text: 'test' }) + provider.generateAudio({ + text: 'test', model: 'openai/gpt-audio-mini', format: 'mp3', + }) ).rejects.toThrow(MediaProviderError); // Reset mock for second assertion mockFetch.mockResolvedValueOnce({ @@ -492,12 +505,15 @@ describe('OpenRouterMediaProvider', () => { text: async () => 'Internal error', }); await expect( - provider.generateAudio({ text: 'test' }) + provider.generateAudio({ + text: 'test', model: 'openai/gpt-audio-mini', format: 'mp3', + }) ).rejects.toThrow('Audio generation failed'); }); it('throws after too many consecutive parse errors (WR-04)', async () => { const provider = new OpenRouterMediaProvider({ apiKey: 'test-key' }); + provider.seedModelMeta('openai/gpt-audio-mini', ['text', 'audio'], ['text']); // Create 51+ malformed SSE lines in a single chunk const malformedLines = Array.from( @@ -524,7 +540,9 @@ describe('OpenRouterMediaProvider', () => { }); await expect( - provider.generateAudio({ text: 'test' }) + provider.generateAudio({ + text: 'test', model: 'openai/gpt-audio-mini', format: 'mp3', + }) ).rejects.toThrow('consecutive SSE parse errors'); }); }); diff --git a/sdk/typescript/tests/openrouter_media_routing.test.ts b/sdk/typescript/tests/openrouter_media_routing.test.ts new file mode 100644 index 000000000..e611f28f7 --- /dev/null +++ b/sdk/typescript/tests/openrouter_media_routing.test.ts @@ -0,0 +1,267 @@ +/** + * Coverage for new OpenRouter routing / param-translation logic in + * OpenRouterMediaProvider: + * - fetchModelMeta (HTTP success/cache/error/exception paths) + * - generateImage: imageUrls multi-part content, imageConfig snake_case mapping + * - generateVideo: imageUrl, frame_images snake_case, input_references, extra + * - generateAudio: /audio/speech path with speed + extra + */ +import { + describe, + it, + expect, + beforeEach, + afterEach, + vi, +} from 'vitest'; +import { OpenRouterMediaProvider } from '../src/ai/OpenRouterMediaProvider.js'; + +const originalFetch = globalThis.fetch; +let mockFetch: ReturnType; + +beforeEach(() => { + mockFetch = vi.fn(); + globalThis.fetch = mockFetch; +}); + +afterEach(() => { + globalThis.fetch = originalFetch; +}); + +describe('OpenRouterMediaProvider: model metadata fetch', () => { + it('caches metadata after first GET', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + id: 'openai/gpt-audio-mini', + architecture: { + output_modalities: ['text', 'audio'], + input_modalities: ['text'], + }, + }, + }), + }); + // For both audio calls below — they hit chat-completions SSE. + const sseBody = { + getReader: () => { + let done = false; + return { + read: async () => { + if (done) return { done: true, value: undefined }; + done = true; + const enc = new TextEncoder(); + return { done: false, value: enc.encode('data: [DONE]\n\n') }; + }, + }; + }, + }; + mockFetch.mockResolvedValueOnce({ ok: true, body: sseBody }); + mockFetch.mockResolvedValueOnce({ ok: true, body: sseBody }); + + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + await provider.generateAudio({ + text: 'a', model: 'openrouter/openai/gpt-audio-mini', format: 'mp3', + }); + await provider.generateAudio({ + text: 'b', model: 'openrouter/openai/gpt-audio-mini', format: 'mp3', + }); + // 1 metadata GET + 2 audio POSTs = 3 calls + expect(mockFetch).toHaveBeenCalledTimes(3); + const firstUrl = String(mockFetch.mock.calls[0][0]); + expect(firstUrl).toContain('/models/openai/gpt-audio-mini/endpoints'); + }); + + it('falls back to /audio/speech when metadata GET returns 500', async () => { + mockFetch.mockResolvedValueOnce({ ok: false, status: 500 }); + mockFetch.mockResolvedValueOnce({ + ok: true, + headers: { get: () => 'audio/mpeg' }, + arrayBuffer: async () => new Uint8Array([1, 2, 3]).buffer, + text: async () => '', + }); + + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + const resp = await provider.generateAudio({ + text: 'x', model: 'unknown/tts', format: 'mp3', + }); + expect(resp.audio).not.toBeNull(); + expect(String(mockFetch.mock.calls[1][0])).toContain('/audio/speech'); + }); + + it('falls back to /audio/speech when metadata GET throws', async () => { + mockFetch.mockRejectedValueOnce(new Error('network')); + mockFetch.mockResolvedValueOnce({ + ok: true, + headers: { get: () => 'audio/mpeg' }, + arrayBuffer: async () => new Uint8Array([1]).buffer, + text: async () => '', + }); + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + await provider.generateAudio({ text: 'x', model: 'unknown/y', format: 'mp3' }); + expect(String(mockFetch.mock.calls[1][0])).toContain('/audio/speech'); + }); +}); + +describe('OpenRouterMediaProvider.generateVideo: param translation', () => { + it('maps imageUrl, frameImages, inputReferences, extra to OpenRouter schema', async () => { + // 1) submit, 2) poll → completed, 3) download + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ id: 'jobZ' }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + id: 'jobZ', + status: 'completed', + unsigned_urls: ['https://cdn.example.com/v.mp4'], + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: async () => new Uint8Array([0x66, 0x66]).buffer, + }); + + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + await provider.generateVideo({ + prompt: 'x', + model: 'openrouter/google/veo-3.1-lite', + duration: 4, + imageUrl: 'https://example.com/seed.jpg', + frameImages: [ + { imageUrl: { url: 'https://x/first.jpg' }, frameType: 'first_frame' }, + { imageUrl: { url: 'https://x/last.jpg' }, frameType: 'last_frame' }, + ], + inputReferences: [{ imageUrl: { url: 'https://x/ref.jpg' } }], + extra: { personGeneration: 'allow_all' }, + pollInterval: 1, + timeout: 10_000, + }); + + const submitBody = JSON.parse(mockFetch.mock.calls[0][1].body as string); + expect(submitBody.image_url).toBe('https://example.com/seed.jpg'); + expect(submitBody.frame_images).toEqual([ + { + type: 'image_url', + image_url: { url: 'https://x/first.jpg' }, + frame_type: 'first_frame', + }, + { + type: 'image_url', + image_url: { url: 'https://x/last.jpg' }, + frame_type: 'last_frame', + }, + ]); + expect(submitBody.input_references).toEqual([ + { type: 'image_url', image_url: { url: 'https://x/ref.jpg' } }, + ]); + expect(submitBody.personGeneration).toBe('allow_all'); + }); +}); + +describe('OpenRouterMediaProvider.generateImage: imageUrls + imageConfig', () => { + it('builds multi-part user content and snake_cases imageConfig', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + choices: [ + { + message: { + content: null, + images: [ + { + type: 'image_url', + image_url: { url: 'data:image/png;base64,QUJD' }, + }, + ], + }, + }, + ], + }), + }); + + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + const resp = await provider.generateImage({ + prompt: 'fox in watercolor', + model: 'openrouter/x-ai/grok-imagine-image-quality', + imageUrls: ['https://r/1.png', 'https://r/2.png'], + imageConfig: { + aspectRatio: '16:9', + imageSize: '1024x576', + strength: 0.6, + style: 'painterly', + rgbColors: [[255, 0, 0]], + backgroundRgbColor: [0, 0, 0], + superResolutionReferences: ['https://r/sr.png'], + fontInputs: [{ fontUrl: 'https://f.com/x.ttf', text: 'hi' }], + }, + extra: { quality_boost: true }, + }); + + const body = JSON.parse(mockFetch.mock.calls[0][1].body as string); + // Multi-part content + expect(body.messages[0].content).toEqual([ + { type: 'text', text: 'fox in watercolor' }, + { type: 'image_url', image_url: { url: 'https://r/1.png' } }, + { type: 'image_url', image_url: { url: 'https://r/2.png' } }, + ]); + // imageConfig snake_case translation + expect(body.image_config).toEqual({ + aspect_ratio: '16:9', + image_size: '1024x576', + strength: 0.6, + style: 'painterly', + rgb_colors: [[255, 0, 0]], + background_rgb_color: [0, 0, 0], + super_resolution_references: ['https://r/sr.png'], + font_inputs: [{ font_url: 'https://f.com/x.ttf', text: 'hi' }], + }); + expect(body.quality_boost).toBe(true); + expect(body.modalities).toEqual(['image']); + // Returned image base64 captured from data: URL + expect(resp.images[0].b64Json).toBe('QUJD'); + }); +}); + +describe('OpenRouterMediaProvider.generateAudio: /audio/speech extras', () => { + it('passes speed and extra through to the speech body', async () => { + mockFetch.mockResolvedValueOnce({ + ok: true, + headers: { get: () => 'audio/mpeg' }, + arrayBuffer: async () => new Uint8Array([1, 2]).buffer, + text: async () => '', + }); + + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + provider.seedModelMeta('openai/gpt-4o-mini-tts', ['speech'], ['text']); + await provider.generateAudio({ + text: 'bonjour', + model: 'openai/gpt-4o-mini-tts', + voice: 'alloy', + format: 'mp3', + speed: 1.5, + extra: { language: 'fr-FR' }, + }); + const body = JSON.parse(mockFetch.mock.calls[0][1].body as string); + expect(body.speed).toBe(1.5); + expect(body.language).toBe('fr-FR'); + expect(body.response_format).toBe('mp3'); + expect(body.voice).toBe('alloy'); + }); + + it('throws MediaProviderError when /audio/speech returns 401', async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 401, + text: async () => 'unauthorized', + }); + const provider = new OpenRouterMediaProvider({ apiKey: 'k' }); + provider.seedModelMeta('hexgrad/kokoro-82m', ['speech'], ['text']); + await expect( + provider.generateAudio({ + text: 'x', model: 'hexgrad/kokoro-82m', voice: 'af_bella', format: 'wav', + }) + ).rejects.toThrow('Audio generation failed'); + }); +});