Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 47 additions & 4 deletions sdk/go/ai/media_integration_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package ai

import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
Expand Down Expand Up @@ -250,7 +251,7 @@ func TestIntegrationAudioSSEStream(t *testing.T) {
var payload map[string]any
require.NoError(t, json.NewDecoder(r.Body).Decode(&payload))
assert.Equal(t, true, payload["stream"])
assert.Equal(t, "openai/gpt-4o-mini-tts", payload["model"])
assert.Equal(t, "openai/gpt-audio-mini", payload["model"])

w.Header().Set("Content-Type", "text/event-stream")
flusher, _ := w.(http.Flusher)
Expand All @@ -274,18 +275,21 @@ func TestIntegrationAudioSSEStream(t *testing.T) {
BaseURL: srv.URL,
Client: srv.Client(),
}
// Pre-seed metadata so routing picks the chat-completions path
// without hitting the real OpenRouter `/models/.../endpoints` endpoint.
p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"})

resp, err := p.GenerateAudio(context.Background(), AudioRequest{
Text: "Say hello",
Model: "openrouter/openai/gpt-4o-mini-tts",
Model: "openrouter/openai/gpt-audio-mini",
Voice: "nova",
Format: "wav",
Format: "mp3", // avoid pcm→wav rewrap so we can compare raw bytes
})

require.NoError(t, err)
assert.Equal(t, "Hello world", resp.Text)
require.NotNil(t, resp.Audio)
assert.Equal(t, "wav", resp.Audio.Format)
assert.Equal(t, "mp3", resp.Audio.Format)
assert.NotEmpty(t, resp.Audio.Data)

// Decode and verify audio bytes
Expand Down Expand Up @@ -315,16 +319,55 @@ func TestIntegrationAudioWithCustomFormat(t *testing.T) {
BaseURL: srv.URL,
Client: srv.Client(),
}
p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"})

resp, err := p.GenerateAudio(context.Background(), AudioRequest{
Text: "test",
Model: "openai/gpt-audio-mini",
Voice: "echo",
Format: "mp3",
})
require.NoError(t, err)
assert.Equal(t, "mp3", resp.Audio.Format)
}

func TestIntegrationAudioSpeechEndpoint(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/audio/speech", r.URL.Path)
assert.Equal(t, "Bearer test-key", r.Header.Get("Authorization"))

var payload map[string]any
require.NoError(t, json.NewDecoder(r.Body).Decode(&payload))
assert.Equal(t, "hexgrad/kokoro-82m", payload["model"])
assert.Equal(t, "af_bella", payload["voice"])
assert.Equal(t, "pcm", payload["response_format"]) // wav → pcm on wire

// Return 1KB of fake PCM16
pcm := bytes.Repeat([]byte{0x00, 0x01}, 500)
w.Header().Set("Content-Type", "audio/pcm")
w.WriteHeader(http.StatusOK)
_, _ = w.Write(pcm)
}))
defer srv.Close()

p := &OpenRouterMediaProvider{APIKey: "test-key", BaseURL: srv.URL, Client: srv.Client()}
p.SeedModelMeta("hexgrad/kokoro-82m", []string{"speech"}, []string{"text"})

resp, err := p.GenerateAudio(context.Background(), AudioRequest{
Text: "hello",
Model: "openrouter/hexgrad/kokoro-82m",
Voice: "af_bella",
Format: "wav",
})
require.NoError(t, err)
require.NotNil(t, resp.Audio)
assert.Equal(t, "wav", resp.Audio.Format)
decoded, err := base64.StdEncoding.DecodeString(resp.Audio.Data)
require.NoError(t, err)
assert.Equal(t, []byte("RIFF"), decoded[:4])
assert.Equal(t, []byte("WAVE"), decoded[8:12])
}

func TestIntegrationImageGeneration(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var payload map[string]any
Expand Down
58 changes: 46 additions & 12 deletions sdk/go/ai/media_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,69 @@ type VideoRequest struct {
AspectRatio string `json:"aspect_ratio,omitempty"`
GenerateAudio *bool `json:"generate_audio,omitempty"`
Seed *int `json:"seed,omitempty"`
// ImageURL is a single input image for image-to-video models (convenience
// alternative to FrameImages with frame_type=first_frame).
ImageURL string `json:"image_url,omitempty"`
// FrameImages is per-frame guidance — first_frame / last_frame. Items
// follow OpenRouter's shape: {type, image_url:{url}, frame_type}.
FrameImages []map[string]any `json:"frame_images,omitempty"`
// InputReferences supplies style/subject reference images (Veo
// "reference-to-video").
InputReferences []map[string]any `json:"input_references,omitempty"`
PollInterval time.Duration `json:"-"`
Timeout time.Duration `json:"-"`
// Extra passes through additional model-specific parameters (e.g. Veo's
// `personGeneration`).
Extra map[string]any `json:"-"`
}

// ImageRequest holds parameters for image generation.
type ImageRequest struct {
Prompt string `json:"prompt"`
Model string `json:"model,omitempty"`
Size string `json:"size,omitempty"`
Quality string `json:"quality,omitempty"`
ImageConfig *ImageConfig `json:"image_config,omitempty"`
Prompt string `json:"prompt"`
Model string `json:"model,omitempty"`
Size string `json:"size,omitempty"`
Quality string `json:"quality,omitempty"`
// ImageURLs are reference / source images for image+text→image models
// (e.g. x-ai/grok-imagine-image-quality). Each may be an http(s) or
// data: URL.
ImageURLs []string `json:"-"`
ImageConfig *ImageConfig `json:"image_config,omitempty"`
// Extra passes through additional model-specific parameters.
Extra map[string]any `json:"-"`
}

// ImageConfig holds OpenRouter-specific image configuration.
type ImageConfig struct {
AspectRatio string `json:"aspect_ratio,omitempty"`
ImageSize string `json:"image_size,omitempty"`
SuperResolutionReferences []string `json:"super_resolution_references,omitempty"`
AspectRatio string `json:"aspect_ratio,omitempty"`
ImageSize string `json:"image_size,omitempty"`
// Strength is the image-to-image blend (0–1, model-dependent).
Strength *float64 `json:"strength,omitempty"`
// Style hint (e.g. Recraft V3 styles).
Style string `json:"style,omitempty"`
// RgbColors is a color palette — array of [r,g,b].
RgbColors [][3]int `json:"rgb_colors,omitempty"`
// BackgroundRgbColor is [r,g,b].
BackgroundRgbColor *[3]int `json:"background_rgb_color,omitempty"`
SuperResolutionReferences []string `json:"super_resolution_references,omitempty"`
FontInputs []FontInput `json:"font_inputs,omitempty"`
}

// FontInput configures custom text rendering for compatible image models.
type FontInput struct {
FontURL string `json:"font_url"`
Text string `json:"text"`
}

// AudioRequest holds parameters for audio generation.
type AudioRequest struct {
Text string `json:"text"`
Model string `json:"model,omitempty"`
Voice string `json:"voice,omitempty"`
Format string `json:"format,omitempty"`
Text string `json:"text"`
Model string `json:"model,omitempty"`
Voice string `json:"voice,omitempty"`
Format string `json:"format,omitempty"`
// Speed multiplier (OpenAI TTS respects; other models ignore).
Speed *float64 `json:"speed,omitempty"`
// Extra passes through additional model-specific parameters.
Extra map[string]any `json:"-"`
}

// MediaResponse holds the result of a media generation call.
Expand Down
9 changes: 6 additions & 3 deletions sdk/go/ai/media_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,15 +219,18 @@ func TestOpenRouterGenerateAudio(t *testing.T) {
BaseURL: srv.URL,
Client: srv.Client(),
}
p.SeedModelMeta("openai/gpt-audio-mini", []string{"text", "audio"}, []string{"text"})

resp, err := p.GenerateAudio(context.Background(), AudioRequest{
Text: "Say hello",
Voice: "nova",
Text: "Say hello",
Model: "openai/gpt-audio-mini",
Voice: "nova",
Format: "mp3",
})
require.NoError(t, err)
assert.Equal(t, "Hello", resp.Text)
require.NotNil(t, resp.Audio)
assert.Equal(t, "pcm16", resp.Audio.Format)
assert.Equal(t, "mp3", resp.Audio.Format)
assert.NotEmpty(t, resp.Audio.Data)
}

Expand Down
Loading
Loading