Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion sdk/python/agentfield/media_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ async def generate_audio(
model: Optional[str] = None,
voice: str = "alloy",
format: str = "wav",
*,
system: Optional[str] = None,
**kwargs,
) -> MultimodalResponse:
"""
Expand All @@ -97,6 +99,8 @@ async def generate_audio(
model: TTS model to use
voice: Voice identifier
format: Audio format
system: Optional system instructions for providers/models that
support chat-style audio generation
**kwargs: Provider-specific options

Returns:
Expand Down Expand Up @@ -386,6 +390,7 @@ async def generate_audio(
format: str = "wav",
ref_audio_url: Optional[str] = None,
speed: float = 1.0,
system: Optional[str] = None,
**kwargs,
) -> MultimodalResponse:
"""
Expand Down Expand Up @@ -696,6 +701,7 @@ async def generate_audio(
voice: str = "alloy",
format: str = "wav",
speed: float = 1.0,
system: Optional[str] = None,
**kwargs,
) -> MultimodalResponse:
"""Generate audio using LiteLLM TTS."""
Expand Down Expand Up @@ -1285,6 +1291,7 @@ async def generate_audio(
format: str = "wav",
speed: Optional[float] = None,
extra: Optional[Dict[str, Any]] = None,
system: Optional[str] = None,
**kwargs,
) -> MultimodalResponse:
"""
Expand All @@ -1311,6 +1318,10 @@ async def generate_audio(
format: Audio format (wav, mp3, flac, opus, pcm16). ``wav`` is
synthesized client-side when the upstream endpoint only emits
pcm.
speed: Optional speech speed for ``/audio/speech`` models.
extra: Optional extra request fields for ``/audio/speech`` models.
system: Optional system instructions for chat-completions audio
models. Ignored for ``/audio/speech`` models.
**kwargs: Additional parameters (timeout overrides default 300s)

Returns:
Expand Down Expand Up @@ -1378,9 +1389,12 @@ async def generate_audio(
# Streaming on the OpenAI provider only emits pcm16 — fall back to
# pcm16 over the wire and re-wrap to user's requested format below.
wire_format = "pcm16" if audio_format == "wav" else audio_format
messages = [{"role": "user", "content": text}]
if system is not None:
messages.insert(0, {"role": "system", "content": system})
payload = {
"model": send_model,
"messages": [{"role": "user", "content": text}],
"messages": messages,
"modalities": ["text", "audio"],
"audio": {"voice": voice, "format": wire_format},
"stream": True,
Expand Down
50 changes: 50 additions & 0 deletions sdk/python/tests/test_media_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,56 @@ async def fake_iter_any():
assert result.audio.data == "AAAABBBB"
assert result.audio.format == "mp3"

@pytest.mark.asyncio
async def test_audio_sse_includes_optional_system_message(self):
"""System instructions are sent before the user text for chat-audio models."""
provider = OpenRouterProvider(api_key="test-key")
provider._model_meta_cache["openai/gpt-audio-mini"] = {
"id": "openai/gpt-audio-mini",
"output_modalities": ["text", "audio"],
"input_modalities": ["text"],
}

mock_resp = AsyncMock()
mock_resp.status = 200
mock_resp.content = MagicMock()

async def fake_iter_any():
yield b'data: {"choices":[{"delta":{"audio":{"data":"AAAA"}}}]}\n'
yield b"data: [DONE]\n"

mock_resp.content.iter_any = fake_iter_any

post_cm = AsyncMock()
post_cm.__aenter__ = AsyncMock(return_value=mock_resp)
post_cm.__aexit__ = AsyncMock(return_value=False)

mock_session = AsyncMock()
mock_session.post = MagicMock(return_value=post_cm)

session_cm = AsyncMock()
session_cm.__aenter__ = AsyncMock(return_value=mock_session)
session_cm.__aexit__ = AsyncMock(return_value=False)

with patch("aiohttp.ClientSession", return_value=session_cm):
result = await provider.generate_audio(
text="Read this dramatically",
model="openai/gpt-audio-mini",
voice="nova",
format="mp3",
system="You are a narrator. Use a calm documentary style.",
)

assert result.audio is not None
payload = mock_session.post.call_args.kwargs["json"]
assert payload["messages"] == [
{
"role": "system",
"content": "You are a narrator. Use a calm documentary style.",
},
{"role": "user", "content": "Read this dramatically"},
]

@pytest.mark.asyncio
async def test_audio_api_key_required(self):
"""Missing API key raises ValueError."""
Expand Down
Loading