Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 285 additions & 0 deletions tools/audio/gemini_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
"""Gemini native text-to-speech provider tool.

Uses Gemini's generateContent API with response_modalities=["AUDIO"]
for expressive, context-aware speech synthesis. Supports 30 voices
with automatic language detection.
"""

from __future__ import annotations

import base64
import os
import time
import wave
from pathlib import Path
from typing import Any

from tools.base_tool import (
BaseTool,
Determinism,
ExecutionMode,
ResourceProfile,
RetryPolicy,
ToolResult,
ToolRuntime,
ToolStability,
ToolStatus,
ToolTier,
)

# All 30 prebuilt Gemini TTS voices
VOICES = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird",
"Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
]

MODELS = [
"gemini-3.1-flash-tts-preview",
"gemini-2.5-flash-preview-tts",
"gemini-2.5-pro-preview-tts",
]

# Gemini TTS returns raw PCM: 24kHz, mono, 16-bit signed little-endian
SAMPLE_RATE = 24000
CHANNELS = 1
SAMPLE_WIDTH = 2


class GeminiTTS(BaseTool):
name = "gemini_tts"
version = "0.1.0"
tier = ToolTier.VOICE
capability = "tts"
provider = "gemini"
stability = ToolStability.BETA
execution_mode = ExecutionMode.SYNC
determinism = Determinism.STOCHASTIC
runtime = ToolRuntime.API

dependencies = []
install_instructions = (
"Set GOOGLE_API_KEY (or GEMINI_API_KEY) to your Google AI Studio key.\n"
" Get one at https://aistudio.google.com/apikey\n"
" Same key works for Gemini TTS, Imagen, and Google Cloud TTS."
)
fallback = "google_tts"
fallback_tools = ["google_tts", "elevenlabs_tts", "piper_tts"]
agent_skills = ["text-to-speech"]

capabilities = [
"text_to_speech",
"voice_selection",
"expressive_delivery",
"multilingual",
]
supports = {
"voice_cloning": False,
"multilingual": True,
"offline": False,
"native_audio": True,
"ssml": False,
}
best_for = [
"expressive narration — understands context and delivers with natural prosody",
"prompt-directed delivery — 'say cheerfully', 'whisper this', etc.",
"multilingual with automatic language detection",
]
not_good_for = [
"SSML markup (not supported)",
"voice cloning",
"fully offline production",
]

input_schema = {
"type": "object",
"required": ["text"],
"properties": {
"text": {
"type": "string",
"description": (
"Text to convert to speech. Supports delivery instructions "
"inline, e.g. 'Say cheerfully: Have a wonderful day!'"
),
},
"voice": {
"type": "string",
"default": "Orus",
"description": (
f"Voice name. Available: {', '.join(VOICES)}. "
"Default: Orus (rich, cinematic male)."
),
},
"model": {
"type": "string",
"default": "gemini-2.5-flash-preview-tts",
"enum": MODELS,
"description": (
"Gemini TTS model. flash = fast/cheap, pro = highest quality."
),
},
"audio_format": {
"type": "string",
"default": "wav",
"enum": ["wav", "pcm"],
"description": "Output format. wav = ready to use, pcm = raw 24kHz s16le.",
},
"output_path": {"type": "string"},
},
}

resource_profile = ResourceProfile(
cpu_cores=1, ram_mb=256, vram_mb=0, disk_mb=50, network_required=True
)
retry_policy = RetryPolicy(max_retries=2, retryable_errors=["rate_limit", "timeout"])
idempotency_key_fields = ["text", "voice", "model"]
side_effects = ["writes audio file to output_path", "calls Gemini API"]
user_visible_verification = ["Listen to generated audio for natural speech quality"]

def _get_api_key(self) -> str | None:
return os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")

def get_status(self) -> ToolStatus:
if self._get_api_key():
return ToolStatus.AVAILABLE
return ToolStatus.UNAVAILABLE

def estimate_cost(self, inputs: dict[str, Any]) -> float:
text = inputs.get("text", "")
model = inputs.get("model", "gemini-2.5-flash-preview-tts")
# Approximate pricing per character (Gemini TTS pricing is per-token
# but characters are a reasonable proxy for estimation)
if "pro" in model:
rate_per_char = 0.000050 # pro is more expensive
else:
rate_per_char = 0.000010 # flash is cheap
return round(len(text) * rate_per_char, 4)

def execute(self, inputs: dict[str, Any]) -> ToolResult:
api_key = self._get_api_key()
if not api_key:
return ToolResult(
success=False,
error="No Google API key found. " + self.install_instructions,
)

voice = inputs.get("voice", "Orus")
if voice not in VOICES:
return ToolResult(
success=False,
error=f"Unknown voice '{voice}'. Available: {', '.join(VOICES)}",
)

start = time.time()
try:
result = self._generate(inputs, api_key)
except Exception as exc:
return ToolResult(success=False, error=f"Gemini TTS failed: {exc}")

result.duration_seconds = round(time.time() - start, 2)
result.cost_usd = self.estimate_cost(inputs)
return result

def _generate(self, inputs: dict[str, Any], api_key: str) -> ToolResult:
import requests

text = inputs["text"]
voice = inputs.get("voice", "Orus")
model = inputs.get("model", "gemini-2.5-flash-preview-tts")
audio_format = inputs.get("audio_format", "wav")

url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"

payload = {
"contents": [{"parts": [{"text": text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": voice,
}
}
},
},
}

response = requests.post(
url,
headers={
"Content-Type": "application/json",
"x-goog-api-key": api_key,
},
json=payload,
timeout=120,
)

# Issue #3: extract API error body before raising
if not response.ok:
try:
err_msg = response.json().get("error", {}).get("message", response.text[:500])
except Exception:
err_msg = response.text[:500]
raise RuntimeError(f"Gemini API error (HTTP {response.status_code}): {err_msg}")

data = response.json()

# Issue #2: guard against missing/blocked candidates
candidates = data.get("candidates")
if not candidates:
error_info = data.get("error", {}).get("message", "No candidates in response")
raise RuntimeError(f"Gemini API returned no audio: {error_info}")

candidate = candidates[0]
finish_reason = candidate.get("finishReason", "")
if finish_reason == "SAFETY":
raise RuntimeError("Gemini blocked this request due to safety filters. Rephrase your text.")

try:
audio_b64 = candidate["content"]["parts"][0]["inlineData"]["data"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(f"Unexpected Gemini response structure: {exc}") from exc

pcm_bytes = base64.b64decode(audio_b64)

# Issue #4: reject empty audio
if not pcm_bytes:
raise RuntimeError("Gemini returned empty audio data. The text may be too short or unsupported.")

# Issue #5: audio_format is already constrained by enum, use directly
output_path = Path(inputs.get("output_path", f"gemini_tts_output.{audio_format}"))
output_path.parent.mkdir(parents=True, exist_ok=True)

if audio_format == "wav":
_write_wav(output_path, pcm_bytes)
else:
output_path.write_bytes(pcm_bytes)

audio_duration = len(pcm_bytes) / (SAMPLE_RATE * CHANNELS * SAMPLE_WIDTH)

return ToolResult(
success=True,
data={
"provider": self.provider,
"model": model,
"voice": voice,
"text_length": len(text),
"audio_duration_seconds": round(audio_duration, 2),
"sample_rate": SAMPLE_RATE,
"output": str(output_path),
"format": audio_format,
},
artifacts=[str(output_path)],
model=model,
)


def _write_wav(path: Path, pcm_bytes: bytes) -> None:
"""Wrap raw PCM bytes in a WAV container."""
with wave.open(str(path), "wb") as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(SAMPLE_WIDTH)
wf.setframerate(SAMPLE_RATE)
wf.writeframes(pcm_bytes)