From f2248d4515f248ca6100d133ea49c3c19320111f Mon Sep 17 00:00:00 2001 From: neilruaro-camb Date: Thu, 2 Apr 2026 16:38:32 +0800 Subject: [PATCH] Add Camb AI as a TTS provider Adds Camb AI as a third TTS provider alongside Kokoro and Supertonic, using the camb-sdk streaming API with MARS models (mars-flash, mars-pro, mars-instruct) and support for 140+ languages. New file: abogen/tts_camb.py (CambPipeline, CambSegment) Backend: provider factory, voice resolution, synthesis dispatch, job config, settings defaults, form handling, voice profiles Frontend: Speaker Studio provider option, settings page defaults --- abogen/tts_camb.py | 128 ++++++++++++++++++++++ abogen/voice_profiles.py | 26 ++++- abogen/webui/conversion_runner.py | 40 ++++++- abogen/webui/routes/api.py | 48 ++++++++- abogen/webui/routes/settings.py | 13 +++ abogen/webui/routes/utils/form.py | 19 +++- abogen/webui/routes/utils/preview.py | 37 ++++++- abogen/webui/routes/utils/service.py | 4 + abogen/webui/routes/utils/settings.py | 4 + abogen/webui/routes/utils/voice.py | 6 ++ abogen/webui/service.py | 20 ++++ abogen/webui/static/voices.js | 150 +++++++++++++++++++++++--- abogen/webui/templates/settings.html | 27 +++++ abogen/webui/templates/voices.html | 43 +++++++- pyproject.toml | 1 + 15 files changed, 542 insertions(+), 24 deletions(-) create mode 100644 abogen/tts_camb.py diff --git a/abogen/tts_camb.py b/abogen/tts_camb.py new file mode 100644 index 0000000..e21664a --- /dev/null +++ b/abogen/tts_camb.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass +from typing import Any, Iterator, Optional + +import numpy as np + +from abogen.tts_supertonic import _ensure_float32_mono, _split_text + + +logger = logging.getLogger(__name__) + + +DEFAULT_CAMB_MODELS = ("mars-flash", "mars-pro", "mars-instruct") +DEFAULT_CAMB_VOICE_ID = 147320 +DEFAULT_CAMB_LANGUAGE = "en-us" + + +@dataclass +class CambSegment: + graphemes: str + audio: np.ndarray + + +class CambPipeline: + """Adapter that mimics Kokoro/SuperTonic's pipeline iteration interface for Camb AI.""" + + def __init__( + self, + *, + sample_rate: int, + api_key: Optional[str] = None, + model: str = "mars-flash", + voice_id: int = DEFAULT_CAMB_VOICE_ID, + language: str = DEFAULT_CAMB_LANGUAGE, + max_chunk_length: int = 500, + ) -> None: + self.sample_rate = int(sample_rate) + self.model = model if model in DEFAULT_CAMB_MODELS else "mars-flash" + self.voice_id = int(voice_id) + self.language = language or DEFAULT_CAMB_LANGUAGE + self.max_chunk_length = int(max_chunk_length) + + resolved_key = api_key or os.environ.get("CAMB_API_KEY") or "" + if not resolved_key: + raise RuntimeError( + "Camb AI API key is required. Set the CAMB_API_KEY environment variable " + "or provide an API key in the settings." + ) + + try: + from camb.client import CambAI # type: ignore[import-not-found] + except Exception as exc: + raise RuntimeError( + "camb-sdk is not installed. Install it with `pip install camb-sdk`." + ) from exc + + self._client = CambAI(api_key=resolved_key) + + def __call__( + self, + text: str, + *, + voice: Any, + speed: float, + split_pattern: Optional[str] = None, + model: Optional[str] = None, + language: Optional[str] = None, + ) -> Iterator[CambSegment]: + from camb.types.stream_tts_output_configuration import StreamTtsOutputConfiguration # type: ignore[import-not-found] + from camb.types.stream_tts_voice_settings import StreamTtsVoiceSettings # type: ignore[import-not-found] + + voice_id = self.voice_id + if isinstance(voice, int): + voice_id = voice + elif isinstance(voice, str): + try: + voice_id = int(voice) + except (TypeError, ValueError): + pass + + speech_model = model or self.model + if speech_model not in DEFAULT_CAMB_MODELS: + speech_model = "mars-flash" + + lang = language or self.language + speed_value = float(speed) if speed is not None else 1.0 + speed_value = max(0.5, min(2.0, speed_value)) + + chunks = _split_text( + text, split_pattern=split_pattern, max_chunk_length=self.max_chunk_length + ) + + for chunk in chunks: + try: + stream = self._client.text_to_speech.tts( + text=chunk, + voice_id=voice_id, + language=lang, + speech_model=speech_model, + output_configuration=StreamTtsOutputConfiguration( + format="pcm_f32le", + sample_rate=self.sample_rate, + ), + voice_settings=StreamTtsVoiceSettings( + speaking_rate=speed_value, + ), + ) + + # Collect all streamed bytes for this chunk. + raw_bytes = b"".join(stream) + if not raw_bytes: + logger.warning("Camb AI returned empty audio for chunk: %s", chunk[:60]) + continue + + audio = np.frombuffer(raw_bytes, dtype=" Dict[str, Any]: return {} provider = str(entry.get("provider") or "kokoro").strip().lower() - if provider not in {"kokoro", "supertonic"}: + if provider not in {"kokoro", "supertonic", "camb"}: provider = "kokoro" language = str(entry.get("language") or "a").strip().lower() or "a" @@ -123,6 +123,30 @@ def normalize_profile_entry(entry: Any) -> Dict[str, Any]: ), } + if provider == "camb": + voice_id = entry.get("voice_id") or entry.get("camb_voice_id") or 147320 + try: + voice_id = int(voice_id) + except (TypeError, ValueError): + voice_id = 147320 + model = str(entry.get("model") or entry.get("camb_model") or "mars-flash").strip() + if model not in {"mars-flash", "mars-pro", "mars-instruct"}: + model = "mars-flash" + speed = 1.0 + raw_speed = entry.get("speed") or entry.get("camb_speed") + if raw_speed is not None: + try: + speed = max(0.5, min(2.0, float(raw_speed))) + except (TypeError, ValueError): + pass + return { + "provider": "camb", + "language": language, + "voice_id": voice_id, + "model": model, + "speed": speed, + } + voices = _normalize_voice_entries(entry.get("voices", [])) if not voices: return {} diff --git a/abogen/webui/conversion_runner.py b/abogen/webui/conversion_runner.py index f41e624..3d97993 100644 --- a/abogen/webui/conversion_runner.py +++ b/abogen/webui/conversion_runner.py @@ -47,6 +47,7 @@ from abogen.pronunciation_store import increment_usage from abogen.llm_client import LLMClientError from abogen.tts_supertonic import DEFAULT_SUPERTONIC_VOICES, SupertonicPipeline +from abogen.tts_camb import CambPipeline from .service import Job, JobStatus @@ -126,6 +127,12 @@ def _infer_provider_from_spec(value: Any, fallback: str = "kokoro") -> str: return "supertonic" if "*" in raw or "+" in raw: return "kokoro" + # Pure integer values indicate a Camb AI voice ID. + try: + int(raw) + return "camb" + except (TypeError, ValueError): + pass return fallback @@ -1573,7 +1580,7 @@ def run_conversion_job(job: Job) -> None: def get_pipeline(provider: str) -> Any: nonlocal kokoro_cache_ready provider_norm = str(provider or "kokoro").strip().lower() or "kokoro" - if provider_norm not in {"kokoro", "supertonic"}: + if provider_norm not in {"kokoro", "supertonic", "camb"}: provider_norm = "kokoro" existing = pipelines.get(provider_norm) @@ -1588,6 +1595,16 @@ def get_pipeline(provider: str) -> Any: ) return pipelines[provider_norm] + if provider_norm == "camb": + pipelines[provider_norm] = CambPipeline( + sample_rate=SAMPLE_RATE, + api_key=getattr(job, "camb_api_key", None), + model=getattr(job, "camb_model", "mars-flash") or "mars-flash", + voice_id=int(getattr(job, "camb_voice_id", 147320) or 147320), + language=getattr(job, "camb_language", "en-us") or "en-us", + ) + return pipelines[provider_norm] + # Kokoro cfg = load_config() disable_gpu = not job.use_gpu or not cfg.get("use_gpu", True) @@ -1621,6 +1638,10 @@ def resolve_voice_target(raw_spec: str) -> tuple[str, str, Optional[float], Opti steps = int(entry.get("total_steps") or getattr(job, "supertonic_total_steps", 5) or 5) speed = float(entry.get("speed") or getattr(job, "speed", 1.0) or 1.0) return "supertonic", _supertonic_voice_from_spec(voice, getattr(job, "voice", "M1")), speed, steps + if provider == "camb": + voice_id = str(entry.get("voice_id") or getattr(job, "camb_voice_id", 147320) or 147320) + speed = float(entry.get("speed") or getattr(job, "speed", 1.0) or 1.0) + return "camb", voice_id, speed, None formula = _formula_from_kokoro_entry(entry) return "kokoro", formula or spec, None, None @@ -1628,6 +1649,8 @@ def resolve_voice_target(raw_spec: str) -> tuple[str, str, Optional[float], Opti inferred = _infer_provider_from_spec(spec, fallback=fallback_provider) if inferred == "supertonic": return "supertonic", _supertonic_voice_from_spec(spec, getattr(job, "voice", "M1")), None, None + if inferred == "camb": + return "camb", spec, None, None return "kokoro", spec, None, None def resolve_voice_choice(raw_spec: str) -> tuple[str, str, Any, Optional[float], Optional[int]]: @@ -1849,7 +1872,20 @@ def emit_text( local_segments = 0 provider = str(tts_provider or getattr(job, "tts_provider", "kokoro") or "kokoro").strip().lower() or "kokoro" - if provider == "supertonic": + if provider == "camb": + camb_pipeline = get_pipeline("camb") + voice_id = voice_choice + try: + voice_id = int(voice_choice) + except (TypeError, ValueError): + voice_id = int(getattr(job, "camb_voice_id", 147320) or 147320) + segment_iter = camb_pipeline( + normalized, + voice=voice_id, + speed=float(speed_override if speed_override is not None else job.speed), + split_pattern=split_pattern, + ) + elif provider == "supertonic": supertonic_pipeline = get_pipeline("supertonic") voice_name = _supertonic_voice_from_spec(voice_choice, getattr(job, "voice", "M1")) segment_iter = supertonic_pipeline( diff --git a/abogen/webui/routes/api.py b/abogen/webui/routes/api.py index f02ab9d..a6f0d4d 100644 --- a/abogen/webui/routes/api.py +++ b/abogen/webui/routes/api.py @@ -63,7 +63,7 @@ def api_save_voice_profile() -> ResponseReturnValue: if profile is None: # Speaker Studio payload format provider = str(payload.get("provider") or "kokoro").strip().lower() - if provider not in {"kokoro", "supertonic"}: + if provider not in {"kokoro", "supertonic", "camb"}: provider = "kokoro" if provider == "supertonic": profile = { @@ -73,6 +73,14 @@ def api_save_voice_profile() -> ResponseReturnValue: "total_steps": payload.get("total_steps") or payload.get("supertonic_total_steps"), "speed": payload.get("speed") or payload.get("supertonic_speed"), } + elif provider == "camb": + profile = { + "provider": "camb", + "language": str(payload.get("camb_language") or payload.get("language") or "en-us").strip().lower() or "en-us", + "voice_id": payload.get("camb_voice_id") or payload.get("voice_id") or 147320, + "model": payload.get("camb_model") or payload.get("model") or "mars-flash", + "speed": payload.get("speed") or 1.0, + } else: profile = { "provider": "kokoro", @@ -168,7 +176,17 @@ def api_voice_profiles_preview() -> ResponseReturnValue: resolved_provider = provider or "kokoro" profiles = load_profiles() - if resolved_provider == "supertonic" and not profile_name: + camb_model = str(payload.get("camb_model") or "").strip() or settings.get("camb_model", "mars-flash") + camb_voice_id = int(payload.get("camb_voice_id") or settings.get("camb_voice_id", 147320)) + camb_api_key = str(payload.get("camb_api_key") or "").strip() or settings.get("camb_api_key") or None + camb_language = str(payload.get("camb_language") or "").strip() or settings.get("camb_language", "en-us") + + if resolved_provider == "camb" and not profile_name: + voice_spec = str(payload.get("camb_voice_id") or payload.get("voice_id") or camb_voice_id) + camb_model = str(payload.get("camb_model") or payload.get("model") or camb_model) + camb_language = str(payload.get("camb_language") or camb_language) + speed = coerce_float(payload.get("speed"), speed) + elif resolved_provider == "supertonic" and not profile_name: voice_spec = str(payload.get("voice") or payload.get("supertonic_voice") or "M1").strip() or "M1" # Allow per-speaker overrides via payload. supertonic_total_steps = int(payload.get("supertonic_total_steps") or payload.get("total_steps") or supertonic_total_steps) @@ -183,6 +201,11 @@ def api_voice_profiles_preview() -> ResponseReturnValue: voice_spec = str(normalized_entry.get("voice") or "M1") supertonic_total_steps = int(normalized_entry.get("total_steps") or supertonic_total_steps) speed = float(normalized_entry.get("speed") or speed) + elif resolved_provider == "camb": + camb_voice_id = int(normalized_entry.get("voice_id") or camb_voice_id) + camb_model = str(normalized_entry.get("model") or camb_model) + voice_spec = str(camb_voice_id) + speed = float(normalized_entry.get("speed") or speed) else: voice_spec = formula_from_profile(normalized_entry) or "" language = str(normalized_entry.get("language") or language) @@ -209,6 +232,10 @@ def api_voice_profiles_preview() -> ResponseReturnValue: use_gpu=use_gpu, tts_provider=resolved_provider, supertonic_total_steps=supertonic_total_steps, + camb_model=camb_model, + camb_voice_id=camb_voice_id, + camb_api_key=camb_api_key, + camb_language=camb_language, max_seconds=max_seconds, ) except Exception as exc: @@ -230,7 +257,12 @@ def api_speaker_preview() -> ResponseReturnValue: use_gpu = settings.get("use_gpu", False) base_spec, speaker_name = split_profile_spec(voice) - resolved_provider = tts_provider if tts_provider in {"kokoro", "supertonic"} else "" + camb_model = str(payload.get("camb_model") or "").strip() or settings.get("camb_model", "mars-flash") + camb_voice_id = int(payload.get("camb_voice_id") or settings.get("camb_voice_id", 147320)) + camb_api_key = str(payload.get("camb_api_key") or "").strip() or settings.get("camb_api_key") or None + camb_language = str(payload.get("camb_language") or "").strip() or settings.get("camb_language", "en-us") + + resolved_provider = tts_provider if tts_provider in {"kokoro", "supertonic", "camb"} else "" if speaker_name: entry = normalize_profile_entry(load_profiles().get(speaker_name)) @@ -241,6 +273,12 @@ def api_speaker_preview() -> ResponseReturnValue: supertonic_total_steps = int(entry.get("total_steps") or supertonic_total_steps) if speed_value is None: speed = coerce_float(entry.get("speed"), speed) + elif resolved_provider == "camb": + camb_voice_id = int(entry.get("voice_id") or camb_voice_id) + camb_model = str(entry.get("model") or camb_model) + voice = str(camb_voice_id) + if speed_value is None: + speed = coerce_float(entry.get("speed"), speed) elif resolved_provider == "kokoro": voice = formula_from_profile(entry) or (base_spec or voice) @@ -270,6 +308,10 @@ def api_speaker_preview() -> ResponseReturnValue: , tts_provider=resolved_provider, supertonic_total_steps=supertonic_total_steps or int(settings.get("supertonic_total_steps") or 5), + camb_model=camb_model, + camb_voice_id=camb_voice_id, + camb_api_key=camb_api_key, + camb_language=camb_language, pronunciation_overrides=pronunciation_overrides, manual_overrides=manual_overrides, speakers=speakers, diff --git a/abogen/webui/routes/settings.py b/abogen/webui/routes/settings.py index d0bb991..e22425e 100644 --- a/abogen/webui/routes/settings.py +++ b/abogen/webui/routes/settings.py @@ -51,6 +51,19 @@ def update_settings() -> ResponseReturnValue: current["supertonic_speed"] = max(0.7, min(2.0, float(form.get("supertonic_speed", current.get("supertonic_speed", 1.0))))) except (TypeError, ValueError): pass + camb_model = (form.get("camb_model") or "").strip().lower() + if camb_model in {"mars-flash", "mars-pro", "mars-instruct"}: + current["camb_model"] = camb_model + try: + current["camb_voice_id"] = max(1, int(form.get("camb_voice_id", current.get("camb_voice_id", 147320)))) + except (TypeError, ValueError): + pass + camb_api_key = form.get("camb_api_key") + if camb_api_key is not None: + current["camb_api_key"] = camb_api_key.strip() + camb_language = (form.get("camb_language") or "").strip().lower() + if camb_language: + current["camb_language"] = camb_language current["output_format"] = (form.get("output_format") or "mp3").strip() current["subtitle_mode"] = (form.get("subtitle_mode") or "Disabled").strip() current["subtitle_format"] = (form.get("subtitle_format") or "srt").strip() diff --git a/abogen/webui/routes/utils/form.py b/abogen/webui/routes/utils/form.py index 41d605e..c1f3451 100644 --- a/abogen/webui/routes/utils/form.py +++ b/abogen/webui/routes/utils/form.py @@ -579,9 +579,26 @@ def _extract_checkbox(name: str, default: bool) -> bool: # spec (e.g. "speaker:Name" for saved speakers, or a Kokoro mix formula). # This enables mixed-provider conversions (e.g. narrator=SuperTonic, characters=Kokoro). provider_value = str(form.get("tts_provider") or "").strip().lower() - if provider_value in {"kokoro", "supertonic"}: + if provider_value in {"kokoro", "supertonic", "camb"}: pending.tts_provider = provider_value + # Camb AI specific fields + camb_model = str(form.get("camb_model") or "").strip().lower() + if camb_model in {"mars-flash", "mars-pro", "mars-instruct"}: + pending.camb_model = camb_model + camb_voice_id = form.get("camb_voice_id") + if camb_voice_id is not None: + try: + pending.camb_voice_id = int(camb_voice_id) + except (TypeError, ValueError): + pass + camb_api_key = form.get("camb_api_key") + if camb_api_key is not None: + pending.camb_api_key = str(camb_api_key).strip() or None + camb_language = str(form.get("camb_language") or "").strip().lower() + if camb_language: + pending.camb_language = camb_language + # Determine the base speaker selection (saved speaker ref or raw voice). narrator_voice_raw = ( form.get("voice") diff --git a/abogen/webui/routes/utils/preview.py b/abogen/webui/routes/utils/preview.py index 95c3040..cb68308 100644 --- a/abogen/webui/routes/utils/preview.py +++ b/abogen/webui/routes/utils/preview.py @@ -60,6 +60,10 @@ def generate_preview_audio( use_gpu: bool, tts_provider: str = "kokoro", supertonic_total_steps: int = 5, + camb_model: str = "mars-flash", + camb_voice_id: int = 147320, + camb_api_key: Optional[str] = None, + camb_language: str = "en-us", max_seconds: float = 8.0, pronunciation_overrides: Optional[Iterable[Mapping[str, Any]]] = None, manual_overrides: Optional[Iterable[Mapping[str, Any]]] = None, @@ -94,7 +98,7 @@ def __init__(self): source_text = text normalized_text = source_text - if provider != "supertonic": + if provider not in ("supertonic", "camb"): try: from abogen.kokoro_text_normalization import normalize_for_pipeline @@ -103,7 +107,28 @@ def __init__(self): current_app.logger.exception("Preview normalization failed; using raw text") normalized_text = source_text - if provider == "supertonic": + if provider == "camb": + from abogen.tts_camb import CambPipeline + + camb_pipeline = CambPipeline( + sample_rate=SAMPLE_RATE, + api_key=camb_api_key, + model=camb_model, + voice_id=camb_voice_id, + language=camb_language, + ) + voice_id = camb_voice_id + try: + voice_id = int(voice_spec) + except (TypeError, ValueError): + pass + segments = camb_pipeline( + normalized_text, + voice=voice_id, + speed=speed, + split_pattern=SPLIT_PATTERN, + ) + elif provider == "supertonic": from abogen.tts_supertonic import SupertonicPipeline pipeline = SupertonicPipeline(sample_rate=SAMPLE_RATE, auto_download=True, total_steps=supertonic_total_steps) @@ -177,6 +202,10 @@ def synthesize_preview( use_gpu: bool, tts_provider: str = "kokoro", supertonic_total_steps: int = 5, + camb_model: str = "mars-flash", + camb_voice_id: int = 147320, + camb_api_key: Optional[str] = None, + camb_language: str = "en-us", max_seconds: float = 8.0, pronunciation_overrides: Optional[Iterable[Mapping[str, Any]]] = None, manual_overrides: Optional[Iterable[Mapping[str, Any]]] = None, @@ -191,6 +220,10 @@ def synthesize_preview( use_gpu=use_gpu, tts_provider=tts_provider, supertonic_total_steps=supertonic_total_steps, + camb_model=camb_model, + camb_voice_id=camb_voice_id, + camb_api_key=camb_api_key, + camb_language=camb_language, max_seconds=max_seconds, pronunciation_overrides=pronunciation_overrides, manual_overrides=manual_overrides, diff --git a/abogen/webui/routes/utils/service.py b/abogen/webui/routes/utils/service.py index b48f94a..6007566 100644 --- a/abogen/webui/routes/utils/service.py +++ b/abogen/webui/routes/utils/service.py @@ -26,6 +26,10 @@ def submit_job(pending: PendingJob) -> str: voice=pending.voice, speed=pending.speed, supertonic_total_steps=getattr(pending, "supertonic_total_steps", 5), + camb_model=getattr(pending, "camb_model", "mars-flash"), + camb_voice_id=getattr(pending, "camb_voice_id", 147320), + camb_api_key=getattr(pending, "camb_api_key", None), + camb_language=getattr(pending, "camb_language", "en-us"), use_gpu=pending.use_gpu, subtitle_mode=pending.subtitle_mode, output_format=pending.output_format, diff --git a/abogen/webui/routes/utils/settings.py b/abogen/webui/routes/utils/settings.py index c96a66c..980ae56 100644 --- a/abogen/webui/routes/utils/settings.py +++ b/abogen/webui/routes/utils/settings.py @@ -177,6 +177,10 @@ def settings_defaults() -> Dict[str, Any]: "default_voice": VOICES_INTERNAL[0] if VOICES_INTERNAL else "", "supertonic_total_steps": 5, "supertonic_speed": 1.0, + "camb_model": "mars-flash", + "camb_voice_id": 147320, + "camb_api_key": "", + "camb_language": "en-us", "replace_single_newlines": False, "use_gpu": True, "save_chapters_separately": False, diff --git a/abogen/webui/routes/utils/voice.py b/abogen/webui/routes/utils/voice.py index 3d9081a..b5dd586 100644 --- a/abogen/webui/routes/utils/voice.py +++ b/abogen/webui/routes/utils/voice.py @@ -675,6 +675,12 @@ def resolve_voice_choice( profile_language = (entry or {}).get("language") if profile_language: resolved_language = str(profile_language) + elif provider == "camb": + resolved_voice = f"speaker:{profile_name}" + selected_profile = profile_name + profile_language = (entry or {}).get("language") + if profile_language: + resolved_language = str(profile_language) else: formula = formula_from_profile(entry or {}) if entry else None if formula: diff --git a/abogen/webui/service.py b/abogen/webui/service.py index a04e9ce..a34070a 100644 --- a/abogen/webui/service.py +++ b/abogen/webui/service.py @@ -112,6 +112,10 @@ class Job: created_at: float tts_provider: str = "kokoro" supertonic_total_steps: int = 5 + camb_model: str = "mars-flash" + camb_voice_id: int = 147320 + camb_api_key: Optional[str] = None + camb_language: str = "en-us" save_chapters_separately: bool = False merge_chapters_at_end: bool = True separate_chapters_format: str = "wav" @@ -553,6 +557,10 @@ class PendingJob: created_at: float tts_provider: str = "kokoro" supertonic_total_steps: int = 5 + camb_model: str = "mars-flash" + camb_voice_id: int = 147320 + camb_api_key: Optional[str] = None + camb_language: str = "en-us" cover_image_path: Optional[Path] = None cover_image_mime: Optional[str] = None chapter_intro_delay: float = 0.5 @@ -622,6 +630,10 @@ def enqueue( speed: float, tts_provider: str = "kokoro", supertonic_total_steps: int = 5, + camb_model: str = "mars-flash", + camb_voice_id: int = 147320, + camb_api_key: Optional[str] = None, + camb_language: str = "en-us", use_gpu: bool, subtitle_mode: str, output_format: str, @@ -675,6 +687,10 @@ def enqueue( speed=speed, tts_provider=tts_provider, supertonic_total_steps=int(supertonic_total_steps or 5), + camb_model=camb_model or "mars-flash", + camb_voice_id=int(camb_voice_id or 147320), + camb_api_key=camb_api_key, + camb_language=camb_language or "en-us", use_gpu=use_gpu, subtitle_mode=subtitle_mode, output_format=output_format, @@ -836,6 +852,10 @@ def retry(self, job_id: str) -> Optional[Job]: language=job.language, voice=job.voice, speed=job.speed, + camb_model=job.camb_model, + camb_voice_id=job.camb_voice_id, + camb_api_key=job.camb_api_key, + camb_language=job.camb_language, use_gpu=job.use_gpu, subtitle_mode=job.subtitle_mode, output_format=job.output_format, diff --git a/abogen/webui/static/voices.js b/abogen/webui/static/voices.js index 5341c56..e68a3c1 100644 --- a/abogen/webui/static/voices.js +++ b/abogen/webui/static/voices.js @@ -33,6 +33,12 @@ const setupVoiceMixer = () => { const supertonicSpeedInput = app.querySelector('[data-role="supertonic-speed"]'); const supertonicStepsLabel = app.querySelector('[data-role="supertonic-steps-display"]'); const supertonicSpeedLabel = app.querySelector('[data-role="supertonic-speed-display"]'); + const cambPanelEl = app.querySelector('[data-role="camb-panel"]'); + const cambVoiceIdInput = app.querySelector('[data-role="camb-voice-id"]'); + const cambModelSelect = app.querySelector('[data-role="camb-model"]'); + const cambLanguageSelect = app.querySelector('[data-role="camb-language"]'); + const cambSpeedInput = app.querySelector('[data-role="camb-speed"]'); + const cambSpeedLabel = app.querySelector('[data-role="camb-speed-display"]'); const speedInput = document.getElementById("preview-speed"); const importInput = document.getElementById("voice-import-input"); const headerActions = document.querySelector(".voice-mixer__header-actions"); @@ -83,6 +89,12 @@ const setupVoiceMixer = () => { total_steps: 5, speed: 1.0, }, + camb: { + voice_id: 147320, + model: "mars-flash", + language: "en-us", + speed: 1.0, + }, }, languageFilter: voiceFilterSelect ? voiceFilterSelect.value : "", genderFilter: "", @@ -150,7 +162,9 @@ const setupVoiceMixer = () => { const normalizeProvider = (value) => { const candidate = String(value || "").trim().toLowerCase(); - return candidate === "supertonic" ? "supertonic" : "kokoro"; + if (candidate === "supertonic") return "supertonic"; + if (candidate === "camb") return "camb"; + return "kokoro"; }; const getProviderCatalog = () => { @@ -171,6 +185,9 @@ const setupVoiceMixer = () => { if (provider === "supertonic") { return "Voice selection + quality/speed per speaker."; } + if (provider === "camb") { + return "Cloud TTS with MARS models. Requires API key."; + } return "Voice mixing supported via the Kokoro mixer."; }; @@ -268,26 +285,31 @@ const setupVoiceMixer = () => { const applyProviderToUI = () => { const provider = normalizeProvider(state.draft.provider); const isSupertonic = provider === "supertonic"; + const isCamb = provider === "camb"; + const isKokoro = provider === "kokoro"; if (providerSelect) { providerSelect.value = provider; } if (languageField) { - languageField.hidden = isSupertonic; + languageField.hidden = !isKokoro; } if (kokoroMixerEl) { - kokoroMixerEl.hidden = isSupertonic; + kokoroMixerEl.hidden = !isKokoro; } if (supertonicPanelEl) { supertonicPanelEl.hidden = !isSupertonic; } + if (cambPanelEl) { + cambPanelEl.hidden = !isCamb; + } if (mixTotalEl) { - mixTotalEl.hidden = isSupertonic; + mixTotalEl.hidden = !isKokoro; } if (previewBtn) { - previewBtn.dataset.label = isSupertonic ? "Preview speaker" : (previewBtn.dataset.label || "Preview speaker"); + previewBtn.dataset.label = isKokoro ? (previewBtn.dataset.label || "Preview speaker") : "Preview speaker"; } - // Keep preview speed aligned with the Supertonic speaker speed. + // Keep preview speed aligned with provider speaker speed. if (isSupertonic && speedInput) { const desired = Number(state.draft.supertonic?.speed ?? 1.0); if (!Number.isNaN(desired)) { @@ -295,6 +317,13 @@ const setupVoiceMixer = () => { setRangeFill(speedInput); } } + if (isCamb && speedInput) { + const desired = Number(state.draft.camb?.speed ?? 1.0); + if (!Number.isNaN(desired)) { + speedInput.value = String(desired); + setRangeFill(speedInput); + } + } }; const updateMixSummary = () => { @@ -311,6 +340,8 @@ const setupVoiceMixer = () => { const profileLabel = state.draft.name ? `Editing: ${state.draft.name}` : "Unsaved speaker"; if (isSupertonic) { profileSummaryEl.textContent = `${profileLabel} · Supertonic`; + } else if (provider === "camb") { + profileSummaryEl.textContent = `${profileLabel} · Camb AI`; } else { profileSummaryEl.textContent = `${profileLabel} · ${voiceCount} voice${voiceCount === 1 ? "" : "s"}`; } @@ -607,6 +638,23 @@ const setupVoiceMixer = () => { const speed = Number(state.draft.supertonic?.speed ?? 1.0); supertonicSpeedLabel.textContent = `${(Number.isFinite(speed) ? speed : 1.0).toFixed(2)}×`; } + if (cambVoiceIdInput) { + cambVoiceIdInput.value = String(state.draft.camb?.voice_id ?? 147320); + } + if (cambModelSelect) { + cambModelSelect.value = state.draft.camb?.model || "mars-flash"; + } + if (cambLanguageSelect) { + cambLanguageSelect.value = state.draft.camb?.language || "en-us"; + } + if (cambSpeedInput) { + cambSpeedInput.value = String(state.draft.camb?.speed ?? 1.0); + setRangeFill(cambSpeedInput); + } + if (cambSpeedLabel) { + const speed = Number(state.draft.camb?.speed ?? 1.0); + cambSpeedLabel.textContent = `${(Number.isFinite(speed) ? speed : 1.0).toFixed(2)}×`; + } applyProviderToUI(); renderSelectedVoices(); updateMixSummary(); @@ -650,7 +698,7 @@ const setupVoiceMixer = () => { selectBtn.dataset.name = name; const profile = profiles[name] || {}; const provider = normalizeProvider(profile.provider); - const providerLabel = provider === "supertonic" ? "Supertonic" : "Kokoro"; + const providerLabel = provider === "supertonic" ? "Supertonic" : provider === "camb" ? "Camb AI" : "Kokoro"; selectBtn.innerHTML = ` ${name} ${providerLabel} ${voiceLanguageLabel(profile.language || "a")} @@ -704,6 +752,12 @@ const setupVoiceMixer = () => { total_steps: Number(profile?.total_steps ?? 5), speed: Number(profile?.speed ?? 1.0), }, + camb: { + voice_id: Number(profile?.voice_id ?? 147320), + model: profile?.model || "mars-flash", + language: profile?.language || "en-us", + speed: Number(profile?.speed ?? 1.0), + }, }; if (provider === "kokoro" && Array.isArray(profile?.voices)) { profile.voices.forEach((entry) => { @@ -736,6 +790,12 @@ const setupVoiceMixer = () => { total_steps: 5, speed: 1.0, }, + camb: { + voice_id: 147320, + model: "mars-flash", + language: "en-us", + speed: 1.0, + }, }; applyDraftToControls(); renderProfileList(); @@ -796,15 +856,19 @@ const setupVoiceMixer = () => { setStatus("Give your profile a name first.", "warning"); return; } + const provider = normalizeProvider(state.draft.provider); const payload = { name, originalName: state.originalName, - provider: normalizeProvider(state.draft.provider), - language: normalizeProvider(state.draft.provider) === "kokoro" ? (languageSelect ? languageSelect.value : "a") : "a", - voices: normalizeProvider(state.draft.provider) === "kokoro" ? buildProfilePayload() : [], + provider, + language: provider === "kokoro" ? (languageSelect ? languageSelect.value : "a") : "a", + voices: provider === "kokoro" ? buildProfilePayload() : [], voice: state.draft.supertonic?.voice, total_steps: state.draft.supertonic?.total_steps, - speed: state.draft.supertonic?.speed, + speed: provider === "camb" ? state.draft.camb?.speed : state.draft.supertonic?.speed, + camb_voice_id: state.draft.camb?.voice_id, + camb_model: state.draft.camb?.model, + camb_language: state.draft.camb?.language, }; try { const response = await fetch("/api/voice-profiles", { @@ -925,6 +989,12 @@ const setupVoiceMixer = () => { setStatus("Enable at least one voice to preview.", "warning"); return; } + } else if (provider === "camb") { + payload.tts_provider = "camb"; + payload.camb_voice_id = state.draft.camb?.voice_id || 147320; + payload.camb_model = state.draft.camb?.model || "mars-flash"; + payload.camb_language = state.draft.camb?.language || "en-us"; + payload.voice_id = payload.camb_voice_id; } else { if (!payload.voice) { setStatus("Select a Supertonic voice to preview.", "warning"); @@ -1004,8 +1074,8 @@ const setupVoiceMixer = () => { if (providerSelect) { providerSelect.addEventListener("change", () => { state.draft.provider = normalizeProvider(providerSelect.value); - // When switching to Supertonic, clear Kokoro mix. - if (state.draft.provider === "supertonic") { + // When switching away from Kokoro, clear Kokoro mix. + if (state.draft.provider !== "kokoro") { state.draft.voices = new Map(); } applyDraftToControls(); @@ -1056,6 +1126,47 @@ const setupVoiceMixer = () => { setRangeFill(supertonicSpeedInput); } + if (cambVoiceIdInput) { + cambVoiceIdInput.addEventListener("change", () => { + state.draft.camb.voice_id = parseInt(cambVoiceIdInput.value, 10) || 147320; + markDirty(); + updateMixSummary(); + }); + } + + if (cambModelSelect) { + cambModelSelect.addEventListener("change", () => { + state.draft.camb.model = cambModelSelect.value || "mars-flash"; + markDirty(); + }); + } + + if (cambLanguageSelect) { + cambLanguageSelect.addEventListener("change", () => { + state.draft.camb.language = cambLanguageSelect.value || "en-us"; + markDirty(); + }); + } + + if (cambSpeedInput) { + cambSpeedInput.addEventListener("input", () => { + const value = parseFloat(cambSpeedInput.value || "1"); + const normalized = clamp(value, 0.5, 2.0); + state.draft.camb.speed = normalized; + cambSpeedInput.value = normalized.toFixed(2); + if (cambSpeedLabel) { + cambSpeedLabel.textContent = `${normalized.toFixed(2)}×`; + } + setRangeFill(cambSpeedInput); + if (speedInput) { + speedInput.value = String(normalized); + setRangeFill(speedInput); + } + markDirty(); + }); + setRangeFill(cambSpeedInput); + } + if (voiceFilterSelect) { voiceFilterSelect.addEventListener("change", () => { state.languageFilter = voiceFilterSelect.value; @@ -1071,7 +1182,8 @@ const setupVoiceMixer = () => { } setRangeFill(speedInput); - if (normalizeProvider(state.draft.provider) === "supertonic") { + const currentProvider = normalizeProvider(state.draft.provider); + if (currentProvider === "supertonic") { state.draft.supertonic.speed = clamp(speed, 0.7, 2.0); if (supertonicSpeedInput) { supertonicSpeedInput.value = state.draft.supertonic.speed.toFixed(2); @@ -1081,6 +1193,16 @@ const setupVoiceMixer = () => { supertonicSpeedLabel.textContent = `${state.draft.supertonic.speed.toFixed(2)}×`; } } + if (currentProvider === "camb") { + state.draft.camb.speed = clamp(speed, 0.5, 2.0); + if (cambSpeedInput) { + cambSpeedInput.value = state.draft.camb.speed.toFixed(2); + setRangeFill(cambSpeedInput); + } + if (cambSpeedLabel) { + cambSpeedLabel.textContent = `${state.draft.camb.speed.toFixed(2)}×`; + } + } }; speedInput.addEventListener("input", updatePreviewSpeedLabel); updatePreviewSpeedLabel(); diff --git a/abogen/webui/templates/settings.html b/abogen/webui/templates/settings.html index 7981834..0b5afc6 100644 --- a/abogen/webui/templates/settings.html +++ b/abogen/webui/templates/settings.html @@ -113,6 +113,33 @@

Application Settings

+ +
+

Camb AI settings

+

These defaults apply when a Camb AI speaker does not override them.

+
+
+ + +

Get your key at studio.camb.ai. Also reads from CAMB_API_KEY env var.

+
+
+ + +
+
+ + +
+
+ + +
+
+
@@ -84,6 +85,46 @@

Speaker Studio

Supertonic voice mixing is not implemented yet. Stub target: Supertonic-Voice-Mixer.

+
Select or create a profile to begin. diff --git a/pyproject.toml b/pyproject.toml index 3722fb5..753ab8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "kokoro>=0.9.4", "misaki[zh]>=0.9.4", "supertonic>=0.1.0", + "camb-sdk", "ebooklib>=0.19", "beautifulsoup4>=4.13.4", "spacy>=3.8.7,<4.0",