From 3e7465fd29b5577c72b0e766104bee3830fda0e4 Mon Sep 17 00:00:00 2001 From: mukunda katta Date: Tue, 14 Apr 2026 23:22:52 -0700 Subject: [PATCH] docs(voice-clone): clarify --prompt-text scope, length tuning, profile reuse (#9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI help on `--prompt-text` (in both `infer.py` and `moss_tts_nano/cli.py`) said it was "used by continuation mode" — but `model.inference` accepts `prompt_text` for voice_clone mode too, and supplying it improves cloning quality. Update the help to reflect that. Also adds a "Voice cloning details" subsection to README.md and README_zh.md that addresses the three questions from #9 directly: 1. Yes, you can pass the source audio's transcript via --prompt-text / --prompt-text-file. It works for both modes. 2. Reference audio length: no enforced limit, but ~3–10 seconds of clean single-speaker speech tends to give the best results. Acknowledges the empirical observation that very short or very long clips degrade output, with a concrete suggestion (clip ~5 s). 3. There's no separate "voice profile" cache yet — keep the model loaded in process (via `python -i infer.py`, `moss-tts-nano serve`, or a reused `MossTtsNanoRuntime`) and call inference repeatedly with the same prompt args. No behavioural change; help text + docs only. Closes #9. --- README.md | 35 +++++++++++++++++++++++++++++++++++ README_zh.md | 23 +++++++++++++++++++++++ infer.py | 14 ++++++++++++-- moss_tts_nano/cli.py | 7 ++++++- 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 54a565d..952a6bc 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,41 @@ python infer.py \ This writes audio to `generated_audio/infer_output.wav` by default. +#### Voice cloning details + +A few questions come up frequently (see [#9](https://github.com/OpenMOSS/MOSS-TTS-Nano/issues/9)): + +1. **Can I pass the transcript of the reference audio?** + Yes — use `--prompt-text ""` (or `--prompt-text-file path.txt`). + It is honoured by both `--mode voice_clone` (the default) and + `--mode continuation`. Supplying it generally improves cloning quality + because the model can align text-to-audio for the prompt clip. + + ```bash + python infer.py \ + --prompt-audio-path assets/audio/zh_1.wav \ + --prompt-text "欢迎收听今日新闻播报。" \ + --text "今天的天气非常好。" + ``` + +2. **What length should the reference audio be?** + We don't enforce a hard limit — the audio tokenizer accepts arbitrary + lengths and the prompt is internally clipped by + `--max-new-frames` / `--voice-clone-max-text-tokens`. Empirically, + short clips (≈ 3–10 seconds) of *clean* speech tend to give the best + results: long clips spend more of the model's prompt budget on + acoustic context, and very short ones (< 2 s) often don't carry + enough timbre. If you see degraded output, try clipping a clean, + single-speaker passage at around 5 seconds. + +3. **How do I cache a voice profile across multiple generations?** + There's no separate "voice profile" object yet — the cleanest pattern + is to keep the model loaded in process (e.g. via `python -i infer.py`, + `moss-tts-nano serve`, or by reusing a `MossTtsNanoRuntime` instance + in your own script) and call `model.inference(...)` repeatedly with + the same `prompt_audio_path` and `prompt_text`. The audio tokenizer + will re-encode the prompt each call, but the model weights stay warm. + ### Local Web Demo with `app.py` You can launch the local FastAPI demo for browser-based testing: diff --git a/README_zh.md b/README_zh.md index a4077e8..0196f2d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -142,6 +142,29 @@ python infer.py \ 默认情况下,这会将音频写入 `generated_audio/infer_output.wav`。 +#### 语音克隆细节 + +社区里反复出现的几个问题(参见 [#9](https://github.com/OpenMOSS/MOSS-TTS-Nano/issues/9)): + +1. **可以传入参考音频的转写文本吗?** + 可以 — 使用 `--prompt-text "<转写>"`(或 `--prompt-text-file path.txt`)。 + `--mode voice_clone`(默认)和 `--mode continuation` 都支持。提供该转写 + 通常能提升克隆质量,因为模型可以将提示片段的文本与音频对齐。 + +2. **参考音频应该多长?** + 没有硬性限制 — 音频 tokenizer 接受任意长度,提示部分会受 + `--max-new-frames` / `--voice-clone-max-text-tokens` 内部裁剪。经验上, + 3–10 秒左右的*干净*单人语音效果最好:过长的片段会把模型的提示预算 + 花在声学上下文上,过短(< 2 秒)的片段又难以承载足够音色信息。 + 如果输出质量下降,建议截取一段约 5 秒的清晰单人语音重试。 + +3. **如何在多次生成之间缓存语音 profile?** + 目前没有独立的"语音 profile"对象 — 最干净的做法是让模型驻留在进程中 + (例如 `python -i infer.py`、`moss-tts-nano serve`,或在脚本中复用 + `MossTtsNanoRuntime` 实例),然后用相同的 `prompt_audio_path` 和 + `prompt_text` 反复调用 `model.inference(...)`。音频 tokenizer 每次都会 + 重新编码提示,但模型权重保持加载状态。 + ### 使用 `app.py` 启动本地 Web 演示 您可以启动本地 FastAPI 演示进行基于浏览器的测试: diff --git a/infer.py b/infer.py index 99844b8..0a32e03 100644 --- a/infer.py +++ b/infer.py @@ -53,8 +53,18 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: text_group.add_argument("--text-file", help="Path to a UTF-8 text file to synthesize.") prompt_text_group = parser.add_mutually_exclusive_group(required=False) - prompt_text_group.add_argument("--prompt-text", help="Reference transcript used by continuation mode.") - prompt_text_group.add_argument("--prompt-text-file", help="UTF-8 reference transcript file used by continuation mode.") + prompt_text_group.add_argument( + "--prompt-text", + help=( + "Transcript of the reference audio. Used by both continuation mode and " + "voice_clone mode — supplying it generally improves cloning quality " + "because the model can align text-to-audio for the prompt clip." + ), + ) + prompt_text_group.add_argument( + "--prompt-text-file", + help="UTF-8 file alternative to --prompt-text. Same behaviour for both modes.", + ) parser.add_argument("--text-tokenizer-path", default=None, help="Override the checkpoint-bundled text tokenizer.") parser.add_argument( diff --git a/moss_tts_nano/cli.py b/moss_tts_nano/cli.py index b7ca2a4..165b221 100644 --- a/moss_tts_nano/cli.py +++ b/moss_tts_nano/cli.py @@ -71,7 +71,12 @@ def _build_parser() -> argparse.ArgumentParser: generate_parser.add_argument( "--prompt-text", default=None, - help="PyTorch backend only. Reference transcript used by continuation mode.", + help=( + "PyTorch backend only. Transcript of the reference audio. Used by " + "both continuation and voice_clone modes, and supplying it " + "generally improves cloning quality because the model can align " + "text-to-audio for the prompt." + ), ) generate_parser.add_argument( "--voice",