-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
### Major Changes - Added support for OuteTTS-0.2-500M model - Introduced default speaker presets for each supported language - **Breaking Changes**: - Incompatible speaker files from versions <0.2.0 - Revised interface usage (see README.md) ### New Features - Added voice cloning guidelines and interface usage in README.md - Implemented Gradio example playground for OuteTTS-0.2-500M - Multi-language alignment support - Enhanced speaker management: - Methods: `print_default_speakers()` and `load_default_speaker(name)` - JSON format for speaker saving with language info - Option to load WavTokenizer from custom path (fixes #24) - Support for multiple interface version initialization ### Improvements - Restructured library files for better organization - Added hash verification for WavTokenizer downloads (fixes #3) - Reworked interface for improved usability - Made sounddevice optional with better error handling - Included training data preparation examples ### Error Handling - Improved validation for audio token detection - Enhanced error messages for long inputs and EOS cases - Better library-wide error handling and feedback
- Loading branch information
Showing
64 changed files
with
15,954 additions
and
208 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
import polars as pl | ||
import torch | ||
from tqdm import tqdm | ||
import outetts | ||
|
||
df = pl.read_parquet("sample.parquet") | ||
|
||
language = "en" | ||
device = "cuda" | ||
|
||
interface = outetts.InterfaceHF( | ||
model_version="0.2", | ||
cfg=outetts.HFModelConfig_v1( | ||
model_path="OuteAI/OuteTTS-0.2-500M", | ||
language=language, | ||
) | ||
) | ||
|
||
del interface.model | ||
|
||
ctc = outetts.CTCForcedAlignment([language], device) | ||
|
||
def create_speaker(audio_path: str, transcript: str, language: str): | ||
words = ctc.align(audio_path, transcript, language) | ||
|
||
full_codes = interface.audio_codec.encode( | ||
interface.audio_codec.convert_audio_tensor( | ||
audio=torch.cat([i["audio"] for i in words], dim=1), | ||
sr=ctc.sample_rate | ||
).to(interface.audio_codec.device) | ||
).tolist() | ||
|
||
data = [] | ||
start = 0 | ||
for i in words: | ||
end = int(round((i["x1"] / ctc.sample_rate) * 75)) | ||
word_tokens = full_codes[0][0][start:end] | ||
start = end | ||
if not word_tokens: | ||
word_tokens = [1] | ||
|
||
data.append({ | ||
"word": i["word"], | ||
"duration": round(len(word_tokens) / 75, 2), | ||
"codes": word_tokens | ||
}) | ||
|
||
return { | ||
"text": transcript, | ||
"words": data, | ||
} | ||
|
||
data = [] | ||
|
||
for i in tqdm(df.to_dicts()): | ||
text = i["text"] | ||
language = i["language"] | ||
|
||
file = i["audio"]["path"] | ||
with open(file, 'wb') as f: | ||
f.write(i["audio"]["bytes"]) | ||
|
||
data.append(interface.prompt_processor.get_training_prompt( | ||
text=text, | ||
language=language, | ||
speaker=create_speaker(file, text, language) | ||
)) | ||
|
||
os.remove(file) | ||
|
||
pl.DataFrame({"data": data}).write_parquet("processed_data.parquet") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import os | ||
import gradio as gr | ||
import outetts | ||
from outetts.version.v1.interface import _DEFAULT_SPEAKERS | ||
|
||
model_config = outetts.HFModelConfig_v1( | ||
model_path="OuteAI/OuteTTS-0.2-500M", | ||
language="en", | ||
) | ||
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) | ||
|
||
def get_available_speakers(language): | ||
"""Get available speakers for the selected language.""" | ||
if language not in interface.languages: | ||
return [] | ||
speakers = list(_DEFAULT_SPEAKERS[language].keys()) | ||
speakers.insert(0, "None") | ||
return speakers | ||
|
||
def change_interface_language(language): | ||
"""Change interface language and update available speakers.""" | ||
try: | ||
interface.change_language(language) | ||
speakers = get_available_speakers(language) | ||
return gr.update(choices=speakers, value="male_1"), gr.update(visible=True) | ||
except ValueError as e: | ||
return gr.update(choices=["None"], value="None"), gr.update(visible=False) | ||
|
||
def generate_tts( | ||
text, temperature, repetition_penalty, language, | ||
speaker_selection, reference_audio, reference_text | ||
): | ||
"""Generate TTS with error handling and new features.""" | ||
try: | ||
# Validate inputs for custom speaker | ||
if reference_audio and reference_text: | ||
if not os.path.exists(reference_audio): | ||
raise ValueError("Reference audio file not found") | ||
if not reference_text.strip(): | ||
raise ValueError("Reference transcription text is required") | ||
speaker = interface.create_speaker(reference_audio, reference_text) | ||
|
||
# Use selected default speaker | ||
elif speaker_selection and speaker_selection != "None": | ||
speaker = interface.load_default_speaker(speaker_selection) | ||
|
||
# No speaker - random characteristics | ||
else: | ||
speaker = None | ||
|
||
# Generate audio | ||
output = interface.generate( | ||
text=text, | ||
speaker=speaker, | ||
temperature=temperature, | ||
repetition_penalty=repetition_penalty, | ||
max_length=4096 | ||
) | ||
|
||
# Verify output | ||
if output.audio is None: | ||
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") | ||
|
||
# Save and return output | ||
output_path = "output.wav" | ||
output.save(output_path) | ||
return output_path, None | ||
|
||
except Exception as e: | ||
return None, str(e) | ||
|
||
with gr.Blocks() as demo: | ||
gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo") | ||
|
||
error_box = gr.Textbox(label="Error Messages", visible=False) | ||
|
||
with gr.Row(): | ||
with gr.Column(): | ||
# Language selection | ||
language_dropdown = gr.Dropdown( | ||
choices=list(interface.languages), | ||
value="en", | ||
label="Interface Language" | ||
) | ||
|
||
# Speaker selection | ||
speaker_dropdown = gr.Dropdown( | ||
choices=get_available_speakers("en"), | ||
value="male_1", | ||
label="Speaker Selection" | ||
) | ||
|
||
text_input = gr.Textbox( | ||
label="Text to Synthesize", | ||
placeholder="Enter text here..." | ||
) | ||
|
||
temperature = gr.Slider( | ||
0.1, 1.0, | ||
value=0.1, | ||
label="Temperature (lower = more stable tone, higher = more expressive)" | ||
) | ||
|
||
repetition_penalty = gr.Slider( | ||
0.5, 2.0, | ||
value=1.1, | ||
label="Repetition Penalty" | ||
) | ||
|
||
gr.Markdown(""" | ||
### Voice Cloning Guidelines: | ||
- Use 10-15 seconds of clear, noise-free audio | ||
- Provide accurate transcription | ||
- Longer audio clips will reduce maximum output length | ||
- Custom speaker overrides speaker selection | ||
""") | ||
|
||
reference_audio = gr.Audio( | ||
label="Reference Audio (for voice cloning)", | ||
type="filepath" | ||
) | ||
|
||
reference_text = gr.Textbox( | ||
label="Reference Transcription Text", | ||
placeholder="Enter exact transcription of reference audio" | ||
) | ||
|
||
submit_button = gr.Button("Generate Speech") | ||
|
||
with gr.Column(): | ||
audio_output = gr.Audio( | ||
label="Generated Audio", | ||
type="filepath" | ||
) | ||
|
||
language_dropdown.change( | ||
fn=change_interface_language, | ||
inputs=[language_dropdown], | ||
outputs=[speaker_dropdown, speaker_dropdown] | ||
) | ||
|
||
submit_button.click( | ||
fn=generate_tts, | ||
inputs=[ | ||
text_input, | ||
temperature, | ||
repetition_penalty, | ||
language_dropdown, | ||
speaker_dropdown, | ||
reference_audio, | ||
reference_text | ||
], | ||
outputs=[audio_output, error_box] | ||
).then( | ||
fn=lambda x: gr.update(visible=bool(x)), | ||
inputs=[error_box], | ||
outputs=[error_box] | ||
) | ||
|
||
demo.launch() |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Training Instructions | ||
|
||
The model can be trained similarly to other transformer-based models. An example for preparing datasets is included in `examples/v1/data_creation.py`. After generating the dataset, you can begin training using your preferred library. Below are some suggested libraries for tasks like supervised fine-tuning (SFT): | ||
|
||
- [Hugging Face's SFT Trainer](https://huggingface.co/docs/trl/sft_trainer) | ||
- [TorchTune](https://github.com/pytorch/torchtune) | ||
|
||
Refer to the respective documentation for detailed setup and instructions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,5 @@ | ||
__version__ = "0.2.0" | ||
|
||
from .interface import InterfaceHF, InterfaceGGUF, display_available_models | ||
from .interface import HFModelConfig_v1, GGUFModelConfig_v1 | ||
from .version.v1.alignment import CTCForcedAlignment |
Oops, something went wrong.