Skip to content

Commit fe5c2f4

Browse files
committed
updated TTS support
1 parent 6e66127 commit fe5c2f4

File tree

20 files changed

+609
-282
lines changed

20 files changed

+609
-282
lines changed

data/audio/tts/.gitkeep

Whitespace-only changes.

data/models/tts/.gitkeep

Whitespace-only changes.

packages/audio/xtts/test.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
#print(TTS().list_models())
1111

1212
model="tts_models/multilingual/multi-dataset/xtts_v2" # "tts_models/multilingual/multi-dataset/xtts_v1.1"
13-
speaker_wav="/data/audio/tts/xtts_v2_claribel_dervla_0.wav"
14-
speaker="Claribel Dervla"
1513
language='en'
1614

1715
print(f"Loading TTS model {model}")
@@ -30,21 +28,20 @@
3028

3129
# Text to speech to a file
3230
prompts = [
33-
"Hello world!",
34-
"How are you today?",
35-
"The weather is 53 degrees out and rainy.",
36-
"What is your favorite food to eat for breakfast?",
37-
"Mine is a French toast with maple syrup and sausages.",
38-
"I'm a big fan of Sunday brunch and taking an early afternoon nap afterwards, of course."
31+
"Hello there, how are you today?",
32+
"The weather is 76 degrees out and sunny.",
33+
"Your first meeting is in an hour downtown, with normal traffic.",
34+
"Can I interest you in anything quick for breakfast?",
3935
]
4036

4137
if tts.is_multi_speaker:
4238
prompts = [' '.join(prompts)] + prompts
4339

44-
for prompt_idx, prompt in enumerate(prompts):
45-
wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}_{prompt_idx}.wav"
46-
print(f'\ngenerating "{prompt}" speaker="{speaker}" lang="{language}" wav="{wav}"\n')
47-
if tts.is_multi_speaker:
48-
tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
49-
else:
50-
tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)
40+
for speaker in tts.synthesizer.tts_model.speaker_manager.speakers:
41+
for prompt_idx, prompt in enumerate(prompts):
42+
wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}_{prompt_idx}.wav"
43+
print(f'\ngenerating "{prompt}" speaker="{speaker}" lang="{language}" wav="{wav}"\n')
44+
if tts.is_multi_speaker:
45+
tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
46+
else:
47+
tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)

packages/audio/xtts/test_stream.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,10 @@
4545
speaker_embedding.to(device)
4646

4747
prompts = [
48-
"Hello world!",
49-
"How are you today?",
50-
"The weather is 53 degrees out and rainy.",
51-
"What is your favorite food to eat for breakfast?",
52-
"Mine is a French toast with maple syrup and sausages.",
53-
"I'm a big fan of Sunday brunch and taking an early afternoon nap afterwards, of course."
48+
"Hello there, how are you today?",
49+
"The weather is 76 degrees out and sunny.",
50+
"Your first meeting is in an hour downtown, with normal traffic.",
51+
"Can I interest you in anything quick for breakfast?",
5452
]
5553

5654
prompts = [' '.join(prompts)] + prompts

packages/audio/xtts/test_voices.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import torch
4+
import pprint
5+
6+
from TTS.api import TTS
7+
8+
device = "cuda" if torch.cuda.is_available() else "cpu"
9+
10+
#print(TTS().list_models())
11+
12+
model="tts_models/multilingual/multi-dataset/xtts_v2" # "tts_models/multilingual/multi-dataset/xtts_v1.1"
13+
language='en'
14+
15+
print(f"Loading TTS model {model}")
16+
17+
tts = TTS(model).to(device)
18+
19+
print(dir(tts.synthesizer.tts_model.speaker_manager))
20+
print(tts.synthesizer.tts_model.speaker_manager)
21+
22+
print(f"\nMulti-speaker: {tts.is_multi_speaker}")
23+
24+
if tts.is_multi_speaker:
25+
print(f"\nSpeakers: {tts.synthesizer.tts_model.speaker_manager.name_to_id}")
26+
27+
print(f"\nLanguages: {tts.synthesizer.tts_model.language_manager.name_to_id}")
28+
29+
# Text to speech to a file
30+
prompts = [
31+
"Hello there, how are you today?",
32+
"The weather is 76 degrees out and sunny.",
33+
"Your first meeting is in an hour downtown, with normal traffic.",
34+
"Can I interest you in anything quick for breakfast?",
35+
]
36+
37+
if tts.is_multi_speaker:
38+
prompts = [' '.join(prompts)] #+ prompts
39+
40+
for speaker in tts.synthesizer.tts_model.speaker_manager.speakers:
41+
for prompt_idx, prompt in enumerate(prompts):
42+
wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}.wav"
43+
print(f'\ngenerating "{prompt}" speaker="{speaker}" lang="{language}" wav="{wav}"\n')
44+
if tts.is_multi_speaker:
45+
tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
46+
else:
47+
tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)

packages/llm/local_llm/Dockerfile

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11
#---
22
# name: local_llm
33
# group: llm
4-
# depends: [nanodb, mlc, riva-client:python, jetson-inference, torch2trt]
4+
# depends: [nanodb, mlc, riva-client:python, jetson-inference, torch2trt, xtts]
55
# requires: '>=34.1.0'
66
# docs: docs.md
77
#---
8-
# depends: [mlc:dev, awq:dev]
98
ARG BASE_IMAGE
109
FROM ${BASE_IMAGE}
1110

1211
WORKDIR /opt/local_llm/local_llm
1312

13+
ENV PYTHONPATH=${PYTHONPATH}:/opt/local_llm \
14+
SSL_KEY=/etc/ssl/private/localhost.key.pem \
15+
SSL_CERT=/etc/ssl/private/localhost.cert.pem
16+
1417
COPY requirements.txt .
1518

16-
RUN pip3 install --ignore-installed --no-cache-dir blinker
17-
RUN pip3 install --no-cache-dir --verbose -r requirements.txt
19+
RUN pip3 install --ignore-installed --no-cache-dir blinker && \
20+
pip3 install --no-cache-dir --verbose -r requirements.txt && \
21+
openssl req \
22+
-new \
23+
-newkey rsa:4096 \
24+
-days 3650 \
25+
-nodes \
26+
-x509 \
27+
-keyout ${SSL_KEY} \
28+
-out ${SSL_CERT} \
29+
-subj '/CN=localhost'
1830

1931
COPY *.py ./
2032

@@ -26,19 +38,4 @@ COPY utils utils
2638
COPY vision vision
2739
COPY web web
2840

29-
ENV PYTHONPATH=${PYTHONPATH}:/opt/local_llm
30-
31-
ENV SSL_KEY=/etc/ssl/private/localhost.key.pem
32-
ENV SSL_CERT=/etc/ssl/private/localhost.cert.pem
33-
34-
RUN openssl req \
35-
-new \
36-
-newkey rsa:4096 \
37-
-days 3650 \
38-
-nodes \
39-
-x509 \
40-
-keyout ${SSL_KEY} \
41-
-out ${SSL_CERT} \
42-
-subj '/CN=localhost'
43-
4441
WORKDIR /

packages/llm/local_llm/plugins/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@
88
from .rate_limit import RateLimit
99
from .process_proxy import ProcessProxy
1010

11-
from .audio import AudioOutputDevice, AudioOutputFile
11+
from .audio import *
1212
from .video import VideoSource, VideoOutput
1313

14-
from .asr import RivaASR
15-
from .tts import RivaTTS
16-
1714
from .nanodb import NanoDB
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/usr/bin/env python3
2+
from .audio_output import AudioOutputDevice, AudioOutputFile
3+
4+
from .tts import TTS, TTSPlugin
5+
from .xtts import XTTS
6+
7+
from .riva_asr import RivaASR
8+
from .riva_tts import RivaTTS

0 commit comments

Comments
 (0)