rowanborder
diff --git a/‎data/audio/tts/.gitkeep b/‎data/audio/tts/.gitkeep
diff --git a/‎data/models/tts/.gitkeep b/‎data/models/tts/.gitkeep
diff --git a/‎packages/audio/xtts/test.py
Lines changed: 12 additions & 15 deletions b/‎packages/audio/xtts/test.py
Lines changed: 12 additions & 15 deletions
diff --git a/‎packages/audio/xtts/test_stream.py
Lines changed: 4 additions & 6 deletions b/‎packages/audio/xtts/test_stream.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎packages/audio/xtts/test_voices.py
Lines changed: 47 additions & 0 deletions b/‎packages/audio/xtts/test_voices.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎packages/llm/local_llm/Dockerfile
Lines changed: 16 additions & 19 deletions b/‎packages/llm/local_llm/Dockerfile
Lines changed: 16 additions & 19 deletions
diff --git a/‎packages/llm/local_llm/plugins/__init__.py
Lines changed: 1 addition & 4 deletions b/‎packages/llm/local_llm/plugins/__init__.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎packages/llm/local_llm/plugins/audio/__init__.py
Lines changed: 8 additions & 0 deletions b/‎packages/llm/local_llm/plugins/audio/__init__.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎packages/llm/local_llm/plugins/audio.py renamed to ‎packages/llm/local_llm/plugins/audio/audio_output.py b/‎packages/llm/local_llm/plugins/audio.py renamed to ‎packages/llm/local_llm/plugins/audio/audio_output.py
diff --git a/‎packages/llm/local_llm/plugins/asr.py renamed to ‎packages/llm/local_llm/plugins/audio/riva_asr.py b/‎packages/llm/local_llm/plugins/asr.py renamed to ‎packages/llm/local_llm/plugins/audio/riva_asr.py
@@ -10,8 +10,6 @@
 #print(TTS().list_models())
 
 model="tts_models/multilingual/multi-dataset/xtts_v2"  # "tts_models/multilingual/multi-dataset/xtts_v1.1"
-speaker_wav="/data/audio/tts/xtts_v2_claribel_dervla_0.wav"
-speaker="Claribel Dervla"
 language='en'
 
 print(f"Loading TTS model {model}")
@@ -30,21 +28,20 @@
 
 # Text to speech to a file
 prompts = [
-    "Hello world!", 
-    "How are you today?", 
-    "The weather is 53 degrees out and rainy.", 
-    "What is your favorite food to eat for breakfast?", 
-    "Mine is a French toast with maple syrup and sausages.", 
-    "I'm a big fan of Sunday brunch and taking an early afternoon nap afterwards, of course."
+    "Hello there, how are you today?", 
+    "The weather is 76 degrees out and sunny.", 
+    "Your first meeting is in an hour downtown, with normal traffic.",
+    "Can I interest you in anything quick for breakfast?",
 ]
 
 if tts.is_multi_speaker:
     prompts = [' '.join(prompts)] + prompts
 
-for prompt_idx, prompt in enumerate(prompts):
-    wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}_{prompt_idx}.wav"
-    print(f'\ngenerating "{prompt}"  speaker="{speaker}"  lang="{language}"  wav="{wav}"\n')
-    if tts.is_multi_speaker:
-        tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
-    else:
-        tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)
+for speaker in tts.synthesizer.tts_model.speaker_manager.speakers:
+    for prompt_idx, prompt in enumerate(prompts):
+        wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}_{prompt_idx}.wav"
+        print(f'\ngenerating "{prompt}"  speaker="{speaker}"  lang="{language}"  wav="{wav}"\n')
+        if tts.is_multi_speaker:
+            tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
+        else:
+            tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)
@@ -45,12 +45,10 @@
 speaker_embedding.to(device)
 
 prompts = [
-    "Hello world!", 
-    "How are you today?", 
-    "The weather is 53 degrees out and rainy.", 
-    "What is your favorite food to eat for breakfast?", 
-    "Mine is a French toast with maple syrup and sausages.", 
-    "I'm a big fan of Sunday brunch and taking an early afternoon nap afterwards, of course."
+    "Hello there, how are you today?", 
+    "The weather is 76 degrees out and sunny.", 
+    "Your first meeting is in an hour downtown, with normal traffic.",
+    "Can I interest you in anything quick for breakfast?",
 ]
 
 prompts = [' '.join(prompts)] + prompts
 
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import os
+import torch
+import pprint
+
+from TTS.api import TTS
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+#print(TTS().list_models())
+
+model="tts_models/multilingual/multi-dataset/xtts_v2"  # "tts_models/multilingual/multi-dataset/xtts_v1.1"
+language='en'
+
+print(f"Loading TTS model {model}")
+
+tts = TTS(model).to(device)
+
+print(dir(tts.synthesizer.tts_model.speaker_manager))
+print(tts.synthesizer.tts_model.speaker_manager)
+
+print(f"\nMulti-speaker:  {tts.is_multi_speaker}")
+
+if tts.is_multi_speaker:
+    print(f"\nSpeakers:  {tts.synthesizer.tts_model.speaker_manager.name_to_id}")
+    
+print(f"\nLanguages:  {tts.synthesizer.tts_model.language_manager.name_to_id}")
+
+# Text to speech to a file
+prompts = [
+    "Hello there, how are you today?", 
+    "The weather is 76 degrees out and sunny.", 
+    "Your first meeting is in an hour downtown, with normal traffic.",
+    "Can I interest you in anything quick for breakfast?",
+]
+    
+if tts.is_multi_speaker:
+    prompts = [' '.join(prompts)] #+ prompts
+
+for speaker in tts.synthesizer.tts_model.speaker_manager.speakers:
+    for prompt_idx, prompt in enumerate(prompts):
+        wav = f"/data/audio/tts/{os.path.basename(model)}_offline_{speaker.lower().replace(' ', '_')}.wav"
+        print(f'\ngenerating "{prompt}"  speaker="{speaker}"  lang="{language}"  wav="{wav}"\n')
+        if tts.is_multi_speaker:
+            tts.tts_to_file(text=prompt, speaker=speaker, language=language, file_path=wav)
+        else:
+            tts.tts_to_file(text=prompt, speaker_wav=speaker_wav, language=language, file_path=wav)
@@ -1,20 +1,32 @@
 #---
 # name: local_llm
 # group: llm
-# depends: [nanodb, mlc, riva-client:python, jetson-inference, torch2trt]
+# depends: [nanodb, mlc, riva-client:python, jetson-inference, torch2trt, xtts]
 # requires: '>=34.1.0'
 # docs: docs.md
 #---
-# depends: [mlc:dev, awq:dev]
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 
 WORKDIR /opt/local_llm/local_llm
 
+ENV PYTHONPATH=${PYTHONPATH}:/opt/local_llm \
+    SSL_KEY=/etc/ssl/private/localhost.key.pem \
+    SSL_CERT=/etc/ssl/private/localhost.cert.pem
+
 COPY requirements.txt .
 
-RUN pip3 install --ignore-installed --no-cache-dir blinker
-RUN pip3 install --no-cache-dir --verbose -r requirements.txt
+RUN pip3 install --ignore-installed --no-cache-dir blinker && \
+    pip3 install --no-cache-dir --verbose -r requirements.txt && \
+    openssl req \
+	-new \
+	-newkey rsa:4096 \
+	-days 3650 \
+	-nodes \
+	-x509 \
+	-keyout ${SSL_KEY} \
+	-out ${SSL_CERT} \
+	-subj '/CN=localhost'
 
 COPY *.py ./
 
@@ -26,19 +38,4 @@ COPY utils utils
 COPY vision vision
 COPY web web
 
-ENV PYTHONPATH=${PYTHONPATH}:/opt/local_llm
-
-ENV SSL_KEY=/etc/ssl/private/localhost.key.pem
-ENV SSL_CERT=/etc/ssl/private/localhost.cert.pem
-
-RUN openssl req \
-	-new \
-	-newkey rsa:4096 \
-	-days 3650 \
-	-nodes \
-	-x509 \
-	-keyout ${SSL_KEY} \
-	-out ${SSL_CERT} \
-	-subj '/CN=localhost'
-  
 WORKDIR /
@@ -8,10 +8,7 @@
 from .rate_limit import RateLimit
 from .process_proxy import ProcessProxy
 
-from .audio import AudioOutputDevice, AudioOutputFile
+from .audio import *
 from .video import VideoSource, VideoOutput
 
-from .asr import RivaASR
-from .tts import RivaTTS
-
 from .nanodb import NanoDB
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+from .audio_output import AudioOutputDevice, AudioOutputFile
+
+from .tts import TTS, TTSPlugin
+from .xtts import XTTS
+
+from .riva_asr import RivaASR
+from .riva_tts import RivaTTS