Text is now splitted by sentance (nltk) or optinaly by words

h43lb1t0 · Jan 17, 2025 · a84e53a · a84e53a
1 parent 362006f
commit a84e53a
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 22 deletions.
diff --git a/requirements.txt b/requirements.txt
diff --git a/script.py b/script.py
@@ -1,7 +1,7 @@
 import pathlib
 import html
 import time
-from extensions.KokoroTtsTexGernerationWebui.src.generate import run, load_voice
+from extensions.KokoroTtsTexGernerationWebui.src.generate import run, load_voice, set_plitting_type
 from extensions.KokoroTtsTexGernerationWebui.src.voices import VOICES
 import gradio as gr
 import time
@@ -26,19 +26,25 @@ def voice_preview():
 
 
 def ui():
-    info = """Select a Voice. \nThe default voice is a 50-50 mix of Bella & Sarah\nVoices starting with 'a' are American
+    info_voice = """Select a Voice. \nThe default voice is a 50-50 mix of Bella & Sarah\nVoices starting with 'a' are American
      englisch, voices with 'b' are British englisch"""
     with gr.Accordion("Kokoro"):
-        voice = gr.Dropdown(choices=VOICES, value=VOICES[0], label="Voice", info=info, interactive=True)
+        voice = gr.Dropdown(choices=VOICES, value=VOICES[0], label="Voice", info=info_voice, interactive=True)
 
         preview = gr.Button("Voice preview", type="secondary")
 
         preview_output = gr.HTML()
 
+        info_splitting ="""Kokoro only supports 510 tokens. One method to split the text is by sentance (default), the otherway
+        is by word up to 510 tokens. """
+        spltting_method = gr.Radio(["Split by Sentance", "Split by Word"], info=info_splitting, value="Split by Sentance", label_lines=2, interactive=True)
+
 
     voice.change(voice_update, voice)
     preview.click(fn=voice_preview, outputs=preview_output)
 
+    spltting_method.change(set_plitting_type, spltting_method)
+
 
 
 

diff --git a/src/generate.py b/src/generate.py
@@ -8,10 +8,12 @@
 from huggingface_hub import snapshot_download
 from modules import shared
 from .voices import VOICES
-
+from nltk.tokenize import sent_tokenize
+import nltk
 
 
 snapshot_download(repo_id="hexgrad/Kokoro-82M", cache_dir =pathlib.Path(__file__).parent, allow_patterns=["*.pth", "*.pt"])
+nltk.download('punkt')
 
 if os.name == 'nt':
     os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
@@ -47,37 +49,49 @@ def run(text, preview=False):
 
     return msg_id
 
+sentance_based = True
+
+def set_plitting_type(method="Split by Sentance"):
+    global sentance_based
+    sentance_based = True if method == "Split by Sentance" else False
+    print(f'Splitting method: {"Sentance" if sentance_based else "Word"}')
+
+set_plitting_type()
+
 def split_text(text):
 
     max_token = 510
-    words = text.split()
-    current_words = []
+    text_parts = sent_tokenize(text) if sentance_based else text.split()
+    current_text_parts = []
     chunks = []
     current_chunk_len = 0
 
-    tokenized_text = tokenize(phonemize(text, lang=voice_name[0]))
-    if len(tokenized_text) > max_token:
-        for word in words:
-            tokenized_word = tokenize(phonemize(word, lang=voice_name[0]))
-            additional_tokens = len(tokenized_word) + (1 if current_words else 0)
-            # Check if adding this word exceeds the token limit
-            if current_chunk_len + additional_tokens > max_token and current_words:
-                current_text = ' '.join(current_words)
+
+    tokenized_text_whole = tokenize(phonemize(text, lang=voice_name[0]))
+    if len(tokenized_text_whole) > max_token:
+        for text_part in text_parts:
+            tokenized_textpart = tokenize(phonemize(text_part, lang=voice_name[0]))
+            additional_tokens = len(tokenized_textpart) + 1
+
+            if current_chunk_len + additional_tokens > max_token and current_text_parts:
+                # Create the chunk from what's accumulated so far
+                current_text = ' '.join(current_text_parts)
                 tokenized_chunk = tokenize(phonemize(current_text, lang=voice_name[0]))
-                print(f'Chunk length: {len(tokenized_chunk)}')
                 chunks.append(tokenized_chunk)
-                current_words = []
+
+                # Reset trackers
+                current_text_parts = []
                 current_chunk_len = 0
 
-            else:
-                current_words.append(word)
-                current_chunk_len += additional_tokens
+
+            current_text_parts.append(text_part)
+            current_chunk_len += additional_tokens
+
 
         # Add remaining words as the final chunk if any
-        if current_words:
-            current_text = ' '.join(current_words)
+        if current_text_parts:
+            current_text = ' '.join(current_text_parts)
             tokenized_chunk = tokenize(phonemize(current_text, lang=voice_name[0]))
-            print(f'Chunk length: {len(tokenized_chunk)}')
             chunks.append(tokenized_chunk)
 
     out = {'out': [], 'ps': []}