Skip to content

Commit

Permalink
Text is now splitted by sentance (nltk) or optinaly by words
Browse files Browse the repository at this point in the history
  • Loading branch information
h43lb1t0 committed Jan 17, 2025
1 parent 362006f commit a84e53a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 22 deletions.
Binary file modified requirements.txt
Binary file not shown.
12 changes: 9 additions & 3 deletions script.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib
import html
import time
from extensions.KokoroTtsTexGernerationWebui.src.generate import run, load_voice
from extensions.KokoroTtsTexGernerationWebui.src.generate import run, load_voice, set_plitting_type
from extensions.KokoroTtsTexGernerationWebui.src.voices import VOICES
import gradio as gr
import time
Expand All @@ -26,19 +26,25 @@ def voice_preview():


def ui():
info = """Select a Voice. \nThe default voice is a 50-50 mix of Bella & Sarah\nVoices starting with 'a' are American
info_voice = """Select a Voice. \nThe default voice is a 50-50 mix of Bella & Sarah\nVoices starting with 'a' are American
englisch, voices with 'b' are British englisch"""
with gr.Accordion("Kokoro"):
voice = gr.Dropdown(choices=VOICES, value=VOICES[0], label="Voice", info=info, interactive=True)
voice = gr.Dropdown(choices=VOICES, value=VOICES[0], label="Voice", info=info_voice, interactive=True)

preview = gr.Button("Voice preview", type="secondary")

preview_output = gr.HTML()

info_splitting ="""Kokoro only supports 510 tokens. One method to split the text is by sentance (default), the otherway
is by word up to 510 tokens. """
spltting_method = gr.Radio(["Split by Sentance", "Split by Word"], info=info_splitting, value="Split by Sentance", label_lines=2, interactive=True)


voice.change(voice_update, voice)
preview.click(fn=voice_preview, outputs=preview_output)

spltting_method.change(set_plitting_type, spltting_method)




Expand Down
52 changes: 33 additions & 19 deletions src/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from huggingface_hub import snapshot_download
from modules import shared
from .voices import VOICES

from nltk.tokenize import sent_tokenize
import nltk


snapshot_download(repo_id="hexgrad/Kokoro-82M", cache_dir =pathlib.Path(__file__).parent, allow_patterns=["*.pth", "*.pt"])
nltk.download('punkt')

if os.name == 'nt':
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
Expand Down Expand Up @@ -47,37 +49,49 @@ def run(text, preview=False):

return msg_id

sentance_based = True

def set_plitting_type(method="Split by Sentance"):
global sentance_based
sentance_based = True if method == "Split by Sentance" else False
print(f'Splitting method: {"Sentance" if sentance_based else "Word"}')

set_plitting_type()

def split_text(text):

max_token = 510
words = text.split()
current_words = []
text_parts = sent_tokenize(text) if sentance_based else text.split()
current_text_parts = []
chunks = []
current_chunk_len = 0

tokenized_text = tokenize(phonemize(text, lang=voice_name[0]))
if len(tokenized_text) > max_token:
for word in words:
tokenized_word = tokenize(phonemize(word, lang=voice_name[0]))
additional_tokens = len(tokenized_word) + (1 if current_words else 0)
# Check if adding this word exceeds the token limit
if current_chunk_len + additional_tokens > max_token and current_words:
current_text = ' '.join(current_words)

tokenized_text_whole = tokenize(phonemize(text, lang=voice_name[0]))
if len(tokenized_text_whole) > max_token:
for text_part in text_parts:
tokenized_textpart = tokenize(phonemize(text_part, lang=voice_name[0]))
additional_tokens = len(tokenized_textpart) + 1

if current_chunk_len + additional_tokens > max_token and current_text_parts:
# Create the chunk from what's accumulated so far
current_text = ' '.join(current_text_parts)
tokenized_chunk = tokenize(phonemize(current_text, lang=voice_name[0]))
print(f'Chunk length: {len(tokenized_chunk)}')
chunks.append(tokenized_chunk)
current_words = []

# Reset trackers
current_text_parts = []
current_chunk_len = 0

else:
current_words.append(word)
current_chunk_len += additional_tokens

current_text_parts.append(text_part)
current_chunk_len += additional_tokens


# Add remaining words as the final chunk if any
if current_words:
current_text = ' '.join(current_words)
if current_text_parts:
current_text = ' '.join(current_text_parts)
tokenized_chunk = tokenize(phonemize(current_text, lang=voice_name[0]))
print(f'Chunk length: {len(tokenized_chunk)}')
chunks.append(tokenized_chunk)

out = {'out': [], 'ps': []}
Expand Down

0 comments on commit a84e53a

Please sign in to comment.