diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c61ebe --- /dev/null +++ b/.gitignore @@ -0,0 +1,322 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be added to the global gitignore or merged into this project gitignore. For a PyCharm +# project, it is generally recommended to ignore these files: +# https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 +.idea/ + +# VS Code +.vscode/ +*.code-workspace + +# Sublime Text +*.sublime-project +*.sublime-workspace + +# Vim +*.swp +*.swo +*~ + +# Emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# macOS +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# Windows +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# Linux +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Project-specific ignores +# ======================= + +# HuggingFace cache +.cache/ +~/.cache/huggingface/ + +# Model downloads and cache +models/ +*.onnx +*.pt +*.pth +*.bin +*.safetensors + +# Audio files generated by the app +*.wav +*.mp3 +*.flac +*.ogg +temp_audio/ +output_audio/ + +# Gradio temporary files +gradio_cached_examples/ +flagged/ + +# Logs +*.log +logs/ + +# Large datasets +data/ +datasets/ + +# Experimental notebooks +experiments/ +*.ipynb + +# Configuration files with secrets +config.json +secrets.json +.env.local +.env.production + +# Temporary test files +test_output/ +temp/ +tmp/ + +# Backup files +*.bak +*.backup +*.old + +# Performance profiling +*.prof +*.profile + +# PyTorch Lightning logs +lightning_logs/ + +# Weights & Biases +wandb/ + +# MLflow +mlruns/ + +# TensorBoard logs +runs/ +tb_logs/ \ No newline at end of file diff --git a/README.md b/README.md index 81536da..9257aab 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,22 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million [Join our discord](https://discord.gg/upcyF5s6) +> **Note**: This is a personal fork of [KittenML/KittenTTS](https://github.com/KittenML/KittenTTS) with additional features including a Gradio web interface for easy testing. + +## 🎵 Sample Audio + +Here's a sample of what KittenTTS can generate: + +**Text**: "Welcome to the future of AI-powered speech synthesis!" +**Voice**: expr-voice-5-f (Female voice) +**Speed**: 1.0x + +[generated_speech_tmpa1hq_pip.webm](https://github.com/user-attachments/assets/24bfa252-a0e3-47b2-b814-d6565d1f9142) + + +> 💡 **Tip**: Click the link above to play the audio in your browser, or right-click and "Save As" to download the WAV file. + +*Note: The audio file demonstrates the natural-sounding speech quality achievable with just 15 million parameters.* ## ✨ Features @@ -15,7 +31,6 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million - **Fast inference**: Optimized for real-time speech synthesis - ## 🚀 Quick Start ### Installation @@ -26,7 +41,7 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentt - ### Basic Usage + ### Basic Usage ``` from kittentts import KittenTTS @@ -42,9 +57,101 @@ sf.write('output.wav', audio, 24000) ``` +## 🔌 API Usage + +You can also use KittenTTS via the Gradio API when the web interface is running. This is perfect for integrating TTS into your applications programmatically. +### API Example +First, make sure the Gradio web interface is running: +```bash +python gradio_app.py +``` + +Then use the API client: + +```python +from gradio_client import Client +import shutil +import os + +# Connect to the running Gradio app +client = Client("http://localhost:7860/") + +# Generate speech via API +result = client.predict( + text="Welcome to the future of AI-powered speech synthesis!", + voice="expr-voice-5-f", # Choose from available voices + speed=1, # Speed from 0.5 to 2.0 + api_name="/generate_speech" +) + +# Save the generated audio to current directory +if result: + print(f"Raw result: {result}") + + # Handle tuple result - extract file path + if isinstance(result, tuple): + audio_file_path = result[0] + else: + audio_file_path = result + + # Copy to current directory with descriptive name + original_filename = os.path.basename(audio_file_path) + current_dir_filename = f"generated_speech_{original_filename}" + shutil.copy2(audio_file_path, current_dir_filename) + + print(f"Audio saved to: {current_dir_filename}") +else: + print("No audio file generated") +``` +### Available API Parameters +- **text**: The text to synthesize (string) +- **voice**: Voice selection from: `expr-voice-2-m`, `expr-voice-2-f`, `expr-voice-3-m`, `expr-voice-3-f`, `expr-voice-4-m`, `expr-voice-4-f`, `expr-voice-5-m`, `expr-voice-5-f` +- **speed**: Speech speed (float, 0.5 to 2.0) + +## 🌐 Web Interface + +We've added a simple Gradio webapp for easy testing and experimentation with KittenTTS! + +### Features +- 🎭 **Voice Selection**: Choose from 8 available voices (male/female variants) +- ⚡ **Speed Control**: Adjust speech speed from 0.5x to 2.0x +- 📝 **Easy Text Input**: Multi-line text input with example texts +- 🔊 **Audio Output**: High-quality 24kHz audio generation +- 💡 **Example Texts**: Pre-loaded examples to get started quickly + +### Running the Web Interface + +1. **Clone and setup**: + ```bash + git clone https://github.com/akashjss/KittenTTS.git + cd KittenTTS + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +2. **Install dependencies**: + ```bash + pip install -e . + pip install gradio + pip install gradio_client # To use the TTS via Gradio API + ``` + +3. **Launch the webapp**: + ```bash + python gradio_app.py + ``` + +4. **Open in browser**: Navigate to http://localhost:7860 + +The webapp provides an intuitive interface where you can: +- Type or paste text to synthesize +- Select from 8 different voice options +- Adjust speech speed with a slider +- Generate and download audio files +- Try example texts to get started ## 💻 System Requirements @@ -52,10 +159,10 @@ Works literally everywhere -## Checklist +## Checklist - [x] Release a preview model - [ ] Release the fully trained model weights -- [ ] Release mobile SDK -- [ ] Release web version +- [ ] Release mobile SDK +- [x] Release web version (Gradio webapp added) diff --git a/generated_speech_tmpa1hq_pip.wav b/generated_speech_tmpa1hq_pip.wav new file mode 100644 index 0000000..37f6726 Binary files /dev/null and b/generated_speech_tmpa1hq_pip.wav differ diff --git a/gradio_app.py b/gradio_app.py new file mode 100644 index 0000000..506e320 --- /dev/null +++ b/gradio_app.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +""" +Simple Gradio webapp to test KittenTTS +""" + +import gradio as gr +import tempfile +import os +from kittentts import KittenTTS + + +def initialize_model(): + """Initialize the KittenTTS model.""" + print("Loading KittenTTS model...") + try: + model = KittenTTS("KittenML/kitten-tts-nano-0.1") + print("Model loaded successfully!") + return model + except Exception as e: + print(f"Error loading model: {e}") + return None + + +def generate_speech(text, voice, speed): + """Generate speech from text using KittenTTS.""" + if not text.strip(): + return None, "Please enter some text to synthesize." + + try: + # Generate audio + audio_data = model.generate(text, voice=voice, speed=speed) + + # Create temporary file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + # Save audio data to file using soundfile + import soundfile as sf + sf.write(tmp_file.name, audio_data, 24000) + + return tmp_file.name, f"✅ Generated speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'" + + except Exception as e: + return None, f"❌ Error generating speech: {str(e)}" + + +# Available voices from KittenTTS +AVAILABLE_VOICES = [ + 'expr-voice-2-m', 'expr-voice-2-f', + 'expr-voice-3-m', 'expr-voice-3-f', + 'expr-voice-4-m', 'expr-voice-4-f', + 'expr-voice-5-m', 'expr-voice-5-f' +] + +# Initialize model +print("🐱 Initializing KittenTTS...") +model = initialize_model() + +if model is None: + print("❌ Failed to initialize model. Please check your installation.") + exit(1) + +# Create Gradio interface +with gr.Blocks(title="🐱 KittenTTS Demo", theme=gr.themes.Ocean()) as demo: + gr.Markdown( + """ + # 🐱 KittenTTS Demo + + Generate high-quality speech from text using KittenTTS - an ultra-lightweight TTS model! + + **Features:** + - 🚀 Fast inference (CPU-optimized) + - 🎙️ Multiple voice options + - ⚡ Adjustable speech speed + - 🔊 High-quality 24kHz audio output + """ + ) + + with gr.Row(): + with gr.Column(scale=2): + # Input controls + text_input = gr.Textbox( + label="📝 Text to Synthesize", + placeholder="Enter the text you want to convert to speech...", + lines=3, + max_lines=5 + ) + + with gr.Row(): + voice_select = gr.Dropdown( + choices=AVAILABLE_VOICES, + value="expr-voice-5-f", # Default to female voice + label="🎭 Voice Selection", + info="Choose from available voice models" + ) + + speed_slider = gr.Slider( + minimum=0.5, + maximum=2.0, + value=1.0, + step=0.1, + label="⚡ Speed", + info="Adjust speech speed (1.0 = normal)" + ) + + generate_btn = gr.Button( + "🎵 Generate Speech", + variant="primary", + size="lg" + ) + + with gr.Column(scale=1): + # Output + audio_output = gr.Audio( + label="🔊 Generated Audio", + type="filepath" + ) + + status_text = gr.Textbox( + label="📊 Status", + interactive=False, + lines=2 + ) + + # Example texts + gr.Markdown("### 💡 Try these examples:") + example_texts = [ + "Hello! This is KittenTTS, a lightweight text-to-speech model.", + "The quick brown fox jumps over the lazy dog.", + "Welcome to the future of AI-powered speech synthesis!", + "KittenTTS works entirely on CPU without requiring a GPU." + ] + + examples = gr.Examples( + examples=[[text, "expr-voice-5-f", 1.0] for text in example_texts], + inputs=[text_input, voice_select, speed_slider], + cache_examples=False + ) + + # Event handlers + generate_btn.click( + fn=generate_speech, + inputs=[text_input, voice_select, speed_slider], + outputs=[audio_output, status_text] + ) + + # Footer + gr.Markdown( + """ + --- + + **About KittenTTS:** + - 🏋️ Ultra-lightweight: <25MB model size + - 💻 CPU-optimized: No GPU required + - ⚡ Fast inference: Real-time speech synthesis + - 🎯 High quality: 24kHz audio output + + *Powered by [KittenML](https://github.com/KittenML/KittenTTS)* + """ + ) + + +if __name__ == "__main__": + print("🚀 Starting Gradio webapp...") + print("📱 The webapp will be available at: http://localhost:7860") + print("🌐 Browser will open automatically...") + print("🔄 Use Ctrl+C to stop the server") + + demo.launch( + server_name="0.0.0.0", # Allow external access + server_port=7860, + share=False, # Set to True if you want a public link + show_error=True, + inbrowser=True # Automatically open in browser + ) \ No newline at end of file diff --git a/manifest.json b/manifest.json new file mode 100644 index 0000000..1944be5 --- /dev/null +++ b/manifest.json @@ -0,0 +1,19 @@ +{ + "name": "KittenTTS", + "description": "Ultra-lightweight TTS engine with a Gradio web UI", + "icon": "https://raw.githubusercontent.com/akashjss/KittenTTS/main/icon.png", + "homepage": "https://github.com/akashjss/KittenTTS", + "categories": ["audio", "ml", "tts"], + "authors": [ + { + "name": "KittenML", + "url": "https://github.com/KittenML" + }, + { + "name": "akashjss", + "url": "https://github.com/akashjss" + } + ] +} + + diff --git a/start.json b/start.json new file mode 100644 index 0000000..67a9771 --- /dev/null +++ b/start.json @@ -0,0 +1,17 @@ +{ + "build": [ + "python3 -m venv .venv", + "./.venv/bin/python -m pip install --upgrade pip setuptools wheel", + "./.venv/bin/pip install -e .", + "./.venv/bin/pip install gradio gradio_client" + ], + "script": "./.venv/bin/python gradio_app.py", + "shell": false, + "env": { + "PYTHONPATH": ".", + "PYTHONUNBUFFERED": "1" + }, + "ports": [7860] +} + + diff --git a/tts-api.py b/tts-api.py new file mode 100644 index 0000000..f18f2d9 --- /dev/null +++ b/tts-api.py @@ -0,0 +1,34 @@ +from gradio_client import Client +import shutil +import os + +client = Client("http://localhost:7860/") +result = client.predict( + text="Welcome to the future of AI-powered speech synthesis!", + voice="expr-voice-5-f", + speed=1, + api_name="/generate_speech" +) + +# The result is typically a tuple containing the path to the generated audio file +if result: + print(f"Raw result: {result}") + + # Handle tuple result - usually the first element is the file path + if isinstance(result, tuple): + audio_file_path = result[0] + else: + audio_file_path = result + + # Extract filename from the result path + original_filename = os.path.basename(audio_file_path) + + # Create a new filename in current directory + current_dir_filename = f"generated_speech_{original_filename}" + + # Copy the file to current directory + shutil.copy2(audio_file_path, current_dir_filename) + print(f"Audio saved to: {current_dir_filename}") + print(f"Original file path: {audio_file_path}") +else: + print("No audio file generated") \ No newline at end of file