KittenML · saen-ai · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,20 @@
+venv
+__pycache__
+.pytest_cache
+.mypy_cache
+.git
+.gitignore
+*.pyc
+*.pyo
+*.pyd
+*.swp
+*.swo
+dist
+build
+*.egg-info
+*.onnx
+*.npz
+*.wav
+*.mp3
+*.flac
+*.log
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+venv 
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz
+*.pyzw
+*.pyzwz
+*.pyzwzw
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.13-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+# System dependencies for phonemizer/soundfile/onnxruntime
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    espeak-ng \
+    libsndfile1 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy project files (entire context; .dockerignore controls exclusions)
+COPY . .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir fastapi uvicorn[standard] phonemizer
+
+EXPOSE 8000
+
+CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8000"]
+
+
diff --git a/LICENSE b/LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,15 @@
+version: "3.9"
+
+services:
+  kittentts:
+    build:
+      context: .
+    image: kittentts:latest
+    container_name: kittentts
+    ports:
+      - "8000:8000"
+    # Persist HF cache to avoid re-downloading models
+    volumes:
+      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
+    restart: unless-stopped
+
diff --git a/output.wav b/output.wav
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,88 @@
-num2words
-spacy
-espeakng_loader
-misaki[en]>=0.9.4
-onnxruntime
-soundfile
-numpy
-huggingface_hub
+annotated-types==0.7.0
+anyio==4.10.0
+attrs==25.3.0
+babel==2.17.0
+blis==1.3.0
+catalogue==2.0.10
+certifi==2025.8.3
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpathlib==0.21.1
+colorama==0.4.6
+coloredlogs==15.0.1
+confection==0.1.5
+csvw==3.5.1
+cymem==2.0.11
+dlinfo==2.0.0
+docopt==0.6.2
+espeakng-loader==0.2.4
+fastapi==0.116.1
+filelock==3.18.0
+flatbuffers==25.2.10
+fsspec==2025.7.0
+h11==0.16.0
+hf-xet==1.1.7
+huggingface-hub==0.34.3
+humanfriendly==10.0
+idna==3.10
+isodate==0.7.2
+Jinja2==3.1.6
+joblib==1.5.1
+jsonschema==4.25.0
+jsonschema-specifications==2025.4.1
+langcodes==3.5.0
+language-tags==1.2.0
+language_data==1.3.0
+marisa-trie==1.2.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+misaki==0.7.4
+mpmath==1.3.0
+murmurhash==1.0.13
+num2words==0.5.14
+numpy==2.3.2
+onnxruntime==1.22.1
+packaging==25.0
+phonemizer==3.3.0
+preshed==3.0.10
+protobuf==6.31.1
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+Pygments==2.19.2
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+PyYAML==6.0.2
+rdflib==7.1.4
+referencing==0.36.2
+regex==2025.7.34
+requests==2.32.4
+rfc3986==1.5.0
+rich==14.1.0
+rpds-py==0.27.0
+segments==2.3.0
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+smart_open==7.3.0.post1
+sniffio==1.3.1
+soundfile==0.13.1
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.5.1
+starlette==0.47.2
+sympy==1.14.0
+thinc==8.3.6
+tqdm==4.67.1
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+uritemplate==4.2.0
+urllib3==2.5.0
+uvicorn==0.35.0
+wasabi==1.1.3
+weasel==0.4.1
+wrapt==1.17.2
diff --git a/run.py b/run.py
@@ -0,0 +1,50 @@
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from kittentts import KittenTTS
+import soundfile as sf
+import io
+
+app = FastAPI()
+
+
+
+
+@app.get("/tts")
+@app.post("/tts")
+def tts(text: str, voice: str = "expr-voice-4-f" ,):
+    # Reuse a single model instance across requests
+    global _model
+    if "_model" not in globals():
+        _model = KittenTTS("KittenML/kitten-tts-nano-0.1")
+
+    # Normalize inputs
+    text = text.strip()
+    voice = voice.strip()
+
+    audio = _model.generate(text, voice=voice)
+
+    buffer = io.BytesIO()
+    sf.write(buffer, audio, 24000, format="WAV")
+    buffer.seek(0)
+
+    return StreamingResponse(
+        buffer,
+        media_type="audio/wav",
+        headers={"Content-Disposition": 'inline; filename="tts.wav"'}
+    )
+
+# m = KittenTTS("KittenML/kitten-tts-nano-0.1")
+
+
+# audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f' )
+
+# available_voices : [  'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',  'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ]
+
+
+
+# Save the audio
+# import soundfile as sf
+# sf.write('output.wav', audio, 24000)
+
+
+# /Users/saeedanwar/code/KittenTTS/venv/bin/python -m uvicorn run:app --host 127.0.0.1 --port 8000 --reload
diff --git a/setup.py b/setup.py