myshell-ai · namanthapliyal · Jul 18, 2025 · Jul 18, 2025 · Jul 19, 2025 · Jul 19, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,39 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+
+# Install Python 3.10 and pip, as well as other dependencies
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3.10 \
+    python3.10-distutils \
+    python3-pip \
+    sudo \
+    ffmpeg \
+    git \
+    aria2 \
+    unzip && \
+    rm -rf /var/lib/apt/lists/*
+
+# Optional: ensure python3 points to python3.10
+RUN ln -sf /usr/bin/python3.10 /usr/bin/python3
+
+WORKDIR /app
+
+# Clone OpenVoice (or use COPY for local code)
+RUN git clone https://github.com/namanthapliyal/OpenVoice.git openvoice
+
+WORKDIR /app/openvoice
+
+# Install Python dependencies
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install --no-cache-dir -r requirements.txt && \
+    python3 -m pip install --no-cache-dir -e .
+
+# Download and place checkpoints/resources
+RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /app/openvoice -o checkpoints_1226.zip && \
+    unzip /app/openvoice/checkpoints_1226.zip && \
+    rm checkpoints_1226.zip
+
+EXPOSE 7860
+
+
+CMD ["uvicorn", "fastapi_app:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/docs/USAGE.md b/docs/USAGE.md
@@ -4,8 +4,8 @@
 
 - [Quick Use](#quick-use): directly use OpenVoice without installation.
 - [Linux Install](#linux-install): for researchers and developers only.
-    - [V1](#openvoice-v1)
-    - [V2](#openvoice-v2)
+  - [V1](#openvoice-v1)
+  - [V2](#openvoice-v2)
 - [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
 
 ## Quick Use
@@ -63,14 +63,14 @@ Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for language
 Download the checkpoint from [here](https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
 
 Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
+
 ```
 pip install git+https://github.com/myshell-ai/MeloTTS.git
 python -m unidic download
 ```
 
 **Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
 
-
 ## Install on Other Platforms
 
 This section provides the unofficial installation guides by open-source contributors in the community:
@@ -79,5 +79,6 @@ This section provides the unofficial installation guides by open-source contribu
   - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
   - You are welcome to contribute if you have a better installation guide. We will list you here.
 - Docker
+  - [Guide](https://github.com/namanthapliyal/OpenVoice/blob/main/docs/docker_usage.md) by [@namanthapliyal](https://github.com/namanthapliyal/)
   - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
   - You are welcome to contribute if you have a better installation guide. We will list you here.
diff --git a/docs/docker_usage.md b/docs/docker_usage.md
@@ -0,0 +1,78 @@
+## Local Development Setup
+
+Follow these steps to set up and run the application locally for development and debugging.
+
+### 1. Clone the Repository
+
+First, clone this repository to your local machine:
+
+```bash
+git clone https://github.com/namanthapliyal/OpenVoice.git
+cd ./OpenVoice
+```
+
+### 2. Build the Docker Image
+
+In the root location of the project, build the Docker image using the following command:
+
+```bash
+docker build -t openvoice-fastapi .
+```
+
+This command will:
+
+Pull the nvidia/cuda base image.
+Install necessary system dependencies and Python packages.
+Clone the OpenVoice library.
+Download pre-trained checkpoints required for voice synthesis.
+Set up the working directory and expose the application port.
+Build the Docker image.
+
+This process may take some time, especially during the initial download of the base image and checkpoints.
+
+### 3. Run the Docker Container
+
+Once the image is built, you can run a container from it. To enable GPU acceleration and map the application's port to your host machine, use the following command:
+
+```bash
+docker run --gpus all -p 7860:7860 openvoice-fastapi
+
+```
+
+- --gpus all: Exposes all available NVIDIA GPUs on your host to the container. Ensure the NVIDIA Container Toolkit is correctly installed.
+- -p 7860:7860: Maps port 7860 inside the container (where FastAPI runs) to port 7860 on your host machine.
+
+The FastAPI application will now be accessible at http://localhost:7860.
+
+### 4. Interact with the API
+
+You can test the API using curl or any API client (like Postman, Insomnia, or your browser for GET requests). The primary endpoint is /synthesize/ which accepts POST requests with multipart/form-data.
+
+Example curl Request:
+
+```bash
+curl -X POST "http://localhost:7860/synthesize/" \
+  -H "accept: application/json" \
+  -H "Content-Type: multipart/form-data" \
+  -F "prompt=This is a test sentence for voice synthesis." \
+  -F "style=default" \
+  -F "audio_file=@/path/to/your/reference_audio.mp3" \
+  -F "agree=true" \
+  --output synthesized_audio.wav
+```
+
+Parameters:
+
+- prompt (string, required): The text to be synthesized.
+- style (string, required): The speaking style. Supported values: default, whispering, shouting, excited, cheerful, terrified, angry, sad, friendly. (Note: Chinese only supports default).
+- audio_file (file, required): An audio file (.mp3 or .wav) of the reference speaker whose voice you want to clone.
+- agree (boolean, required): Must be true to accept the terms and conditions.
+
+The API will return the synthesized audio as a .wav file.
+
+Output Directory
+Synthesized audio files and temporary processing files will be stored in the outputs/ directory within the container. For local debugging, you might want to mount a volume to persist these outputs on your host machine.
+
+### 4. Access Swagger Doc
+
+You can access the Swagger UI documentation by navigating to http://localhost:7860/docs in your web browser. This provides an interactive API reference and allows you to test the API endpoints directly through the UI.
diff --git a/fastapi_app.py b/fastapi_app.py
@@ -0,0 +1,107 @@
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import FileResponse
+import os
+import torch
+import langid
+from openvoice import se_extractor
+from openvoice.api import BaseSpeakerTTS, ToneColorConverter
+import shutil
+
+app = FastAPI()
+
+# Configuration from openvoice_app.py
+en_ckpt_base = 'checkpoints/base_speakers/EN'
+zh_ckpt_base = 'checkpoints/base_speakers/ZH'
+ckpt_converter = 'checkpoints/converter'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+output_dir = 'outputs'
+os.makedirs(output_dir, exist_ok=True)
+
+# Load models
+en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
+en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
+zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
+zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
+tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+
+# Load speaker embeddings
+en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
+en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
+zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
+
+supported_languages = ['zh', 'en']
+
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the OpenVoice API! Server is up and running!"}
+
+@app.post("/synthesize/")
+async def synthesize_speech(
+    prompt: str = Form(...),
+    style: str = Form(...),
+    audio_file: UploadFile = File(...),
+):
+
+    # Save the uploaded audio file temporarily
+    temp_audio_path = os.path.join(output_dir, audio_file.filename)
+    with open(temp_audio_path, "wb") as buffer:
+        shutil.copyfileobj(audio_file.file, buffer)
+
+    language_predicted = langid.classify(prompt)[0].strip()
+    print(f"Detected language: {language_predicted}")
+
+    if language_predicted not in supported_languages:
+        os.remove(temp_audio_path)
+        raise HTTPException(status_code=400, detail=f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}")
+
+    if language_predicted == "zh":
+        tts_model = zh_base_speaker_tts
+        source_se = zh_source_se
+        language = 'Chinese'
+        if style not in ['default']:
+            os.remove(temp_audio_path)
+            raise HTTPException(status_code=400, detail=f"The style {style} is not supported for Chinese, which should be in ['default']")
+    else:
+        tts_model = en_base_speaker_tts
+        if style == 'default':
+            source_se = en_source_default_se
+        else:
+            source_se = en_source_style_se
+        language = 'English'
+        if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
+            os.remove(temp_audio_path)
+            raise HTTPException(status_code=400, detail=f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
+
+    if len(prompt) < 2:
+        os.remove(temp_audio_path)
+        raise HTTPException(status_code=400, detail="Please give a longer prompt text")
+    if len(prompt) > 200:
+        os.remove(temp_audio_path)
+        raise HTTPException(status_code=400, detail="Text length limited to 200 characters for this demo, please try shorter text.")
+
+    try:
+        target_se, audio_name = se_extractor.get_se(temp_audio_path, tone_color_converter, target_dir='processed', vad=True)
+    except Exception as e:
+        os.remove(temp_audio_path)
+        raise HTTPException(status_code=500, detail=f"Get target tone color error: {str(e)}")
+
+    src_path = os.path.join(output_dir, 'tmp.wav')
+    tts_model.tts(prompt, src_path, speaker=style, language=language)
+
+    save_path = os.path.join(output_dir, 'output.wav')
+    encode_message = "@MyShell"
+    tone_color_converter.convert(
+        audio_src_path=src_path,
+        src_se=source_se,
+        tgt_se=target_se,
+        output_path=save_path,
+        message=encode_message
+    )
+
+    # Clean up temporary files
+    os.remove(temp_audio_path)
+    os.remove(src_path)
+
+    return FileResponse(save_path, media_type="audio/wav", filename="synthesized_audio.wav")
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+# OpenVoice core requirements, with pinned versions for compatibility
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
@@ -14,3 +15,9 @@ cn2an==0.5.22
 jieba==0.42.1
 gradio==3.48.0
 langid==1.1.6
+
+# Add extra requirements for your FastAPI wrapper
+fastapi
+uvicorn
+python-multipart
+