diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..98585b80 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + +# Install Python 3.10 and pip, as well as other dependencies +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.10 \ + python3.10-distutils \ + python3-pip \ + sudo \ + ffmpeg \ + git \ + aria2 \ + unzip && \ + rm -rf /var/lib/apt/lists/* + +# Optional: ensure python3 points to python3.10 +RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 + +WORKDIR /app + +# Clone OpenVoice (or use COPY for local code) +RUN git clone https://github.com/namanthapliyal/OpenVoice.git openvoice + +WORKDIR /app/openvoice + +# Install Python dependencies +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --no-cache-dir -r requirements.txt && \ + python3 -m pip install --no-cache-dir -e . + +# Download and place checkpoints/resources +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /app/openvoice -o checkpoints_1226.zip && \ + unzip /app/openvoice/checkpoints_1226.zip && \ + rm checkpoints_1226.zip + +EXPOSE 7860 + + +CMD ["uvicorn", "fastapi_app:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/docs/USAGE.md b/docs/USAGE.md index ff051a83..33debbe5 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -4,8 +4,8 @@ - [Quick Use](#quick-use): directly use OpenVoice without installation. - [Linux Install](#linux-install): for researchers and developers only. - - [V1](#openvoice-v1) - - [V2](#openvoice-v2) + - [V1](#openvoice-v1) + - [V2](#openvoice-v2) - [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community ## Quick Use @@ -63,6 +63,7 @@ Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for language Download the checkpoint from [here](https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder. Install [MeloTTS](https://github.com/myshell-ai/MeloTTS): + ``` pip install git+https://github.com/myshell-ai/MeloTTS.git python -m unidic download @@ -70,7 +71,6 @@ python -m unidic download **Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean. - ## Install on Other Platforms This section provides the unofficial installation guides by open-source contributors in the community: @@ -79,5 +79,6 @@ This section provides the unofficial installation guides by open-source contribu - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups) - You are welcome to contribute if you have a better installation guide. We will list you here. - Docker + - [Guide](https://github.com/namanthapliyal/OpenVoice/blob/main/docs/docker_usage.md) by [@namanthapliyal](https://github.com/namanthapliyal/) - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF) - You are welcome to contribute if you have a better installation guide. We will list you here. diff --git a/docs/docker_usage.md b/docs/docker_usage.md new file mode 100644 index 00000000..c43962bf --- /dev/null +++ b/docs/docker_usage.md @@ -0,0 +1,78 @@ +## Local Development Setup + +Follow these steps to set up and run the application locally for development and debugging. + +### 1. Clone the Repository + +First, clone this repository to your local machine: + +```bash +git clone https://github.com/namanthapliyal/OpenVoice.git +cd ./OpenVoice +``` + +### 2. Build the Docker Image + +In the root location of the project, build the Docker image using the following command: + +```bash +docker build -t openvoice-fastapi . +``` + +This command will: + +Pull the nvidia/cuda base image. +Install necessary system dependencies and Python packages. +Clone the OpenVoice library. +Download pre-trained checkpoints required for voice synthesis. +Set up the working directory and expose the application port. +Build the Docker image. + +This process may take some time, especially during the initial download of the base image and checkpoints. + +### 3. Run the Docker Container + +Once the image is built, you can run a container from it. To enable GPU acceleration and map the application's port to your host machine, use the following command: + +```bash +docker run --gpus all -p 7860:7860 openvoice-fastapi + +``` + +- --gpus all: Exposes all available NVIDIA GPUs on your host to the container. Ensure the NVIDIA Container Toolkit is correctly installed. +- -p 7860:7860: Maps port 7860 inside the container (where FastAPI runs) to port 7860 on your host machine. + +The FastAPI application will now be accessible at http://localhost:7860. + +### 4. Interact with the API + +You can test the API using curl or any API client (like Postman, Insomnia, or your browser for GET requests). The primary endpoint is /synthesize/ which accepts POST requests with multipart/form-data. + +Example curl Request: + +```bash +curl -X POST "http://localhost:7860/synthesize/" \ + -H "accept: application/json" \ + -H "Content-Type: multipart/form-data" \ + -F "prompt=This is a test sentence for voice synthesis." \ + -F "style=default" \ + -F "audio_file=@/path/to/your/reference_audio.mp3" \ + -F "agree=true" \ + --output synthesized_audio.wav +``` + +Parameters: + +- prompt (string, required): The text to be synthesized. +- style (string, required): The speaking style. Supported values: default, whispering, shouting, excited, cheerful, terrified, angry, sad, friendly. (Note: Chinese only supports default). +- audio_file (file, required): An audio file (.mp3 or .wav) of the reference speaker whose voice you want to clone. +- agree (boolean, required): Must be true to accept the terms and conditions. + +The API will return the synthesized audio as a .wav file. + +Output Directory +Synthesized audio files and temporary processing files will be stored in the outputs/ directory within the container. For local debugging, you might want to mount a volume to persist these outputs on your host machine. + +### 4. Access Swagger Doc + +You can access the Swagger UI documentation by navigating to http://localhost:7860/docs in your web browser. This provides an interactive API reference and allows you to test the API endpoints directly through the UI. diff --git a/fastapi_app.py b/fastapi_app.py new file mode 100644 index 00000000..73c9ffc3 --- /dev/null +++ b/fastapi_app.py @@ -0,0 +1,107 @@ +from fastapi import FastAPI, File, UploadFile, Form, HTTPException +from fastapi.responses import FileResponse +import os +import torch +import langid +from openvoice import se_extractor +from openvoice.api import BaseSpeakerTTS, ToneColorConverter +import shutil + +app = FastAPI() + +# Configuration from openvoice_app.py +en_ckpt_base = 'checkpoints/base_speakers/EN' +zh_ckpt_base = 'checkpoints/base_speakers/ZH' +ckpt_converter = 'checkpoints/converter' +device = 'cuda' if torch.cuda.is_available() else 'cpu' +output_dir = 'outputs' +os.makedirs(output_dir, exist_ok=True) + +# Load models +en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device) +en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth') +zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device) +zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth') +tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) +tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') + +# Load speaker embeddings +en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device) +en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device) +zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device) + +supported_languages = ['zh', 'en'] + +@app.get("/") +async def root(): + return {"message": "Welcome to the OpenVoice API! Server is up and running!"} + +@app.post("/synthesize/") +async def synthesize_speech( + prompt: str = Form(...), + style: str = Form(...), + audio_file: UploadFile = File(...), +): + + # Save the uploaded audio file temporarily + temp_audio_path = os.path.join(output_dir, audio_file.filename) + with open(temp_audio_path, "wb") as buffer: + shutil.copyfileobj(audio_file.file, buffer) + + language_predicted = langid.classify(prompt)[0].strip() + print(f"Detected language: {language_predicted}") + + if language_predicted not in supported_languages: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}") + + if language_predicted == "zh": + tts_model = zh_base_speaker_tts + source_se = zh_source_se + language = 'Chinese' + if style not in ['default']: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The style {style} is not supported for Chinese, which should be in ['default']") + else: + tts_model = en_base_speaker_tts + if style == 'default': + source_se = en_source_default_se + else: + source_se = en_source_style_se + language = 'English' + if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail=f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") + + if len(prompt) < 2: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail="Please give a longer prompt text") + if len(prompt) > 200: + os.remove(temp_audio_path) + raise HTTPException(status_code=400, detail="Text length limited to 200 characters for this demo, please try shorter text.") + + try: + target_se, audio_name = se_extractor.get_se(temp_audio_path, tone_color_converter, target_dir='processed', vad=True) + except Exception as e: + os.remove(temp_audio_path) + raise HTTPException(status_code=500, detail=f"Get target tone color error: {str(e)}") + + src_path = os.path.join(output_dir, 'tmp.wav') + tts_model.tts(prompt, src_path, speaker=style, language=language) + + save_path = os.path.join(output_dir, 'output.wav') + encode_message = "@MyShell" + tone_color_converter.convert( + audio_src_path=src_path, + src_se=source_se, + tgt_se=target_se, + output_path=save_path, + message=encode_message + ) + + # Clean up temporary files + os.remove(temp_audio_path) + os.remove(src_path) + + return FileResponse(save_path, media_type="audio/wav", filename="synthesized_audio.wav") + diff --git a/requirements.txt b/requirements.txt index 8ddba70d..b3cccb77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +# OpenVoice core requirements, with pinned versions for compatibility librosa==0.9.1 faster-whisper==0.9.0 pydub==0.25.1 @@ -14,3 +15,9 @@ cn2an==0.5.22 jieba==0.42.1 gradio==3.48.0 langid==1.1.6 + +# Add extra requirements for your FastAPI wrapper +fastapi +uvicorn +python-multipart +