From 780bf06fa066e41f1c8c53a43c43ba71689d3149 Mon Sep 17 00:00:00 2001
From: Roberts Slisans <rsxdalv@gmail.com>
Date: Fri, 29 Mar 2024 08:02:54 +0200
Subject: [PATCH] Add Tab Voice Clone and Utils (#296)

* add information about voice cloning to tab voice clone

* readme

* add GPU Info tab

* readme
---
 README.md                         |  6 ++++
 server.py                         |  3 ++
 src/bark/clone/tab_voice_clone.py | 25 +++++++++++++++--
 src/utils/gpu_info_tab.py         | 46 +++++++++++++++++++++++++++++++
 4 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 src/utils/gpu_info_tab.py

diff --git a/README.md b/README.md
index a8d21a7d..6e3e281c 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,12 @@ https://rsxdalv.github.io/bark-speaker-directory/
 https://github.com/rsxdalv/tts-generation-webui/discussions/186#discussioncomment-7291274
 
 ## Changelog
+Mar 28:
+* Add GPU Info tab
+
+Mar 27:
+* Add information about voice cloning to tab voice clone
+
 Mar 26:
 * Add Maha TTS demo notebook
 
diff --git a/server.py b/server.py
index 950c3b82..3c452aa3 100644
--- a/server.py
+++ b/server.py
@@ -31,6 +31,7 @@
 from src.css.css import full_css
 from src.Joutai import Joutai
 from src.history_tab.collections_directories_atom import collections_directories_atom
+from src.utils.gpu_info_tab import gpu_info_tab
 
 
 setup_or_recover.dummy()
@@ -53,6 +54,7 @@ def reload_config_and_restart_ui():
     else default_config
 )
 
+
 with gr.Blocks(
     css=full_css,
     title="TTS Generation WebUI",
@@ -167,6 +169,7 @@ def reload_config_and_restart_ui():
             model_location_settings_tab()
 
         remixer_input = simple_remixer_tab()
+        gpu_info_tab()
     Joutai.singleton.tabs.render()
 
 
diff --git a/src/bark/clone/tab_voice_clone.py b/src/bark/clone/tab_voice_clone.py
index d6706004..3daa1d9c 100644
--- a/src/bark/clone/tab_voice_clone.py
+++ b/src/bark/clone/tab_voice_clone.py
@@ -140,8 +140,29 @@ def tab_voice_clone(register_use_as_history_button):
                 """
             Unethical use of this technology is prohibited.
             This demo is based on https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer repository.
+
+            Information from the original repository (https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer?tab=readme-ov-file#voices-cloned-arent-very-convincing-why-are-other-peoples-cloned-voices-better-than-mine)
+
+            ## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
+            Make sure these things are **NOT** in your voice input: (in no particular order)
+            * Noise (You can use a noise remover before)
+            * Music (There are also music remover tools) (Unless you want music in the background)
+            * A cut-off at the end (This will cause it to try and continue on the generation)
+            * Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
+
+            What makes for good prompt audio? (in no particular order)
+            * Clearly spoken
+            * No weird background noises
+            * Only one speaker
+            * Audio which ends after a sentence ends
+            * Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
+            * Around 10 seconds of data
+
             """
             )
+
+
+        with gr.Column():
             tokenizer_dropdown = gr.Dropdown(
                 label="Tokenizer",
                 choices=[
@@ -165,7 +186,6 @@ def tab_voice_clone(register_use_as_history_button):
                 source="upload",
                 interactive=True,
             )
-
             with gr.Row():
                 use_gpu_checkbox = gr.Checkbox(label="Use GPU", value=True)
                 clear_models_button = gr.Button(
@@ -207,8 +227,7 @@ def load_tokenizer(tokenizer_and_repo: str, use_gpu: bool):
                 outputs=[tokenizer_dropdown],
                 api_name="bark_voice_tokenizer_load",
             )
-
-        with gr.Column():
+            
             gr.Markdown("Generated voice:")
             voice_file_name = gr.Textbox(
                 label="Voice file name", value="", interactive=False
diff --git a/src/utils/gpu_info_tab.py b/src/utils/gpu_info_tab.py
new file mode 100644
index 00000000..c445b67d
--- /dev/null
+++ b/src/utils/gpu_info_tab.py
@@ -0,0 +1,46 @@
+import gradio as gr
+import torch
+
+
+def gpu_info_tab():
+    with gr.Tab("GPU Info"):
+        gpu_info = gr.Markdown(render_gpu_info(get_gpu_info()))
+
+        gr.Button("Refresh").click(
+            fn=refresh_gpu_info, outputs=gpu_info, api_name="refresh_gpu_info"
+        )
+
+        gr.Button("API_GET_GPU_INFO", visible=False).click(
+            fn=get_gpu_info, api_name="get_gpu_info"
+        )
+
+
+def get_gpu_info():
+    if torch.cuda.is_available():
+        vram = torch.cuda.get_device_properties(0).total_memory / 1024**2
+        name = torch.cuda.get_device_properties(0).name
+        cuda_capabilities = torch.cuda.get_device_capability(0)
+        used_vram = torch.cuda.memory_allocated(0) / 1024**2
+        used_vram_total = (
+            torch.cuda.mem_get_info(0)[1] - torch.cuda.mem_get_info(0)[0]
+        ) / 1024**2
+        return {
+            "vram": vram,
+            "name": name,
+            "cuda_capabilities": cuda_capabilities,
+            "used_vram": used_vram,
+            "used_vram_total": used_vram_total,
+        }
+    else:
+        return "No GPU with CUDA support detected by PyTorch"
+
+
+def render_gpu_info(gpu_info):
+    if isinstance(gpu_info, dict):
+        return f"VRAM: {gpu_info['vram']} MB\n\nUsed VRAM: {gpu_info['used_vram']} MB\n\nTotal Used VRAM: {gpu_info['used_vram_total']} MB\n\nName: {gpu_info['name']}\n\nCUDA Capabilities: {gpu_info['cuda_capabilities']}"
+    else:
+        return gpu_info
+
+
+def refresh_gpu_info():
+    return render_gpu_info(get_gpu_info())