From 63aada50b631ce758920959ef88fd48c77362b38 Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Fri, 31 Jan 2025 14:02:15 +0100
Subject: [PATCH] Add YuE (music gen) from fal.ai (#2801)

* YuE first test (fal.ai provider)

* pass genres as extra parameter

* docstring

* things
---
 src/huggingface_hub/inference/_client.py      | 31 +++++++++++++++++++
 .../inference/_generated/_async_client.py     | 31 +++++++++++++++++++
 .../inference/_providers/__init__.py          | 10 ++++--
 .../inference/_providers/fal_ai.py            | 18 +++++++++++
 4 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index 72125f8748..9f2f96d963 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -2750,6 +2750,37 @@ def text_to_speech(
         ... )
         >>> Path("hello.flac").write_bytes(audio)
         ```
+
+        Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> lyrics = '''
+        ... [verse]
+        ... In the town where I was born
+        ... Lived a man who sailed to sea
+        ... And he told us of his life
+        ... In the land of submarines
+        ... So we sailed on to the sun
+        ... 'Til we found a sea of green
+        ... And we lived beneath the waves
+        ... In our yellow submarine
+
+        ... [chorus]
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... '''
+        >>> genres = "pavarotti-style tenor voice"
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",
+        ...     model="m-a-p/YuE-s1-7B-anneal-en-cot",
+        ...     api_key=...,
+        ... )
+        >>> audio = client.text_to_speech(lyrics, extra_parameters={"genres": genres})
+        >>> with open("output.mp3", "wb") as f:
+        ...     f.write(audio)
+        ```
         """
         provider_helper = get_provider_helper(self.provider, task="text-to-speech")
         request_parameters = provider_helper.prepare_request(
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 5de9094271..5b686edac8 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -2808,6 +2808,37 @@ async def text_to_speech(
         ... )
         >>> Path("hello.flac").write_bytes(audio)
         ```
+
+        Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> lyrics = '''
+        ... [verse]
+        ... In the town where I was born
+        ... Lived a man who sailed to sea
+        ... And he told us of his life
+        ... In the land of submarines
+        ... So we sailed on to the sun
+        ... 'Til we found a sea of green
+        ... And we lived beneath the waves
+        ... In our yellow submarine
+
+        ... [chorus]
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... We all live in a yellow submarine
+        ... Yellow submarine, yellow submarine
+        ... '''
+        >>> genres = "pavarotti-style tenor voice"
+        >>> client = InferenceClient(
+        ...     provider="fal-ai",
+        ...     model="m-a-p/YuE-s1-7B-anneal-en-cot",
+        ...     api_key=...,
+        ... )
+        >>> audio = client.text_to_speech(lyrics, extra_parameters={"genres": genres})
+        >>> with open("output.mp3", "wb") as f:
+        ...     f.write(audio)
+        ```
         """
         provider_helper = get_provider_helper(self.provider, task="text-to-speech")
         request_parameters = provider_helper.prepare_request(
diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py
index 95c64d7da6..48fc556246 100644
--- a/src/huggingface_hub/inference/_providers/__init__.py
+++ b/src/huggingface_hub/inference/_providers/__init__.py
@@ -1,7 +1,12 @@
 from typing import Dict, Literal
 
 from .._common import TaskProviderHelper
-from .fal_ai import FalAIAutomaticSpeechRecognitionTask, FalAITextToImageTask, FalAITextToVideoTask
+from .fal_ai import (
+    FalAIAutomaticSpeechRecognitionTask,
+    FalAITextToImageTask,
+    FalAITextToSpeechTask,
+    FalAITextToVideoTask,
+)
 from .hf_inference import HFInferenceBinaryInputTask, HFInferenceConversational, HFInferenceTask
 from .replicate import ReplicateTask, ReplicateTextToSpeechTask
 from .sambanova import SambanovaConversationalTask
@@ -18,8 +23,9 @@
 
 PROVIDERS: Dict[PROVIDER_T, Dict[str, TaskProviderHelper]] = {
     "fal-ai": {
-        "text-to-image": FalAITextToImageTask(),
         "automatic-speech-recognition": FalAIAutomaticSpeechRecognitionTask(),
+        "text-to-image": FalAITextToImageTask(),
+        "text-to-speech": FalAITextToSpeechTask(),
         "text-to-video": FalAITextToVideoTask(),
     },
     "hf-inference": {
diff --git a/src/huggingface_hub/inference/_providers/fal_ai.py b/src/huggingface_hub/inference/_providers/fal_ai.py
index 00cecd7e9f..ec8e7277fc 100644
--- a/src/huggingface_hub/inference/_providers/fal_ai.py
+++ b/src/huggingface_hub/inference/_providers/fal_ai.py
@@ -28,6 +28,9 @@
         "stabilityai/stable-diffusion-3.5-large": "fal-ai/stable-diffusion-v35-large",
         "Warlord-K/Sana-1024": "fal-ai/sana",
     },
+    "text-to-speech": {
+        "m-a-p/YuE-s1-7B-anneal-en-cot": "fal-ai/yue",
+    },
     "text-to-video": {
         "genmo/mochi-1-preview": "fal-ai/mochi-v1",
         "tencent/HunyuanVideo": "fal-ai/hunyuan-video",
@@ -146,6 +149,21 @@ def get_response(self, response: Union[bytes, Dict]) -> Any:
         return get_session().get(url).content
 
 
+class FalAITextToSpeechTask(FalAITask):
+    def __init__(self):
+        super().__init__("text-to-speech")
+
+    def _prepare_payload(self, inputs: Any, parameters: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            "lyrics": inputs,
+            **{k: v for k, v in parameters.items() if v is not None},
+        }
+
+    def get_response(self, response: Union[bytes, Dict]) -> Any:
+        url = _as_dict(response)["audio"]["url"]
+        return get_session().get(url).content
+
+
 class FalAITextToVideoTask(FalAITask):
     def __init__(self):
         super().__init__("text-to-video")