From 63aada50b631ce758920959ef88fd48c77362b38 Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 31 Jan 2025 14:02:15 +0100 Subject: [PATCH] Add YuE (music gen) from fal.ai (#2801) * YuE first test (fal.ai provider) * pass genres as extra parameter * docstring * things --- src/huggingface_hub/inference/_client.py | 31 +++++++++++++++++++ .../inference/_generated/_async_client.py | 31 +++++++++++++++++++ .../inference/_providers/__init__.py | 10 ++++-- .../inference/_providers/fal_ai.py | 18 +++++++++++ 4 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 72125f8748..9f2f96d963 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -2750,6 +2750,37 @@ def text_to_speech( ... ) >>> Path("hello.flac").write_bytes(audio) ``` + + Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai + ```py + >>> from huggingface_hub import InferenceClient + >>> lyrics = ''' + ... [verse] + ... In the town where I was born + ... Lived a man who sailed to sea + ... And he told us of his life + ... In the land of submarines + ... So we sailed on to the sun + ... 'Til we found a sea of green + ... And we lived beneath the waves + ... In our yellow submarine + + ... [chorus] + ... We all live in a yellow submarine + ... Yellow submarine, yellow submarine + ... We all live in a yellow submarine + ... Yellow submarine, yellow submarine + ... ''' + >>> genres = "pavarotti-style tenor voice" + >>> client = InferenceClient( + ... provider="fal-ai", + ... model="m-a-p/YuE-s1-7B-anneal-en-cot", + ... api_key=..., + ... ) + >>> audio = client.text_to_speech(lyrics, extra_parameters={"genres": genres}) + >>> with open("output.mp3", "wb") as f: + ... f.write(audio) + ``` """ provider_helper = get_provider_helper(self.provider, task="text-to-speech") request_parameters = provider_helper.prepare_request( diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 5de9094271..5b686edac8 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -2808,6 +2808,37 @@ async def text_to_speech( ... ) >>> Path("hello.flac").write_bytes(audio) ``` + + Example music-gen using "YuE-s1-7B-anneal-en-cot" on fal.ai + ```py + >>> from huggingface_hub import InferenceClient + >>> lyrics = ''' + ... [verse] + ... In the town where I was born + ... Lived a man who sailed to sea + ... And he told us of his life + ... In the land of submarines + ... So we sailed on to the sun + ... 'Til we found a sea of green + ... And we lived beneath the waves + ... In our yellow submarine + + ... [chorus] + ... We all live in a yellow submarine + ... Yellow submarine, yellow submarine + ... We all live in a yellow submarine + ... Yellow submarine, yellow submarine + ... ''' + >>> genres = "pavarotti-style tenor voice" + >>> client = InferenceClient( + ... provider="fal-ai", + ... model="m-a-p/YuE-s1-7B-anneal-en-cot", + ... api_key=..., + ... ) + >>> audio = client.text_to_speech(lyrics, extra_parameters={"genres": genres}) + >>> with open("output.mp3", "wb") as f: + ... f.write(audio) + ``` """ provider_helper = get_provider_helper(self.provider, task="text-to-speech") request_parameters = provider_helper.prepare_request( diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py index 95c64d7da6..48fc556246 100644 --- a/src/huggingface_hub/inference/_providers/__init__.py +++ b/src/huggingface_hub/inference/_providers/__init__.py @@ -1,7 +1,12 @@ from typing import Dict, Literal from .._common import TaskProviderHelper -from .fal_ai import FalAIAutomaticSpeechRecognitionTask, FalAITextToImageTask, FalAITextToVideoTask +from .fal_ai import ( + FalAIAutomaticSpeechRecognitionTask, + FalAITextToImageTask, + FalAITextToSpeechTask, + FalAITextToVideoTask, +) from .hf_inference import HFInferenceBinaryInputTask, HFInferenceConversational, HFInferenceTask from .replicate import ReplicateTask, ReplicateTextToSpeechTask from .sambanova import SambanovaConversationalTask @@ -18,8 +23,9 @@ PROVIDERS: Dict[PROVIDER_T, Dict[str, TaskProviderHelper]] = { "fal-ai": { - "text-to-image": FalAITextToImageTask(), "automatic-speech-recognition": FalAIAutomaticSpeechRecognitionTask(), + "text-to-image": FalAITextToImageTask(), + "text-to-speech": FalAITextToSpeechTask(), "text-to-video": FalAITextToVideoTask(), }, "hf-inference": { diff --git a/src/huggingface_hub/inference/_providers/fal_ai.py b/src/huggingface_hub/inference/_providers/fal_ai.py index 00cecd7e9f..ec8e7277fc 100644 --- a/src/huggingface_hub/inference/_providers/fal_ai.py +++ b/src/huggingface_hub/inference/_providers/fal_ai.py @@ -28,6 +28,9 @@ "stabilityai/stable-diffusion-3.5-large": "fal-ai/stable-diffusion-v35-large", "Warlord-K/Sana-1024": "fal-ai/sana", }, + "text-to-speech": { + "m-a-p/YuE-s1-7B-anneal-en-cot": "fal-ai/yue", + }, "text-to-video": { "genmo/mochi-1-preview": "fal-ai/mochi-v1", "tencent/HunyuanVideo": "fal-ai/hunyuan-video", @@ -146,6 +149,21 @@ def get_response(self, response: Union[bytes, Dict]) -> Any: return get_session().get(url).content +class FalAITextToSpeechTask(FalAITask): + def __init__(self): + super().__init__("text-to-speech") + + def _prepare_payload(self, inputs: Any, parameters: Dict[str, Any]) -> Dict[str, Any]: + return { + "lyrics": inputs, + **{k: v for k, v in parameters.items() if v is not None}, + } + + def get_response(self, response: Union[bytes, Dict]) -> Any: + url = _as_dict(response)["audio"]["url"] + return get_session().get(url).content + + class FalAITextToVideoTask(FalAITask): def __init__(self): super().__init__("text-to-video")