[Inference Providers] add support for ASR with replicate (#3538)

hanouticelina · web-flow · commit 5c3a252eb502 · 2025-11-12T17:19:33.000+01:00
diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
@@ -196,7 +196,7 @@ For more details, refer to the [Inference Providers pricing documentation](https
 | --------------------------------------------------- | ----------------- | -------- | -------- | ------ | ------ | -------------- | ------------ | ---- | ------------ | ---------- | ---------------- | --------- | ------ | ---------- | --------- | --------- | --------- | -------- | --------- | ---- |
 | [`~InferenceClient.audio_classification`]           | ❌                 | ❌        | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        | ❌      | ❌          | ❌         | ❌         | ❌   |
 | [`~InferenceClient.audio_to_audio`]                 | ❌                 | ❌        | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        | ❌      | ❌          | ❌         | ❌         | ❌   |
-| [`~InferenceClient.automatic_speech_recognition`]   | ❌                 | ❌        | ❌        | ❌      | ✅      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        | ❌      | ❌          | ❌         | ❌         | ❌   |
+| [`~InferenceClient.automatic_speech_recognition`]   | ❌                 | ❌        | ❌        | ❌      | ✅      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ✅        | ❌      | ❌          | ❌         | ❌         | ❌   |
 | [`~InferenceClient.chat_completion`]                | ❌                 | ✅        | ✅        | ✅      | ❌      | ✅              | ✅            | ✅    | ✅            | ✅          | ✅                | ✅         | ✅      | ✅          | ❌         | ✅         | ✅         | ✅        | ❌         | ✅   |
 | [`~InferenceClient.document_question_answering`]    | ❌                 | ❌        | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ❌                | ❌         | ❌         | ❌         | ❌        | ❌      | ❌          | ❌         | ❌         | ❌   |
 | [`~InferenceClient.feature_extraction`]             | ❌                 | ❌        | ❌        | ❌      | ❌      | ❌              | ❌            | ❌    | ✅            | ❌          | ✅                | ❌         | ❌      | ❌          | ❌         | ✅         | ✅         | ❌        | ❌         | ❌   |
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -452,6 +452,7 @@ def automatic_speech_recognition(
             api_key=self.token,
         )
         response = self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_params=request_parameters)
         return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
 
     @overload
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -472,6 +472,7 @@ async def automatic_speech_recognition(
             api_key=self.token,
         )
         response = await self._inner_post(request_parameters)
+        response = provider_helper.get_response(response, request_params=request_parameters)
         return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)
 
     @overload
diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py
@@ -39,7 +39,13 @@
 from .nscale import NscaleConversationalTask, NscaleTextToImageTask
 from .openai import OpenAIConversationalTask
 from .publicai import PublicAIConversationalTask
-from .replicate import ReplicateImageToImageTask, ReplicateTask, ReplicateTextToImageTask, ReplicateTextToSpeechTask
+from .replicate import (
+    ReplicateAutomaticSpeechRecognitionTask,
+    ReplicateImageToImageTask,
+    ReplicateTask,
+    ReplicateTextToImageTask,
+    ReplicateTextToSpeechTask,
+)
 from .sambanova import SambanovaConversationalTask, SambanovaFeatureExtractionTask
 from .scaleway import ScalewayConversationalTask, ScalewayFeatureExtractionTask
 from .together import TogetherConversationalTask, TogetherTextGenerationTask, TogetherTextToImageTask
@@ -170,6 +176,7 @@
         "conversational": PublicAIConversationalTask(),
     },
     "replicate": {
+        "automatic-speech-recognition": ReplicateAutomaticSpeechRecognitionTask(),
         "image-to-image": ReplicateImageToImageTask(),
         "text-to-image": ReplicateTextToImageTask(),
         "text-to-speech": ReplicateTextToSpeechTask(),
diff --git a/src/huggingface_hub/inference/_providers/fal_ai.py b/src/huggingface_hub/inference/_providers/fal_ai.py
@@ -112,7 +112,7 @@ def get_response(self, response: Union[bytes, dict], request_params: Optional[Re
         text = _as_dict(response)["text"]
         if not isinstance(text, str):
             raise ValueError(f"Unexpected output format from FalAI API. Expected string, got {type(text)}.")
-        return text
+        return {"text": text}
 
 
 class FalAITextToImageTask(FalAITask):
diff --git a/src/huggingface_hub/inference/_providers/replicate.py b/src/huggingface_hub/inference/_providers/replicate.py
@@ -72,6 +72,67 @@ def _prepare_payload_as_dict(
         return payload
 
 
+class ReplicateAutomaticSpeechRecognitionTask(ReplicateTask):
+    def __init__(self) -> None:
+        super().__init__("automatic-speech-recognition")
+
+    def _prepare_payload_as_dict(
+        self,
+        inputs: Any,
+        parameters: dict,
+        provider_mapping_info: InferenceProviderMapping,
+    ) -> Optional[dict]:
+        mapped_model = provider_mapping_info.provider_id
+        audio_url = _as_url(inputs, default_mime_type="audio/wav")
+
+        payload: dict[str, Any] = {
+            "input": {
+                **{"audio": audio_url},
+                **filter_none(parameters),
+            }
+        }
+
+        if ":" in mapped_model:
+            payload["version"] = mapped_model.split(":", 1)[1]
+
+        return payload
+
+    def get_response(self, response: Union[bytes, dict], request_params: Optional[RequestParameters] = None) -> Any:
+        response_dict = _as_dict(response)
+        output = response_dict.get("output")
+
+        if isinstance(output, str):
+            return {"text": output}
+
+        if isinstance(output, list) and output:
+            first_item = output[0]
+            if isinstance(first_item, str):
+                return {"text": first_item}
+            if isinstance(first_item, dict):
+                output = first_item
+
+        text: Optional[str] = None
+        if isinstance(output, dict):
+            transcription = output.get("transcription")
+            if isinstance(transcription, str):
+                text = transcription
+
+            translation = output.get("translation")
+            if isinstance(translation, str):
+                text = translation
+
+            txt_file = output.get("txt_file")
+            if isinstance(txt_file, str):
+                text_response = get_session().get(txt_file)
+                text_response.raise_for_status()
+                text = text_response.text
+
+        if text is not None:
+            return {"text": text}
+
+        raise ValueError("Received malformed response from Replicate automatic-speech-recognition API")
+
+
 class ReplicateImageToImageTask(ReplicateTask):
     def __init__(self):
         super().__init__("image-to-image")
diff --git a/tests/test_inference_providers.py b/tests/test_inference_providers.py
@@ -48,6 +48,7 @@
 from huggingface_hub.inference._providers.openai import OpenAIConversationalTask
 from huggingface_hub.inference._providers.publicai import PublicAIConversationalTask
 from huggingface_hub.inference._providers.replicate import (
+    ReplicateAutomaticSpeechRecognitionTask,
     ReplicateImageToImageTask,
     ReplicateTask,
     ReplicateTextToSpeechTask,
@@ -396,7 +397,7 @@ def test_automatic_speech_recognition_payload(self):
     def test_automatic_speech_recognition_response(self):
         helper = FalAIAutomaticSpeechRecognitionTask()
         response = helper.get_response({"text": "Hello world"})
-        assert response == "Hello world"
+        assert response == {"text": "Hello world"}
 
         with pytest.raises(ValueError):
             helper.get_response({"text": 123})
@@ -1423,6 +1424,74 @@ def test_prepare_url(self):
 
 
 class TestReplicateProvider:
+    def test_automatic_speech_recognition_payload(self):
+        helper = ReplicateAutomaticSpeechRecognitionTask()
+
+        mapping_info = InferenceProviderMapping(
+            provider="replicate",
+            hf_model_id="openai/whisper-large-v3",
+            providerId="openai/whisper-large-v3",
+            task="automatic-speech-recognition",
+            status="live",
+        )
+
+        payload = helper._prepare_payload_as_dict(
+            "https://example.com/audio.mp3",
+            {"language": "en"},
+            mapping_info,
+        )
+
+        assert payload == {"input": {"audio": "https://example.com/audio.mp3", "language": "en"}}
+
+        mapping_with_version = InferenceProviderMapping(
+            provider="replicate",
+            hf_model_id="openai/whisper-large-v3",
+            providerId="openai/whisper-large-v3:123",
+            task="automatic-speech-recognition",
+            status="live",
+        )
+
+        audio_bytes = b"dummy-audio"
+        encoded_audio = base64.b64encode(audio_bytes).decode()
+
+        payload = helper._prepare_payload_as_dict(
+            audio_bytes,
+            {},
+            mapping_with_version,
+        )
+
+        assert payload == {
+            "input": {"audio": f"data:audio/wav;base64,{encoded_audio}"},
+            "version": "123",
+        }
+
+    def test_automatic_speech_recognition_get_response_variants(self, mocker):
+        helper = ReplicateAutomaticSpeechRecognitionTask()
+
+        result = helper.get_response({"output": "hello"})
+        assert result == {"text": "hello"}
+
+        result = helper.get_response({"output": ["hello-world"]})
+        assert result == {"text": "hello-world"}
+
+        result = helper.get_response({"output": {"transcription": "bonjour"}})
+        assert result == {"text": "bonjour"}
+
+        result = helper.get_response({"output": {"translation": "hola"}})
+        assert result == {"text": "hola"}
+
+        mock_session = mocker.patch("huggingface_hub.inference._providers.replicate.get_session")
+        mock_response = mocker.Mock(text="file text")
+        mock_response.raise_for_status = lambda: None
+        mock_session.return_value.get.return_value = mock_response
+
+        result = helper.get_response({"output": {"txt_file": "https://example.com/output.txt"}})
+        mock_session.return_value.get.assert_called_once_with("https://example.com/output.txt")
+        assert result == {"text": "file text"}
+
+        with pytest.raises(ValueError):
+            helper.get_response({"output": 123})
+
     def test_prepare_headers(self):
         helper = ReplicateTask("text-to-image")
         headers = helper._prepare_headers({}, "my_replicate_key")

Original file line number	Diff line number	Diff line change
`@@ -452,6 +452,7 @@ def automatic_speech_recognition(`
`452`	`452`	`api_key=self.token,`
`453`	`453`	`)`
`454`	`454`	`response = self._inner_post(request_parameters)`
	`455`	`+ response = provider_helper.get_response(response, request_params=request_parameters)`
`455`	`456`	`return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)`
`456`	`457`
`457`	`458`	`@overload`
Original file line number	Diff line number	Diff line change
`@@ -472,6 +472,7 @@ async def automatic_speech_recognition(`
`472`	`472`	`api_key=self.token,`
`473`	`473`	`)`
`474`	`474`	`response = await self._inner_post(request_parameters)`
	`475`	`+ response = provider_helper.get_response(response, request_params=request_parameters)`
`475`	`476`	`return AutomaticSpeechRecognitionOutput.parse_obj_as_instance(response)`
`476`	`477`
`477`	`478`	`@overload`