From 798d082448fe35df63668b83f45cbd19e2e27616 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Wed, 8 Jan 2025 17:02:03 +0100 Subject: [PATCH 01/50] draft initial implementation of Realtime API --- python/pyproject.toml | 6 + .../audio/04-chat_with_realtime_api.py | 126 ++++++++ python/samples/concepts/audio/audio_player.py | 2 +- .../concepts/audio/audio_player_async.py | 75 +++++ .../concepts/audio/audio_recorder_stream.py | 59 ++++ .../ai/chat_completion_client_base.py | 28 +- .../open_ai_realtime_execution_settings.py | 48 +++ .../ai/open_ai/services/open_ai_realtime.py | 66 ++++ .../open_ai/services/open_ai_realtime_base.py | 294 ++++++++++++++++++ .../services/open_ai_realtime_utils.py | 47 +++ .../connectors/ai/realtime_client_base.py | 51 +++ .../contents/chat_message_content.py | 2 + .../contents/function_call_content.py | 1 + .../streaming_chat_message_content.py | 2 + .../tests/unit/contents/test_audio_content.py | 60 ++++ python/uv.lock | 111 +++++-- 16 files changed, 940 insertions(+), 38 deletions(-) create mode 100644 python/samples/concepts/audio/04-chat_with_realtime_api.py create mode 100644 python/samples/concepts/audio/audio_player_async.py create mode 100644 python/samples/concepts/audio/audio_recorder_stream.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py create mode 100644 python/semantic_kernel/connectors/ai/realtime_client_base.py create mode 100644 python/tests/unit/contents/test_audio_content.py diff --git a/python/pyproject.toml b/python/pyproject.toml index a7d7277eef61..c33bf21ede6c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -128,6 +128,12 @@ dapr = [ "dapr-ext-fastapi>=1.14.0", "flask-dapr>=1.14.0" ] +openai_realtime = [ + "openai[realtime] ~= 1.0", + "pyaudio", + "pydub", + "sounddevice" +] [tool.uv] prerelease = "if-necessary-or-explicit" diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py new file mode 100644 index 000000000000..4440d13b8eec --- /dev/null +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -0,0 +1,126 @@ +# Copyright (c) Microsoft. All rights reserved. +import asyncio +import contextlib +import logging +import signal + +from samples.concepts.audio.audio_player_async import AudioPlayerAsync + +# This simple sample demonstrates how to use the OpenAI Realtime API to create +# a chat bot that can listen and respond directly through audio. +# It requires installing semantic-kernel[openai_realtime] which includes the +# OpenAI Realtime API client and some packages for handling audio locally. +# It has hardcoded device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +from samples.concepts.audio.audio_recorder_stream import AudioRecorderStream +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, + TurnDetection, +) +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime +from semantic_kernel.contents import AudioContent, ChatHistory, StreamingTextContent +from semantic_kernel.functions import kernel_function + +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) + + +def signal_handler(): + for task in asyncio.all_tasks(): + task.cancel() + + +system_message = """ +You are a chat bot. Your name is Mosscap and +you have one goal: figure out what people need. +Your full name, should you need to know it, is +Splendid Speckled Mosscap. You communicate +effectively, but you tend to answer with long +flowery prose. +""" + +history = ChatHistory() +history.add_user_message("Hi there, who are you?") +history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + + +class Speaker: + def __init__(self, audio_player: AudioPlayerAsync, realtime_client: OpenAIRealtime, kernel: Kernel): + self.audio_player = audio_player + self.realtime_client = realtime_client + self.kernel = kernel + + async def play( + self, + chat_history: ChatHistory, + settings: OpenAIRealtimeExecutionSettings, + ) -> None: + self.audio_player.reset_frame_count() + print("Mosscap (transcript): ", end="") + try: + async for content in self.realtime_client.get_streaming_chat_message_content( + chat_history=chat_history, settings=settings, kernel=self.kernel + ): + if not content: + continue + for item in content.items: + match item: + case StreamingTextContent(): + print(item.text, end="") + await asyncio.sleep(0.01) + continue + case AudioContent(): + self.audio_player.add_data(item.data) + await asyncio.sleep(0.01) + continue + except asyncio.CancelledError: + print("\nThanks for talking to Mosscap!") + + +class Microphone: + def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: OpenAIRealtime): + self.audio_recorder = audio_recorder + self.realtime_client = realtime_client + + async def record_audio(self): + with contextlib.suppress(asyncio.CancelledError): + async for audio in self.audio_recorder.stream_audio_content(): + if audio.data: + await self.realtime_client.send_content(content=audio) + await asyncio.sleep(0.01) + + +@kernel_function +def get_weather(location: str) -> str: + """Get the weather for a location.""" + logger.debug(f"Getting weather for {location}") + return f"The weather in {location} is sunny." + + +async def main() -> None: + loop = asyncio.get_event_loop() + loop.add_signal_handler(signal.SIGINT, signal_handler) + settings = OpenAIRealtimeExecutionSettings( + instructions=system_message, + voice="sage", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + realtime_client = OpenAIRealtime(ai_model_id="gpt-4o-realtime-preview-2024-12-17") + kernel = Kernel() + kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) + + speaker = Speaker(AudioPlayerAsync(), realtime_client, kernel) + microphone = Microphone(AudioRecorderStream(), realtime_client) + with contextlib.suppress(asyncio.CancelledError): + await asyncio.gather(*[speaker.play(history, settings), microphone.record_audio()]) + + +if __name__ == "__main__": + print( + "Instruction: start speaking, when you stop the API should detect you finished and start responding." + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/audio/audio_player.py b/python/samples/concepts/audio/audio_player.py index b10c15184821..036b978dcff1 100644 --- a/python/samples/concepts/audio/audio_player.py +++ b/python/samples/concepts/audio/audio_player.py @@ -20,7 +20,7 @@ class AudioPlayer(BaseModel): # Audio replay parameters CHUNK: ClassVar[int] = 1024 - audio_content: AudioContent + audio_content: AudioContent | None = None def play(self, text: str | None = None) -> None: """Play the audio content to the default audio output device. diff --git a/python/samples/concepts/audio/audio_player_async.py b/python/samples/concepts/audio/audio_player_async.py new file mode 100644 index 000000000000..9ae424b01c66 --- /dev/null +++ b/python/samples/concepts/audio/audio_player_async.py @@ -0,0 +1,75 @@ +# Copyright (c) Microsoft. All rights reserved. + +import threading + +import numpy as np +import pyaudio +import sounddevice as sd + +CHUNK_LENGTH_S = 0.05 # 100ms +SAMPLE_RATE = 24000 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 + + +class AudioPlayerAsync: + def __init__(self): + self.queue = [] + self.lock = threading.Lock() + self.stream = sd.OutputStream( + callback=self.callback, + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype=np.int16, + blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), + device=3, + ) + self.playing = False + self._frame_count = 0 + + def callback(self, outdata, frames, time, status): # noqa + with self.lock: + data = np.empty(0, dtype=np.int16) + + # get next item from queue if there is still space in the buffer + while len(data) < frames and len(self.queue) > 0: + item = self.queue.pop(0) + frames_needed = frames - len(data) + data = np.concatenate((data, item[:frames_needed])) + if len(item) > frames_needed: + self.queue.insert(0, item[frames_needed:]) + + self._frame_count += len(data) + + # fill the rest of the frames with zeros if there is no more data + if len(data) < frames: + data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) + + outdata[:] = data.reshape(-1, 1) + + def reset_frame_count(self): + self._frame_count = 0 + + def get_frame_count(self): + return self._frame_count + + def add_data(self, data: bytes): + with self.lock: + # bytes is pcm16 single channel audio data, convert to numpy array + np_data = np.frombuffer(data, dtype=np.int16) + self.queue.append(np_data) + if not self.playing: + self.start() + + def start(self): + self.playing = True + self.stream.start() + + def stop(self): + self.playing = False + self.stream.stop() + with self.lock: + self.queue = [] + + def terminate(self): + self.stream.close() diff --git a/python/samples/concepts/audio/audio_recorder_stream.py b/python/samples/concepts/audio/audio_recorder_stream.py new file mode 100644 index 000000000000..99ac1a9f8141 --- /dev/null +++ b/python/samples/concepts/audio/audio_recorder_stream.py @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +from collections.abc import AsyncGenerator +from typing import Any, ClassVar, cast + +from pydantic import BaseModel + +from semantic_kernel.contents.audio_content import AudioContent + + +class AudioRecorderStream(BaseModel): + """A class to record audio from the microphone and save it to a WAV file. + + To start recording, press the spacebar. To stop recording, release the spacebar. + + To use as a context manager, that automatically removes the output file after exiting the context: + ``` + with AudioRecorder(output_filepath="output.wav") as recorder: + recorder.start_recording() + # Do something with the recorded audio + ... + ``` + """ + + # Audio recording parameters + CHANNELS: ClassVar[int] = 1 + SAMPLE_RATE: ClassVar[int] = 24000 + CHUNK_LENGTH_S: ClassVar[float] = 0.05 + + async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: + import sounddevice as sd # type: ignore + + # device_info = sd.query_devices() + # print(device_info) + + read_size = int(self.SAMPLE_RATE * 0.02) + + stream = sd.InputStream( + channels=self.CHANNELS, + samplerate=self.SAMPLE_RATE, + dtype="int16", + device=4, + ) + stream.start() + try: + while True: + if stream.read_available < read_size: + await asyncio.sleep(0) + continue + + data, _ = stream.read(read_size) + yield AudioContent(data=base64.b64encode(cast(Any, data)), data_format="base64", mime_type="audio/wav") + except KeyboardInterrupt: + pass + finally: + stream.stop() + stream.close() diff --git a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py index b7be735e95d8..a402fad10b53 100644 --- a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py +++ b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py @@ -222,7 +222,7 @@ async def get_streaming_chat_message_contents( if not self.SUPPORTS_FUNCTION_CALLING: async for streaming_chat_message_contents in self._inner_get_streaming_chat_message_contents( - chat_history, settings + chat_history, settings, **kwargs ): yield streaming_chat_message_contents return @@ -247,7 +247,7 @@ async def get_streaming_chat_message_contents( or not settings.function_choice_behavior.auto_invoke_kernel_functions ): async for streaming_chat_message_contents in self._inner_get_streaming_chat_message_contents( - chat_history, settings + chat_history, settings, **kwargs ): yield streaming_chat_message_contents return @@ -259,12 +259,14 @@ async def get_streaming_chat_message_contents( all_messages: list["StreamingChatMessageContent"] = [] function_call_returned = False async for messages in self._inner_get_streaming_chat_message_contents( - chat_history, settings, request_index + chat_history, settings, request_index, **kwargs ): for msg in messages: if msg is not None: all_messages.append(msg) - if any(isinstance(item, FunctionCallContent) for item in msg.items): + if not function_call_returned and any( + isinstance(item, FunctionCallContent) for item in msg.items + ): function_call_returned = True yield messages @@ -310,6 +312,7 @@ async def get_streaming_chat_message_contents( function_invoke_attempt=request_index, ) if self._yield_function_result_messages(function_result_messages): + await self._streaming_function_call_result_callback(function_result_messages) yield function_result_messages if any(result.terminate for result in results if result is not None): @@ -432,7 +435,22 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str: return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id def _yield_function_result_messages(self, function_result_messages: list) -> bool: - """Determine if the function result messages should be yielded.""" + """Determine if the function result messages should be yielded. + + If there are messages and if the first message has items, then yield the messages. + """ return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0 + async def _streaming_function_call_result_callback( + self, function_result_messages: list["ChatMessageContent"] + ) -> None: + """Callback to handle the streaming function call result messages. + + Override this method to handle the streaming function call result messages. + + Args: + function_result_messages (list): The streaming function call result messages. + """ + return + # endregion diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py new file mode 100644 index 000000000000..480e2ed1373f --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft. All rights reserved. + +from collections.abc import Sequence +from typing import Annotated, Any, Literal + +from pydantic import Field + +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.kernel_pydantic import KernelBaseModel + + +class TurnDetection(KernelBaseModel): + """Turn detection settings.""" + + type: Literal["server_vad"] | None = None + threshold: Annotated[float | None, Field(ge=0, le=1)] = None + prefix_padding_ms: Annotated[int | None, Field(ge=0)] = None + silence_duration_ms: Annotated[int | None, Field(ge=0)] = None + create_response: bool | None = None + + +class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): + """Request settings for OpenAI realtime services.""" + + modalities: Sequence[Literal["audio", "text"]] | None = None + ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None + instructions: str | None = None + voice: str | None = None + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None + output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None + input_audio_transcription: dict[str, Any] | None = None + turn_detection: TurnDetection | None = None + tools: Annotated[ + list[dict[str, Any]] | None, + Field( + description="Do not set this manually. It is set by the service based " + "on the function choice configuration.", + ), + ] = None + tool_choice: Annotated[ + str | None, + Field( + description="Do not set this manually. It is set by the service based " + "on the function choice configuration.", + ), + ] = None + temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None + max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py new file mode 100644 index 000000000000..23351d7b6176 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft. All rights reserved. + +from collections.abc import Mapping + +from openai import AsyncOpenAI +from pydantic import ValidationError + +from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIModelTypes +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import OpenAIRealtimeBase +from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings +from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError + + +class OpenAIRealtime(OpenAIRealtimeBase, OpenAIConfigBase): + """OpenAI Realtime service.""" + + def __init__( + self, + ai_model_id: str | None = None, + api_key: str | None = None, + org_id: str | None = None, + service_id: str | None = None, + default_headers: Mapping[str, str] | None = None, + async_client: AsyncOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + ) -> None: + """Initialize an OpenAITextCompletion service. + + Args: + ai_model_id (str | None): OpenAI model name, see + https://platform.openai.com/docs/models + service_id (str | None): Service ID tied to the execution settings. + api_key (str | None): The optional API key to use. If provided will override, + the env vars or .env file value. + org_id (str | None): The optional org ID to use. If provided will override, + the env vars or .env file value. + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + async_client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + """ + try: + openai_settings = OpenAISettings.create( + api_key=api_key, + org_id=org_id, + text_model_id=ai_model_id, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not openai_settings.text_model_id: + raise ServiceInitializationError("The OpenAI text model ID is required.") + super().__init__( + ai_model_id=openai_settings.text_model_id, + service_id=service_id, + api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, + org_id=openai_settings.org_id, + ai_model_type=OpenAIModelTypes.TEXT, + default_headers=default_headers, + client=async_client, + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py new file mode 100644 index 000000000000..c73f12d7f343 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -0,0 +1,294 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +import logging +import sys +from collections.abc import AsyncGenerator, Callable +from typing import TYPE_CHECKING, Any, ClassVar + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection +from openai.types.beta.realtime.conversation_item_create_event_param import ConversationItemParam +from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent +from openai.types.beta.realtime.session import Session +from pydantic import Field + +from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_utils import ( + update_settings_from_function_call_configuration, +) +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.contents.streaming_text_content import StreamingTextContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.contents.utils.author_role import AuthorRole +from semantic_kernel.kernel import Kernel + +if TYPE_CHECKING: + from semantic_kernel.contents.chat_history import ChatHistory + +logger: logging.Logger = logging.getLogger(__name__) + + +class OpenAIRealtimeBase(OpenAIHandler, ChatCompletionClientBase): + """OpenAI Realtime service.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True + connection: AsyncRealtimeConnection | None = None + connected: asyncio.Event = Field(default_factory=asyncio.Event) + session: Session | None = None + + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + """Get the request settings class.""" + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + async def _get_connection(self) -> AsyncRealtimeConnection: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection not established") + return self.connection + + @override + async def _inner_get_streaming_chat_message_contents( + self, + chat_history: "ChatHistory", + settings: "PromptExecutionSettings", + function_invoke_attempt: int = 0, + **kwargs: Any, + ) -> AsyncGenerator[list[StreamingChatMessageContent], Any]: + if not isinstance(settings, self.get_prompt_execution_settings_class()): + settings = self.get_prompt_execution_settings_from_settings(settings) + + events: list[RealtimeServerEvent] = [] + detailed_events: dict[str, list[RealtimeServerEvent]] = {} + function_calls: list[StreamingChatMessageContent] = [] + + async with self.client.beta.realtime.connect(model=self.ai_model_id) as conn: + self.connection = conn + self.connected.set() + + await conn.session.update(session=settings.prepare_settings_dict()) + if len(chat_history) > 0: + await asyncio.gather(*(self._add_content_to_conversation(msg) for msg in chat_history.messages)) + + async for event in conn: + events.append(event) + detailed_events.setdefault(event.type, []).append(event) + match event.type: + case "session.created" | "session.updated": + self.session = event.session + continue + case "error": + logger.error("Error received: %s", event.error) + continue + case "response.audio.delta": + yield [ + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[AudioContent(data=base64.b64decode(event.delta), data_format="base64")], + choice_index=event.content_index, + inner_content=event, + ) + ] + continue + case "response.audio_transcript.delta": + yield [ + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ) + ] + continue + case "response.audio_transcript.done": + chat_history.add_message( + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ) + ) + case "response.function_call_arguments.delta": + msg = StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + ], + choice_index=0, + inner_content=event, + ) + function_calls.append(msg) + yield [msg] + continue + case "response.function_call_arguments.done": + # execute function, add result to conversation + if len(function_calls) > 0: + function_call = sum(function_calls[1:], function_calls[0]) + # execute function + results = [] + for item in function_call.items: + if isinstance(item, FunctionCallContent): + kernel: Kernel | None = kwargs.get("kernel") + call_id = item.name + function_name = next( + output_item_event.item.name + for output_item_event in detailed_events["response.output_item.added"] + if output_item_event.item.call_id == call_id + ) + item.plugin_name, item.function_name = function_name.split("-", 1) + if kernel: + await kernel.invoke_function_call(item, chat_history) + # add result to conversation + results.append(chat_history.messages[-1]) + for message in results: + await self._add_content_to_conversation(content=message) + case _: + logger.debug("Unhandled event type: %s", event.type) + logger.debug(f"Finished streaming chat message contents, {len(events)} events received.") + for event_type in detailed_events: + logger.debug(f"Event type: {event_type}, count: {len(detailed_events[event_type])}") + + async def send_content( + self, + content: ChatMessageContent | AudioContent | AsyncGenerator[AudioContent, Any], + **kwargs: Any, + ) -> None: + """Send a chat message content to the service. + + This content should contain audio content, either as a ChatMessageContent with a + AudioContent item, as AudioContent directly, as or as a generator of AudioContent. + + """ + if isinstance(content, AudioContent | ChatMessageContent): + if isinstance(content, ChatMessageContent): + content = next(item for item in content.items if isinstance(item, AudioContent)) + connection = await self._get_connection() + await connection.input_audio_buffer.append(audio=content.data.decode("utf-8")) + await asyncio.sleep(0) + return + + async for audio_content in content: + if isinstance(audio_content, ChatMessageContent): + audio_content = next(item for item in audio_content.items if isinstance(item, AudioContent)) + connection = await self._get_connection() + await connection.input_audio_buffer.append(audio=audio_content.data.decode("utf-8")) + await asyncio.sleep(0) + + async def commit_content(self, settings: "PromptExecutionSettings") -> None: + """Commit the chat message content to the service. + + This is only needed when turn detection is not handled by the service. + + This behavior is determined by the turn_detection parameter in the settings. + If turn_detection is None, then it will commit the audio buffer and + ask the service to process the audio and create the response. + """ + if not isinstance(settings, self.get_prompt_execution_settings_class()): + settings = self.get_prompt_execution_settings_from_settings(settings) + if not settings.turn_detection: + connection = await self._get_connection() + await connection.input_audio_buffer.commit() + await connection.response.create() + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration + + async def _streaming_function_call_result_callback( + self, function_result_messages: list[StreamingChatMessageContent] + ) -> None: + """Callback to handle the streaming function call result messages. + + Override this method to handle the streaming function call result messages. + + Args: + function_result_messages (list): The streaming function call result messages. + """ + for msg in function_result_messages: + await self._add_content_to_conversation(msg) + + async def _add_content_to_conversation(self, content: ChatMessageContent) -> None: + """Add an item to the conversation.""" + connection = await self._get_connection() + for item in content.items: + match item: + case AudioContent(): + await connection.conversation.item.create( + item=ConversationItemParam( + type="message", + content=[ + { + "type": "input_audio", + "audio": item.data.decode("utf-8"), + } + ], + role="user", + ) + ) + case TextContent(): + await connection.conversation.item.create( + item=ConversationItemParam( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ) + ) + case FunctionCallContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function call needs to have a call_id") + continue + await connection.conversation.item.create( + item=ConversationItemParam( + type="function_call", + name=item.name, + arguments=item.arguments, + call_id=call_id, + ) + ) + case FunctionResultContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function result needs to have a call_id") + continue + await connection.conversation.item.create( + item=ConversationItemParam( + type="function_call_output", + output=item.result, + call_id=call_id, + ) + ) + case _: + logger.debug("Unhandled item type: %s", item.__class__.__name__) + continue diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py new file mode 100644 index 000000000000..ada8d42924c0 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft. All rights reserved. + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from semantic_kernel.connectors.ai.function_choice_behavior import ( + FunctionCallChoiceConfiguration, + FunctionChoiceType, + ) + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + + +def update_settings_from_function_call_configuration( + function_choice_configuration: "FunctionCallChoiceConfiguration", + settings: "PromptExecutionSettings", + type: "FunctionChoiceType", +) -> None: + """Update the settings from a FunctionChoiceConfiguration.""" + if ( + function_choice_configuration.available_functions + and hasattr(settings, "tool_choice") + and hasattr(settings, "tools") + ): + settings.tool_choice = type + settings.tools = [ + kernel_function_metadata_to_function_call_format(f) + for f in function_choice_configuration.available_functions + ] + + +def kernel_function_metadata_to_function_call_format( + metadata: "KernelFunctionMetadata", +) -> dict[str, Any]: + """Convert the kernel function metadata to function calling format.""" + return { + "type": "function", + "name": metadata.fully_qualified_name, + "description": metadata.description or "", + "parameters": { + "type": "object", + "properties": { + param.name: param.schema_data for param in metadata.parameters if param.include_in_function_choices + }, + "required": [p.name for p in metadata.parameters if p.is_required and p.include_in_function_choices], + }, + } diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py new file mode 100644 index 000000000000..734e7e7caed4 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -0,0 +1,51 @@ +# Copyright (c) Microsoft. All rights reserved. + + +from abc import ABC, abstractmethod +from collections.abc import AsyncGenerator +from typing import Any + +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.services.ai_service_client_base import AIServiceClientBase + + +class RealtimeClientBase(AIServiceClientBase, ABC): + """Base class for audio to text client.""" + + @abstractmethod + async def receive( + self, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> AsyncGenerator[TextContent | AudioContent, Any]: + """Get text contents from audio. + + Args: + settings: Prompt execution settings. + kwargs: Additional arguments. + + Returns: + list[TextContent | AudioContent]: response contents. + """ + raise NotImplementedError + + @abstractmethod + async def send( + self, + audio_content: AudioContent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Get text content from audio. + + Args: + audio_content: Audio content. + settings: Prompt execution settings. + kwargs: Additional arguments. + + Returns: + TextContent: Text content. + """ + raise NotImplementedError diff --git a/python/semantic_kernel/contents/chat_message_content.py b/python/semantic_kernel/contents/chat_message_content.py index 4a35e03457a7..861a168d142e 100644 --- a/python/semantic_kernel/contents/chat_message_content.py +++ b/python/semantic_kernel/contents/chat_message_content.py @@ -10,6 +10,7 @@ from pydantic import Field from semantic_kernel.contents.annotation_content import AnnotationContent +from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.binary_content import BinaryContent from semantic_kernel.contents.const import ( ANNOTATION_CONTENT_TAG, @@ -58,6 +59,7 @@ | FileReferenceContent | StreamingAnnotationContent | StreamingFileReferenceContent + | AudioContent ) logger = logging.getLogger(__name__) diff --git a/python/semantic_kernel/contents/function_call_content.py b/python/semantic_kernel/contents/function_call_content.py index 7067311f4c8a..a8b2509336e1 100644 --- a/python/semantic_kernel/contents/function_call_content.py +++ b/python/semantic_kernel/contents/function_call_content.py @@ -124,6 +124,7 @@ def __add__(self, other: "FunctionCallContent | None") -> "FunctionCallContent": index=self.index or other.index, name=self.name or other.name, arguments=self.combine_arguments(self.arguments, other.arguments), + metadata=self.metadata | other.metadata, ) def combine_arguments( diff --git a/python/semantic_kernel/contents/streaming_chat_message_content.py b/python/semantic_kernel/contents/streaming_chat_message_content.py index 32b8bd55a3f6..2f7e27d32aaa 100644 --- a/python/semantic_kernel/contents/streaming_chat_message_content.py +++ b/python/semantic_kernel/contents/streaming_chat_message_content.py @@ -6,6 +6,7 @@ from pydantic import Field +from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.binary_content import BinaryContent from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent @@ -22,6 +23,7 @@ ITEM_TYPES = Union[ BinaryContent, + AudioContent, ImageContent, StreamingTextContent, FunctionCallContent, diff --git a/python/tests/unit/contents/test_audio_content.py b/python/tests/unit/contents/test_audio_content.py new file mode 100644 index 000000000000..2af5a99b9e29 --- /dev/null +++ b/python/tests/unit/contents/test_audio_content.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft. All rights reserved. + +import os + +import pytest + +from semantic_kernel.contents.audio_content import AudioContent + +test_cases = [ + pytest.param(AudioContent(uri="http://test_uri"), id="uri"), + pytest.param(AudioContent(data=b"test_data", mime_type="image/jpeg", data_format="base64"), id="data"), + pytest.param(AudioContent(uri="http://test_uri", data=b"test_data", mime_type="image/jpeg"), id="both"), + pytest.param( + AudioContent.from_image_path( + image_path=os.path.join(os.path.dirname(__file__), "../../", "assets/sample_image.jpg") + ), + id="image_file", + ), +] + + +def test_create_uri(): + image = AudioContent(uri="http://test_uri") + assert str(image.uri) == "http://test_uri/" + + +def test_create_file_from_path(): + image_path = os.path.join(os.path.dirname(__file__), "../../", "assets/sample_image.jpg") + image = AudioContent.from_image_path(image_path=image_path) + assert image.mime_type == "image/jpeg" + assert image.data_uri.startswith("data:image/jpeg;") + assert image.data is not None + + +def test_create_data(): + image = AudioContent(data=b"test_data", mime_type="image/jpeg") + assert image.mime_type == "image/jpeg" + assert image.data == b"test_data" + + +def test_to_str_uri(): + image = AudioContent(uri="http://test_uri") + assert str(image) == "http://test_uri/" + + +def test_to_str_data(): + image = AudioContent(data=b"test_data", mime_type="image/jpeg", data_format="base64") + assert str(image) == "data:image/jpeg;base64,dGVzdF9kYXRh" + + +@pytest.mark.parametrize("image", test_cases) +def test_element_roundtrip(image): + element = image.to_element() + new_image = AudioContent.from_element(element) + assert new_image == image + + +@pytest.mark.parametrize("image", test_cases) +def test_to_dict(image): + assert image.to_dict() == {"type": "image_url", "image_url": {"url": str(image)}} diff --git a/python/uv.lock b/python/uv.lock index 4dafbf89c550..5b7d0b30d6d4 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -920,27 +920,27 @@ wheels = [ [[package]] name = "debugpy" -version = "1.8.12" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/68/25/c74e337134edf55c4dfc9af579eccb45af2393c40960e2795a94351e8140/debugpy-1.8.12.tar.gz", hash = "sha256:646530b04f45c830ceae8e491ca1c9320a2d2f0efea3141487c82130aba70dce", size = 1641122 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/56/19/dd58334c0a1ec07babf80bf29fb8daf1a7ca4c1a3bbe61548e40616ac087/debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a", size = 2076091 }, - { url = "https://files.pythonhosted.org/packages/4c/37/bde1737da15f9617d11ab7b8d5267165f1b7dae116b2585a6643e89e1fa2/debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45", size = 3560717 }, - { url = "https://files.pythonhosted.org/packages/d9/ca/bc67f5a36a7de072908bc9e1156c0f0b272a9a2224cf21540ab1ffd71a1f/debugpy-1.8.12-cp310-cp310-win32.whl", hash = "sha256:b202f591204023b3ce62ff9a47baa555dc00bb092219abf5caf0e3718ac20e7c", size = 5180672 }, - { url = "https://files.pythonhosted.org/packages/c1/b9/e899c0a80dfa674dbc992f36f2b1453cd1ee879143cdb455bc04fce999da/debugpy-1.8.12-cp310-cp310-win_amd64.whl", hash = "sha256:9649eced17a98ce816756ce50433b2dd85dfa7bc92ceb60579d68c053f98dff9", size = 5212702 }, - { url = "https://files.pythonhosted.org/packages/af/9f/5b8af282253615296264d4ef62d14a8686f0dcdebb31a669374e22fff0a4/debugpy-1.8.12-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:36f4829839ef0afdfdd208bb54f4c3d0eea86106d719811681a8627ae2e53dd5", size = 2174643 }, - { url = "https://files.pythonhosted.org/packages/ef/31/f9274dcd3b0f9f7d1e60373c3fa4696a585c55acb30729d313bb9d3bcbd1/debugpy-1.8.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a28ed481d530e3138553be60991d2d61103ce6da254e51547b79549675f539b7", size = 3133457 }, - { url = "https://files.pythonhosted.org/packages/ab/ca/6ee59e9892e424477e0c76e3798046f1fd1288040b927319c7a7b0baa484/debugpy-1.8.12-cp311-cp311-win32.whl", hash = "sha256:4ad9a94d8f5c9b954e0e3b137cc64ef3f579d0df3c3698fe9c3734ee397e4abb", size = 5106220 }, - { url = "https://files.pythonhosted.org/packages/d5/1a/8ab508ab05ede8a4eae3b139bbc06ea3ca6234f9e8c02713a044f253be5e/debugpy-1.8.12-cp311-cp311-win_amd64.whl", hash = "sha256:4703575b78dd697b294f8c65588dc86874ed787b7348c65da70cfc885efdf1e1", size = 5130481 }, - { url = "https://files.pythonhosted.org/packages/ba/e6/0f876ecfe5831ebe4762b19214364753c8bc2b357d28c5d739a1e88325c7/debugpy-1.8.12-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:7e94b643b19e8feb5215fa508aee531387494bf668b2eca27fa769ea11d9f498", size = 2500846 }, - { url = "https://files.pythonhosted.org/packages/19/64/33f41653a701f3cd2cbff8b41ebaad59885b3428b5afd0d93d16012ecf17/debugpy-1.8.12-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:086b32e233e89a2740c1615c2f775c34ae951508b28b308681dbbb87bba97d06", size = 4222181 }, - { url = "https://files.pythonhosted.org/packages/32/a6/02646cfe50bfacc9b71321c47dc19a46e35f4e0aceea227b6d205e900e34/debugpy-1.8.12-cp312-cp312-win32.whl", hash = "sha256:2ae5df899732a6051b49ea2632a9ea67f929604fd2b036613a9f12bc3163b92d", size = 5227017 }, - { url = "https://files.pythonhosted.org/packages/da/a6/10056431b5c47103474312cf4a2ec1001f73e0b63b1216706d5fef2531eb/debugpy-1.8.12-cp312-cp312-win_amd64.whl", hash = "sha256:39dfbb6fa09f12fae32639e3286112fc35ae976114f1f3d37375f3130a820969", size = 5267555 }, - { url = "https://files.pythonhosted.org/packages/cf/4d/7c3896619a8791effd5d8c31f0834471fc8f8fb3047ec4f5fc69dd1393dd/debugpy-1.8.12-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:696d8ae4dff4cbd06bf6b10d671e088b66669f110c7c4e18a44c43cf75ce966f", size = 2485246 }, - { url = "https://files.pythonhosted.org/packages/99/46/bc6dcfd7eb8cc969a5716d858e32485eb40c72c6a8dc88d1e3a4d5e95813/debugpy-1.8.12-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:898fba72b81a654e74412a67c7e0a81e89723cfe2a3ea6fcd3feaa3395138ca9", size = 4218616 }, - { url = "https://files.pythonhosted.org/packages/03/dd/d7fcdf0381a9b8094da1f6a1c9f19fed493a4f8576a2682349b3a8b20ec7/debugpy-1.8.12-cp313-cp313-win32.whl", hash = "sha256:22a11c493c70413a01ed03f01c3c3a2fc4478fc6ee186e340487b2edcd6f4180", size = 5226540 }, - { url = "https://files.pythonhosted.org/packages/25/bd/ecb98f5b5fc7ea0bfbb3c355bc1dd57c198a28780beadd1e19915bf7b4d9/debugpy-1.8.12-cp313-cp313-win_amd64.whl", hash = "sha256:fdb3c6d342825ea10b90e43d7f20f01535a72b3a1997850c0c3cefa5c27a4a2c", size = 5267134 }, - { url = "https://files.pythonhosted.org/packages/38/c4/5120ad36405c3008f451f94b8f92ef1805b1e516f6ff870f331ccb3c4cc0/debugpy-1.8.12-py2.py3-none-any.whl", hash = "sha256:274b6a2040349b5c9864e475284bce5bb062e63dce368a394b8cc865ae3b00c6", size = 5229490 }, +version = "1.8.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bc/e7/666f4c9b0e24796af50aadc28d36d21c2e01e831a934535f956e09b3650c/debugpy-1.8.11.tar.gz", hash = "sha256:6ad2688b69235c43b020e04fecccdf6a96c8943ca9c2fb340b8adc103c655e57", size = 1640124 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/e6/4cf7422eaa591b4c7d6a9fde224095dac25283fdd99d90164f28714242b0/debugpy-1.8.11-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:2b26fefc4e31ff85593d68b9022e35e8925714a10ab4858fb1b577a8a48cb8cd", size = 2075100 }, + { url = "https://files.pythonhosted.org/packages/83/3a/e163de1df5995d95760a4d748b02fbefb1c1bf19e915b664017c40435dbf/debugpy-1.8.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61bc8b3b265e6949855300e84dc93d02d7a3a637f2aec6d382afd4ceb9120c9f", size = 3559724 }, + { url = "https://files.pythonhosted.org/packages/27/6c/327e19fd1bf428a1efe1a6f97b306689c54c2cebcf871b66674ead718756/debugpy-1.8.11-cp310-cp310-win32.whl", hash = "sha256:c928bbf47f65288574b78518449edaa46c82572d340e2750889bbf8cd92f3737", size = 5178068 }, + { url = "https://files.pythonhosted.org/packages/49/80/359ff8aa388f0bd4a48f0fa9ce3606396d576657ac149c6fba3cc7de8adb/debugpy-1.8.11-cp310-cp310-win_amd64.whl", hash = "sha256:8da1db4ca4f22583e834dcabdc7832e56fe16275253ee53ba66627b86e304da1", size = 5210109 }, + { url = "https://files.pythonhosted.org/packages/7c/58/8e3f7ec86c1b7985a232667b5df8f3b1b1c8401028d8f4d75e025c9556cd/debugpy-1.8.11-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:85de8474ad53ad546ff1c7c7c89230db215b9b8a02754d41cb5a76f70d0be296", size = 2173656 }, + { url = "https://files.pythonhosted.org/packages/d2/03/95738a68ade2358e5a4d63a2fd8e7ed9ad911001cfabbbb33a7f81343945/debugpy-1.8.11-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ffc382e4afa4aee367bf413f55ed17bd91b191dcaf979890af239dda435f2a1", size = 3132464 }, + { url = "https://files.pythonhosted.org/packages/ca/f4/18204891ab67300950615a6ad09b9de236203a9138f52b3b596fa17628ca/debugpy-1.8.11-cp311-cp311-win32.whl", hash = "sha256:40499a9979c55f72f4eb2fc38695419546b62594f8af194b879d2a18439c97a9", size = 5103637 }, + { url = "https://files.pythonhosted.org/packages/3b/90/3775e301cfa573b51eb8a108285681f43f5441dc4c3916feed9f386ef861/debugpy-1.8.11-cp311-cp311-win_amd64.whl", hash = "sha256:987bce16e86efa86f747d5151c54e91b3c1e36acc03ce1ddb50f9d09d16ded0e", size = 5127862 }, + { url = "https://files.pythonhosted.org/packages/c6/ae/2cf26f3111e9d94384d9c01e9d6170188b0aeda15b60a4ac6457f7c8a26f/debugpy-1.8.11-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:84e511a7545d11683d32cdb8f809ef63fc17ea2a00455cc62d0a4dbb4ed1c308", size = 2498756 }, + { url = "https://files.pythonhosted.org/packages/b0/16/ec551789d547541a46831a19aa15c147741133da188e7e6acf77510545a7/debugpy-1.8.11-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce291a5aca4985d82875d6779f61375e959208cdf09fcec40001e65fb0a54768", size = 4219136 }, + { url = "https://files.pythonhosted.org/packages/72/6f/b2b3ce673c55f882d27a6eb04a5f0c68bcad6b742ac08a86d8392ae58030/debugpy-1.8.11-cp312-cp312-win32.whl", hash = "sha256:28e45b3f827d3bf2592f3cf7ae63282e859f3259db44ed2b129093ca0ac7940b", size = 5224440 }, + { url = "https://files.pythonhosted.org/packages/77/09/b1f05be802c1caef5b3efc042fc6a7cadd13d8118b072afd04a9b9e91e06/debugpy-1.8.11-cp312-cp312-win_amd64.whl", hash = "sha256:44b1b8e6253bceada11f714acf4309ffb98bfa9ac55e4fce14f9e5d4484287a1", size = 5264578 }, + { url = "https://files.pythonhosted.org/packages/2e/66/931dc2479aa8fbf362dc6dcee707d895a84b0b2d7b64020135f20b8db1ed/debugpy-1.8.11-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:8988f7163e4381b0da7696f37eec7aca19deb02e500245df68a7159739bbd0d3", size = 2483651 }, + { url = "https://files.pythonhosted.org/packages/10/07/6c171d0fe6b8d237e35598b742f20ba062511b3a4631938cc78eefbbf847/debugpy-1.8.11-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c1f6a173d1140e557347419767d2b14ac1c9cd847e0b4c5444c7f3144697e4e", size = 4213770 }, + { url = "https://files.pythonhosted.org/packages/89/f1/0711da6ac250d4fe3bf7b3e9b14b4a86e82a98b7825075c07e19bab8da3d/debugpy-1.8.11-cp313-cp313-win32.whl", hash = "sha256:bb3b15e25891f38da3ca0740271e63ab9db61f41d4d8541745cfc1824252cb28", size = 5223911 }, + { url = "https://files.pythonhosted.org/packages/56/98/5e27fa39050749ed460025bcd0034a0a5e78a580a14079b164cc3abdeb98/debugpy-1.8.11-cp313-cp313-win_amd64.whl", hash = "sha256:d8768edcbeb34da9e11bcb8b5c2e0958d25218df7a6e56adf415ef262cd7b6d1", size = 5264166 }, + { url = "https://files.pythonhosted.org/packages/77/0a/d29a5aacf47b4383ed569b8478c02d59ee3a01ad91224d2cff8562410e43/debugpy-1.8.11-py2.py3-none-any.whl", hash = "sha256:0e22f846f4211383e6a416d04b4c13ed174d24cc5d43f5fd52e7821d0ebc8920", size = 5226874 }, ] [[package]] @@ -3624,16 +3624,16 @@ wheels = [ [[package]] name = "protobuf" -version = "5.29.3" +version = "5.29.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f7/d1/e0a911544ca9993e0f17ce6d3cc0932752356c1b0a834397f28e63479344/protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620", size = 424945 } +sdist = { url = "https://files.pythonhosted.org/packages/a5/73/4e6295c1420a9d20c9c351db3a36109b4c9aa601916cb7c6871e3196a1ca/protobuf-5.29.2.tar.gz", hash = "sha256:b2cc8e8bb7c9326996f0e160137b0861f1a82162502658df2951209d0cb0309e", size = 424901 } wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/7a/1e38f3cafa022f477ca0f57a1f49962f21ad25850c3ca0acd3b9d0091518/protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888", size = 422708 }, - { url = "https://files.pythonhosted.org/packages/61/fa/aae8e10512b83de633f2646506a6d835b151edf4b30d18d73afd01447253/protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a", size = 434508 }, - { url = "https://files.pythonhosted.org/packages/dd/04/3eaedc2ba17a088961d0e3bd396eac764450f431621b58a04ce898acd126/protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e", size = 417825 }, - { url = "https://files.pythonhosted.org/packages/4f/06/7c467744d23c3979ce250397e26d8ad8eeb2bea7b18ca12ad58313c1b8d5/protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84", size = 319573 }, - { url = "https://files.pythonhosted.org/packages/a8/45/2ebbde52ad2be18d3675b6bee50e68cd73c9e0654de77d595540b5129df8/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f", size = 319672 }, - { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 }, + { url = "https://files.pythonhosted.org/packages/f3/42/6db5387124708d619ffb990a846fb123bee546f52868039f8fa964c5bc54/protobuf-5.29.2-cp310-abi3-win32.whl", hash = "sha256:c12ba8249f5624300cf51c3d0bfe5be71a60c63e4dcf51ffe9a68771d958c851", size = 422697 }, + { url = "https://files.pythonhosted.org/packages/6c/38/2fcc968b377b531882d6ab2ac99b10ca6d00108394f6ff57c2395fb7baff/protobuf-5.29.2-cp310-abi3-win_amd64.whl", hash = "sha256:842de6d9241134a973aab719ab42b008a18a90f9f07f06ba480df268f86432f9", size = 434495 }, + { url = "https://files.pythonhosted.org/packages/cb/26/41debe0f6615fcb7e97672057524687ed86fcd85e3da3f031c30af8f0c51/protobuf-5.29.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a0c53d78383c851bfa97eb42e3703aefdc96d2036a41482ffd55dc5f529466eb", size = 417812 }, + { url = "https://files.pythonhosted.org/packages/e4/20/38fc33b60dcfb380507b99494aebe8c34b68b8ac7d32808c4cebda3f6f6b/protobuf-5.29.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:494229ecd8c9009dd71eda5fd57528395d1eacdf307dbece6c12ad0dd09e912e", size = 319562 }, + { url = "https://files.pythonhosted.org/packages/90/4d/c3d61e698e0e41d926dbff6aa4e57428ab1a6fc3b5e1deaa6c9ec0fd45cf/protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:b6b0d416bbbb9d4fbf9d0561dbfc4e324fd522f61f7af0fe0f282ab67b22477e", size = 319662 }, + { url = "https://files.pythonhosted.org/packages/f3/fd/c7924b4c2a1c61b8f4b64edd7a31ffacf63432135a2606f03a2f0d75a750/protobuf-5.29.2-py3-none-any.whl", hash = "sha256:fde4554c0e578a5a0bcc9a276339594848d1e89f9ea47b4427c80e5d72f90181", size = 172539 }, ] [[package]] @@ -3816,6 +3816,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, ] +[[package]] +name = "pyaudio" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/1d/8878c7752febb0f6716a7e1a52cb92ac98871c5aa522cba181878091607c/PyAudio-0.2.14.tar.gz", hash = "sha256:78dfff3879b4994d1f4fc6485646a57755c6ee3c19647a491f790a0895bd2f87", size = 47066 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/90/1553487277e6aa25c0b7c2c38709cdd2b49e11c66c0b25c6e8b7b6638c72/PyAudio-0.2.14-cp310-cp310-win32.whl", hash = "sha256:126065b5e82a1c03ba16e7c0404d8f54e17368836e7d2d92427358ad44fefe61", size = 144624 }, + { url = "https://files.pythonhosted.org/packages/27/bc/719d140ee63cf4b0725016531d36743a797ffdbab85e8536922902c9349a/PyAudio-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:2a166fc88d435a2779810dd2678354adc33499e9d4d7f937f28b20cc55893e83", size = 164069 }, + { url = "https://files.pythonhosted.org/packages/7b/f0/b0eab89eafa70a86b7b566a4df2f94c7880a2d483aa8de1c77d335335b5b/PyAudio-0.2.14-cp311-cp311-win32.whl", hash = "sha256:506b32a595f8693811682ab4b127602d404df7dfc453b499c91a80d0f7bad289", size = 144624 }, + { url = "https://files.pythonhosted.org/packages/82/d8/f043c854aad450a76e476b0cf9cda1956419e1dacf1062eb9df3c0055abe/PyAudio-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:bbeb01d36a2f472ae5ee5e1451cacc42112986abe622f735bb870a5db77cf903", size = 164070 }, + { url = "https://files.pythonhosted.org/packages/8d/45/8d2b76e8f6db783f9326c1305f3f816d4a12c8eda5edc6a2e1d03c097c3b/PyAudio-0.2.14-cp312-cp312-win32.whl", hash = "sha256:5fce4bcdd2e0e8c063d835dbe2860dac46437506af509353c7f8114d4bacbd5b", size = 144750 }, + { url = "https://files.pythonhosted.org/packages/b0/6a/d25812e5f79f06285767ec607b39149d02aa3b31d50c2269768f48768930/PyAudio-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:12f2f1ba04e06ff95d80700a78967897a489c05e093e3bffa05a84ed9c0a7fa3", size = 164126 }, + { url = "https://files.pythonhosted.org/packages/3a/77/66cd37111a87c1589b63524f3d3c848011d21ca97828422c7fde7665ff0d/PyAudio-0.2.14-cp313-cp313-win32.whl", hash = "sha256:95328285b4dab57ea8c52a4a996cb52be6d629353315be5bfda403d15932a497", size = 150982 }, + { url = "https://files.pythonhosted.org/packages/a5/8b/7f9a061c1cc2b230f9ac02a6003fcd14c85ce1828013aecbaf45aa988d20/PyAudio-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:692d8c1446f52ed2662120bcd9ddcb5aa2b71f38bda31e58b19fb4672fffba69", size = 173655 }, +] + [[package]] name = "pybars4" version = "0.9.13" @@ -3936,13 +3952,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/a9/3b9642025174bbe67e900785fb99c9bfe91ea584b0b7126ff99945c24a0e/pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820", size = 30746 }, ] +[[package]] +name = "pydub" +version = "0.25.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327 }, +] + [[package]] name = "pygments" -version = "2.19.1" +version = "2.19.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } +sdist = { url = "https://files.pythonhosted.org/packages/d3/c0/9c9832e5be227c40e1ce774d493065f83a91d6430baa7e372094e9683a45/pygments-2.19.0.tar.gz", hash = "sha256:afc4146269910d4bdfabcd27c24923137a74d562a23a320a41a55ad303e19783", size = 4967733 } wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, + { url = "https://files.pythonhosted.org/packages/20/dc/fde3e7ac4d279a331676829af4afafd113b34272393d73f610e8f0329221/pygments-2.19.0-py3-none-any.whl", hash = "sha256:4755e6e64d22161d5b61432c0600c923c5927214e7c956e31c23923c89251a9b", size = 1225305 }, ] [[package]] @@ -4928,6 +4953,12 @@ ollama = [ onnx = [ { name = "onnxruntime-genai", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, ] +openai-realtime = [ + { name = "openai", extra = ["realtime"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyaudio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pydub", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "sounddevice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] pandas = [ { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -5013,6 +5044,7 @@ requires-dist = [ { name = "pybars4", specifier = "~=0.9" }, { name = "pydantic", specifier = ">=2.0,!=2.10.0,!=2.10.1,!=2.10.2,!=2.10.3,<2.11" }, { name = "pydantic-settings", specifier = "~=2.0" }, + { name = "pydub", marker = "extra == 'openai-realtime'" }, { name = "pymilvus", marker = "extra == 'milvus'", specifier = ">=2.3,<2.6" }, { name = "pymongo", marker = "extra == 'mongo'", specifier = ">=4.8.0,<4.12" }, { name = "qdrant-client", marker = "extra == 'qdrant'", specifier = "~=1.9" }, @@ -5224,6 +5256,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/93/84a16940c44f6ec62cf334f25aed3128a514dffc361397eee09421a1c7f2/snoop-0.6.0-py3-none-any.whl", hash = "sha256:f5ea9060e65594bf404e6841086b4a964cc27bc30569109c91a470f948b0f729", size = 27461 }, ] +[[package]] +name = "sounddevice" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/2d/b04ae180312b81dbb694504bee170eada5372242e186f6298139fd3a0513/sounddevice-0.5.1.tar.gz", hash = "sha256:09ca991daeda8ce4be9ac91e15a9a81c8f81efa6b695a348c9171ea0c16cb041", size = 52896 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/d1/464b5fca3decdd0cfec8c47f7b4161a0b12972453201c1bf03811f367c5e/sounddevice-0.5.1-py3-none-any.whl", hash = "sha256:e2017f182888c3f3c280d9fbac92e5dbddac024a7e3442f6e6116bd79dab8a9c", size = 32276 }, + { url = "https://files.pythonhosted.org/packages/6f/f6/6703fe7cf3d7b7279040c792aeec6334e7305956aba4a80f23e62c8fdc44/sounddevice-0.5.1-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:d16cb23d92322526a86a9490c427bf8d49e273d9ccc0bd096feecd229cde6031", size = 107916 }, + { url = "https://files.pythonhosted.org/packages/57/a5/78a5e71f5ec0faedc54f4053775d61407bfbd7d0c18228c7f3d4252fd276/sounddevice-0.5.1-py3-none-win32.whl", hash = "sha256:d84cc6231526e7a08e89beff229c37f762baefe5e0cc2747cbe8e3a565470055", size = 312494 }, + { url = "https://files.pythonhosted.org/packages/af/9b/15217b04f3b36d30de55fef542389d722de63f1ad81f9c72d8afc98cb6ab/sounddevice-0.5.1-py3-none-win_amd64.whl", hash = "sha256:4313b63f2076552b23ac3e0abd3bcfc0c1c6a696fc356759a13bd113c9df90f1", size = 363634 }, +] + [[package]] name = "soupsieve" version = "2.6" From 7308bcbd14fde0d3edf5dab8ad1fa1c024f76141 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Jan 2025 16:47:12 +0100 Subject: [PATCH 02/50] major update --- python/pyproject.toml | 5 +- .../audio/04-chat_with_realtime_api.py | 176 +++-- python/samples/concepts/audio/audio_player.py | 2 +- .../concepts/audio/audio_player_async.py | 4 +- .../concepts/audio/audio_recorder_stream.py | 3 +- .../ai/chat_completion_client_base.py | 12 - .../connectors/ai/function_calling_utils.py | 50 ++ .../connectors/ai/open_ai/__init__.py | 8 + .../open_ai/services/open_ai_realtime_base.py | 614 +++++++++++------- .../connectors/ai/realtime_client_base.py | 131 +++- .../tests/unit/contents/test_audio_content.py | 60 -- python/uv.lock | 269 ++++---- 12 files changed, 796 insertions(+), 538 deletions(-) delete mode 100644 python/tests/unit/contents/test_audio_content.py diff --git a/python/pyproject.toml b/python/pyproject.toml index c33bf21ede6c..6972267008aa 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -129,10 +129,7 @@ dapr = [ "flask-dapr>=1.14.0" ] openai_realtime = [ - "openai[realtime] ~= 1.0", - "pyaudio", - "pydub", - "sounddevice" + "openai[realtime] ~= 1.0" ] [tool.uv] diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 4440d13b8eec..bffbad691716 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -3,51 +3,60 @@ import contextlib import logging import signal +from typing import Any -from samples.concepts.audio.audio_player_async import AudioPlayerAsync +from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -# This simple sample demonstrates how to use the OpenAI Realtime API to create -# a chat bot that can listen and respond directly through audio. -# It requires installing semantic-kernel[openai_realtime] which includes the -# OpenAI Realtime API client and some packages for handling audio locally. -# It has hardcoded device id's set in the AudioRecorderStream and AudioPlayerAsync classes, -# so you may need to adjust these for your system. +from samples.concepts.audio.audio_player_async import AudioPlayerAsync from samples.concepts.audio.audio_recorder_stream import AudioRecorderStream from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior -from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( +from semantic_kernel.connectors.ai.open_ai import ( + OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents import AudioContent, ChatHistory, StreamingTextContent from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) +# This simple sample demonstrates how to use the OpenAI Realtime API to create +# a chat bot that can listen and respond directly through audio. +# It requires installing: +# - semantic-kernel[openai_realtime] +# - pyaudio +# - sounddevice +# - pydub +# e.g. pip install semantic-kernel[openai_realtime] pyaudio sounddevice pydub + +# The characterics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can check the available devices by uncommenting line below the function + -def signal_handler(): - for task in asyncio.all_tasks(): - task.cancel() +def check_audio_devices(): + import sounddevice as sd # type: ignore + print(sd.query_devices()) -system_message = """ -You are a chat bot. Your name is Mosscap and -you have one goal: figure out what people need. -Your full name, should you need to know it, is -Splendid Speckled Mosscap. You communicate -effectively, but you tend to answer with long -flowery prose. -""" -history = ChatHistory() -history.add_user_message("Hi there, who are you?") -history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") +# check_audio_devices() class Speaker: - def __init__(self, audio_player: AudioPlayerAsync, realtime_client: OpenAIRealtime, kernel: Kernel): + """This is a simple class that opens the session with the realtime api and plays the audio response. + + At the same time it prints the transcript of the conversation to the console. + """ + + def __init__(self, audio_player: AudioPlayerAsync, realtime_client: RealtimeClientBase, kernel: Kernel): self.audio_player = audio_player self.realtime_client = realtime_client self.kernel = kernel @@ -56,42 +65,63 @@ async def play( self, chat_history: ChatHistory, settings: OpenAIRealtimeExecutionSettings, + print_transcript: bool = True, ) -> None: + # reset the frame count for the audio player self.audio_player.reset_frame_count() - print("Mosscap (transcript): ", end="") - try: - async for content in self.realtime_client.get_streaming_chat_message_content( - chat_history=chat_history, settings=settings, kernel=self.kernel - ): - if not content: - continue - for item in content.items: - match item: - case StreamingTextContent(): - print(item.text, end="") - await asyncio.sleep(0.01) - continue - case AudioContent(): - self.audio_player.add_data(item.data) - await asyncio.sleep(0.01) - continue - except asyncio.CancelledError: - print("\nThanks for talking to Mosscap!") + # open the connection to the realtime api + async with self.realtime_client as client: + # update the session with the chat_history and settings + await client.update_session(settings=settings, chat_history=chat_history) + # print the start message of the transcript + if print_transcript: + print("Mosscap (transcript): ", end="") + try: + # start listening for events + async for content in self.realtime_client.event_listener(settings=settings, kernel=self.kernel): + if not content: + continue + # the contents returned should be StreamingChatMessageContent + # so we will loop through the items within it. + for item in content.items: + match item: + case StreamingTextContent(): + if print_transcript: + print(item.text, end="") + await asyncio.sleep(0.01) + continue + case AudioContent(): + self.audio_player.add_data(item.data) + await asyncio.sleep(0.01) + continue + except asyncio.CancelledError: + print("\nThanks for talking to Mosscap!") class Microphone: - def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: OpenAIRealtime): + """This is a simple class that opens the microphone and sends the audio to the realtime api.""" + + def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: RealtimeClientBase): self.audio_recorder = audio_recorder self.realtime_client = realtime_client async def record_audio(self): with contextlib.suppress(asyncio.CancelledError): - async for audio in self.audio_recorder.stream_audio_content(): - if audio.data: - await self.realtime_client.send_content(content=audio) + async for content in self.audio_recorder.stream_audio_content(): + if content.data: + await self.realtime_client.send_event( + "input_audio_buffer.append", + content=content, + ) await asyncio.sleep(0.01) +# this function is used to stop the processes when ctrl + c is pressed +def signal_handler(): + for task in asyncio.all_tasks(): + task.cancel() + + @kernel_function def get_weather(location: str) -> str: """Get the weather for a location.""" @@ -99,23 +129,59 @@ def get_weather(location: str) -> str: return f"The weather in {location} is sunny." +def response_created_callback( + event: RealtimeServerEvent, settings: PromptExecutionSettings | None = None, **kwargs: Any +) -> None: + """Add a empty print to start a new line for a new response.""" + print("") + + async def main() -> None: + # setup the asyncio loop with the signal event handler loop = asyncio.get_event_loop() loop.add_signal_handler(signal.SIGINT, signal_handler) + + # create the Kernel and add a simple function for function calling. + kernel = Kernel() + kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) + + # create the realtime client and register the response created callback + realtime_client = OpenAIRealtime(ai_model_id="gpt-4o-realtime-preview-2024-12-17") + realtime_client.register_event_handler("response.created", response_created_callback) + + # create the speaker and microphone + speaker = Speaker(AudioPlayerAsync(device_id=7), realtime_client, kernel) + microphone = Microphone(AudioRecorderStream(device_id=2), realtime_client) + + # Create the settings for the session + # the key thing to decide on is to enable the server_vad turn detection + # if turn is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + + # The realtime api, does not use a system message, but takes instructions as a parameter for a session + instructions = """ + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """ + # but we can add a chat history to conversation after starting it + chat_history = ChatHistory() + chat_history.add_user_message("Hi there, who are you?") + chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + settings = OpenAIRealtimeExecutionSettings( - instructions=system_message, + instructions=instructions, voice="sage", turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) - realtime_client = OpenAIRealtime(ai_model_id="gpt-4o-realtime-preview-2024-12-17") - kernel = Kernel() - kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) - - speaker = Speaker(AudioPlayerAsync(), realtime_client, kernel) - microphone = Microphone(AudioRecorderStream(), realtime_client) + # start the the speaker and the microphone with contextlib.suppress(asyncio.CancelledError): - await asyncio.gather(*[speaker.play(history, settings), microphone.record_audio()]) + await asyncio.gather(*[speaker.play(chat_history, settings), microphone.record_audio()]) if __name__ == "__main__": diff --git a/python/samples/concepts/audio/audio_player.py b/python/samples/concepts/audio/audio_player.py index 036b978dcff1..b10c15184821 100644 --- a/python/samples/concepts/audio/audio_player.py +++ b/python/samples/concepts/audio/audio_player.py @@ -20,7 +20,7 @@ class AudioPlayer(BaseModel): # Audio replay parameters CHUNK: ClassVar[int] = 1024 - audio_content: AudioContent | None = None + audio_content: AudioContent def play(self, text: str | None = None) -> None: """Play the audio content to the default audio output device. diff --git a/python/samples/concepts/audio/audio_player_async.py b/python/samples/concepts/audio/audio_player_async.py index 9ae424b01c66..a77b8df6e32c 100644 --- a/python/samples/concepts/audio/audio_player_async.py +++ b/python/samples/concepts/audio/audio_player_async.py @@ -13,7 +13,7 @@ class AudioPlayerAsync: - def __init__(self): + def __init__(self, device_id: int | None = None): self.queue = [] self.lock = threading.Lock() self.stream = sd.OutputStream( @@ -22,7 +22,7 @@ def __init__(self): channels=CHANNELS, dtype=np.int16, blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), - device=3, + device=device_id, ) self.playing = False self._frame_count = 0 diff --git a/python/samples/concepts/audio/audio_recorder_stream.py b/python/samples/concepts/audio/audio_recorder_stream.py index 99ac1a9f8141..55684e9c469b 100644 --- a/python/samples/concepts/audio/audio_recorder_stream.py +++ b/python/samples/concepts/audio/audio_recorder_stream.py @@ -28,6 +28,7 @@ class AudioRecorderStream(BaseModel): CHANNELS: ClassVar[int] = 1 SAMPLE_RATE: ClassVar[int] = 24000 CHUNK_LENGTH_S: ClassVar[float] = 0.05 + device_id: int | None = None async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: import sounddevice as sd # type: ignore @@ -41,7 +42,7 @@ async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: channels=self.CHANNELS, samplerate=self.SAMPLE_RATE, dtype="int16", - device=4, + device=self.device_id, ) stream.start() try: diff --git a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py index a402fad10b53..42797489e26f 100644 --- a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py +++ b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py @@ -441,16 +441,4 @@ def _yield_function_result_messages(self, function_result_messages: list) -> boo """ return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0 - async def _streaming_function_call_result_callback( - self, function_result_messages: list["ChatMessageContent"] - ) -> None: - """Callback to handle the streaming function call result messages. - - Override this method to handle the streaming function call result messages. - - Args: - function_result_messages (list): The streaming function call result messages. - """ - return - # endregion diff --git a/python/semantic_kernel/connectors/ai/function_calling_utils.py b/python/semantic_kernel/connectors/ai/function_calling_utils.py index 48415aff9725..ec09b4d2850f 100644 --- a/python/semantic_kernel/connectors/ai/function_calling_utils.py +++ b/python/semantic_kernel/connectors/ai/function_calling_utils.py @@ -1,10 +1,13 @@ # Copyright (c) Microsoft. All rights reserved. from collections import OrderedDict +from collections.abc import Callable +from copy import deepcopy from typing import TYPE_CHECKING, Any from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.utils.experimental_decorator import experimental_function if TYPE_CHECKING: from semantic_kernel.connectors.ai.function_choice_behavior import ( @@ -15,6 +18,7 @@ from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + from semantic_kernel.kernel import Kernel def update_settings_from_function_call_configuration( @@ -134,3 +138,49 @@ def merge_streaming_function_results( function_invoke_attempt=function_invoke_attempt, ) ] + + +@experimental_function +def prepare_settings_for_function_calling( + settings: "PromptExecutionSettings", + settings_class: type["PromptExecutionSettings"], + update_settings_callback: Callable[..., None], + kernel: "Kernel", +) -> "PromptExecutionSettings": + """Prepare settings for the service. + + Args: + settings: Prompt execution settings. + settings_class: The settings class. + update_settings_callback: The callback to update the settings. + kernel: Kernel instance. + + Returns: + PromptExecutionSettings of type settings_class. + """ + settings = deepcopy(settings) + if not isinstance(settings, settings_class): + settings = settings_class.from_prompt_execution_settings(settings) + + # For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior` + # if this method is called with a `FunctionCallBehavior` object as part of the settings + + from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior + from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior + + if hasattr(settings, "function_call_behavior") and isinstance( + settings.function_call_behavior, FunctionCallBehavior + ): + settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior( + settings.function_call_behavior + ) + + if settings.function_choice_behavior: + # Configure the function choice behavior into the settings object + # that will become part of the request to the AI service + settings.function_choice_behavior.configure( + kernel=kernel, + update_settings_callback=update_settings_callback, + settings=settings, + ) + return settings diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index a3103ae86446..27d36ea30d34 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -22,6 +22,10 @@ OpenAIPromptExecutionSettings, OpenAITextPromptExecutionSettings, ) +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, + TurnDetection, +) from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import ( OpenAITextToAudioExecutionSettings, ) @@ -36,6 +40,7 @@ from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio @@ -69,6 +74,8 @@ "OpenAIChatPromptExecutionSettings", "OpenAIEmbeddingPromptExecutionSettings", "OpenAIPromptExecutionSettings", + "OpenAIRealtime", + "OpenAIRealtimeExecutionSettings", "OpenAISettings", "OpenAITextCompletion", "OpenAITextEmbedding", @@ -77,4 +84,5 @@ "OpenAITextToAudioExecutionSettings", "OpenAITextToImage", "OpenAITextToImageExecutionSettings", + "TurnDetection", ] diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index c73f12d7f343..4175d9449b2e 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -4,8 +4,10 @@ import base64 import logging import sys -from collections.abc import AsyncGenerator, Callable -from typing import TYPE_CHECKING, Any, ClassVar +from collections.abc import AsyncGenerator +from enum import Enum +from inspect import isawaitable +from typing import Any, ClassVar, Protocol, runtime_checkable if sys.version_info >= (3, 12): from typing import override # pragma: no cover @@ -15,19 +17,14 @@ from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection from openai.types.beta.realtime.conversation_item_create_event_param import ConversationItemParam from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from openai.types.beta.realtime.session import Session from pydantic import Field -from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase -from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration -from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.function_calling_utils import prepare_settings_for_function_calling from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_utils import ( - update_settings_from_function_call_configuration, -) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent @@ -35,260 +32,401 @@ from semantic_kernel.contents.text_content import TextContent from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.kernel import Kernel - -if TYPE_CHECKING: - from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.utils.experimental_decorator import experimental_class logger: logging.Logger = logging.getLogger(__name__) -class OpenAIRealtimeBase(OpenAIHandler, ChatCompletionClientBase): +@runtime_checkable +@experimental_class +class EventCallBackProtocolAsync(Protocol): + """Event callback protocol.""" + + async def __call__( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool] | None: + """Call the event callback.""" + ... + + +@runtime_checkable +@experimental_class +class EventCallBackProtocol(Protocol): + """Event callback protocol.""" + + def __call__( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool] | None: + """Call the event callback.""" + ... + + +@experimental_class +class SendEvents(str, Enum): + """Events that can be sent.""" + + SESSION_UPDATE = "session.update" + INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" + INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" + INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" + CONVERSATION_ITEM_CREATE = "conversation.item.create" + CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" + CONVERSATION_ITEM_DELETE = "conversation.item.delete" + RESPONSE_CREATE = "response.create" + RESPONSE_CANCEL = "response.cancel" + + +@experimental_class +class ListenEvents(str, Enum): + """Events that can be listened to.""" + + ERROR = "error" + SESSION_CREATED = "session.created" + SESSION_UPDATED = "session.updated" + CONVERSATION_CREATED = "conversation.created" + INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" + INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" + INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" + INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" + CONVERSATION_ITEM_CREATED = "conversation.item.created" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" + CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" + CONVERSATION_ITEM_DELETED = "conversation.item.deleted" + RESPONSE_CREATED = "response.created" + RESPONSE_DONE = "response.done" + RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" + RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" + RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" + RESPONSE_CONTENT_PART_DONE = "response.content_part.done" + RESPONSE_TEXT_DELTA = "response.text.delta" + RESPONSE_TEXT_DONE = "response.text.done" + RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" + RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" + RESPONSE_AUDIO_DELTA = "response.audio.delta" + RESPONSE_AUDIO_DONE = "response.audio.done" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" + RATE_LIMITS_UPDATED = "rate_limits.updated" + + +@experimental_class +class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): """OpenAI Realtime service.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True connection: AsyncRealtimeConnection | None = None connected: asyncio.Event = Field(default_factory=asyncio.Event) - session: Session | None = None + event_log: dict[str, list[RealtimeServerEvent]] = Field(default_factory=dict) + event_handlers: dict[str, list[EventCallBackProtocol | EventCallBackProtocolAsync]] = Field(default_factory=dict) - def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - """Get the request settings class.""" - from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa - OpenAIRealtimeExecutionSettings, + def model_post_init(self, *args, **kwargs) -> None: + """Post init method for the model.""" + # Register the default event handlers + self.register_event_handler(ListenEvents.RESPONSE_AUDIO_DELTA, self.response_audio_delta_callback) + self.register_event_handler( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, self.response_audio_transcript_delta_callback ) + self.register_event_handler( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE, self.response_audio_transcript_done_callback + ) + self.register_event_handler( + ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, self.response_function_call_arguments_delta_callback + ) + self.register_event_handler(ListenEvents.ERROR, self.error_callback) + self.register_event_handler(ListenEvents.SESSION_CREATED, self.session_callback) + self.register_event_handler(ListenEvents.SESSION_UPDATED, self.session_callback) - return OpenAIRealtimeExecutionSettings - - async def _get_connection(self) -> AsyncRealtimeConnection: - await self.connected.wait() - if not self.connection: - raise ValueError("Connection not established") - return self.connection + def register_event_handler( + self, event_type: str | ListenEvents, handler: EventCallBackProtocol | EventCallBackProtocolAsync + ) -> None: + """Register a event handler.""" + if not isinstance(event_type, ListenEvents): + event_type = ListenEvents(event_type) + self.event_handlers.setdefault(event_type, []).append(handler) @override - async def _inner_get_streaming_chat_message_contents( + async def event_listener( self, - chat_history: "ChatHistory", settings: "PromptExecutionSettings", - function_invoke_attempt: int = 0, + chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> AsyncGenerator[list[StreamingChatMessageContent], Any]: - if not isinstance(settings, self.get_prompt_execution_settings_class()): - settings = self.get_prompt_execution_settings_from_settings(settings) - - events: list[RealtimeServerEvent] = [] - detailed_events: dict[str, list[RealtimeServerEvent]] = {} - function_calls: list[StreamingChatMessageContent] = [] - - async with self.client.beta.realtime.connect(model=self.ai_model_id) as conn: - self.connection = conn - self.connected.set() - - await conn.session.update(session=settings.prepare_settings_dict()) - if len(chat_history) > 0: - await asyncio.gather(*(self._add_content_to_conversation(msg) for msg in chat_history.messages)) - - async for event in conn: - events.append(event) - detailed_events.setdefault(event.type, []).append(event) - match event.type: - case "session.created" | "session.updated": - self.session = event.session - continue - case "error": - logger.error("Error received: %s", event.error) - continue - case "response.audio.delta": - yield [ - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[AudioContent(data=base64.b64decode(event.delta), data_format="base64")], - choice_index=event.content_index, - inner_content=event, - ) - ] + ) -> AsyncGenerator[StreamingChatMessageContent, Any]: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + if not chat_history: + chat_history = ChatHistory() + async for event in self.connection: + event_type = ListenEvents(event.type) + self.event_log.setdefault(event_type, []).append(event) + for handler in self.event_handlers.get(event_type, []): + task = handler(event=event, settings=settings) + if not task: + continue + if isawaitable(task): + async_result = await task + if not async_result: continue - case "response.audio_transcript.delta": - yield [ - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, + result, should_return = async_result + else: + result, should_return = task + if should_return: + yield result + else: + chat_history.add_message(result) + + for event_type in self.event_log: + logger.debug(f"Event type: {event_type}, count: {len(self.event_log[event_type])}") + + @override + async def send_event(self, event: str | SendEvents, **kwargs: Any) -> None: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + if not isinstance(event, SendEvents): + event = SendEvents(event) + match event: + case SendEvents.SESSION_UPDATE: + if "settings" not in kwargs: + logger.error("Event data does not contain 'settings'") + await self.connection.session.update(session=kwargs["settings"].prepare_settings_dict()) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if "content" not in kwargs: + logger.error("Event data does not contain 'content'") + return + await self.connection.input_audio_buffer.append(audio=kwargs["content"].data.decode("utf-8")) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + await self.connection.input_audio_buffer.commit() + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + await self.connection.input_audio_buffer.clear() + case SendEvents.CONVERSATION_ITEM_CREATE: + if "item" not in kwargs: + logger.error("Event data does not contain 'item'") + return + content = kwargs["item"] + for item in content.items: + match item: + case TextContent(): + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ) ) - ] - continue - case "response.audio_transcript.done": - chat_history.add_message( - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, + case FunctionCallContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function call needs to have a call_id") + continue + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="function_call", + name=item.name, + arguments=item.arguments, + call_id=call_id, + ) ) - ) - case "response.function_call_arguments.delta": - msg = StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, + case FunctionResultContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function result needs to have a call_id") + continue + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="function_call_output", + output=item.result, + call_id=call_id, ) - ], - choice_index=0, - inner_content=event, - ) - function_calls.append(msg) - yield [msg] - continue - case "response.function_call_arguments.done": - # execute function, add result to conversation - if len(function_calls) > 0: - function_call = sum(function_calls[1:], function_calls[0]) - # execute function - results = [] - for item in function_call.items: - if isinstance(item, FunctionCallContent): - kernel: Kernel | None = kwargs.get("kernel") - call_id = item.name - function_name = next( - output_item_event.item.name - for output_item_event in detailed_events["response.output_item.added"] - if output_item_event.item.call_id == call_id - ) - item.plugin_name, item.function_name = function_name.split("-", 1) - if kernel: - await kernel.invoke_function_call(item, chat_history) - # add result to conversation - results.append(chat_history.messages[-1]) - for message in results: - await self._add_content_to_conversation(content=message) - case _: - logger.debug("Unhandled event type: %s", event.type) - logger.debug(f"Finished streaming chat message contents, {len(events)} events received.") - for event_type in detailed_events: - logger.debug(f"Event type: {event_type}, count: {len(detailed_events[event_type])}") - - async def send_content( + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "item_id" not in kwargs: + logger.error("Event data does not contain 'item_id'") + return + await self.connection.conversation.item.truncate( + item_id=kwargs["item_id"], content_index=0, audio_end_ms=kwargs.get("audio_end_ms", 0) + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in kwargs: + logger.error("Event data does not contain 'item_id'") + return + await self.connection.conversation.item.delete(item_id=kwargs["item_id"]) + case SendEvents.RESPONSE_CREATE: + if "response" in kwargs: + await self.connection.response.create(response=kwargs["response"]) + else: + await self.connection.response.create() + case SendEvents.RESPONSE_CANCEL: + if "response_id" in kwargs: + await self.connection.response.cancel(response_id=kwargs["response_id"]) + else: + await self.connection.response.cancel() + + @override + async def create_session( self, - content: ChatMessageContent | AudioContent | AsyncGenerator[AudioContent, Any], + settings: PromptExecutionSettings | None = None, + chat_history: ChatHistory | None = None, **kwargs: Any, ) -> None: - """Send a chat message content to the service. - - This content should contain audio content, either as a ChatMessageContent with a - AudioContent item, as AudioContent directly, as or as a generator of AudioContent. - - """ - if isinstance(content, AudioContent | ChatMessageContent): - if isinstance(content, ChatMessageContent): - content = next(item for item in content.items if isinstance(item, AudioContent)) - connection = await self._get_connection() - await connection.input_audio_buffer.append(audio=content.data.decode("utf-8")) - await asyncio.sleep(0) - return - - async for audio_content in content: - if isinstance(audio_content, ChatMessageContent): - audio_content = next(item for item in audio_content.items if isinstance(item, AudioContent)) - connection = await self._get_connection() - await connection.input_audio_buffer.append(audio=audio_content.data.decode("utf-8")) - await asyncio.sleep(0) - - async def commit_content(self, settings: "PromptExecutionSettings") -> None: - """Commit the chat message content to the service. - - This is only needed when turn detection is not handled by the service. - - This behavior is determined by the turn_detection parameter in the settings. - If turn_detection is None, then it will commit the audio buffer and - ask the service to process the audio and create the response. - """ - if not isinstance(settings, self.get_prompt_execution_settings_class()): - settings = self.get_prompt_execution_settings_from_settings(settings) - if not settings.turn_detection: - connection = await self._get_connection() - await connection.input_audio_buffer.commit() - await connection.response.create() + """Create a session in the service.""" + self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() + self.connected.set() + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) @override - def _update_function_choice_settings_callback( + async def update_session( + self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any + ) -> None: + if settings: + if "kernel" in kwargs: + settings = prepare_settings_for_function_calling( + settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=kwargs.get("kernel"), # type: ignore + ) + await self.send_event(SendEvents.SESSION_UPDATE, settings=settings) + if chat_history and len(chat_history) > 0: + await asyncio.gather( + *(self.send_event(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) for msg in chat_history.messages) + ) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.connected.is_set(): + await self.connection.close() + self.connection = None + self.connected.clear() + + def response_audio_delta_callback( self, - ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: - return update_settings_from_function_call_configuration + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response audio delta.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[AudioContent(data=base64.b64decode(event.delta), data_format="base64")], + choice_index=event.content_index, + inner_content=event, + ), True - async def _streaming_function_call_result_callback( - self, function_result_messages: list[StreamingChatMessageContent] + def response_audio_transcript_delta_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response audio transcript delta.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ), True + + def response_audio_transcript_done_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response audio transcript done.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ), False + + def response_function_call_arguments_delta_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response function call arguments delta.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + ], + choice_index=0, + inner_content=event, + ), True + + def error_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, ) -> None: - """Callback to handle the streaming function call result messages. - - Override this method to handle the streaming function call result messages. - - Args: - function_result_messages (list): The streaming function call result messages. - """ - for msg in function_result_messages: - await self._add_content_to_conversation(msg) - - async def _add_content_to_conversation(self, content: ChatMessageContent) -> None: - """Add an item to the conversation.""" - connection = await self._get_connection() - for item in content.items: - match item: - case AudioContent(): - await connection.conversation.item.create( - item=ConversationItemParam( - type="message", - content=[ - { - "type": "input_audio", - "audio": item.data.decode("utf-8"), - } - ], - role="user", - ) - ) - case TextContent(): - await connection.conversation.item.create( - item=ConversationItemParam( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ) - ) - case FunctionCallContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function call needs to have a call_id") - continue - await connection.conversation.item.create( - item=ConversationItemParam( - type="function_call", - name=item.name, - arguments=item.arguments, - call_id=call_id, - ) - ) - case FunctionResultContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function result needs to have a call_id") - continue - await connection.conversation.item.create( - item=ConversationItemParam( - type="function_call_output", - output=item.result, - call_id=call_id, - ) - ) - case _: - logger.debug("Unhandled item type: %s", item.__class__.__name__) - continue + """Handle error.""" + logger.error("Error received: %s", event.error) + + def session_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Handle session.""" + logger.debug("Session created or updated, session: %s", event.session) + + async def response_function_call_arguments_done_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Handle response function call done.""" + item = FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + kernel: Kernel | None = kwargs.get("kernel") + call_id = item.name + function_name = next( + output_item_event.item.name + for output_item_event in self.event_log[ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED] + if output_item_event.item.call_id == call_id + ) + item.plugin_name, item.function_name = function_name.split("-", 1) + if kernel: + chat_history = ChatHistory() + await kernel.invoke_function_call(item, chat_history) + await self.send_event(SendEvents.CONVERSATION_ITEM_CREATE, item=chat_history.messages[-1]) + return chat_history.messages[-1], False + + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + """Get the request settings class.""" + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 734e7e7caed4..c5d092d50870 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -1,51 +1,140 @@ # Copyright (c) Microsoft. All rights reserved. - from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator -from typing import Any +from collections.abc import AsyncGenerator, Callable +from typing import TYPE_CHECKING, Any, ClassVar -from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.services.ai_service_client_base import AIServiceClientBase +from semantic_kernel.utils.experimental_decorator import experimental_class + +if TYPE_CHECKING: + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent + +#### +# TODO (eavanvalkenburg): Move to ADR +# Receiving: +# Option 1: Events and Contents split (current) +# - content received through main receive_content method +# - events received through event callback handlers +# Option 2: Everything is Content +# - content (events as new Content Type) received through main receive_content method +# Option 3: Everything is Event +# - receive_content method is removed +# - events received through main listen method +# - default event handlers added for things like errors and function calling +# - built-in vs custom event handling - separate or not? +# Sending: +# Option 1: Events and Contents split (current) +# - send_content and send_event +# Option 2: Everything is Content +# - single method needed, with EventContent type support +# Option 3: Everything is Event +# - send_event method only, Content is part of event data +#### +@experimental_class class RealtimeClientBase(AIServiceClientBase, ABC): - """Base class for audio to text client.""" + """Base class for a realtime client.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False + + async def __aenter__(self) -> "RealtimeClientBase": + """Enter the context manager. + + Default implementation calls the create session method. + """ + await self.create_session() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Exit the context manager.""" + await self.close_session() + + @abstractmethod + async def close_session(self) -> None: + """Close the session in the service.""" + pass @abstractmethod - async def receive( + async def create_session( self, - settings: PromptExecutionSettings | None = None, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> AsyncGenerator[TextContent | AudioContent, Any]: - """Get text contents from audio. + ) -> None: + """Create a session in the service. Args: settings: Prompt execution settings. + chat_history: Chat history. kwargs: Additional arguments. - - Returns: - list[TextContent | AudioContent]: response contents. """ raise NotImplementedError @abstractmethod - async def send( + async def update_session( self, - audio_content: AudioContent, - settings: PromptExecutionSettings | None = None, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Get text content from audio. + """Update a session in the service. + + Can be used when using the context manager instead of calling create_session with these same arguments. Args: - audio_content: Audio content. settings: Prompt execution settings. + chat_history: Chat history. kwargs: Additional arguments. + """ + raise NotImplementedError - Returns: - TextContent: Text content. + @abstractmethod + async def event_listener( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> AsyncGenerator["StreamingChatMessageContent", Any]: + """Get text contents from audio. + + Args: + settings: Prompt execution settings. + chat_history: Chat history. + kwargs: Additional arguments. + + Yields: + StreamingChatMessageContent messages """ raise NotImplementedError + + @abstractmethod + async def send_event( + self, + event: str, + event_data: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + """Send an event to the session. + + Args: + event: Event name, can be a string or a Enum value. + event_data: Event data. + kwargs: Additional arguments. + """ + raise NotImplementedError + + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + """Return the callback function to update the settings from a function call configuration. + + Override this method to provide a custom callback function to + update the settings from a function call configuration. + """ + return lambda configuration, settings, choice_type: None diff --git a/python/tests/unit/contents/test_audio_content.py b/python/tests/unit/contents/test_audio_content.py deleted file mode 100644 index 2af5a99b9e29..000000000000 --- a/python/tests/unit/contents/test_audio_content.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import os - -import pytest - -from semantic_kernel.contents.audio_content import AudioContent - -test_cases = [ - pytest.param(AudioContent(uri="http://test_uri"), id="uri"), - pytest.param(AudioContent(data=b"test_data", mime_type="image/jpeg", data_format="base64"), id="data"), - pytest.param(AudioContent(uri="http://test_uri", data=b"test_data", mime_type="image/jpeg"), id="both"), - pytest.param( - AudioContent.from_image_path( - image_path=os.path.join(os.path.dirname(__file__), "../../", "assets/sample_image.jpg") - ), - id="image_file", - ), -] - - -def test_create_uri(): - image = AudioContent(uri="http://test_uri") - assert str(image.uri) == "http://test_uri/" - - -def test_create_file_from_path(): - image_path = os.path.join(os.path.dirname(__file__), "../../", "assets/sample_image.jpg") - image = AudioContent.from_image_path(image_path=image_path) - assert image.mime_type == "image/jpeg" - assert image.data_uri.startswith("data:image/jpeg;") - assert image.data is not None - - -def test_create_data(): - image = AudioContent(data=b"test_data", mime_type="image/jpeg") - assert image.mime_type == "image/jpeg" - assert image.data == b"test_data" - - -def test_to_str_uri(): - image = AudioContent(uri="http://test_uri") - assert str(image) == "http://test_uri/" - - -def test_to_str_data(): - image = AudioContent(data=b"test_data", mime_type="image/jpeg", data_format="base64") - assert str(image) == "data:image/jpeg;base64,dGVzdF9kYXRh" - - -@pytest.mark.parametrize("image", test_cases) -def test_element_roundtrip(image): - element = image.to_element() - new_image = AudioContent.from_element(element) - assert new_image == image - - -@pytest.mark.parametrize("image", test_cases) -def test_to_dict(image): - assert image.to_dict() == {"type": "image_url", "image_url": {"url": str(image)}} diff --git a/python/uv.lock b/python/uv.lock index 5b7d0b30d6d4..710dd9901b09 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -687,7 +687,7 @@ wheels = [ [[package]] name = "chromadb" -version = "0.6.3" +version = "0.5.20" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "bcrypt", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -719,9 +719,9 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/39/cd/f0f2de3f466ff514fb6b58271c14f6d22198402bb5b71b8d890231265946/chromadb-0.6.3.tar.gz", hash = "sha256:c8f34c0b704b9108b04491480a36d42e894a960429f87c6516027b5481d59ed3", size = 29297929 } +sdist = { url = "https://files.pythonhosted.org/packages/03/31/6c8e05405bb02b4a1f71f9aa3eef242415565dabf6afc1bde7f64f726963/chromadb-0.5.20.tar.gz", hash = "sha256:19513a23b2d20059866216bfd80195d1d4a160ffba234b8899f5e80978160ca7", size = 33664540 } wheels = [ - { url = "https://files.pythonhosted.org/packages/28/8e/5c186c77bf749b6fe0528385e507e463f1667543328d76fd00a49e1a4e6a/chromadb-0.6.3-py3-none-any.whl", hash = "sha256:4851258489a3612b558488d98d09ae0fe0a28d5cad6bd1ba64b96fdc419dc0e5", size = 611129 }, + { url = "https://files.pythonhosted.org/packages/5f/7a/10bf5dc92d13cc03230190fcc5016a0b138d99e5b36b8b89ee0fe1680e10/chromadb-0.5.20-py3-none-any.whl", hash = "sha256:9550ba1b6dce911e35cac2568b301badf4b42f457b99a432bdeec2b6b9dd3680", size = 617884 }, ] [[package]] @@ -1053,6 +1053,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4c/a3/ac312faeceffd2d8f86bc6dcb5c401188ba5a01bc88e69bed97578a0dfcd/durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38", size = 3461 }, ] +[[package]] +name = "environs" +version = "9.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "marshmallow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d4/e3/c3c6c76f3dbe3e019e9a451b35bf9f44690026a5bb1232f7b77097b72ff5/environs-9.5.0.tar.gz", hash = "sha256:a76307b36fbe856bdca7ee9161e6c466fd7fcffc297109a118c59b54e27e30c9", size = 20795 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/5e/f0f217dc393372681bfe05c50f06a212e78d0a3fee907a74ab451ec1dcdb/environs-9.5.0-py2.py3-none-any.whl", hash = "sha256:1e549569a3de49c05f856f40bce86979e7d5ffbbc4398e7f338574c220189124", size = 12548 }, +] + [[package]] name = "eval-type-backport" version = "0.2.2" @@ -1505,122 +1518,122 @@ wheels = [ [[package]] name = "grpcio" -version = "1.67.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/53/d9282a66a5db45981499190b77790570617a604a38f3d103d0400974aeb5/grpcio-1.67.1.tar.gz", hash = "sha256:3dc2ed4cabea4dc14d5e708c2b426205956077cc5de419b4d4079315017e9732", size = 12580022 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/cd/f6ca5c49aa0ae7bc6d0757f7dae6f789569e9490a635eaabe02bc02de7dc/grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f", size = 5112450 }, - { url = "https://files.pythonhosted.org/packages/d4/f0/d9bbb4a83cbee22f738ee7a74aa41e09ccfb2dcea2cc30ebe8dab5b21771/grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d", size = 10937518 }, - { url = "https://files.pythonhosted.org/packages/5b/17/0c5dbae3af548eb76669887642b5f24b232b021afe77eb42e22bc8951d9c/grpcio-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:43112046864317498a33bdc4797ae6a268c36345a910de9b9c17159d8346602f", size = 5633610 }, - { url = "https://files.pythonhosted.org/packages/17/48/e000614e00153d7b2760dcd9526b95d72f5cfe473b988e78f0ff3b472f6c/grpcio-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9b929f13677b10f63124c1a410994a401cdd85214ad83ab67cc077fc7e480f0", size = 6240678 }, - { url = "https://files.pythonhosted.org/packages/64/19/a16762a70eeb8ddfe43283ce434d1499c1c409ceec0c646f783883084478/grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7d1797a8a3845437d327145959a2c0c47c05947c9eef5ff1a4c80e499dcc6fa", size = 5884528 }, - { url = "https://files.pythonhosted.org/packages/6b/dc/bd016aa3684914acd2c0c7fa4953b2a11583c2b844f3d7bae91fa9b98fbb/grpcio-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0489063974d1452436139501bf6b180f63d4977223ee87488fe36858c5725292", size = 6583680 }, - { url = "https://files.pythonhosted.org/packages/1a/93/1441cb14c874f11aa798a816d582f9da82194b6677f0f134ea53d2d5dbeb/grpcio-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9fd042de4a82e3e7aca44008ee2fb5da01b3e5adb316348c21980f7f58adc311", size = 6162967 }, - { url = "https://files.pythonhosted.org/packages/29/e9/9295090380fb4339b7e935b9d005fa9936dd573a22d147c9e5bb2df1b8d4/grpcio-1.67.1-cp310-cp310-win32.whl", hash = "sha256:638354e698fd0c6c76b04540a850bf1db27b4d2515a19fcd5cf645c48d3eb1ed", size = 3616336 }, - { url = "https://files.pythonhosted.org/packages/ce/de/7c783b8cb8f02c667ca075c49680c4aeb8b054bc69784bcb3e7c1bbf4985/grpcio-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:608d87d1bdabf9e2868b12338cd38a79969eaf920c89d698ead08f48de9c0f9e", size = 4352071 }, - { url = "https://files.pythonhosted.org/packages/59/2c/b60d6ea1f63a20a8d09c6db95c4f9a16497913fb3048ce0990ed81aeeca0/grpcio-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:7818c0454027ae3384235a65210bbf5464bd715450e30a3d40385453a85a70cb", size = 5119075 }, - { url = "https://files.pythonhosted.org/packages/b3/9a/e1956f7ca582a22dd1f17b9e26fcb8229051b0ce6d33b47227824772feec/grpcio-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ea33986b70f83844cd00814cee4451055cd8cab36f00ac64a31f5bb09b31919e", size = 11009159 }, - { url = "https://files.pythonhosted.org/packages/43/a8/35fbbba580c4adb1d40d12e244cf9f7c74a379073c0a0ca9d1b5338675a1/grpcio-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c7a01337407dd89005527623a4a72c5c8e2894d22bead0895306b23c6695698f", size = 5629476 }, - { url = "https://files.pythonhosted.org/packages/77/c9/864d336e167263d14dfccb4dbfa7fce634d45775609895287189a03f1fc3/grpcio-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80b866f73224b0634f4312a4674c1be21b2b4afa73cb20953cbbb73a6b36c3cc", size = 6239901 }, - { url = "https://files.pythonhosted.org/packages/f7/1e/0011408ebabf9bd69f4f87cc1515cbfe2094e5a32316f8714a75fd8ddfcb/grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fff78ba10d4250bfc07a01bd6254a6d87dc67f9627adece85c0b2ed754fa96", size = 5881010 }, - { url = "https://files.pythonhosted.org/packages/b4/7d/fbca85ee9123fb296d4eff8df566f458d738186d0067dec6f0aa2fd79d71/grpcio-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8a23cbcc5bb11ea7dc6163078be36c065db68d915c24f5faa4f872c573bb400f", size = 6580706 }, - { url = "https://files.pythonhosted.org/packages/75/7a/766149dcfa2dfa81835bf7df623944c1f636a15fcb9b6138ebe29baf0bc6/grpcio-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1a65b503d008f066e994f34f456e0647e5ceb34cfcec5ad180b1b44020ad4970", size = 6161799 }, - { url = "https://files.pythonhosted.org/packages/09/13/5b75ae88810aaea19e846f5380611837de411181df51fd7a7d10cb178dcb/grpcio-1.67.1-cp311-cp311-win32.whl", hash = "sha256:e29ca27bec8e163dca0c98084040edec3bc49afd10f18b412f483cc68c712744", size = 3616330 }, - { url = "https://files.pythonhosted.org/packages/aa/39/38117259613f68f072778c9638a61579c0cfa5678c2558706b10dd1d11d3/grpcio-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:786a5b18544622bfb1e25cc08402bd44ea83edfb04b93798d85dca4d1a0b5be5", size = 4354535 }, - { url = "https://files.pythonhosted.org/packages/6e/25/6f95bd18d5f506364379eabc0d5874873cc7dbdaf0757df8d1e82bc07a88/grpcio-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:267d1745894200e4c604958da5f856da6293f063327cb049a51fe67348e4f953", size = 5089809 }, - { url = "https://files.pythonhosted.org/packages/10/3f/d79e32e5d0354be33a12db2267c66d3cfeff700dd5ccdd09fd44a3ff4fb6/grpcio-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:85f69fdc1d28ce7cff8de3f9c67db2b0ca9ba4449644488c1e0303c146135ddb", size = 10981985 }, - { url = "https://files.pythonhosted.org/packages/21/f2/36fbc14b3542e3a1c20fb98bd60c4732c55a44e374a4eb68f91f28f14aab/grpcio-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f26b0b547eb8d00e195274cdfc63ce64c8fc2d3e2d00b12bf468ece41a0423a0", size = 5588770 }, - { url = "https://files.pythonhosted.org/packages/0d/af/bbc1305df60c4e65de8c12820a942b5e37f9cf684ef5e49a63fbb1476a73/grpcio-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4422581cdc628f77302270ff839a44f4c24fdc57887dc2a45b7e53d8fc2376af", size = 6214476 }, - { url = "https://files.pythonhosted.org/packages/92/cf/1d4c3e93efa93223e06a5c83ac27e32935f998bc368e276ef858b8883154/grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d7616d2ded471231c701489190379e0c311ee0a6c756f3c03e6a62b95a7146e", size = 5850129 }, - { url = "https://files.pythonhosted.org/packages/ae/ca/26195b66cb253ac4d5ef59846e354d335c9581dba891624011da0e95d67b/grpcio-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8a00efecde9d6fcc3ab00c13f816313c040a28450e5e25739c24f432fc6d3c75", size = 6568489 }, - { url = "https://files.pythonhosted.org/packages/d1/94/16550ad6b3f13b96f0856ee5dfc2554efac28539ee84a51d7b14526da985/grpcio-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:699e964923b70f3101393710793289e42845791ea07565654ada0969522d0a38", size = 6149369 }, - { url = "https://files.pythonhosted.org/packages/33/0d/4c3b2587e8ad7f121b597329e6c2620374fccbc2e4e1aa3c73ccc670fde4/grpcio-1.67.1-cp312-cp312-win32.whl", hash = "sha256:4e7b904484a634a0fff132958dabdb10d63e0927398273917da3ee103e8d1f78", size = 3599176 }, - { url = "https://files.pythonhosted.org/packages/7d/36/0c03e2d80db69e2472cf81c6123aa7d14741de7cf790117291a703ae6ae1/grpcio-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:5721e66a594a6c4204458004852719b38f3d5522082be9061d6510b455c90afc", size = 4346574 }, - { url = "https://files.pythonhosted.org/packages/12/d2/2f032b7a153c7723ea3dea08bffa4bcaca9e0e5bdf643ce565b76da87461/grpcio-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa0162e56fd10a5547fac8774c4899fc3e18c1aa4a4759d0ce2cd00d3696ea6b", size = 5091487 }, - { url = "https://files.pythonhosted.org/packages/d0/ae/ea2ff6bd2475a082eb97db1104a903cf5fc57c88c87c10b3c3f41a184fc0/grpcio-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:beee96c8c0b1a75d556fe57b92b58b4347c77a65781ee2ac749d550f2a365dc1", size = 10943530 }, - { url = "https://files.pythonhosted.org/packages/07/62/646be83d1a78edf8d69b56647327c9afc223e3140a744c59b25fbb279c3b/grpcio-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:a93deda571a1bf94ec1f6fcda2872dad3ae538700d94dc283c672a3b508ba3af", size = 5589079 }, - { url = "https://files.pythonhosted.org/packages/d0/25/71513d0a1b2072ce80d7f5909a93596b7ed10348b2ea4fdcbad23f6017bf/grpcio-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e6f255980afef598a9e64a24efce87b625e3e3c80a45162d111a461a9f92955", size = 6213542 }, - { url = "https://files.pythonhosted.org/packages/76/9a/d21236297111052dcb5dc85cd77dc7bf25ba67a0f55ae028b2af19a704bc/grpcio-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e838cad2176ebd5d4a8bb03955138d6589ce9e2ce5d51c3ada34396dbd2dba8", size = 5850211 }, - { url = "https://files.pythonhosted.org/packages/2d/fe/70b1da9037f5055be14f359026c238821b9bcf6ca38a8d760f59a589aacd/grpcio-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:a6703916c43b1d468d0756c8077b12017a9fcb6a1ef13faf49e67d20d7ebda62", size = 6572129 }, - { url = "https://files.pythonhosted.org/packages/74/0d/7df509a2cd2a54814598caf2fb759f3e0b93764431ff410f2175a6efb9e4/grpcio-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:917e8d8994eed1d86b907ba2a61b9f0aef27a2155bca6cbb322430fc7135b7bb", size = 6149819 }, - { url = "https://files.pythonhosted.org/packages/0a/08/bc3b0155600898fd10f16b79054e1cca6cb644fa3c250c0fe59385df5e6f/grpcio-1.67.1-cp313-cp313-win32.whl", hash = "sha256:e279330bef1744040db8fc432becc8a727b84f456ab62b744d3fdb83f327e121", size = 3596561 }, - { url = "https://files.pythonhosted.org/packages/5a/96/44759eca966720d0f3e1b105c43f8ad4590c97bf8eb3cd489656e9590baa/grpcio-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:fa0c739ad8b1996bd24823950e3cb5152ae91fca1c09cc791190bf1627ffefba", size = 4346042 }, +version = "1.69.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/87/06a145284cbe86c91ca517fe6b57be5efbb733c0d6374b407f0992054d18/grpcio-1.69.0.tar.gz", hash = "sha256:936fa44241b5379c5afc344e1260d467bee495747eaf478de825bab2791da6f5", size = 12738244 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/6e/2f8ee5fb65aef962d0bd7e46b815e7b52820687e29c138eaee207a688abc/grpcio-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2060ca95a8db295ae828d0fc1c7f38fb26ccd5edf9aa51a0f44251f5da332e97", size = 5190753 }, + { url = "https://files.pythonhosted.org/packages/89/07/028dcda44d40f9488f0a0de79c5ffc80e2c1bc5ed89da9483932e3ea67cf/grpcio-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2e52e107261fd8fa8fa457fe44bfadb904ae869d87c1280bf60f93ecd3e79278", size = 11096752 }, + { url = "https://files.pythonhosted.org/packages/99/a0/c727041b1410605ba38b585b6b52c1a289d7fcd70a41bccbc2c58fc643b2/grpcio-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:316463c0832d5fcdb5e35ff2826d9aa3f26758d29cdfb59a368c1d6c39615a11", size = 5705442 }, + { url = "https://files.pythonhosted.org/packages/7a/2f/1c53f5d127ff882443b19c757d087da1908f41c58c4b098e8eaf6b2bb70a/grpcio-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:26c9a9c4ac917efab4704b18eed9082ed3b6ad19595f047e8173b5182fec0d5e", size = 6333796 }, + { url = "https://files.pythonhosted.org/packages/cc/f6/2017da2a1b64e896af710253e5bfbb4188605cdc18bce3930dae5cdbf502/grpcio-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b3646ced2eae3a0599658eeccc5ba7f303bf51b82514c50715bdd2b109e5ec", size = 5954245 }, + { url = "https://files.pythonhosted.org/packages/c1/65/1395bec928e99ba600464fb01b541e7e4cdd462e6db25259d755ef9f8d02/grpcio-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3b75aea7c6cb91b341c85e7c1d9db1e09e1dd630b0717f836be94971e015031e", size = 6664854 }, + { url = "https://files.pythonhosted.org/packages/40/57/8b3389cfeb92056c8b44288c9c4ed1d331bcad0215c4eea9ae4629e156d9/grpcio-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5cfd14175f9db33d4b74d63de87c64bb0ee29ce475ce3c00c01ad2a3dc2a9e51", size = 6226854 }, + { url = "https://files.pythonhosted.org/packages/cc/61/1f2bbeb7c15544dffc98b3f65c093e746019995e6f1e21dc3655eec3dc23/grpcio-1.69.0-cp310-cp310-win32.whl", hash = "sha256:9031069d36cb949205293cf0e243abd5e64d6c93e01b078c37921493a41b72dc", size = 3662734 }, + { url = "https://files.pythonhosted.org/packages/ef/ba/bf1a6d9f5c17d2da849793d72039776c56c98c889c9527f6721b6ee57e6e/grpcio-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:cc89b6c29f3dccbe12d7a3b3f1b3999db4882ae076c1c1f6df231d55dbd767a5", size = 4410306 }, + { url = "https://files.pythonhosted.org/packages/8d/cd/ca256aeef64047881586331347cd5a68a4574ba1a236e293cd8eba34e355/grpcio-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:8de1b192c29b8ce45ee26a700044717bcbbd21c697fa1124d440548964328561", size = 5198734 }, + { url = "https://files.pythonhosted.org/packages/37/3f/10c1e5e0150bf59aa08ea6aebf38f87622f95f7f33f98954b43d1b2a3200/grpcio-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:7e76accf38808f5c5c752b0ab3fd919eb14ff8fafb8db520ad1cc12afff74de6", size = 11135285 }, + { url = "https://files.pythonhosted.org/packages/08/61/61cd116a572203a740684fcba3fef37a3524f1cf032b6568e1e639e59db0/grpcio-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:d5658c3c2660417d82db51e168b277e0ff036d0b0f859fa7576c0ffd2aec1442", size = 5699468 }, + { url = "https://files.pythonhosted.org/packages/01/f1/a841662e8e2465ba171c973b77d18fa7438ced535519b3c53617b7e6e25c/grpcio-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5494d0e52bf77a2f7eb17c6da662886ca0a731e56c1c85b93505bece8dc6cf4c", size = 6332337 }, + { url = "https://files.pythonhosted.org/packages/62/b1/c30e932e02c2e0bfdb8df46fe3b0c47f518fb04158ebdc0eb96cc97d642f/grpcio-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ed866f9edb574fd9be71bf64c954ce1b88fc93b2a4cbf94af221e9426eb14d6", size = 5949844 }, + { url = "https://files.pythonhosted.org/packages/5e/cb/55327d43b6286100ffae7d1791be6178d13c917382f3e9f43f82e8b393cf/grpcio-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c5ba38aeac7a2fe353615c6b4213d1fbb3a3c34f86b4aaa8be08baaaee8cc56d", size = 6661828 }, + { url = "https://files.pythonhosted.org/packages/6f/e4/120d72ae982d51cb9cabcd9672f8a1c6d62011b493a4d049d2abdf564db0/grpcio-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f79e05f5bbf551c4057c227d1b041ace0e78462ac8128e2ad39ec58a382536d2", size = 6226026 }, + { url = "https://files.pythonhosted.org/packages/96/e8/2cc15f11db506d7b1778f0587fa7bdd781602b05b3c4d75b7ca13de33d62/grpcio-1.69.0-cp311-cp311-win32.whl", hash = "sha256:bf1f8be0da3fcdb2c1e9f374f3c2d043d606d69f425cd685110dd6d0d2d61258", size = 3662653 }, + { url = "https://files.pythonhosted.org/packages/42/78/3c5216829a48237fcb71a077f891328a435e980d9757a9ebc49114d88768/grpcio-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb9302afc3a0e4ba0b225cd651ef8e478bf0070cf11a529175caecd5ea2474e7", size = 4412824 }, + { url = "https://files.pythonhosted.org/packages/61/1d/8f28f147d7f3f5d6b6082f14e1e0f40d58e50bc2bd30d2377c730c57a286/grpcio-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:fc18a4de8c33491ad6f70022af5c460b39611e39578a4d84de0fe92f12d5d47b", size = 5161414 }, + { url = "https://files.pythonhosted.org/packages/35/4b/9ab8ea65e515e1844feced1ef9e7a5d8359c48d986c93f3d2a2006fbdb63/grpcio-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:0f0270bd9ffbff6961fe1da487bdcd594407ad390cc7960e738725d4807b18c4", size = 11108909 }, + { url = "https://files.pythonhosted.org/packages/99/68/1856fde2b3c3162bdfb9845978608deef3606e6907fdc2c87443fce6ecd0/grpcio-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc48f99cc05e0698e689b51a05933253c69a8c8559a47f605cff83801b03af0e", size = 5658302 }, + { url = "https://files.pythonhosted.org/packages/3e/21/3fa78d38dc5080d0d677103fad3a8cd55091635cc2069a7c06c7a54e6c4d/grpcio-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e925954b18d41aeb5ae250262116d0970893b38232689c4240024e4333ac084", size = 6306201 }, + { url = "https://files.pythonhosted.org/packages/f3/cb/5c47b82fd1baf43dba973ae399095d51aaf0085ab0439838b4cbb1e87e3c/grpcio-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d222569273720366f68a99cb62e6194681eb763ee1d3b1005840678d4884f9", size = 5919649 }, + { url = "https://files.pythonhosted.org/packages/c6/67/59d1a56a0f9508a29ea03e1ce800bdfacc1f32b4f6b15274b2e057bf8758/grpcio-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b62b0f41e6e01a3e5082000b612064c87c93a49b05f7602fe1b7aa9fd5171a1d", size = 6648974 }, + { url = "https://files.pythonhosted.org/packages/f8/fe/ca70c14d98c6400095f19a0f4df8273d09c2106189751b564b26019f1dbe/grpcio-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db6f9fd2578dbe37db4b2994c94a1d9c93552ed77dca80e1657bb8a05b898b55", size = 6215144 }, + { url = "https://files.pythonhosted.org/packages/b3/94/b2b0a9fd487fc8262e20e6dd0ec90d9fa462c82a43b4855285620f6e9d01/grpcio-1.69.0-cp312-cp312-win32.whl", hash = "sha256:b192b81076073ed46f4b4dd612b8897d9a1e39d4eabd822e5da7b38497ed77e1", size = 3644552 }, + { url = "https://files.pythonhosted.org/packages/93/99/81aec9f85412e3255a591ae2ccb799238e074be774e5f741abae08a23418/grpcio-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:1227ff7836f7b3a4ab04e5754f1d001fa52a730685d3dc894ed8bc262cc96c01", size = 4399532 }, + { url = "https://files.pythonhosted.org/packages/54/47/3ff4501365f56b7cc16617695dbd4fd838c5e362bc7fa9fee09d592f7d78/grpcio-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:a78a06911d4081a24a1761d16215a08e9b6d4d29cdbb7e427e6c7e17b06bcc5d", size = 5162928 }, + { url = "https://files.pythonhosted.org/packages/c0/63/437174c5fa951052c9ecc5f373f62af6f3baf25f3f5ef35cbf561806b371/grpcio-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:dc5a351927d605b2721cbb46158e431dd49ce66ffbacb03e709dc07a491dde35", size = 11103027 }, + { url = "https://files.pythonhosted.org/packages/53/df/53566a6fdc26b6d1f0585896e1cc4825961039bca5a6a314ff29d79b5d5b/grpcio-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:3629d8a8185f5139869a6a17865d03113a260e311e78fbe313f1a71603617589", size = 5659277 }, + { url = "https://files.pythonhosted.org/packages/e6/4c/b8a0c4f71498b6f9be5ca6d290d576cf2af9d95fd9827c47364f023969ad/grpcio-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9a281878feeb9ae26db0622a19add03922a028d4db684658f16d546601a4870", size = 6305255 }, + { url = "https://files.pythonhosted.org/packages/ef/55/d9aa05eb3dfcf6aa946aaf986740ec07fc5189f20e2cbeb8c5d278ffd00f/grpcio-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc614e895177ab7e4b70f154d1a7c97e152577ea101d76026d132b7aaba003b", size = 5920240 }, + { url = "https://files.pythonhosted.org/packages/ea/eb/774b27c51e3e386dfe6c491a710f6f87ffdb20d88ec6c3581e047d9354a2/grpcio-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1ee76cd7e2e49cf9264f6812d8c9ac1b85dda0eaea063af07292400f9191750e", size = 6652974 }, + { url = "https://files.pythonhosted.org/packages/59/98/96de14e6e7d89123813d58c246d9b0f1fbd24f9277f5295264e60861d9d6/grpcio-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0470fa911c503af59ec8bc4c82b371ee4303ececbbdc055f55ce48e38b20fd67", size = 6215757 }, + { url = "https://files.pythonhosted.org/packages/7d/5b/ce922e0785910b10756fabc51fd294260384a44bea41651dadc4e47ddc82/grpcio-1.69.0-cp313-cp313-win32.whl", hash = "sha256:b650f34aceac8b2d08a4c8d7dc3e8a593f4d9e26d86751ebf74ebf5107d927de", size = 3642488 }, + { url = "https://files.pythonhosted.org/packages/5d/04/11329e6ca1ceeb276df2d9c316b5e170835a687a4d0f778dba8294657e36/grpcio-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:028337786f11fecb5d7b7fa660475a06aabf7e5e52b5ac2df47414878c0ce7ea", size = 4399968 }, ] [[package]] name = "grpcio-health-checking" -version = "1.67.1" +version = "1.69.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/64/dd/e3b339fa44dc75b501a1a22cb88f1af5b1f8c964488f19c4de4cfbbf05ba/grpcio_health_checking-1.67.1.tar.gz", hash = "sha256:ca90fa76a6afbb4fda71d734cb9767819bba14928b91e308cffbb0c311eb941e", size = 16775 } +sdist = { url = "https://files.pythonhosted.org/packages/ef/b8/d6d485e27d60174ba22c25587c1a97512c6a800633cfd6a8cd7943ad66e0/grpcio_health_checking-1.69.0.tar.gz", hash = "sha256:ff6e1d38c2a300b1bbd296916fbd9165667bc4b5a8557f99dd4226d4f9e8f4c1", size = 16809 } wheels = [ - { url = "https://files.pythonhosted.org/packages/5c/8d/7a9878dca6616b48093d71c52d0bc79cb2dd1a2698ff6f5ce7406306de12/grpcio_health_checking-1.67.1-py3-none-any.whl", hash = "sha256:93753da5062152660aef2286c9b261e07dd87124a65e4dc9fbd47d1ce966b39d", size = 18924 }, + { url = "https://files.pythonhosted.org/packages/a4/07/8d68bb1821dc46dfb5b702374c5d06e9c0013afb08fa92516ebd8f963ef3/grpcio_health_checking-1.69.0-py3-none-any.whl", hash = "sha256:d2d0eec7e3af245863fd4997e2942d27c0868fbd61ffa4d14bc492c3e2c67127", size = 18923 }, ] [[package]] name = "grpcio-status" -version = "1.67.1" +version = "1.69.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "googleapis-common-protos", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/be/c7/fe0e79a80ac6346e0c6c0a24e9e3cbc3ae1c2a009acffb59eab484a6f69b/grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11", size = 13673 } +sdist = { url = "https://files.pythonhosted.org/packages/02/35/52dc0d8300f879dbf9cdc95764cee9f56d5a212998cfa1a8871b262df2a4/grpcio_status-1.69.0.tar.gz", hash = "sha256:595ef84e5178d6281caa732ccf68ff83259241608d26b0e9c40a5e66eee2a2d2", size = 13662 } wheels = [ - { url = "https://files.pythonhosted.org/packages/05/18/56999a1da3577d8ccc8698a575d6638e15fe25650cc88b2ce0a087f180b9/grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd", size = 14427 }, + { url = "https://files.pythonhosted.org/packages/f6/e2/346a766a4232f74f45f8bc70e636fc3a6677e6bc3893382187829085f12e/grpcio_status-1.69.0-py3-none-any.whl", hash = "sha256:d6b2a3c9562c03a817c628d7ba9a925e209c228762d6d7677ae5c9401a542853", size = 14428 }, ] [[package]] name = "grpcio-tools" -version = "1.67.1" +version = "1.69.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/6facde12a5a8da4398a3a8947f8ba6ef33b408dfc9767c8cefc0074ddd68/grpcio_tools-1.67.1.tar.gz", hash = "sha256:d9657f5ddc62b52f58904e6054b7d8a8909ed08a1e28b734be3a707087bcf004", size = 5159073 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/46/668e681e2e4ca7dc80cb5ad22bc794958c8b604b5b3143f16b94be3c0118/grpcio_tools-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:c701aaa51fde1f2644bd94941aa94c337adb86f25cd03cf05e37387aaea25800", size = 2308117 }, - { url = "https://files.pythonhosted.org/packages/d6/56/1c65fb7c836cd40470f1f1a88185973466241fdb42b42b7a83367c268622/grpcio_tools-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:6a722bba714392de2386569c40942566b83725fa5c5450b8910e3832a5379469", size = 5500152 }, - { url = "https://files.pythonhosted.org/packages/01/ab/caf9c330241d843a83043b023e2996e959cdc2c3ab404b1a9938eb734143/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0c7415235cb154e40b5ae90e2a172a0eb8c774b6876f53947cf0af05c983d549", size = 2282055 }, - { url = "https://files.pythonhosted.org/packages/75/e6/0cd849d140b58fedb7d3b15d907fe2eefd4dadff09b570dd687d841c5d00/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a4c459098c4934f9470280baf9ff8b38c365e147f33c8abc26039a948a664a5", size = 2617360 }, - { url = "https://files.pythonhosted.org/packages/b9/51/bd73cd6515c2e81ba0a29b3cf6f2f62ad94737326f70b32511d1972a383e/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e89bf53a268f55c16989dab1cf0b32a5bff910762f138136ffad4146129b7a10", size = 2416028 }, - { url = "https://files.pythonhosted.org/packages/47/e5/6a16e23036f625b6d60b579996bb9bb7165485903f934d9d9d73b3f03ef5/grpcio_tools-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f09cb3e6bcb140f57b878580cf3b848976f67faaf53d850a7da9bfac12437068", size = 3224906 }, - { url = "https://files.pythonhosted.org/packages/14/cb/230c17d4372fa46fc799a822f25fa00c8eb3f85cc86e192b9606a17f732f/grpcio_tools-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:616dd0c6686212ca90ff899bb37eb774798677e43dc6f78c6954470782d37399", size = 2870384 }, - { url = "https://files.pythonhosted.org/packages/66/fd/6d9dd3bf5982ab7d7e773f055360185e96a96cf95f2cbc7f53ded5912ef5/grpcio_tools-1.67.1-cp310-cp310-win32.whl", hash = "sha256:58a66dbb3f0fef0396737ac09d6571a7f8d96a544ce3ed04c161f3d4fa8d51cc", size = 941138 }, - { url = "https://files.pythonhosted.org/packages/6a/97/2fd5ebd996c12b2cb1e1202ee4a03cac0a65ba17d29dd34253bfe2079839/grpcio_tools-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:89ee7c505bdf152e67c2cced6055aed4c2d4170f53a2b46a7e543d3b90e7b977", size = 1091151 }, - { url = "https://files.pythonhosted.org/packages/b5/9a/ec06547673c5001c2604637069ff8f287df1aef3f0f8809b09a1c936b049/grpcio_tools-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:6d80ddd87a2fb7131d242f7d720222ef4f0f86f53ec87b0a6198c343d8e4a86e", size = 2307990 }, - { url = "https://files.pythonhosted.org/packages/ca/84/4b7c3c27a2972c00b3b6ccaadd349e0f86b7039565d3a4932e219a4d76e0/grpcio_tools-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b655425b82df51f3bd9fd3ba1a6282d5c9ce1937709f059cb3d419b224532d89", size = 5526552 }, - { url = "https://files.pythonhosted.org/packages/a7/2d/a620e4c53a3b808ebecaa5033c2176925ee1c6cbb45c29af8bec9a249822/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:250241e6f9d20d0910a46887dfcbf2ec9108efd3b48f3fb95bb42d50d09d03f8", size = 2282137 }, - { url = "https://files.pythonhosted.org/packages/ec/29/e188b2e438781b37532abb8f10caf5b09c611a0bf9a09940b4cf303afd5b/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6008f5a5add0b6f03082edb597acf20d5a9e4e7c55ea1edac8296c19e6a0ec8d", size = 2617333 }, - { url = "https://files.pythonhosted.org/packages/86/aa/2bbccd3c34b1fa48b892fbad91525c33a8aa85cbedd50e8b0d17dc260dc3/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5eff9818c3831fa23735db1fa39aeff65e790044d0a312260a0c41ae29cc2d9e", size = 2415806 }, - { url = "https://files.pythonhosted.org/packages/db/34/99853a8ced1119937d02511476018dc1d6b295a4803d4ead5dbf9c55e9bc/grpcio_tools-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:262ab7c40113f8c3c246e28e369661ddf616a351cb34169b8ba470c9a9c3b56f", size = 3224765 }, - { url = "https://files.pythonhosted.org/packages/66/39/8537a8ace8f6242f2058677e56a429587ec731c332985af34f35d496ca58/grpcio_tools-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1eebd8c746adf5786fa4c3056258c21cc470e1eca51d3ed23a7fb6a697fe4e81", size = 2870446 }, - { url = "https://files.pythonhosted.org/packages/28/2a/5c04375adccff58647d48675e055895c31811a0ad896e4ba310833e2154d/grpcio_tools-1.67.1-cp311-cp311-win32.whl", hash = "sha256:3eff92fb8ca1dd55e3af0ef02236c648921fb7d0e8ca206b889585804b3659ae", size = 940890 }, - { url = "https://files.pythonhosted.org/packages/e6/ee/7861339c2cec8d55a5e859cf3682bda34eab5a040f95d0c80f775d6a3279/grpcio_tools-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:1ed18281ee17e5e0f9f6ce0c6eb3825ca9b5a0866fc1db2e17fab8aca28b8d9f", size = 1091094 }, - { url = "https://files.pythonhosted.org/packages/d9/cf/7b1908ca72e484bac555431036292c48d2d6504a45e2789848cb5ff313a8/grpcio_tools-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:bd5caef3a484e226d05a3f72b2d69af500dca972cf434bf6b08b150880166f0b", size = 2307645 }, - { url = "https://files.pythonhosted.org/packages/bb/15/0d1efb38af8af7e56b2342322634a3caf5f1337a6c3857a6d14aa590dfdf/grpcio_tools-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:48a2d63d1010e5b218e8e758ecb2a8d63c0c6016434e9f973df1c3558917020a", size = 5525468 }, - { url = "https://files.pythonhosted.org/packages/52/42/a810709099f09ade7f32990c0712c555b3d7eab6a05fb62618c17f8fe9da/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:baa64a6aa009bffe86309e236c81b02cd4a88c1ebd66f2d92e84e9b97a9ae857", size = 2281768 }, - { url = "https://files.pythonhosted.org/packages/4c/2a/64ee6cfdf1c32ef8bdd67bf04ae2f745f517f4a546281453ca1f68fa79ca/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ab318c40b5e3c097a159035fc3e4ecfbe9b3d2c9de189e55468b2c27639a6ab", size = 2617359 }, - { url = "https://files.pythonhosted.org/packages/79/7f/1ed8cd1529253fef9cf0ef3cd8382641125a5ca2eaa08eaffbb549f84e0b/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50eba3e31f9ac1149463ad9182a37349850904f142cffbd957cd7f54ec320b8e", size = 2415323 }, - { url = "https://files.pythonhosted.org/packages/8e/08/59f0073c58703c176c15fb1a838763b77c1c06994adba16654b92a666e1b/grpcio_tools-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:de6fbc071ecc4fe6e354a7939202191c1f1abffe37fbce9b08e7e9a5b93eba3d", size = 3225051 }, - { url = "https://files.pythonhosted.org/packages/b7/0d/a5d703214fe49d261b4b8f0a64140a4dc1f88560724a38ad937120b899ad/grpcio_tools-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db9e87f6ea4b0ce99b2651203480585fd9e8dd0dd122a19e46836e93e3a1b749", size = 2870421 }, - { url = "https://files.pythonhosted.org/packages/ac/af/41d79cb87eae99c0348e8f1fb3dbed9e40a6f63548b216e99f4d1165fa5c/grpcio_tools-1.67.1-cp312-cp312-win32.whl", hash = "sha256:6a595a872fb720dde924c4e8200f41d5418dd6baab8cc1a3c1e540f8f4596351", size = 940542 }, - { url = "https://files.pythonhosted.org/packages/66/e5/096e12f5319835aa2bcb746d49ae62220bb48313ca649e89bdbef605c11d/grpcio_tools-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:92eebb9b31031604ae97ea7657ae2e43149b0394af7117ad7e15894b6cc136dc", size = 1090425 }, - { url = "https://files.pythonhosted.org/packages/62/b3/91c88440c978740752d39f1abae83f21408048b98b93652ebd84f974ad3d/grpcio_tools-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:9a3b9510cc87b6458b05ad49a6dee38df6af37f9ee6aa027aa086537798c3d4a", size = 2307453 }, - { url = "https://files.pythonhosted.org/packages/05/33/faf3330825463c0409fa3891bc1459bf86a00055b19790211365279538d7/grpcio_tools-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9e4c9b9fa9b905f15d414cb7bd007ba7499f8907bdd21231ab287a86b27da81a", size = 5517975 }, - { url = "https://files.pythonhosted.org/packages/bd/78/461ab34cadbd0b5b9a0b6efedda96b58e0de471e3fa91d8e4a4e31924e1b/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:e11a98b41af4bc88b7a738232b8fa0306ad82c79fa5d7090bb607f183a57856f", size = 2281081 }, - { url = "https://files.pythonhosted.org/packages/5f/0c/b30bdbcab1795b12e05adf30c20981c14f66198e22044edb15b3c1d9f0bc/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de0fcfe61c26679d64b1710746f2891f359593f76894fcf492c37148d5694f00", size = 2616929 }, - { url = "https://files.pythonhosted.org/packages/d3/c2/a77ca68ae768f8d5f1d070ea4afc42fda40401083e7c4f5c08211e84de38/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ae3b3e2ee5aad59dece65a613624c46a84c9582fc3642686537c6dfae8e47dc", size = 2414633 }, - { url = "https://files.pythonhosted.org/packages/39/70/8d7131dccfe4d7b739c96ada7ea9acde631f58f013eae773791fb490a3eb/grpcio_tools-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:9a630f83505b6471a3094a7a372a1240de18d0cd3e64f4fbf46b361bac2be65b", size = 3224328 }, - { url = "https://files.pythonhosted.org/packages/2a/28/2d24b933ccf0d6877035aa3d5f8b64aad18c953657dd43c682b5701dc127/grpcio_tools-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d85a1fcbacd3e08dc2b3d1d46b749351a9a50899fa35cf2ff040e1faf7d405ad", size = 2869640 }, - { url = "https://files.pythonhosted.org/packages/37/77/ddd2b4cc896639fb0f85fc21d5684f25080ee28845c5a4031e3dd65fdc92/grpcio_tools-1.67.1-cp313-cp313-win32.whl", hash = "sha256:778470f025f25a1fca5a48c93c0a18af395b46b12dd8df7fca63736b85181f41", size = 939997 }, - { url = "https://files.pythonhosted.org/packages/96/d0/f0855a0ccb26ffeb41e6db68b5cbb25d7e9ba1f8f19151eef36210e64efc/grpcio_tools-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:6961da86e9856b4ddee0bf51ef6636b4bf9c29c0715aa71f3c8f027c45d42654", size = 1089819 }, +sdist = { url = "https://files.pythonhosted.org/packages/64/ec/1c25136ca1697eaa09a02effe3e74959fd9fb6aba9960d7340dd6341c5ce/grpcio_tools-1.69.0.tar.gz", hash = "sha256:3e1a98f4d9decb84979e1ddd3deb09c0a33a84b6e3c0776d5bde4097e3ab66dd", size = 5323319 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/90/7df7326552fec627adcf3880cf13e9a5b23c090bbcedba367f64fa2bb54b/grpcio_tools-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:8c210630faa581c3bd08953dac4ad21a7f49862f3b92d69686e9b436d2f1265d", size = 2388795 }, + { url = "https://files.pythonhosted.org/packages/e2/03/6ccaa58b3ca1734d0868a389148e22ac15248a9be4c223805339f7904e31/grpcio_tools-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:09b66ea279fcdaebae4ec34b1baf7577af3b14322738aa980c1c33cfea71f7d7", size = 5703156 }, + { url = "https://files.pythonhosted.org/packages/c9/f6/162b456684d2444b43e45ace4e889087301e5890bbfd16ee6b2aedf36219/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:be94a4bfa56d356aae242cc54072c9ccc2704b659eaae2fd599a94afebf791ce", size = 2350725 }, + { url = "https://files.pythonhosted.org/packages/db/3a/2e83fea8c90b9902d68964491d014d688177a6ad0303dbbe6c2c16f25da6/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28778debad73a8c8e0a0e07e6a2f76eecce43adbc205d17dd244d2d58bb0f0aa", size = 2727230 }, + { url = "https://files.pythonhosted.org/packages/63/06/be27b8f1811ff4cc556bdec64a9004755a929df035dc606466a75c9ac0fa/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:449308d93e4c97ae3a4503510c6d64978748ff5e21429c85da14fdc783c0f498", size = 2472752 }, + { url = "https://files.pythonhosted.org/packages/a3/43/f94578afa1535287b7b0ba39eeb23b2b8304a2a5b8e325ed7079d2ad9cba/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b9343651e73bc6e0df6bb518c2638bf9cc2194b50d060cdbcf1b2121cd4e4ae3", size = 3344074 }, + { url = "https://files.pythonhosted.org/packages/13/d1/5f9030cbb6195f3bb182e740f349cdaa71d9c38c1b2572f401270709d7d2/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2f08b063612553e726e328aef3a27adfaea8d92712b229012afc54d59da88a02", size = 2953778 }, + { url = "https://files.pythonhosted.org/packages/0c/cb/4812660e150d197de81296fa04ed6ad012d1aeac23bbe21be5f51493f455/grpcio_tools-1.69.0-cp310-cp310-win32.whl", hash = "sha256:599ffd39525e7bbb6412a63e56a2e6c1af8f3493fe4305260efd4a11d064cce0", size = 957556 }, + { url = "https://files.pythonhosted.org/packages/4e/c7/c7d5f5418909764e63208b9f76812db3287ece4f79500e815178194e1db9/grpcio_tools-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:02f92e3c2bae67ece818787f8d3d89df0fa1e5e6bbb7c1493824fd5dfad886dd", size = 1114783 }, + { url = "https://files.pythonhosted.org/packages/7e/f4/575f536bada8d8f5f8943c317ae28faafe7b4aaf95ef84a599f4f3e67db3/grpcio_tools-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c18df5d1c8e163a29863583ec51237d08d7059ef8d4f7661ee6d6363d3e38fe3", size = 2388772 }, + { url = "https://files.pythonhosted.org/packages/87/94/1157342b046f51c4d076f21ef76da6d89323929b7e870389204fd49e3f09/grpcio_tools-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:37876ae49235ef2e61e5059faf45dc5e7142ca54ae61aec378bb9483e0cd7e95", size = 5726348 }, + { url = "https://files.pythonhosted.org/packages/36/5c/cfd9160ef1867e025844b2695d436bb953c2d5f9c20eaaa7da6fd739ab0c/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:33120920e29959eaa37a1268c6a22af243d086b1a5e5222b4203e29560ece9ce", size = 2350857 }, + { url = "https://files.pythonhosted.org/packages/61/70/10614b8bc39f06548a0586fdd5d97843da4789965e758fba87726bde8c2f/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:788bb3ecd1b44664d829d319b3c1ebc15c7d7b5e7d1f22706ab57d6acd2c6301", size = 2727157 }, + { url = "https://files.pythonhosted.org/packages/37/fb/33faedb3e991dceb7a2bf802d3875bff7d6a6b6a80d314197adc73739cae/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f453b11a112e3774c8957ec2570669f3da1f7fbc8ee242482c38981496e88da2", size = 2472882 }, + { url = "https://files.pythonhosted.org/packages/41/f7/abddc158919a982f6b8e61d4a5c72569b2963304c162c3ca53c6c14d23ee/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e5c5dc2b656755cb58b11a7e87b65258a4a8eaff01b6c30ffcb230dd447c03d", size = 3343987 }, + { url = "https://files.pythonhosted.org/packages/ba/46/e7219456aefe29137728246a67199fcbfdaa99ede93d2045a6406f0e4c0b/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8eabf0a7a98c14322bc74f9910c96f98feebe311e085624b2d022924d4f652ca", size = 2953659 }, + { url = "https://files.pythonhosted.org/packages/74/be/262c5d2b681930f8c58012500741fe06cb40a770c9d395650efe9042467f/grpcio_tools-1.69.0-cp311-cp311-win32.whl", hash = "sha256:ad567bea43d018c2215e1db10316eda94ca19229a834a3221c15d132d24c1b8a", size = 957447 }, + { url = "https://files.pythonhosted.org/packages/8e/55/68153acca126dced35f888e708a65169df8fa8a4d5f0e78166a395e3fa9c/grpcio_tools-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:3d64e801586dbea3530f245d48b9ed031738cc3eb099d5ce2fdb1b3dc2e1fb20", size = 1114753 }, + { url = "https://files.pythonhosted.org/packages/5b/f6/9cd1aa47556664564b873cd187d8dec978ff2f4a539d8c6d5d2f418d3d36/grpcio_tools-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8ef8efe8beac4cc1e30d41893e4096ca2601da61001897bd17441645de2d4d3c", size = 2388440 }, + { url = "https://files.pythonhosted.org/packages/62/37/0bcd8431e44b38f648f70368dd60542d10ffaffa109563349ee635013e10/grpcio_tools-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:a00e87a0c5a294028115a098819899b08dd18449df5b2aac4a2b87ba865e8681", size = 5726135 }, + { url = "https://files.pythonhosted.org/packages/8b/f5/2ec994bbf522a231ce54c41a2d3621e77bece1240aafe31f12804052af0f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:7722700346d5b223159532e046e51f2ff743ed4342e5fe3e0457120a4199015e", size = 2350247 }, + { url = "https://files.pythonhosted.org/packages/a9/29/9ebf54315a499a766e4c3bd53124267491162e9049c2d9ed45f43222b98f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a934116fdf202cb675246056ee54645c743e2240632f86a37e52f91a405c7143", size = 2727994 }, + { url = "https://files.pythonhosted.org/packages/f0/2a/1a031018660b5d95c1a4c587a0babd0d28f0aa0c9a40dbca330567049a3f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e6a6d44359ca836acfbc58103daf94b3bb8ac919d659bb348dcd7fbecedc293", size = 2472625 }, + { url = "https://files.pythonhosted.org/packages/74/bf/76d24078e1c76976a10760c3193b6c62685a7aed64b1cb0d8242afa16f1d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e27662c0597fd1ab5399a583d358b5203edcb6fc2b29d6245099dfacd51a6ddc", size = 3344290 }, + { url = "https://files.pythonhosted.org/packages/f1/f7/4ab645e4955ca1e5240b0bbd557662cec4838f0e21e072ff40f4e191b48d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7bbb2b2fb81d95bcdd1d8331defb5f5dc256dbe423bb98b682cf129cdd432366", size = 2953592 }, + { url = "https://files.pythonhosted.org/packages/8f/32/57e67b126f209f289fc32009309d155b8dbe9ac760c32733746e4dda7b51/grpcio_tools-1.69.0-cp312-cp312-win32.whl", hash = "sha256:e11accd10cf4af5031ac86c45f1a13fb08f55e005cea070917c12e78fe6d2aa2", size = 957042 }, + { url = "https://files.pythonhosted.org/packages/19/64/7bfcb4e50a0ce87690c24696cd666f528e672119966abead09ae65a2e1da/grpcio_tools-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:6df4c6ac109af338a8ccde29d184e0b0bdab13d78490cb360ff9b192a1aec7e2", size = 1114248 }, + { url = "https://files.pythonhosted.org/packages/0c/ef/a9867f612e3aa5e69d299e47a72ea8dafa476b1f099462c9a1223cd6a83c/grpcio_tools-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c320c4faa1431f2e1252ef2325a970ac23b2fd04ffef6c12f96dd4552c3445c", size = 2388281 }, + { url = "https://files.pythonhosted.org/packages/4b/53/b2752d8ec338778e48d76845d605a0f8bca9e43a5f09428e5ed1a76e4e1d/grpcio_tools-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:5f1224596ad74dd14444b20c37122b361c5d203b67e14e018b995f3c5d76eede", size = 5725856 }, + { url = "https://files.pythonhosted.org/packages/83/dd/195d3639634c0c1d1e48b6693c074d66a64f16c748df2f40bcee74aa04e2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:965a0cf656a113bc32d15ac92ca51ed702a75d5370ae0afbdd36f818533a708a", size = 2350180 }, + { url = "https://files.pythonhosted.org/packages/8c/18/c412884fa0e888d8a271f3e31d23e3765cde0efe2404653ab67971c411c2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:978835768c11a7f28778b3b7c40f839d8a57f765c315e80c4246c23900d56149", size = 2726724 }, + { url = "https://files.pythonhosted.org/packages/be/c7/dfb59b7e25d760bfdd93f0aef7dd0e2a37f8437ac3017b8b526c68764e2f/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:094c7cec9bd271a32dfb7c620d4a558c63fcb0122fd1651b9ed73d6afd4ae6fe", size = 2472127 }, + { url = "https://files.pythonhosted.org/packages/f2/b6/af4edf0a181fd7b148a83d491f5677d7d1c9f86f03282f8f0209d9dfb793/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:b51bf4981b3d7e47c2569efadff08284787124eb3dea0f63f491d39703231d3c", size = 3344015 }, + { url = "https://files.pythonhosted.org/packages/0a/9f/4c2b5ae642f7d3df73c16df6c7d53e9443cb0e49e1dcf2c8d1a49058e0b5/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea7aaf0dc1a828e2133357a9e9553fd1bb4e766890d52a506cc132e40632acdc", size = 2952942 }, + { url = "https://files.pythonhosted.org/packages/97/8e/6b707871db5927a17ad7475c070916bff4f32463a51552b424779236ab65/grpcio_tools-1.69.0-cp313-cp313-win32.whl", hash = "sha256:4320f11b79d3a148cc23bad1b81719ce1197808dc2406caa8a8ba0a5cfb0260d", size = 956242 }, + { url = "https://files.pythonhosted.org/packages/27/e2/b419a02b50240143605f77cd50cb07f724caf0fd35a01540a4f044ae9f21/grpcio_tools-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9bae733654e0eb8ca83aa1d0d6b6c2f4a3525ce70d5ffc07df68d28f6520137", size = 1113616 }, ] [[package]] @@ -2273,6 +2286,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, ] +[[package]] +name = "marshmallow" +version = "3.24.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3b/1f/52fa79445669322ee42fdd11b591c2e9c8dbab33eaf7059ca881b349ae09/marshmallow-3.24.2.tar.gz", hash = "sha256:0822c3701de396b51d3f8ac97319aea5493998ba4e7d0e4c05f6fce7777bf3a2", size = 176520 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/40/7802bb90b1ecbb284ae613da2cfde9ce0177b77d76cbb276acf976296aa8/marshmallow-3.24.2-py3-none-any.whl", hash = "sha256:bf3c56db473bb160e5191f1c5e32e3fc8bfb58998eb2b35d6747de023e31f9e7", size = 49333 }, +] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -3624,16 +3649,16 @@ wheels = [ [[package]] name = "protobuf" -version = "5.29.2" +version = "5.29.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a5/73/4e6295c1420a9d20c9c351db3a36109b4c9aa601916cb7c6871e3196a1ca/protobuf-5.29.2.tar.gz", hash = "sha256:b2cc8e8bb7c9326996f0e160137b0861f1a82162502658df2951209d0cb0309e", size = 424901 } +sdist = { url = "https://files.pythonhosted.org/packages/f7/d1/e0a911544ca9993e0f17ce6d3cc0932752356c1b0a834397f28e63479344/protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620", size = 424945 } wheels = [ - { url = "https://files.pythonhosted.org/packages/f3/42/6db5387124708d619ffb990a846fb123bee546f52868039f8fa964c5bc54/protobuf-5.29.2-cp310-abi3-win32.whl", hash = "sha256:c12ba8249f5624300cf51c3d0bfe5be71a60c63e4dcf51ffe9a68771d958c851", size = 422697 }, - { url = "https://files.pythonhosted.org/packages/6c/38/2fcc968b377b531882d6ab2ac99b10ca6d00108394f6ff57c2395fb7baff/protobuf-5.29.2-cp310-abi3-win_amd64.whl", hash = "sha256:842de6d9241134a973aab719ab42b008a18a90f9f07f06ba480df268f86432f9", size = 434495 }, - { url = "https://files.pythonhosted.org/packages/cb/26/41debe0f6615fcb7e97672057524687ed86fcd85e3da3f031c30af8f0c51/protobuf-5.29.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a0c53d78383c851bfa97eb42e3703aefdc96d2036a41482ffd55dc5f529466eb", size = 417812 }, - { url = "https://files.pythonhosted.org/packages/e4/20/38fc33b60dcfb380507b99494aebe8c34b68b8ac7d32808c4cebda3f6f6b/protobuf-5.29.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:494229ecd8c9009dd71eda5fd57528395d1eacdf307dbece6c12ad0dd09e912e", size = 319562 }, - { url = "https://files.pythonhosted.org/packages/90/4d/c3d61e698e0e41d926dbff6aa4e57428ab1a6fc3b5e1deaa6c9ec0fd45cf/protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:b6b0d416bbbb9d4fbf9d0561dbfc4e324fd522f61f7af0fe0f282ab67b22477e", size = 319662 }, - { url = "https://files.pythonhosted.org/packages/f3/fd/c7924b4c2a1c61b8f4b64edd7a31ffacf63432135a2606f03a2f0d75a750/protobuf-5.29.2-py3-none-any.whl", hash = "sha256:fde4554c0e578a5a0bcc9a276339594848d1e89f9ea47b4427c80e5d72f90181", size = 172539 }, + { url = "https://files.pythonhosted.org/packages/dc/7a/1e38f3cafa022f477ca0f57a1f49962f21ad25850c3ca0acd3b9d0091518/protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888", size = 422708 }, + { url = "https://files.pythonhosted.org/packages/61/fa/aae8e10512b83de633f2646506a6d835b151edf4b30d18d73afd01447253/protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a", size = 434508 }, + { url = "https://files.pythonhosted.org/packages/dd/04/3eaedc2ba17a088961d0e3bd396eac764450f431621b58a04ce898acd126/protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e", size = 417825 }, + { url = "https://files.pythonhosted.org/packages/4f/06/7c467744d23c3979ce250397e26d8ad8eeb2bea7b18ca12ad58313c1b8d5/protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84", size = 319573 }, + { url = "https://files.pythonhosted.org/packages/a8/45/2ebbde52ad2be18d3675b6bee50e68cd73c9e0654de77d595540b5129df8/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f", size = 319672 }, + { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 }, ] [[package]] @@ -3816,22 +3841,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, ] -[[package]] -name = "pyaudio" -version = "0.2.14" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/26/1d/8878c7752febb0f6716a7e1a52cb92ac98871c5aa522cba181878091607c/PyAudio-0.2.14.tar.gz", hash = "sha256:78dfff3879b4994d1f4fc6485646a57755c6ee3c19647a491f790a0895bd2f87", size = 47066 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/90/1553487277e6aa25c0b7c2c38709cdd2b49e11c66c0b25c6e8b7b6638c72/PyAudio-0.2.14-cp310-cp310-win32.whl", hash = "sha256:126065b5e82a1c03ba16e7c0404d8f54e17368836e7d2d92427358ad44fefe61", size = 144624 }, - { url = "https://files.pythonhosted.org/packages/27/bc/719d140ee63cf4b0725016531d36743a797ffdbab85e8536922902c9349a/PyAudio-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:2a166fc88d435a2779810dd2678354adc33499e9d4d7f937f28b20cc55893e83", size = 164069 }, - { url = "https://files.pythonhosted.org/packages/7b/f0/b0eab89eafa70a86b7b566a4df2f94c7880a2d483aa8de1c77d335335b5b/PyAudio-0.2.14-cp311-cp311-win32.whl", hash = "sha256:506b32a595f8693811682ab4b127602d404df7dfc453b499c91a80d0f7bad289", size = 144624 }, - { url = "https://files.pythonhosted.org/packages/82/d8/f043c854aad450a76e476b0cf9cda1956419e1dacf1062eb9df3c0055abe/PyAudio-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:bbeb01d36a2f472ae5ee5e1451cacc42112986abe622f735bb870a5db77cf903", size = 164070 }, - { url = "https://files.pythonhosted.org/packages/8d/45/8d2b76e8f6db783f9326c1305f3f816d4a12c8eda5edc6a2e1d03c097c3b/PyAudio-0.2.14-cp312-cp312-win32.whl", hash = "sha256:5fce4bcdd2e0e8c063d835dbe2860dac46437506af509353c7f8114d4bacbd5b", size = 144750 }, - { url = "https://files.pythonhosted.org/packages/b0/6a/d25812e5f79f06285767ec607b39149d02aa3b31d50c2269768f48768930/PyAudio-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:12f2f1ba04e06ff95d80700a78967897a489c05e093e3bffa05a84ed9c0a7fa3", size = 164126 }, - { url = "https://files.pythonhosted.org/packages/3a/77/66cd37111a87c1589b63524f3d3c848011d21ca97828422c7fde7665ff0d/PyAudio-0.2.14-cp313-cp313-win32.whl", hash = "sha256:95328285b4dab57ea8c52a4a996cb52be6d629353315be5bfda403d15932a497", size = 150982 }, - { url = "https://files.pythonhosted.org/packages/a5/8b/7f9a061c1cc2b230f9ac02a6003fcd14c85ce1828013aecbaf45aa988d20/PyAudio-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:692d8c1446f52ed2662120bcd9ddcb5aa2b71f38bda31e58b19fb4672fffba69", size = 173655 }, -] - [[package]] name = "pybars4" version = "0.9.13" @@ -3952,22 +3961,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/a9/3b9642025174bbe67e900785fb99c9bfe91ea584b0b7126ff99945c24a0e/pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820", size = 30746 }, ] -[[package]] -name = "pydub" -version = "0.25.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327 }, -] - [[package]] name = "pygments" -version = "2.19.0" +version = "2.19.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/c0/9c9832e5be227c40e1ce774d493065f83a91d6430baa7e372094e9683a45/pygments-2.19.0.tar.gz", hash = "sha256:afc4146269910d4bdfabcd27c24923137a74d562a23a320a41a55ad303e19783", size = 4967733 } +sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } wheels = [ - { url = "https://files.pythonhosted.org/packages/20/dc/fde3e7ac4d279a331676829af4afafd113b34272393d73f610e8f0329221/pygments-2.19.0-py3-none-any.whl", hash = "sha256:4755e6e64d22161d5b61432c0600c923c5927214e7c956e31c23923c89251a9b", size = 1225305 }, + { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] [[package]] @@ -3995,11 +3995,11 @@ name = "pymilvus" version = "2.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "environs", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "milvus-lite", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ujson", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -4955,9 +4955,6 @@ onnx = [ ] openai-realtime = [ { name = "openai", extra = ["realtime"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pyaudio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pydub", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "sounddevice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] pandas = [ { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -5044,7 +5041,6 @@ requires-dist = [ { name = "pybars4", specifier = "~=0.9" }, { name = "pydantic", specifier = ">=2.0,!=2.10.0,!=2.10.1,!=2.10.2,!=2.10.3,<2.11" }, { name = "pydantic-settings", specifier = "~=2.0" }, - { name = "pydub", marker = "extra == 'openai-realtime'" }, { name = "pymilvus", marker = "extra == 'milvus'", specifier = ">=2.3,<2.6" }, { name = "pymongo", marker = "extra == 'mongo'", specifier = ">=4.8.0,<4.12" }, { name = "qdrant-client", marker = "extra == 'qdrant'", specifier = "~=1.9" }, @@ -5256,21 +5252,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/93/84a16940c44f6ec62cf334f25aed3128a514dffc361397eee09421a1c7f2/snoop-0.6.0-py3-none-any.whl", hash = "sha256:f5ea9060e65594bf404e6841086b4a964cc27bc30569109c91a470f948b0f729", size = 27461 }, ] -[[package]] -name = "sounddevice" -version = "0.5.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/80/2d/b04ae180312b81dbb694504bee170eada5372242e186f6298139fd3a0513/sounddevice-0.5.1.tar.gz", hash = "sha256:09ca991daeda8ce4be9ac91e15a9a81c8f81efa6b695a348c9171ea0c16cb041", size = 52896 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/d1/464b5fca3decdd0cfec8c47f7b4161a0b12972453201c1bf03811f367c5e/sounddevice-0.5.1-py3-none-any.whl", hash = "sha256:e2017f182888c3f3c280d9fbac92e5dbddac024a7e3442f6e6116bd79dab8a9c", size = 32276 }, - { url = "https://files.pythonhosted.org/packages/6f/f6/6703fe7cf3d7b7279040c792aeec6334e7305956aba4a80f23e62c8fdc44/sounddevice-0.5.1-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:d16cb23d92322526a86a9490c427bf8d49e273d9ccc0bd096feecd229cde6031", size = 107916 }, - { url = "https://files.pythonhosted.org/packages/57/a5/78a5e71f5ec0faedc54f4053775d61407bfbd7d0c18228c7f3d4252fd276/sounddevice-0.5.1-py3-none-win32.whl", hash = "sha256:d84cc6231526e7a08e89beff229c37f762baefe5e0cc2747cbe8e3a565470055", size = 312494 }, - { url = "https://files.pythonhosted.org/packages/af/9b/15217b04f3b36d30de55fef542389d722de63f1ad81f9c72d8afc98cb6ab/sounddevice-0.5.1-py3-none-win_amd64.whl", hash = "sha256:4313b63f2076552b23ac3e0abd3bcfc0c1c6a696fc356759a13bd113c9df90f1", size = 363634 }, -] - [[package]] name = "soupsieve" version = "2.6" From d9ce9372afe9021294fe90841670852f5c68a1d7 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Jan 2025 16:47:55 +0100 Subject: [PATCH 03/50] updated note --- .../semantic_kernel/connectors/ai/realtime_client_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index c5d092d50870..ebdd4eed3739 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -17,22 +17,22 @@ #### # TODO (eavanvalkenburg): Move to ADR # Receiving: -# Option 1: Events and Contents split (current) +# Option 1: Events and Contents split # - content received through main receive_content method # - events received through event callback handlers # Option 2: Everything is Content # - content (events as new Content Type) received through main receive_content method -# Option 3: Everything is Event +# Option 3: Everything is Event (current) # - receive_content method is removed # - events received through main listen method # - default event handlers added for things like errors and function calling # - built-in vs custom event handling - separate or not? # Sending: -# Option 1: Events and Contents split (current) +# Option 1: Events and Contents split # - send_content and send_event # Option 2: Everything is Content # - single method needed, with EventContent type support -# Option 3: Everything is Event +# Option 3: Everything is Event (current) # - send_event method only, Content is part of event data #### From 20ea3dced2f042d0e19b748df794c38d156786e5 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Jan 2025 17:01:06 +0100 Subject: [PATCH 04/50] reverted some changes --- .../connectors/ai/chat_completion_client_base.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py index 42797489e26f..974d59af92be 100644 --- a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py +++ b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py @@ -222,7 +222,7 @@ async def get_streaming_chat_message_contents( if not self.SUPPORTS_FUNCTION_CALLING: async for streaming_chat_message_contents in self._inner_get_streaming_chat_message_contents( - chat_history, settings, **kwargs + chat_history, settings ): yield streaming_chat_message_contents return @@ -247,7 +247,7 @@ async def get_streaming_chat_message_contents( or not settings.function_choice_behavior.auto_invoke_kernel_functions ): async for streaming_chat_message_contents in self._inner_get_streaming_chat_message_contents( - chat_history, settings, **kwargs + chat_history, settings ): yield streaming_chat_message_contents return @@ -259,7 +259,7 @@ async def get_streaming_chat_message_contents( all_messages: list["StreamingChatMessageContent"] = [] function_call_returned = False async for messages in self._inner_get_streaming_chat_message_contents( - chat_history, settings, request_index, **kwargs + chat_history, settings, request_index ): for msg in messages: if msg is not None: @@ -312,7 +312,6 @@ async def get_streaming_chat_message_contents( function_invoke_attempt=request_index, ) if self._yield_function_result_messages(function_result_messages): - await self._streaming_function_call_result_callback(function_result_messages) yield function_result_messages if any(result.terminate for result in results if result is not None): From eff47659bc8245d1757d1030f7f43ec397ce00ae Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 10 Jan 2025 13:52:55 +0100 Subject: [PATCH 05/50] WIP ADR --- docs/decisions/00XX-realtime-api-clients.md | 158 +++++++++++++ .../open_ai_realtime_execution_settings.py | 8 +- .../open_ai/services/open_ai_realtime_base.py | 4 +- python/uv.lock | 220 +++++++++--------- 4 files changed, 274 insertions(+), 116 deletions(-) create mode 100644 docs/decisions/00XX-realtime-api-clients.md diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md new file mode 100644 index 000000000000..81d9d6fdf4e7 --- /dev/null +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -0,0 +1,158 @@ +--- +# These are optional elements. Feel free to remove any of them. +status: {proposed } +contact: {Eduard van Valkenburg} +date: {2025-01-10} +deciders: { Eduard van Valkenburg, Mark Wallace, Ben Thomas, Roger Barreto} +consulted: +informed: +--- + +# Realtime API Clients + +## Context and Problem Statement + +Multiple model providers are starting to enable realtime voice-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. The key addition that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. + +The way these API's work at this time is through either websockets or WebRTC. In both cases there are events being sent to and from the service, some events contain content, text, audio, or video (so far only sending, not receiving), while some events are "control" events, like content created, function call requested, etc. Sending events include, sending content, either voice, text or function call output, or events, like committing the input audio and requesting a response. + +Both the OpenAI and Google realtime api's are in preview/beta, this means there might be breaking changes in the way they work coming in the future, therefore the clients built to support these API's are going to be experimental until the API's stabilize. + +One feature that we need to consider if and how to deal with is whether or not a service uses Voice Activated Detection, OpenAI supports turning that off and allows parameters for how it behaves, while Google has it on by default and it cannot be configured. + +### Event types + +Client side events: +| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | +|-------------------| ------------------------------------|-------------------------|------------------------| + | Control | Configure session | `session.update` | `BidiGenerateContentSetup` | + | Content | Send voice input | `input_audio_buffer.append` | `BidiGenerateContentRealtimeInput` | + | Control | Commit input and request response | `input_audio_buffer.commit` | `-` | + | Control | Clean audio input buffer | `input_audio_buffer.clear` | `-` | + | Content | Send text input | `conversation.item.create` | `BidiGenerateContentClientContent` | + | Control | Interrupt audio | `conversation.item.truncate` | `-`| + | Control | Delete content | `conversation.item.delete` | `-`| +| Control | Respond to function call request | `conversation.item.create` | `BidiGenerateContentToolResponse`| +| Control | Ask for response | `response.create` | `-`| +| Control | Cancel response | `response.cancel` | `-`| + +Server side events: +| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | +|----------------------------|-------------------------------------|-------------------------|------------------------| +| Control | Error | `error` | `-` | +| Control | Session created | `session.created` | `BidiGenerateContentSetupComplete` | +| Control | Session updated | `session.updated` | `BidiGenerateContentSetupComplete` | +| Control | Conversation created | `conversation.created` | `-` | +| Control | Input audio buffer committed | `input_audio_buffer.committed` | `-` | +| Control | Input audio buffer cleared | `input_audio_buffer.cleared` | `-` | +| Control | Input audio buffer speech started | `input_audio_buffer.speech_started` | `-` | +| Control | Input audio buffer speech stopped | `input_audio_buffer.speech_stopped` | `-` | +| Content | Conversation item created | `conversation.item.created` | `-` | +| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | +| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | +| Control | Conversation item truncated | `conversation.item.truncated` | `-` | +| Control | Conversation item deleted | `conversation.item.deleted` | `-` | +| Control | Response created | `response.created` | `-` | +| Control | Response done | `response.done` | `-` | +| Content | Response output item added | `response.output_item.added` | `-` | +| Content | Response output item done | `response.output_item.done` | `-` | +| Content | Response content part added | `response.content_part.added` | `-` | +| Content | Response content part done | `response.content_part.done` | `-` | +| Content | Response text delta | `response.text.delta` | `BidiGenerateContentServerContent` | +| Content | Response text done | `response.text.done` | `-` | +| Content | Response audio transcript delta | `response.audio_transcript.delta` | `BidiGenerateContentServerContent` | +| Content | Response audio transcript done | `response.audio_transcript.done` | `-` | +| Content | Response audio delta | `response.audio.delta` | `BidiGenerateContentServerContent` | +| Content | Response audio done | `response.audio.done` | `-` | +| Content | Response function call arguments delta | `response.function_call_arguments.delta` | `BidiGenerateContentToolCall` | +| Content | Response function call arguments done | `response.function_call_arguments.done` | `-` | +| Control | Function call cancelled | `-` | `BidiGenerateContentToolCallCancellation` | +| Control | Rate limits updated | `rate_limits.updated` | `-` | + + + + +## Decision Drivers + +- Simple programming model that is likely able to handle future realtime api's and evolution of the existing ones. +- Support for the most common scenario's and content, extensible for the rest. +- Natively integrated with Semantic Kernel especially for content types and function calling. + +- … + +## Considered Options + +Both the sending and receiving side of these integrations need to decide how to deal with the api's. + +- Treat content events separate from control events +- Treat everything as content items +- Treat everything as events + +### Treat content events separate from control events +This would mean there are two mechanisms in the clients, one deals with content, and one with control events. + +- Pro: + - strongly typed responses for known content + - easy to use as the main interactions are clear with familiar SK content types, the rest goes through a separate mechanism +- Con: + - new content support requires updates in the codebase and can be considered breaking (potentitally sending additional types back) + - additional complexity in dealing with two streams of data + +### Treat everything as content items + + +## Decision Outcome + +Chosen option: "{title of option 1}", because +{justification. e.g., only option, which meets k.o. criterion decision driver | which resolves force {force} | … | comes out best (see below)}. + + + +### Consequences + +- Good, because {positive consequence, e.g., improvement of one or more desired qualities, …} +- Bad, because {negative consequence, e.g., compromising one or more desired qualities, …} +- … + + + +## Validation + +{describe how the implementation of/compliance with the ADR is validated. E.g., by a review or an ArchUnit test} + + + +## Pros and Cons of the Options + +### {title of option 1} + + + +{example | description | pointer to more information | …} + +- Good, because {argument a} +- Good, because {argument b} + +- Neutral, because {argument c} +- Bad, because {argument d} +- … + +### {title of other option} + +{example | description | pointer to more information | …} + +- Good, because {argument a} +- Good, because {argument b} +- Neutral, because {argument c} +- Bad, because {argument d} +- … + + + +## More Information + +{You might want to provide additional evidence/confidence for the decision outcome here and/or +document the team agreement on the decision and/or +define when this decision when and how the decision should be realized and if/when it should be re-visited and/or +how the decision is validated. +Links to other decisions and resources might appear here as well.} diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index 480e2ed1373f..a26237b78b84 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -9,6 +9,12 @@ from semantic_kernel.kernel_pydantic import KernelBaseModel +class InputAudioTranscription(KernelBaseModel): + """Input audio transcription settings.""" + + model: Literal["whisper-1"] | None = None + + class TurnDetection(KernelBaseModel): """Turn detection settings.""" @@ -28,7 +34,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): voice: str | None = None input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None - input_audio_transcription: dict[str, Any] | None = None + input_audio_transcription: InputAudioTranscription | None = None turn_detection: TurnDetection | None = None tools: Annotated[ list[dict[str, Any]] | None, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index 4175d9449b2e..64b647f44ee8 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -100,7 +100,7 @@ class ListenEvents(str, Enum): CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" CONVERSATION_ITEM_DELETED = "conversation.item.deleted" RESPONSE_CREATED = "response.created" - RESPONSE_DONE = "response.done" + RESPONSE_DONE = "response.done" # contains usage info -> log RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" @@ -421,6 +421,8 @@ async def response_function_call_arguments_done_callback( chat_history = ChatHistory() await kernel.invoke_function_call(item, chat_history) await self.send_event(SendEvents.CONVERSATION_ITEM_CREATE, item=chat_history.messages[-1]) + # The model doesn't start responding to the tool call automatically, so triggering it here. + await self.send_event(SendEvents.RESPONSE_CREATE) return chat_history.messages[-1], False def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: diff --git a/python/uv.lock b/python/uv.lock index 710dd9901b09..7b452ae40cb3 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -687,7 +687,7 @@ wheels = [ [[package]] name = "chromadb" -version = "0.5.20" +version = "0.6.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "bcrypt", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -719,9 +719,9 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/03/31/6c8e05405bb02b4a1f71f9aa3eef242415565dabf6afc1bde7f64f726963/chromadb-0.5.20.tar.gz", hash = "sha256:19513a23b2d20059866216bfd80195d1d4a160ffba234b8899f5e80978160ca7", size = 33664540 } +sdist = { url = "https://files.pythonhosted.org/packages/d1/c5/d2b4219fdee424e881608da681c3c63b73d68dc6667bd2df14a4d9bb308d/chromadb-0.6.2.tar.gz", hash = "sha256:e9e11f04d3850796711ee05dad4e918c75ec7b62ab9cbe7b4588b68a26aaea06", size = 19979649 } wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/7a/10bf5dc92d13cc03230190fcc5016a0b138d99e5b36b8b89ee0fe1680e10/chromadb-0.5.20-py3-none-any.whl", hash = "sha256:9550ba1b6dce911e35cac2568b301badf4b42f457b99a432bdeec2b6b9dd3680", size = 617884 }, + { url = "https://files.pythonhosted.org/packages/bb/1c/2b77093f4191ad2d1ab70b9215cb6bc9f43350aa3e9e54a44304c8379335/chromadb-0.6.2-py3-none-any.whl", hash = "sha256:77a5e07097e36cdd49d8d2925d0c4d28291cabc9677787423d2cc7c426e8895b", size = 606162 }, ] [[package]] @@ -1053,19 +1053,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4c/a3/ac312faeceffd2d8f86bc6dcb5c401188ba5a01bc88e69bed97578a0dfcd/durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38", size = 3461 }, ] -[[package]] -name = "environs" -version = "9.5.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "marshmallow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d4/e3/c3c6c76f3dbe3e019e9a451b35bf9f44690026a5bb1232f7b77097b72ff5/environs-9.5.0.tar.gz", hash = "sha256:a76307b36fbe856bdca7ee9161e6c466fd7fcffc297109a118c59b54e27e30c9", size = 20795 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ca/5e/f0f217dc393372681bfe05c50f06a212e78d0a3fee907a74ab451ec1dcdb/environs-9.5.0-py2.py3-none-any.whl", hash = "sha256:1e549569a3de49c05f856f40bce86979e7d5ffbbc4398e7f338574c220189124", size = 12548 }, -] - [[package]] name = "eval-type-backport" version = "0.2.2" @@ -1455,6 +1442,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/fb/54deefe679b7d1c1cc81d83396fcf28ad1a66d213bddeb275a8d28665918/google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d", size = 27866 }, ] +[[package]] +name = "google-genai" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "websockets", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8f/fa/e8c81d37ffe7d8aa05573494735cdc432a97b77f641a08caa959de19523d/google_genai-0.4.0.tar.gz", hash = "sha256:d14ce2e941063092cfc98726aeabcae44f179456e3a4906ee5f28dc91b0663fb", size = 107625 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/ac/cf91960fc842f8c3387be8abeaa01deb0e6b20a72a028b70107f58e13150/google_genai-0.4.0-py3-none-any.whl", hash = "sha256:2cbfea3cb47d4ac54ee3d3f9ecd79ff72298cac13e150828afdc5ed62768ed00", size = 113562 }, +] + [[package]] name = "google-generativeai" version = "0.8.4" @@ -1518,122 +1521,122 @@ wheels = [ [[package]] name = "grpcio" -version = "1.69.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e4/87/06a145284cbe86c91ca517fe6b57be5efbb733c0d6374b407f0992054d18/grpcio-1.69.0.tar.gz", hash = "sha256:936fa44241b5379c5afc344e1260d467bee495747eaf478de825bab2791da6f5", size = 12738244 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/6e/2f8ee5fb65aef962d0bd7e46b815e7b52820687e29c138eaee207a688abc/grpcio-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2060ca95a8db295ae828d0fc1c7f38fb26ccd5edf9aa51a0f44251f5da332e97", size = 5190753 }, - { url = "https://files.pythonhosted.org/packages/89/07/028dcda44d40f9488f0a0de79c5ffc80e2c1bc5ed89da9483932e3ea67cf/grpcio-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2e52e107261fd8fa8fa457fe44bfadb904ae869d87c1280bf60f93ecd3e79278", size = 11096752 }, - { url = "https://files.pythonhosted.org/packages/99/a0/c727041b1410605ba38b585b6b52c1a289d7fcd70a41bccbc2c58fc643b2/grpcio-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:316463c0832d5fcdb5e35ff2826d9aa3f26758d29cdfb59a368c1d6c39615a11", size = 5705442 }, - { url = "https://files.pythonhosted.org/packages/7a/2f/1c53f5d127ff882443b19c757d087da1908f41c58c4b098e8eaf6b2bb70a/grpcio-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:26c9a9c4ac917efab4704b18eed9082ed3b6ad19595f047e8173b5182fec0d5e", size = 6333796 }, - { url = "https://files.pythonhosted.org/packages/cc/f6/2017da2a1b64e896af710253e5bfbb4188605cdc18bce3930dae5cdbf502/grpcio-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b3646ced2eae3a0599658eeccc5ba7f303bf51b82514c50715bdd2b109e5ec", size = 5954245 }, - { url = "https://files.pythonhosted.org/packages/c1/65/1395bec928e99ba600464fb01b541e7e4cdd462e6db25259d755ef9f8d02/grpcio-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3b75aea7c6cb91b341c85e7c1d9db1e09e1dd630b0717f836be94971e015031e", size = 6664854 }, - { url = "https://files.pythonhosted.org/packages/40/57/8b3389cfeb92056c8b44288c9c4ed1d331bcad0215c4eea9ae4629e156d9/grpcio-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5cfd14175f9db33d4b74d63de87c64bb0ee29ce475ce3c00c01ad2a3dc2a9e51", size = 6226854 }, - { url = "https://files.pythonhosted.org/packages/cc/61/1f2bbeb7c15544dffc98b3f65c093e746019995e6f1e21dc3655eec3dc23/grpcio-1.69.0-cp310-cp310-win32.whl", hash = "sha256:9031069d36cb949205293cf0e243abd5e64d6c93e01b078c37921493a41b72dc", size = 3662734 }, - { url = "https://files.pythonhosted.org/packages/ef/ba/bf1a6d9f5c17d2da849793d72039776c56c98c889c9527f6721b6ee57e6e/grpcio-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:cc89b6c29f3dccbe12d7a3b3f1b3999db4882ae076c1c1f6df231d55dbd767a5", size = 4410306 }, - { url = "https://files.pythonhosted.org/packages/8d/cd/ca256aeef64047881586331347cd5a68a4574ba1a236e293cd8eba34e355/grpcio-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:8de1b192c29b8ce45ee26a700044717bcbbd21c697fa1124d440548964328561", size = 5198734 }, - { url = "https://files.pythonhosted.org/packages/37/3f/10c1e5e0150bf59aa08ea6aebf38f87622f95f7f33f98954b43d1b2a3200/grpcio-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:7e76accf38808f5c5c752b0ab3fd919eb14ff8fafb8db520ad1cc12afff74de6", size = 11135285 }, - { url = "https://files.pythonhosted.org/packages/08/61/61cd116a572203a740684fcba3fef37a3524f1cf032b6568e1e639e59db0/grpcio-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:d5658c3c2660417d82db51e168b277e0ff036d0b0f859fa7576c0ffd2aec1442", size = 5699468 }, - { url = "https://files.pythonhosted.org/packages/01/f1/a841662e8e2465ba171c973b77d18fa7438ced535519b3c53617b7e6e25c/grpcio-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5494d0e52bf77a2f7eb17c6da662886ca0a731e56c1c85b93505bece8dc6cf4c", size = 6332337 }, - { url = "https://files.pythonhosted.org/packages/62/b1/c30e932e02c2e0bfdb8df46fe3b0c47f518fb04158ebdc0eb96cc97d642f/grpcio-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ed866f9edb574fd9be71bf64c954ce1b88fc93b2a4cbf94af221e9426eb14d6", size = 5949844 }, - { url = "https://files.pythonhosted.org/packages/5e/cb/55327d43b6286100ffae7d1791be6178d13c917382f3e9f43f82e8b393cf/grpcio-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c5ba38aeac7a2fe353615c6b4213d1fbb3a3c34f86b4aaa8be08baaaee8cc56d", size = 6661828 }, - { url = "https://files.pythonhosted.org/packages/6f/e4/120d72ae982d51cb9cabcd9672f8a1c6d62011b493a4d049d2abdf564db0/grpcio-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f79e05f5bbf551c4057c227d1b041ace0e78462ac8128e2ad39ec58a382536d2", size = 6226026 }, - { url = "https://files.pythonhosted.org/packages/96/e8/2cc15f11db506d7b1778f0587fa7bdd781602b05b3c4d75b7ca13de33d62/grpcio-1.69.0-cp311-cp311-win32.whl", hash = "sha256:bf1f8be0da3fcdb2c1e9f374f3c2d043d606d69f425cd685110dd6d0d2d61258", size = 3662653 }, - { url = "https://files.pythonhosted.org/packages/42/78/3c5216829a48237fcb71a077f891328a435e980d9757a9ebc49114d88768/grpcio-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb9302afc3a0e4ba0b225cd651ef8e478bf0070cf11a529175caecd5ea2474e7", size = 4412824 }, - { url = "https://files.pythonhosted.org/packages/61/1d/8f28f147d7f3f5d6b6082f14e1e0f40d58e50bc2bd30d2377c730c57a286/grpcio-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:fc18a4de8c33491ad6f70022af5c460b39611e39578a4d84de0fe92f12d5d47b", size = 5161414 }, - { url = "https://files.pythonhosted.org/packages/35/4b/9ab8ea65e515e1844feced1ef9e7a5d8359c48d986c93f3d2a2006fbdb63/grpcio-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:0f0270bd9ffbff6961fe1da487bdcd594407ad390cc7960e738725d4807b18c4", size = 11108909 }, - { url = "https://files.pythonhosted.org/packages/99/68/1856fde2b3c3162bdfb9845978608deef3606e6907fdc2c87443fce6ecd0/grpcio-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc48f99cc05e0698e689b51a05933253c69a8c8559a47f605cff83801b03af0e", size = 5658302 }, - { url = "https://files.pythonhosted.org/packages/3e/21/3fa78d38dc5080d0d677103fad3a8cd55091635cc2069a7c06c7a54e6c4d/grpcio-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e925954b18d41aeb5ae250262116d0970893b38232689c4240024e4333ac084", size = 6306201 }, - { url = "https://files.pythonhosted.org/packages/f3/cb/5c47b82fd1baf43dba973ae399095d51aaf0085ab0439838b4cbb1e87e3c/grpcio-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d222569273720366f68a99cb62e6194681eb763ee1d3b1005840678d4884f9", size = 5919649 }, - { url = "https://files.pythonhosted.org/packages/c6/67/59d1a56a0f9508a29ea03e1ce800bdfacc1f32b4f6b15274b2e057bf8758/grpcio-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b62b0f41e6e01a3e5082000b612064c87c93a49b05f7602fe1b7aa9fd5171a1d", size = 6648974 }, - { url = "https://files.pythonhosted.org/packages/f8/fe/ca70c14d98c6400095f19a0f4df8273d09c2106189751b564b26019f1dbe/grpcio-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db6f9fd2578dbe37db4b2994c94a1d9c93552ed77dca80e1657bb8a05b898b55", size = 6215144 }, - { url = "https://files.pythonhosted.org/packages/b3/94/b2b0a9fd487fc8262e20e6dd0ec90d9fa462c82a43b4855285620f6e9d01/grpcio-1.69.0-cp312-cp312-win32.whl", hash = "sha256:b192b81076073ed46f4b4dd612b8897d9a1e39d4eabd822e5da7b38497ed77e1", size = 3644552 }, - { url = "https://files.pythonhosted.org/packages/93/99/81aec9f85412e3255a591ae2ccb799238e074be774e5f741abae08a23418/grpcio-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:1227ff7836f7b3a4ab04e5754f1d001fa52a730685d3dc894ed8bc262cc96c01", size = 4399532 }, - { url = "https://files.pythonhosted.org/packages/54/47/3ff4501365f56b7cc16617695dbd4fd838c5e362bc7fa9fee09d592f7d78/grpcio-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:a78a06911d4081a24a1761d16215a08e9b6d4d29cdbb7e427e6c7e17b06bcc5d", size = 5162928 }, - { url = "https://files.pythonhosted.org/packages/c0/63/437174c5fa951052c9ecc5f373f62af6f3baf25f3f5ef35cbf561806b371/grpcio-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:dc5a351927d605b2721cbb46158e431dd49ce66ffbacb03e709dc07a491dde35", size = 11103027 }, - { url = "https://files.pythonhosted.org/packages/53/df/53566a6fdc26b6d1f0585896e1cc4825961039bca5a6a314ff29d79b5d5b/grpcio-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:3629d8a8185f5139869a6a17865d03113a260e311e78fbe313f1a71603617589", size = 5659277 }, - { url = "https://files.pythonhosted.org/packages/e6/4c/b8a0c4f71498b6f9be5ca6d290d576cf2af9d95fd9827c47364f023969ad/grpcio-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9a281878feeb9ae26db0622a19add03922a028d4db684658f16d546601a4870", size = 6305255 }, - { url = "https://files.pythonhosted.org/packages/ef/55/d9aa05eb3dfcf6aa946aaf986740ec07fc5189f20e2cbeb8c5d278ffd00f/grpcio-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc614e895177ab7e4b70f154d1a7c97e152577ea101d76026d132b7aaba003b", size = 5920240 }, - { url = "https://files.pythonhosted.org/packages/ea/eb/774b27c51e3e386dfe6c491a710f6f87ffdb20d88ec6c3581e047d9354a2/grpcio-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1ee76cd7e2e49cf9264f6812d8c9ac1b85dda0eaea063af07292400f9191750e", size = 6652974 }, - { url = "https://files.pythonhosted.org/packages/59/98/96de14e6e7d89123813d58c246d9b0f1fbd24f9277f5295264e60861d9d6/grpcio-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0470fa911c503af59ec8bc4c82b371ee4303ececbbdc055f55ce48e38b20fd67", size = 6215757 }, - { url = "https://files.pythonhosted.org/packages/7d/5b/ce922e0785910b10756fabc51fd294260384a44bea41651dadc4e47ddc82/grpcio-1.69.0-cp313-cp313-win32.whl", hash = "sha256:b650f34aceac8b2d08a4c8d7dc3e8a593f4d9e26d86751ebf74ebf5107d927de", size = 3642488 }, - { url = "https://files.pythonhosted.org/packages/5d/04/11329e6ca1ceeb276df2d9c316b5e170835a687a4d0f778dba8294657e36/grpcio-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:028337786f11fecb5d7b7fa660475a06aabf7e5e52b5ac2df47414878c0ce7ea", size = 4399968 }, +version = "1.67.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/53/d9282a66a5db45981499190b77790570617a604a38f3d103d0400974aeb5/grpcio-1.67.1.tar.gz", hash = "sha256:3dc2ed4cabea4dc14d5e708c2b426205956077cc5de419b4d4079315017e9732", size = 12580022 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/cd/f6ca5c49aa0ae7bc6d0757f7dae6f789569e9490a635eaabe02bc02de7dc/grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f", size = 5112450 }, + { url = "https://files.pythonhosted.org/packages/d4/f0/d9bbb4a83cbee22f738ee7a74aa41e09ccfb2dcea2cc30ebe8dab5b21771/grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d", size = 10937518 }, + { url = "https://files.pythonhosted.org/packages/5b/17/0c5dbae3af548eb76669887642b5f24b232b021afe77eb42e22bc8951d9c/grpcio-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:43112046864317498a33bdc4797ae6a268c36345a910de9b9c17159d8346602f", size = 5633610 }, + { url = "https://files.pythonhosted.org/packages/17/48/e000614e00153d7b2760dcd9526b95d72f5cfe473b988e78f0ff3b472f6c/grpcio-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9b929f13677b10f63124c1a410994a401cdd85214ad83ab67cc077fc7e480f0", size = 6240678 }, + { url = "https://files.pythonhosted.org/packages/64/19/a16762a70eeb8ddfe43283ce434d1499c1c409ceec0c646f783883084478/grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7d1797a8a3845437d327145959a2c0c47c05947c9eef5ff1a4c80e499dcc6fa", size = 5884528 }, + { url = "https://files.pythonhosted.org/packages/6b/dc/bd016aa3684914acd2c0c7fa4953b2a11583c2b844f3d7bae91fa9b98fbb/grpcio-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0489063974d1452436139501bf6b180f63d4977223ee87488fe36858c5725292", size = 6583680 }, + { url = "https://files.pythonhosted.org/packages/1a/93/1441cb14c874f11aa798a816d582f9da82194b6677f0f134ea53d2d5dbeb/grpcio-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9fd042de4a82e3e7aca44008ee2fb5da01b3e5adb316348c21980f7f58adc311", size = 6162967 }, + { url = "https://files.pythonhosted.org/packages/29/e9/9295090380fb4339b7e935b9d005fa9936dd573a22d147c9e5bb2df1b8d4/grpcio-1.67.1-cp310-cp310-win32.whl", hash = "sha256:638354e698fd0c6c76b04540a850bf1db27b4d2515a19fcd5cf645c48d3eb1ed", size = 3616336 }, + { url = "https://files.pythonhosted.org/packages/ce/de/7c783b8cb8f02c667ca075c49680c4aeb8b054bc69784bcb3e7c1bbf4985/grpcio-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:608d87d1bdabf9e2868b12338cd38a79969eaf920c89d698ead08f48de9c0f9e", size = 4352071 }, + { url = "https://files.pythonhosted.org/packages/59/2c/b60d6ea1f63a20a8d09c6db95c4f9a16497913fb3048ce0990ed81aeeca0/grpcio-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:7818c0454027ae3384235a65210bbf5464bd715450e30a3d40385453a85a70cb", size = 5119075 }, + { url = "https://files.pythonhosted.org/packages/b3/9a/e1956f7ca582a22dd1f17b9e26fcb8229051b0ce6d33b47227824772feec/grpcio-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ea33986b70f83844cd00814cee4451055cd8cab36f00ac64a31f5bb09b31919e", size = 11009159 }, + { url = "https://files.pythonhosted.org/packages/43/a8/35fbbba580c4adb1d40d12e244cf9f7c74a379073c0a0ca9d1b5338675a1/grpcio-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c7a01337407dd89005527623a4a72c5c8e2894d22bead0895306b23c6695698f", size = 5629476 }, + { url = "https://files.pythonhosted.org/packages/77/c9/864d336e167263d14dfccb4dbfa7fce634d45775609895287189a03f1fc3/grpcio-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80b866f73224b0634f4312a4674c1be21b2b4afa73cb20953cbbb73a6b36c3cc", size = 6239901 }, + { url = "https://files.pythonhosted.org/packages/f7/1e/0011408ebabf9bd69f4f87cc1515cbfe2094e5a32316f8714a75fd8ddfcb/grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fff78ba10d4250bfc07a01bd6254a6d87dc67f9627adece85c0b2ed754fa96", size = 5881010 }, + { url = "https://files.pythonhosted.org/packages/b4/7d/fbca85ee9123fb296d4eff8df566f458d738186d0067dec6f0aa2fd79d71/grpcio-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8a23cbcc5bb11ea7dc6163078be36c065db68d915c24f5faa4f872c573bb400f", size = 6580706 }, + { url = "https://files.pythonhosted.org/packages/75/7a/766149dcfa2dfa81835bf7df623944c1f636a15fcb9b6138ebe29baf0bc6/grpcio-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1a65b503d008f066e994f34f456e0647e5ceb34cfcec5ad180b1b44020ad4970", size = 6161799 }, + { url = "https://files.pythonhosted.org/packages/09/13/5b75ae88810aaea19e846f5380611837de411181df51fd7a7d10cb178dcb/grpcio-1.67.1-cp311-cp311-win32.whl", hash = "sha256:e29ca27bec8e163dca0c98084040edec3bc49afd10f18b412f483cc68c712744", size = 3616330 }, + { url = "https://files.pythonhosted.org/packages/aa/39/38117259613f68f072778c9638a61579c0cfa5678c2558706b10dd1d11d3/grpcio-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:786a5b18544622bfb1e25cc08402bd44ea83edfb04b93798d85dca4d1a0b5be5", size = 4354535 }, + { url = "https://files.pythonhosted.org/packages/6e/25/6f95bd18d5f506364379eabc0d5874873cc7dbdaf0757df8d1e82bc07a88/grpcio-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:267d1745894200e4c604958da5f856da6293f063327cb049a51fe67348e4f953", size = 5089809 }, + { url = "https://files.pythonhosted.org/packages/10/3f/d79e32e5d0354be33a12db2267c66d3cfeff700dd5ccdd09fd44a3ff4fb6/grpcio-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:85f69fdc1d28ce7cff8de3f9c67db2b0ca9ba4449644488c1e0303c146135ddb", size = 10981985 }, + { url = "https://files.pythonhosted.org/packages/21/f2/36fbc14b3542e3a1c20fb98bd60c4732c55a44e374a4eb68f91f28f14aab/grpcio-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f26b0b547eb8d00e195274cdfc63ce64c8fc2d3e2d00b12bf468ece41a0423a0", size = 5588770 }, + { url = "https://files.pythonhosted.org/packages/0d/af/bbc1305df60c4e65de8c12820a942b5e37f9cf684ef5e49a63fbb1476a73/grpcio-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4422581cdc628f77302270ff839a44f4c24fdc57887dc2a45b7e53d8fc2376af", size = 6214476 }, + { url = "https://files.pythonhosted.org/packages/92/cf/1d4c3e93efa93223e06a5c83ac27e32935f998bc368e276ef858b8883154/grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d7616d2ded471231c701489190379e0c311ee0a6c756f3c03e6a62b95a7146e", size = 5850129 }, + { url = "https://files.pythonhosted.org/packages/ae/ca/26195b66cb253ac4d5ef59846e354d335c9581dba891624011da0e95d67b/grpcio-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8a00efecde9d6fcc3ab00c13f816313c040a28450e5e25739c24f432fc6d3c75", size = 6568489 }, + { url = "https://files.pythonhosted.org/packages/d1/94/16550ad6b3f13b96f0856ee5dfc2554efac28539ee84a51d7b14526da985/grpcio-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:699e964923b70f3101393710793289e42845791ea07565654ada0969522d0a38", size = 6149369 }, + { url = "https://files.pythonhosted.org/packages/33/0d/4c3b2587e8ad7f121b597329e6c2620374fccbc2e4e1aa3c73ccc670fde4/grpcio-1.67.1-cp312-cp312-win32.whl", hash = "sha256:4e7b904484a634a0fff132958dabdb10d63e0927398273917da3ee103e8d1f78", size = 3599176 }, + { url = "https://files.pythonhosted.org/packages/7d/36/0c03e2d80db69e2472cf81c6123aa7d14741de7cf790117291a703ae6ae1/grpcio-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:5721e66a594a6c4204458004852719b38f3d5522082be9061d6510b455c90afc", size = 4346574 }, + { url = "https://files.pythonhosted.org/packages/12/d2/2f032b7a153c7723ea3dea08bffa4bcaca9e0e5bdf643ce565b76da87461/grpcio-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa0162e56fd10a5547fac8774c4899fc3e18c1aa4a4759d0ce2cd00d3696ea6b", size = 5091487 }, + { url = "https://files.pythonhosted.org/packages/d0/ae/ea2ff6bd2475a082eb97db1104a903cf5fc57c88c87c10b3c3f41a184fc0/grpcio-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:beee96c8c0b1a75d556fe57b92b58b4347c77a65781ee2ac749d550f2a365dc1", size = 10943530 }, + { url = "https://files.pythonhosted.org/packages/07/62/646be83d1a78edf8d69b56647327c9afc223e3140a744c59b25fbb279c3b/grpcio-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:a93deda571a1bf94ec1f6fcda2872dad3ae538700d94dc283c672a3b508ba3af", size = 5589079 }, + { url = "https://files.pythonhosted.org/packages/d0/25/71513d0a1b2072ce80d7f5909a93596b7ed10348b2ea4fdcbad23f6017bf/grpcio-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e6f255980afef598a9e64a24efce87b625e3e3c80a45162d111a461a9f92955", size = 6213542 }, + { url = "https://files.pythonhosted.org/packages/76/9a/d21236297111052dcb5dc85cd77dc7bf25ba67a0f55ae028b2af19a704bc/grpcio-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e838cad2176ebd5d4a8bb03955138d6589ce9e2ce5d51c3ada34396dbd2dba8", size = 5850211 }, + { url = "https://files.pythonhosted.org/packages/2d/fe/70b1da9037f5055be14f359026c238821b9bcf6ca38a8d760f59a589aacd/grpcio-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:a6703916c43b1d468d0756c8077b12017a9fcb6a1ef13faf49e67d20d7ebda62", size = 6572129 }, + { url = "https://files.pythonhosted.org/packages/74/0d/7df509a2cd2a54814598caf2fb759f3e0b93764431ff410f2175a6efb9e4/grpcio-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:917e8d8994eed1d86b907ba2a61b9f0aef27a2155bca6cbb322430fc7135b7bb", size = 6149819 }, + { url = "https://files.pythonhosted.org/packages/0a/08/bc3b0155600898fd10f16b79054e1cca6cb644fa3c250c0fe59385df5e6f/grpcio-1.67.1-cp313-cp313-win32.whl", hash = "sha256:e279330bef1744040db8fc432becc8a727b84f456ab62b744d3fdb83f327e121", size = 3596561 }, + { url = "https://files.pythonhosted.org/packages/5a/96/44759eca966720d0f3e1b105c43f8ad4590c97bf8eb3cd489656e9590baa/grpcio-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:fa0c739ad8b1996bd24823950e3cb5152ae91fca1c09cc791190bf1627ffefba", size = 4346042 }, ] [[package]] name = "grpcio-health-checking" -version = "1.69.0" +version = "1.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ef/b8/d6d485e27d60174ba22c25587c1a97512c6a800633cfd6a8cd7943ad66e0/grpcio_health_checking-1.69.0.tar.gz", hash = "sha256:ff6e1d38c2a300b1bbd296916fbd9165667bc4b5a8557f99dd4226d4f9e8f4c1", size = 16809 } +sdist = { url = "https://files.pythonhosted.org/packages/64/dd/e3b339fa44dc75b501a1a22cb88f1af5b1f8c964488f19c4de4cfbbf05ba/grpcio_health_checking-1.67.1.tar.gz", hash = "sha256:ca90fa76a6afbb4fda71d734cb9767819bba14928b91e308cffbb0c311eb941e", size = 16775 } wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/07/8d68bb1821dc46dfb5b702374c5d06e9c0013afb08fa92516ebd8f963ef3/grpcio_health_checking-1.69.0-py3-none-any.whl", hash = "sha256:d2d0eec7e3af245863fd4997e2942d27c0868fbd61ffa4d14bc492c3e2c67127", size = 18923 }, + { url = "https://files.pythonhosted.org/packages/5c/8d/7a9878dca6616b48093d71c52d0bc79cb2dd1a2698ff6f5ce7406306de12/grpcio_health_checking-1.67.1-py3-none-any.whl", hash = "sha256:93753da5062152660aef2286c9b261e07dd87124a65e4dc9fbd47d1ce966b39d", size = 18924 }, ] [[package]] name = "grpcio-status" -version = "1.69.0" +version = "1.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "googleapis-common-protos", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/02/35/52dc0d8300f879dbf9cdc95764cee9f56d5a212998cfa1a8871b262df2a4/grpcio_status-1.69.0.tar.gz", hash = "sha256:595ef84e5178d6281caa732ccf68ff83259241608d26b0e9c40a5e66eee2a2d2", size = 13662 } +sdist = { url = "https://files.pythonhosted.org/packages/be/c7/fe0e79a80ac6346e0c6c0a24e9e3cbc3ae1c2a009acffb59eab484a6f69b/grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11", size = 13673 } wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/e2/346a766a4232f74f45f8bc70e636fc3a6677e6bc3893382187829085f12e/grpcio_status-1.69.0-py3-none-any.whl", hash = "sha256:d6b2a3c9562c03a817c628d7ba9a925e209c228762d6d7677ae5c9401a542853", size = 14428 }, + { url = "https://files.pythonhosted.org/packages/05/18/56999a1da3577d8ccc8698a575d6638e15fe25650cc88b2ce0a087f180b9/grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd", size = 14427 }, ] [[package]] name = "grpcio-tools" -version = "1.69.0" +version = "1.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/64/ec/1c25136ca1697eaa09a02effe3e74959fd9fb6aba9960d7340dd6341c5ce/grpcio_tools-1.69.0.tar.gz", hash = "sha256:3e1a98f4d9decb84979e1ddd3deb09c0a33a84b6e3c0776d5bde4097e3ab66dd", size = 5323319 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/90/7df7326552fec627adcf3880cf13e9a5b23c090bbcedba367f64fa2bb54b/grpcio_tools-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:8c210630faa581c3bd08953dac4ad21a7f49862f3b92d69686e9b436d2f1265d", size = 2388795 }, - { url = "https://files.pythonhosted.org/packages/e2/03/6ccaa58b3ca1734d0868a389148e22ac15248a9be4c223805339f7904e31/grpcio_tools-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:09b66ea279fcdaebae4ec34b1baf7577af3b14322738aa980c1c33cfea71f7d7", size = 5703156 }, - { url = "https://files.pythonhosted.org/packages/c9/f6/162b456684d2444b43e45ace4e889087301e5890bbfd16ee6b2aedf36219/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:be94a4bfa56d356aae242cc54072c9ccc2704b659eaae2fd599a94afebf791ce", size = 2350725 }, - { url = "https://files.pythonhosted.org/packages/db/3a/2e83fea8c90b9902d68964491d014d688177a6ad0303dbbe6c2c16f25da6/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28778debad73a8c8e0a0e07e6a2f76eecce43adbc205d17dd244d2d58bb0f0aa", size = 2727230 }, - { url = "https://files.pythonhosted.org/packages/63/06/be27b8f1811ff4cc556bdec64a9004755a929df035dc606466a75c9ac0fa/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:449308d93e4c97ae3a4503510c6d64978748ff5e21429c85da14fdc783c0f498", size = 2472752 }, - { url = "https://files.pythonhosted.org/packages/a3/43/f94578afa1535287b7b0ba39eeb23b2b8304a2a5b8e325ed7079d2ad9cba/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b9343651e73bc6e0df6bb518c2638bf9cc2194b50d060cdbcf1b2121cd4e4ae3", size = 3344074 }, - { url = "https://files.pythonhosted.org/packages/13/d1/5f9030cbb6195f3bb182e740f349cdaa71d9c38c1b2572f401270709d7d2/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2f08b063612553e726e328aef3a27adfaea8d92712b229012afc54d59da88a02", size = 2953778 }, - { url = "https://files.pythonhosted.org/packages/0c/cb/4812660e150d197de81296fa04ed6ad012d1aeac23bbe21be5f51493f455/grpcio_tools-1.69.0-cp310-cp310-win32.whl", hash = "sha256:599ffd39525e7bbb6412a63e56a2e6c1af8f3493fe4305260efd4a11d064cce0", size = 957556 }, - { url = "https://files.pythonhosted.org/packages/4e/c7/c7d5f5418909764e63208b9f76812db3287ece4f79500e815178194e1db9/grpcio_tools-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:02f92e3c2bae67ece818787f8d3d89df0fa1e5e6bbb7c1493824fd5dfad886dd", size = 1114783 }, - { url = "https://files.pythonhosted.org/packages/7e/f4/575f536bada8d8f5f8943c317ae28faafe7b4aaf95ef84a599f4f3e67db3/grpcio_tools-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c18df5d1c8e163a29863583ec51237d08d7059ef8d4f7661ee6d6363d3e38fe3", size = 2388772 }, - { url = "https://files.pythonhosted.org/packages/87/94/1157342b046f51c4d076f21ef76da6d89323929b7e870389204fd49e3f09/grpcio_tools-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:37876ae49235ef2e61e5059faf45dc5e7142ca54ae61aec378bb9483e0cd7e95", size = 5726348 }, - { url = "https://files.pythonhosted.org/packages/36/5c/cfd9160ef1867e025844b2695d436bb953c2d5f9c20eaaa7da6fd739ab0c/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:33120920e29959eaa37a1268c6a22af243d086b1a5e5222b4203e29560ece9ce", size = 2350857 }, - { url = "https://files.pythonhosted.org/packages/61/70/10614b8bc39f06548a0586fdd5d97843da4789965e758fba87726bde8c2f/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:788bb3ecd1b44664d829d319b3c1ebc15c7d7b5e7d1f22706ab57d6acd2c6301", size = 2727157 }, - { url = "https://files.pythonhosted.org/packages/37/fb/33faedb3e991dceb7a2bf802d3875bff7d6a6b6a80d314197adc73739cae/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f453b11a112e3774c8957ec2570669f3da1f7fbc8ee242482c38981496e88da2", size = 2472882 }, - { url = "https://files.pythonhosted.org/packages/41/f7/abddc158919a982f6b8e61d4a5c72569b2963304c162c3ca53c6c14d23ee/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e5c5dc2b656755cb58b11a7e87b65258a4a8eaff01b6c30ffcb230dd447c03d", size = 3343987 }, - { url = "https://files.pythonhosted.org/packages/ba/46/e7219456aefe29137728246a67199fcbfdaa99ede93d2045a6406f0e4c0b/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8eabf0a7a98c14322bc74f9910c96f98feebe311e085624b2d022924d4f652ca", size = 2953659 }, - { url = "https://files.pythonhosted.org/packages/74/be/262c5d2b681930f8c58012500741fe06cb40a770c9d395650efe9042467f/grpcio_tools-1.69.0-cp311-cp311-win32.whl", hash = "sha256:ad567bea43d018c2215e1db10316eda94ca19229a834a3221c15d132d24c1b8a", size = 957447 }, - { url = "https://files.pythonhosted.org/packages/8e/55/68153acca126dced35f888e708a65169df8fa8a4d5f0e78166a395e3fa9c/grpcio_tools-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:3d64e801586dbea3530f245d48b9ed031738cc3eb099d5ce2fdb1b3dc2e1fb20", size = 1114753 }, - { url = "https://files.pythonhosted.org/packages/5b/f6/9cd1aa47556664564b873cd187d8dec978ff2f4a539d8c6d5d2f418d3d36/grpcio_tools-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8ef8efe8beac4cc1e30d41893e4096ca2601da61001897bd17441645de2d4d3c", size = 2388440 }, - { url = "https://files.pythonhosted.org/packages/62/37/0bcd8431e44b38f648f70368dd60542d10ffaffa109563349ee635013e10/grpcio_tools-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:a00e87a0c5a294028115a098819899b08dd18449df5b2aac4a2b87ba865e8681", size = 5726135 }, - { url = "https://files.pythonhosted.org/packages/8b/f5/2ec994bbf522a231ce54c41a2d3621e77bece1240aafe31f12804052af0f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:7722700346d5b223159532e046e51f2ff743ed4342e5fe3e0457120a4199015e", size = 2350247 }, - { url = "https://files.pythonhosted.org/packages/a9/29/9ebf54315a499a766e4c3bd53124267491162e9049c2d9ed45f43222b98f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a934116fdf202cb675246056ee54645c743e2240632f86a37e52f91a405c7143", size = 2727994 }, - { url = "https://files.pythonhosted.org/packages/f0/2a/1a031018660b5d95c1a4c587a0babd0d28f0aa0c9a40dbca330567049a3f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e6a6d44359ca836acfbc58103daf94b3bb8ac919d659bb348dcd7fbecedc293", size = 2472625 }, - { url = "https://files.pythonhosted.org/packages/74/bf/76d24078e1c76976a10760c3193b6c62685a7aed64b1cb0d8242afa16f1d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e27662c0597fd1ab5399a583d358b5203edcb6fc2b29d6245099dfacd51a6ddc", size = 3344290 }, - { url = "https://files.pythonhosted.org/packages/f1/f7/4ab645e4955ca1e5240b0bbd557662cec4838f0e21e072ff40f4e191b48d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7bbb2b2fb81d95bcdd1d8331defb5f5dc256dbe423bb98b682cf129cdd432366", size = 2953592 }, - { url = "https://files.pythonhosted.org/packages/8f/32/57e67b126f209f289fc32009309d155b8dbe9ac760c32733746e4dda7b51/grpcio_tools-1.69.0-cp312-cp312-win32.whl", hash = "sha256:e11accd10cf4af5031ac86c45f1a13fb08f55e005cea070917c12e78fe6d2aa2", size = 957042 }, - { url = "https://files.pythonhosted.org/packages/19/64/7bfcb4e50a0ce87690c24696cd666f528e672119966abead09ae65a2e1da/grpcio_tools-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:6df4c6ac109af338a8ccde29d184e0b0bdab13d78490cb360ff9b192a1aec7e2", size = 1114248 }, - { url = "https://files.pythonhosted.org/packages/0c/ef/a9867f612e3aa5e69d299e47a72ea8dafa476b1f099462c9a1223cd6a83c/grpcio_tools-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c320c4faa1431f2e1252ef2325a970ac23b2fd04ffef6c12f96dd4552c3445c", size = 2388281 }, - { url = "https://files.pythonhosted.org/packages/4b/53/b2752d8ec338778e48d76845d605a0f8bca9e43a5f09428e5ed1a76e4e1d/grpcio_tools-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:5f1224596ad74dd14444b20c37122b361c5d203b67e14e018b995f3c5d76eede", size = 5725856 }, - { url = "https://files.pythonhosted.org/packages/83/dd/195d3639634c0c1d1e48b6693c074d66a64f16c748df2f40bcee74aa04e2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:965a0cf656a113bc32d15ac92ca51ed702a75d5370ae0afbdd36f818533a708a", size = 2350180 }, - { url = "https://files.pythonhosted.org/packages/8c/18/c412884fa0e888d8a271f3e31d23e3765cde0efe2404653ab67971c411c2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:978835768c11a7f28778b3b7c40f839d8a57f765c315e80c4246c23900d56149", size = 2726724 }, - { url = "https://files.pythonhosted.org/packages/be/c7/dfb59b7e25d760bfdd93f0aef7dd0e2a37f8437ac3017b8b526c68764e2f/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:094c7cec9bd271a32dfb7c620d4a558c63fcb0122fd1651b9ed73d6afd4ae6fe", size = 2472127 }, - { url = "https://files.pythonhosted.org/packages/f2/b6/af4edf0a181fd7b148a83d491f5677d7d1c9f86f03282f8f0209d9dfb793/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:b51bf4981b3d7e47c2569efadff08284787124eb3dea0f63f491d39703231d3c", size = 3344015 }, - { url = "https://files.pythonhosted.org/packages/0a/9f/4c2b5ae642f7d3df73c16df6c7d53e9443cb0e49e1dcf2c8d1a49058e0b5/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea7aaf0dc1a828e2133357a9e9553fd1bb4e766890d52a506cc132e40632acdc", size = 2952942 }, - { url = "https://files.pythonhosted.org/packages/97/8e/6b707871db5927a17ad7475c070916bff4f32463a51552b424779236ab65/grpcio_tools-1.69.0-cp313-cp313-win32.whl", hash = "sha256:4320f11b79d3a148cc23bad1b81719ce1197808dc2406caa8a8ba0a5cfb0260d", size = 956242 }, - { url = "https://files.pythonhosted.org/packages/27/e2/b419a02b50240143605f77cd50cb07f724caf0fd35a01540a4f044ae9f21/grpcio_tools-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9bae733654e0eb8ca83aa1d0d6b6c2f4a3525ce70d5ffc07df68d28f6520137", size = 1113616 }, +sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/6facde12a5a8da4398a3a8947f8ba6ef33b408dfc9767c8cefc0074ddd68/grpcio_tools-1.67.1.tar.gz", hash = "sha256:d9657f5ddc62b52f58904e6054b7d8a8909ed08a1e28b734be3a707087bcf004", size = 5159073 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/46/668e681e2e4ca7dc80cb5ad22bc794958c8b604b5b3143f16b94be3c0118/grpcio_tools-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:c701aaa51fde1f2644bd94941aa94c337adb86f25cd03cf05e37387aaea25800", size = 2308117 }, + { url = "https://files.pythonhosted.org/packages/d6/56/1c65fb7c836cd40470f1f1a88185973466241fdb42b42b7a83367c268622/grpcio_tools-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:6a722bba714392de2386569c40942566b83725fa5c5450b8910e3832a5379469", size = 5500152 }, + { url = "https://files.pythonhosted.org/packages/01/ab/caf9c330241d843a83043b023e2996e959cdc2c3ab404b1a9938eb734143/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0c7415235cb154e40b5ae90e2a172a0eb8c774b6876f53947cf0af05c983d549", size = 2282055 }, + { url = "https://files.pythonhosted.org/packages/75/e6/0cd849d140b58fedb7d3b15d907fe2eefd4dadff09b570dd687d841c5d00/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a4c459098c4934f9470280baf9ff8b38c365e147f33c8abc26039a948a664a5", size = 2617360 }, + { url = "https://files.pythonhosted.org/packages/b9/51/bd73cd6515c2e81ba0a29b3cf6f2f62ad94737326f70b32511d1972a383e/grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e89bf53a268f55c16989dab1cf0b32a5bff910762f138136ffad4146129b7a10", size = 2416028 }, + { url = "https://files.pythonhosted.org/packages/47/e5/6a16e23036f625b6d60b579996bb9bb7165485903f934d9d9d73b3f03ef5/grpcio_tools-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f09cb3e6bcb140f57b878580cf3b848976f67faaf53d850a7da9bfac12437068", size = 3224906 }, + { url = "https://files.pythonhosted.org/packages/14/cb/230c17d4372fa46fc799a822f25fa00c8eb3f85cc86e192b9606a17f732f/grpcio_tools-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:616dd0c6686212ca90ff899bb37eb774798677e43dc6f78c6954470782d37399", size = 2870384 }, + { url = "https://files.pythonhosted.org/packages/66/fd/6d9dd3bf5982ab7d7e773f055360185e96a96cf95f2cbc7f53ded5912ef5/grpcio_tools-1.67.1-cp310-cp310-win32.whl", hash = "sha256:58a66dbb3f0fef0396737ac09d6571a7f8d96a544ce3ed04c161f3d4fa8d51cc", size = 941138 }, + { url = "https://files.pythonhosted.org/packages/6a/97/2fd5ebd996c12b2cb1e1202ee4a03cac0a65ba17d29dd34253bfe2079839/grpcio_tools-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:89ee7c505bdf152e67c2cced6055aed4c2d4170f53a2b46a7e543d3b90e7b977", size = 1091151 }, + { url = "https://files.pythonhosted.org/packages/b5/9a/ec06547673c5001c2604637069ff8f287df1aef3f0f8809b09a1c936b049/grpcio_tools-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:6d80ddd87a2fb7131d242f7d720222ef4f0f86f53ec87b0a6198c343d8e4a86e", size = 2307990 }, + { url = "https://files.pythonhosted.org/packages/ca/84/4b7c3c27a2972c00b3b6ccaadd349e0f86b7039565d3a4932e219a4d76e0/grpcio_tools-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b655425b82df51f3bd9fd3ba1a6282d5c9ce1937709f059cb3d419b224532d89", size = 5526552 }, + { url = "https://files.pythonhosted.org/packages/a7/2d/a620e4c53a3b808ebecaa5033c2176925ee1c6cbb45c29af8bec9a249822/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:250241e6f9d20d0910a46887dfcbf2ec9108efd3b48f3fb95bb42d50d09d03f8", size = 2282137 }, + { url = "https://files.pythonhosted.org/packages/ec/29/e188b2e438781b37532abb8f10caf5b09c611a0bf9a09940b4cf303afd5b/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6008f5a5add0b6f03082edb597acf20d5a9e4e7c55ea1edac8296c19e6a0ec8d", size = 2617333 }, + { url = "https://files.pythonhosted.org/packages/86/aa/2bbccd3c34b1fa48b892fbad91525c33a8aa85cbedd50e8b0d17dc260dc3/grpcio_tools-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5eff9818c3831fa23735db1fa39aeff65e790044d0a312260a0c41ae29cc2d9e", size = 2415806 }, + { url = "https://files.pythonhosted.org/packages/db/34/99853a8ced1119937d02511476018dc1d6b295a4803d4ead5dbf9c55e9bc/grpcio_tools-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:262ab7c40113f8c3c246e28e369661ddf616a351cb34169b8ba470c9a9c3b56f", size = 3224765 }, + { url = "https://files.pythonhosted.org/packages/66/39/8537a8ace8f6242f2058677e56a429587ec731c332985af34f35d496ca58/grpcio_tools-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1eebd8c746adf5786fa4c3056258c21cc470e1eca51d3ed23a7fb6a697fe4e81", size = 2870446 }, + { url = "https://files.pythonhosted.org/packages/28/2a/5c04375adccff58647d48675e055895c31811a0ad896e4ba310833e2154d/grpcio_tools-1.67.1-cp311-cp311-win32.whl", hash = "sha256:3eff92fb8ca1dd55e3af0ef02236c648921fb7d0e8ca206b889585804b3659ae", size = 940890 }, + { url = "https://files.pythonhosted.org/packages/e6/ee/7861339c2cec8d55a5e859cf3682bda34eab5a040f95d0c80f775d6a3279/grpcio_tools-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:1ed18281ee17e5e0f9f6ce0c6eb3825ca9b5a0866fc1db2e17fab8aca28b8d9f", size = 1091094 }, + { url = "https://files.pythonhosted.org/packages/d9/cf/7b1908ca72e484bac555431036292c48d2d6504a45e2789848cb5ff313a8/grpcio_tools-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:bd5caef3a484e226d05a3f72b2d69af500dca972cf434bf6b08b150880166f0b", size = 2307645 }, + { url = "https://files.pythonhosted.org/packages/bb/15/0d1efb38af8af7e56b2342322634a3caf5f1337a6c3857a6d14aa590dfdf/grpcio_tools-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:48a2d63d1010e5b218e8e758ecb2a8d63c0c6016434e9f973df1c3558917020a", size = 5525468 }, + { url = "https://files.pythonhosted.org/packages/52/42/a810709099f09ade7f32990c0712c555b3d7eab6a05fb62618c17f8fe9da/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:baa64a6aa009bffe86309e236c81b02cd4a88c1ebd66f2d92e84e9b97a9ae857", size = 2281768 }, + { url = "https://files.pythonhosted.org/packages/4c/2a/64ee6cfdf1c32ef8bdd67bf04ae2f745f517f4a546281453ca1f68fa79ca/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ab318c40b5e3c097a159035fc3e4ecfbe9b3d2c9de189e55468b2c27639a6ab", size = 2617359 }, + { url = "https://files.pythonhosted.org/packages/79/7f/1ed8cd1529253fef9cf0ef3cd8382641125a5ca2eaa08eaffbb549f84e0b/grpcio_tools-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50eba3e31f9ac1149463ad9182a37349850904f142cffbd957cd7f54ec320b8e", size = 2415323 }, + { url = "https://files.pythonhosted.org/packages/8e/08/59f0073c58703c176c15fb1a838763b77c1c06994adba16654b92a666e1b/grpcio_tools-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:de6fbc071ecc4fe6e354a7939202191c1f1abffe37fbce9b08e7e9a5b93eba3d", size = 3225051 }, + { url = "https://files.pythonhosted.org/packages/b7/0d/a5d703214fe49d261b4b8f0a64140a4dc1f88560724a38ad937120b899ad/grpcio_tools-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db9e87f6ea4b0ce99b2651203480585fd9e8dd0dd122a19e46836e93e3a1b749", size = 2870421 }, + { url = "https://files.pythonhosted.org/packages/ac/af/41d79cb87eae99c0348e8f1fb3dbed9e40a6f63548b216e99f4d1165fa5c/grpcio_tools-1.67.1-cp312-cp312-win32.whl", hash = "sha256:6a595a872fb720dde924c4e8200f41d5418dd6baab8cc1a3c1e540f8f4596351", size = 940542 }, + { url = "https://files.pythonhosted.org/packages/66/e5/096e12f5319835aa2bcb746d49ae62220bb48313ca649e89bdbef605c11d/grpcio_tools-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:92eebb9b31031604ae97ea7657ae2e43149b0394af7117ad7e15894b6cc136dc", size = 1090425 }, + { url = "https://files.pythonhosted.org/packages/62/b3/91c88440c978740752d39f1abae83f21408048b98b93652ebd84f974ad3d/grpcio_tools-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:9a3b9510cc87b6458b05ad49a6dee38df6af37f9ee6aa027aa086537798c3d4a", size = 2307453 }, + { url = "https://files.pythonhosted.org/packages/05/33/faf3330825463c0409fa3891bc1459bf86a00055b19790211365279538d7/grpcio_tools-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9e4c9b9fa9b905f15d414cb7bd007ba7499f8907bdd21231ab287a86b27da81a", size = 5517975 }, + { url = "https://files.pythonhosted.org/packages/bd/78/461ab34cadbd0b5b9a0b6efedda96b58e0de471e3fa91d8e4a4e31924e1b/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:e11a98b41af4bc88b7a738232b8fa0306ad82c79fa5d7090bb607f183a57856f", size = 2281081 }, + { url = "https://files.pythonhosted.org/packages/5f/0c/b30bdbcab1795b12e05adf30c20981c14f66198e22044edb15b3c1d9f0bc/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de0fcfe61c26679d64b1710746f2891f359593f76894fcf492c37148d5694f00", size = 2616929 }, + { url = "https://files.pythonhosted.org/packages/d3/c2/a77ca68ae768f8d5f1d070ea4afc42fda40401083e7c4f5c08211e84de38/grpcio_tools-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ae3b3e2ee5aad59dece65a613624c46a84c9582fc3642686537c6dfae8e47dc", size = 2414633 }, + { url = "https://files.pythonhosted.org/packages/39/70/8d7131dccfe4d7b739c96ada7ea9acde631f58f013eae773791fb490a3eb/grpcio_tools-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:9a630f83505b6471a3094a7a372a1240de18d0cd3e64f4fbf46b361bac2be65b", size = 3224328 }, + { url = "https://files.pythonhosted.org/packages/2a/28/2d24b933ccf0d6877035aa3d5f8b64aad18c953657dd43c682b5701dc127/grpcio_tools-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d85a1fcbacd3e08dc2b3d1d46b749351a9a50899fa35cf2ff040e1faf7d405ad", size = 2869640 }, + { url = "https://files.pythonhosted.org/packages/37/77/ddd2b4cc896639fb0f85fc21d5684f25080ee28845c5a4031e3dd65fdc92/grpcio_tools-1.67.1-cp313-cp313-win32.whl", hash = "sha256:778470f025f25a1fca5a48c93c0a18af395b46b12dd8df7fca63736b85181f41", size = 939997 }, + { url = "https://files.pythonhosted.org/packages/96/d0/f0855a0ccb26ffeb41e6db68b5cbb25d7e9ba1f8f19151eef36210e64efc/grpcio_tools-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:6961da86e9856b4ddee0bf51ef6636b4bf9c29c0715aa71f3c8f027c45d42654", size = 1089819 }, ] [[package]] @@ -2286,18 +2289,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, ] -[[package]] -name = "marshmallow" -version = "3.24.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3b/1f/52fa79445669322ee42fdd11b591c2e9c8dbab33eaf7059ca881b349ae09/marshmallow-3.24.2.tar.gz", hash = "sha256:0822c3701de396b51d3f8ac97319aea5493998ba4e7d0e4c05f6fce7777bf3a2", size = 176520 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/67/40/7802bb90b1ecbb284ae613da2cfde9ce0177b77d76cbb276acf976296aa8/marshmallow-3.24.2-py3-none-any.whl", hash = "sha256:bf3c56db473bb160e5191f1c5e32e3fc8bfb58998eb2b35d6747de023e31f9e7", size = 49333 }, -] - [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -3995,11 +3986,11 @@ name = "pymilvus" version = "2.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "environs", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "grpcio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "milvus-lite", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ujson", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -4926,6 +4917,7 @@ dapr = [ ] google = [ { name = "google-cloud-aiplatform", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "google-genai", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "google-generativeai", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] hugging-face = [ From 7cde9da27a1323b605b760baccc1ef6fc6113fdf Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Fri, 10 Jan 2025 16:59:35 +0100 Subject: [PATCH 06/50] small updates --- docs/decisions/00XX-realtime-api-clients.md | 90 ++++++++++--------- .../audio/04-chat_with_realtime_api.py | 9 +- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 81d9d6fdf4e7..8dff5f882cf6 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -23,51 +23,51 @@ One feature that we need to consider if and how to deal with is whether or not a ### Event types Client side events: -| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | -|-------------------| ------------------------------------|-------------------------|------------------------| - | Control | Configure session | `session.update` | `BidiGenerateContentSetup` | - | Content | Send voice input | `input_audio_buffer.append` | `BidiGenerateContentRealtimeInput` | - | Control | Commit input and request response | `input_audio_buffer.commit` | `-` | - | Control | Clean audio input buffer | `input_audio_buffer.clear` | `-` | - | Content | Send text input | `conversation.item.create` | `BidiGenerateContentClientContent` | - | Control | Interrupt audio | `conversation.item.truncate` | `-`| - | Control | Delete content | `conversation.item.delete` | `-`| -| Control | Respond to function call request | `conversation.item.create` | `BidiGenerateContentToolResponse`| -| Control | Ask for response | `response.create` | `-`| -| Control | Cancel response | `response.cancel` | `-`| +| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | +| ------------------------- | --------------------------------- | ---------------------------- | ---------------------------------- | +| Control | Configure session | `session.update` | `BidiGenerateContentSetup` | +| Content | Send voice input | `input_audio_buffer.append` | `BidiGenerateContentRealtimeInput` | +| Control | Commit input and request response | `input_audio_buffer.commit` | `-` | +| Control | Clean audio input buffer | `input_audio_buffer.clear` | `-` | +| Content | Send text input | `conversation.item.create` | `BidiGenerateContentClientContent` | +| Control | Interrupt audio | `conversation.item.truncate` | `-` | +| Control | Delete content | `conversation.item.delete` | `-` | +| Control | Respond to function call request | `conversation.item.create` | `BidiGenerateContentToolResponse` | +| Control | Ask for response | `response.create` | `-` | +| Control | Cancel response | `response.cancel` | `-` | Server side events: -| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | -|----------------------------|-------------------------------------|-------------------------|------------------------| -| Control | Error | `error` | `-` | -| Control | Session created | `session.created` | `BidiGenerateContentSetupComplete` | -| Control | Session updated | `session.updated` | `BidiGenerateContentSetupComplete` | -| Control | Conversation created | `conversation.created` | `-` | -| Control | Input audio buffer committed | `input_audio_buffer.committed` | `-` | -| Control | Input audio buffer cleared | `input_audio_buffer.cleared` | `-` | -| Control | Input audio buffer speech started | `input_audio_buffer.speech_started` | `-` | -| Control | Input audio buffer speech stopped | `input_audio_buffer.speech_stopped` | `-` | -| Content | Conversation item created | `conversation.item.created` | `-` | -| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | -| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | -| Control | Conversation item truncated | `conversation.item.truncated` | `-` | -| Control | Conversation item deleted | `conversation.item.deleted` | `-` | -| Control | Response created | `response.created` | `-` | -| Control | Response done | `response.done` | `-` | -| Content | Response output item added | `response.output_item.added` | `-` | -| Content | Response output item done | `response.output_item.done` | `-` | -| Content | Response content part added | `response.content_part.added` | `-` | -| Content | Response content part done | `response.content_part.done` | `-` | -| Content | Response text delta | `response.text.delta` | `BidiGenerateContentServerContent` | -| Content | Response text done | `response.text.done` | `-` | -| Content | Response audio transcript delta | `response.audio_transcript.delta` | `BidiGenerateContentServerContent` | -| Content | Response audio transcript done | `response.audio_transcript.done` | `-` | -| Content | Response audio delta | `response.audio.delta` | `BidiGenerateContentServerContent` | -| Content | Response audio done | `response.audio.done` | `-` | -| Content | Response function call arguments delta | `response.function_call_arguments.delta` | `BidiGenerateContentToolCall` | -| Content | Response function call arguments done | `response.function_call_arguments.done` | `-` | -| Control | Function call cancelled | `-` | `BidiGenerateContentToolCallCancellation` | -| Control | Rate limits updated | `rate_limits.updated` | `-` | +| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | +| ------------------------- | -------------------------------------- | ------------------------------------------------------- | ----------------------------------------- | +| Control | Error | `error` | `-` | +| Control | Session created | `session.created` | `BidiGenerateContentSetupComplete` | +| Control | Session updated | `session.updated` | `BidiGenerateContentSetupComplete` | +| Control | Conversation created | `conversation.created` | `-` | +| Control | Input audio buffer committed | `input_audio_buffer.committed` | `-` | +| Control | Input audio buffer cleared | `input_audio_buffer.cleared` | `-` | +| Control | Input audio buffer speech started | `input_audio_buffer.speech_started` | `-` | +| Control | Input audio buffer speech stopped | `input_audio_buffer.speech_stopped` | `-` | +| Content | Conversation item created | `conversation.item.created` | `-` | +| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | +| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | +| Control | Conversation item truncated | `conversation.item.truncated` | `-` | +| Control | Conversation item deleted | `conversation.item.deleted` | `-` | +| Control | Response created | `response.created` | `-` | +| Control | Response done | `response.done` | `-` | +| Content | Response output item added | `response.output_item.added` | `-` | +| Content | Response output item done | `response.output_item.done` | `-` | +| Content | Response content part added | `response.content_part.added` | `-` | +| Content | Response content part done | `response.content_part.done` | `-` | +| Content | Response text delta | `response.text.delta` | `BidiGenerateContentServerContent` | +| Content | Response text done | `response.text.done` | `-` | +| Content | Response audio transcript delta | `response.audio_transcript.delta` | `BidiGenerateContentServerContent` | +| Content | Response audio transcript done | `response.audio_transcript.done` | `-` | +| Content | Response audio delta | `response.audio.delta` | `BidiGenerateContentServerContent` | +| Content | Response audio done | `response.audio.done` | `-` | +| Content | Response function call arguments delta | `response.function_call_arguments.delta` | `BidiGenerateContentToolCall` | +| Content | Response function call arguments done | `response.function_call_arguments.done` | `-` | +| Control | Function call cancelled | `-` | `BidiGenerateContentToolCallCancellation` | +| Control | Rate limits updated | `rate_limits.updated` | `-` | @@ -77,6 +77,7 @@ Server side events: - Simple programming model that is likely able to handle future realtime api's and evolution of the existing ones. - Support for the most common scenario's and content, extensible for the rest. - Natively integrated with Semantic Kernel especially for content types and function calling. +- Support multiple types of connections, like websocket and WebRTC - … @@ -94,8 +95,9 @@ This would mean there are two mechanisms in the clients, one deals with content, - Pro: - strongly typed responses for known content - easy to use as the main interactions are clear with familiar SK content types, the rest goes through a separate mechanism + - this might fit better with something like WebRTC that has distinct channels for audio and video vs a data stream for all other events - Con: - - new content support requires updates in the codebase and can be considered breaking (potentitally sending additional types back) + - new content support requires updates in the codebase and can be considered breaking (potentially sending additional types back) - additional complexity in dealing with two streams of data ### Treat everything as content items diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index bffbad691716..3aa2a8f52b10 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -47,7 +47,7 @@ def check_audio_devices(): print(sd.query_devices()) -# check_audio_devices() +check_audio_devices() class Speaker: @@ -106,6 +106,7 @@ def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: Realtim self.realtime_client = realtime_client async def record_audio(self): + await self.realtime_client.send_event("response.create") with contextlib.suppress(asyncio.CancelledError): async for content in self.audio_recorder.stream_audio_content(): if content.data: @@ -150,8 +151,8 @@ async def main() -> None: realtime_client.register_event_handler("response.created", response_created_callback) # create the speaker and microphone - speaker = Speaker(AudioPlayerAsync(device_id=7), realtime_client, kernel) - microphone = Microphone(AudioRecorderStream(device_id=2), realtime_client) + speaker = Speaker(AudioPlayerAsync(device_id=None), realtime_client, kernel) + microphone = Microphone(AudioRecorderStream(device_id=None), realtime_client) # Create the settings for the session # the key thing to decide on is to enable the server_vad turn detection @@ -186,7 +187,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instruction: start speaking, when you stop the API should detect you finished and start responding." + "Instruction: start speaking, when you stop the API should detect you finished and start responding. " "Press ctrl + c to stop the program." ) asyncio.run(main()) From fe1be54fd7c7351192b55399495248067e7f81fb Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 14 Jan 2025 15:45:04 +0100 Subject: [PATCH 07/50] webrtc WIP --- docs/decisions/00XX-realtime-api-clients.md | 23 +- .../audio/04-chat_with_realtime_api.py | 121 ++++-- .../concepts/audio/audio_player_async.py | 4 +- .../concepts/audio/audio_recorder_stream.py | 23 +- .../connectors/ai/open_ai/__init__.py | 3 +- .../ai/open_ai/services/open_ai_realtime.py | 59 ++- .../open_ai/services/open_ai_realtime_base.py | 405 +++++++++++++++++- .../connectors/ai/realtime_client_base.py | 74 ++-- .../semantic_kernel/contents/audio_content.py | 5 + .../contents/binary_content.py | 2 +- .../contents/utils/data_uri.py | 2 + python/uv.lock | 39 +- 12 files changed, 642 insertions(+), 118 deletions(-) diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 8dff5f882cf6..1b0bbd2d6c52 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -14,13 +14,21 @@ informed: Multiple model providers are starting to enable realtime voice-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. The key addition that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. -The way these API's work at this time is through either websockets or WebRTC. In both cases there are events being sent to and from the service, some events contain content, text, audio, or video (so far only sending, not receiving), while some events are "control" events, like content created, function call requested, etc. Sending events include, sending content, either voice, text or function call output, or events, like committing the input audio and requesting a response. +The way these API's work at this time is through either Websockets or WebRTC. + +In both cases there are events being sent to and from the service, some events contain content, text, audio, or video (so far only sending, not receiving), while some events are "control" events, like content created, function call requested, etc. Sending events include, sending content, either voice, text or function call output, or events, like committing the input audio and requesting a response. + +### Websocket +Websocket has been around for a while and is a well known technology, it is a full-duplex communication protocol over a single, long-lived connection. It is used for sending and receiving messages between client and server in real-time. Each event can contain a message, which might contain a content item, or a control event. + +### WebRTC +WebRTC is a Mozilla project that provides web browsers and mobile applications with real-time communication via simple application programming interfaces (APIs). It allows audio and video communication to work inside web pages by allowing direct peer-to-peer communication, eliminating the need to install plugins or download native apps. It is used for sending and receiving audio and video streams, and can be used for sending messages as well. The big difference compared to websockets is that it does explicitly create a channel for audio and video, and a separate channel for "data", which are events but also things like Function calls. Both the OpenAI and Google realtime api's are in preview/beta, this means there might be breaking changes in the way they work coming in the future, therefore the clients built to support these API's are going to be experimental until the API's stabilize. One feature that we need to consider if and how to deal with is whether or not a service uses Voice Activated Detection, OpenAI supports turning that off and allows parameters for how it behaves, while Google has it on by default and it cannot be configured. -### Event types +### Event types (websocket and partially webrtc) Client side events: | **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | @@ -48,8 +56,8 @@ Server side events: | Control | Input audio buffer speech started | `input_audio_buffer.speech_started` | `-` | | Control | Input audio buffer speech stopped | `input_audio_buffer.speech_stopped` | `-` | | Content | Conversation item created | `conversation.item.created` | `-` | -| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | -| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | +| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | | +| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | | | Control | Conversation item truncated | `conversation.item.truncated` | `-` | | Control | Conversation item deleted | `conversation.item.deleted` | `-` | | Control | Response created | `response.created` | `-` | @@ -70,16 +78,15 @@ Server side events: | Control | Rate limits updated | `rate_limits.updated` | `-` | - - ## Decision Drivers - Simple programming model that is likely able to handle future realtime api's and evolution of the existing ones. - Support for the most common scenario's and content, extensible for the rest. - Natively integrated with Semantic Kernel especially for content types and function calling. - Support multiple types of connections, like websocket and WebRTC - -- … + +## Decision driver questions +- For WebRTC, a audio device can be passed, should this be a requirement for the client also for websockets? ## Considered Options diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 3aa2a8f52b10..40f16a2c0a24 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -5,6 +5,9 @@ import signal from typing import Any +import numpy as np +from aiortc.mediastreams import MediaStreamError, MediaStreamTrack +from av import AudioFrame from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent from samples.concepts.audio.audio_player_async import AudioPlayerAsync @@ -12,8 +15,8 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( - OpenAIRealtime, OpenAIRealtimeExecutionSettings, + OpenAIRealtimeWebRTC, TurnDetection, ) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings @@ -69,52 +72,77 @@ async def play( ) -> None: # reset the frame count for the audio player self.audio_player.reset_frame_count() - # open the connection to the realtime api - async with self.realtime_client as client: - # update the session with the chat_history and settings - await client.update_session(settings=settings, chat_history=chat_history) - # print the start message of the transcript - if print_transcript: - print("Mosscap (transcript): ", end="") - try: - # start listening for events - async for content in self.realtime_client.event_listener(settings=settings, kernel=self.kernel): - if not content: - continue - # the contents returned should be StreamingChatMessageContent - # so we will loop through the items within it. - for item in content.items: - match item: - case StreamingTextContent(): - if print_transcript: - print(item.text, end="") - await asyncio.sleep(0.01) - continue - case AudioContent(): - self.audio_player.add_data(item.data) - await asyncio.sleep(0.01) - continue - except asyncio.CancelledError: - print("\nThanks for talking to Mosscap!") - - -class Microphone: + # print the start message of the transcript + if print_transcript: + print("Mosscap (transcript): ", end="") + try: + # start listening for events + while True: + _, content = await self.realtime_client.output_buffer.get() + if not content: + continue + # the contents returned should be StreamingChatMessageContent + # so we will loop through the items within it. + for item in content.items: + match item: + case StreamingTextContent(): + if print_transcript: + print(item.text, end="") + await asyncio.sleep(0.01) + continue + case AudioContent(): + self.audio_player.add_data(item.data) + await asyncio.sleep(0.01) + continue + except asyncio.CancelledError: + print("\nThanks for talking to Mosscap!") + + +class Microphone(MediaStreamTrack): """This is a simple class that opens the microphone and sends the audio to the realtime api.""" + kind = "audio" + def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: RealtimeClientBase): self.audio_recorder = audio_recorder self.realtime_client = realtime_client + self.queue = asyncio.Queue() + self.loop = asyncio.get_running_loop() + self._pts = 0 + + async def recv(self) -> Any: + # start the audio recording + try: + return await self.queue.get() + except Exception as e: + logger.error(f"Error receiving audio frame: {str(e)}") + raise MediaStreamError("Failed to receive audio frame") async def record_audio(self): - await self.realtime_client.send_event("response.create") - with contextlib.suppress(asyncio.CancelledError): - async for content in self.audio_recorder.stream_audio_content(): - if content.data: - await self.realtime_client.send_event( - "input_audio_buffer.append", - content=content, - ) - await asyncio.sleep(0.01) + def callback(indata, frames, time, status): + if status: + logger.warning(f"Audio input status: {status}") + audio_data = indata.copy() + + if audio_data.dtype != np.int16: + audio_data = (audio_data * 32767).astype(np.int16) + + # Create AudioFrame with incrementing pts + frame = AudioFrame( + samples=len(audio_data), + layout="mono", + format="s16", # 16-bit signed integer + ) + frame.rate = 48000 + frame.pts = self._pts + self._pts += len(audio_data) # Increment pts by frame size + + frame.planes[0].update(audio_data.tobytes()) + + asyncio.run_coroutine_threadsafe(self.queue.put(frame), self.loop) + + await self.realtime_client.input_buffer.put("response.create") + await self.audio_recorder.stream_audio_content_with_callback(callback=callback) # this function is used to stop the processes when ctrl + c is pressed @@ -147,7 +175,7 @@ async def main() -> None: kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) # create the realtime client and register the response created callback - realtime_client = OpenAIRealtime(ai_model_id="gpt-4o-realtime-preview-2024-12-17") + realtime_client = OpenAIRealtimeWebRTC(ai_model_id="gpt-4o-realtime-preview-2024-12-17") realtime_client.register_event_handler("response.created", response_created_callback) # create the speaker and microphone @@ -180,9 +208,14 @@ async def main() -> None: turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) - # start the the speaker and the microphone - with contextlib.suppress(asyncio.CancelledError): - await asyncio.gather(*[speaker.play(chat_history, settings), microphone.record_audio()]) + async with realtime_client: + await realtime_client.update_session(settings=settings, chat_history=chat_history) + await realtime_client.start_listening(settings, chat_history) + await realtime_client.start_sending(input_audio_track=microphone) + # await realtime_client.start_streaming(settings, chat_history, input_audio_track=microphone) + # start the the speaker and the microphone + with contextlib.suppress(asyncio.CancelledError): + await speaker.play(chat_history, settings) if __name__ == "__main__": diff --git a/python/samples/concepts/audio/audio_player_async.py b/python/samples/concepts/audio/audio_player_async.py index a77b8df6e32c..36c1492094a6 100644 --- a/python/samples/concepts/audio/audio_player_async.py +++ b/python/samples/concepts/audio/audio_player_async.py @@ -53,10 +53,10 @@ def reset_frame_count(self): def get_frame_count(self): return self._frame_count - def add_data(self, data: bytes): + def add_data(self, data: bytes | np.ndarray): with self.lock: # bytes is pcm16 single channel audio data, convert to numpy array - np_data = np.frombuffer(data, dtype=np.int16) + np_data = np.frombuffer(data, dtype=np.int16) if isinstance(data, bytes) else data self.queue.append(np_data) if not self.playing: self.start() diff --git a/python/samples/concepts/audio/audio_recorder_stream.py b/python/samples/concepts/audio/audio_recorder_stream.py index 55684e9c469b..20c758af3e39 100644 --- a/python/samples/concepts/audio/audio_recorder_stream.py +++ b/python/samples/concepts/audio/audio_recorder_stream.py @@ -2,9 +2,10 @@ import asyncio import base64 -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Callable from typing import Any, ClassVar, cast +import sounddevice as sd from pydantic import BaseModel from semantic_kernel.contents.audio_content import AudioContent @@ -30,9 +31,25 @@ class AudioRecorderStream(BaseModel): CHUNK_LENGTH_S: ClassVar[float] = 0.05 device_id: int | None = None - async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: - import sounddevice as sd # type: ignore + async def stream_audio_content_with_callback(self, callback: Callable[..., Any]) -> None: + stream = sd.InputStream( + channels=self.CHANNELS, + samplerate=self.SAMPLE_RATE, + dtype="int16", + device=self.device_id, + callback=callback, + ) + stream.start() + try: + while True: + await asyncio.sleep(0) + except KeyboardInterrupt: + pass + finally: + stream.stop() + stream.close() + async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: # device_info = sd.query_devices() # print(device_info) diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 27d36ea30d34..2c2a87a64a7b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -40,7 +40,7 @@ from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime, OpenAIRealtimeWebRTC from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio @@ -76,6 +76,7 @@ "OpenAIPromptExecutionSettings", "OpenAIRealtime", "OpenAIRealtimeExecutionSettings", + "OpenAIRealtimeWebRTC", "OpenAISettings", "OpenAITextCompletion", "OpenAITextEmbedding", diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 23351d7b6176..39c85816ced3 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -7,7 +7,10 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import OpenAIRealtimeBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ( + OpenAIRealtimeBase, + OpenAIRealtimeWebRTCBase, +) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError @@ -64,3 +67,57 @@ def __init__( default_headers=default_headers, client=async_client, ) + + +class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): + """OpenAI Realtime service.""" + + def __init__( + self, + ai_model_id: str | None = None, + api_key: str | None = None, + org_id: str | None = None, + service_id: str | None = None, + default_headers: Mapping[str, str] | None = None, + async_client: AsyncOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + ) -> None: + """Initialize an OpenAITextCompletion service. + + Args: + ai_model_id (str | None): OpenAI model name, see + https://platform.openai.com/docs/models + service_id (str | None): Service ID tied to the execution settings. + api_key (str | None): The optional API key to use. If provided will override, + the env vars or .env file value. + org_id (str | None): The optional org ID to use. If provided will override, + the env vars or .env file value. + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + async_client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + """ + try: + openai_settings = OpenAISettings.create( + api_key=api_key, + org_id=org_id, + text_model_id=ai_model_id, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not openai_settings.text_model_id: + raise ServiceInitializationError("The OpenAI text model ID is required.") + super().__init__( + ai_model_id=openai_settings.text_model_id, + service_id=service_id, + api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, + org_id=openai_settings.org_id, + ai_model_type=OpenAIModelTypes.TEXT, + default_headers=default_headers, + client=async_client, + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index 64b647f44ee8..f82bce19164f 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -2,6 +2,7 @@ import asyncio import base64 +import json import logging import sys from collections.abc import AsyncGenerator @@ -14,6 +15,15 @@ else: from typing_extensions import override # pragma: no cover +from aiohttp import ClientSession +from aiortc import ( + MediaStreamTrack, + RTCConfiguration, + RTCDataChannel, + RTCIceServer, + RTCPeerConnection, + RTCSessionDescription, +) from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection from openai.types.beta.realtime.conversation_item_create_event_param import ConversationItemParam from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent @@ -152,7 +162,7 @@ def register_event_handler( self.event_handlers.setdefault(event_type, []).append(handler) @override - async def event_listener( + async def start_listening( self, settings: "PromptExecutionSettings", chat_history: "ChatHistory | None" = None, @@ -186,7 +196,7 @@ async def event_listener( logger.debug(f"Event type: {event_type}, count: {len(self.event_log[event_type])}") @override - async def send_event(self, event: str | SendEvents, **kwargs: Any) -> None: + async def start_sending(self, event: str | SendEvents, **kwargs: Any) -> None: await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") @@ -299,10 +309,10 @@ async def update_session( self._update_function_choice_settings_callback(), kernel=kwargs.get("kernel"), # type: ignore ) - await self.send_event(SendEvents.SESSION_UPDATE, settings=settings) + await self.start_sending(SendEvents.SESSION_UPDATE, settings=settings) if chat_history and len(chat_history) > 0: await asyncio.gather( - *(self.send_event(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) for msg in chat_history.messages) + *(self.start_sending(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) for msg in chat_history.messages) ) @override @@ -313,6 +323,8 @@ async def close_session(self) -> None: self.connection = None self.connected.clear() + # region Event callbacks + def response_audio_delta_callback( self, event: RealtimeServerEvent, @@ -420,13 +432,392 @@ async def response_function_call_arguments_done_callback( if kernel: chat_history = ChatHistory() await kernel.invoke_function_call(item, chat_history) - await self.send_event(SendEvents.CONVERSATION_ITEM_CREATE, item=chat_history.messages[-1]) + await self.start_sending(SendEvents.CONVERSATION_ITEM_CREATE, item=chat_history.messages[-1]) + # The model doesn't start responding to the tool call automatically, so triggering it here. + await self.start_sending(SendEvents.RESPONSE_CREATE) + return chat_history.messages[-1], False + + # region settings + + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + +@experimental_class +class OpenAIRealtimeWebRTCBase(OpenAIHandler, RealtimeClientBase): + """OpenAI WebRTC Realtime service.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True + peer_connection: RTCPeerConnection | None = None + data_channel: RTCDataChannel | None = None + connection: AsyncRealtimeConnection | None = None + connected: asyncio.Event = Field(default_factory=asyncio.Event) + event_log: dict[str, list[RealtimeServerEvent]] = Field(default_factory=dict) + event_handlers: dict[str, list[EventCallBackProtocol | EventCallBackProtocolAsync]] = Field(default_factory=dict) + + def model_post_init(self, *args, **kwargs) -> None: + """Post init method for the model.""" + # Register the default event handlers + self.register_event_handler( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, self.response_audio_transcript_delta_callback + ) + self.register_event_handler( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE, self.response_audio_transcript_done_callback + ) + self.register_event_handler( + ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, self.response_function_call_arguments_delta_callback + ) + self.register_event_handler(ListenEvents.ERROR, self.error_callback) + self.register_event_handler(ListenEvents.SESSION_CREATED, self.session_callback) + self.register_event_handler(ListenEvents.SESSION_UPDATED, self.session_callback) + + def register_event_handler( + self, event_type: str | ListenEvents, handler: EventCallBackProtocol | EventCallBackProtocolAsync + ) -> None: + """Register a event handler.""" + if not isinstance(event_type, ListenEvents): + event_type = ListenEvents(event_type) + self.event_handlers.setdefault(event_type, []).append(handler) + + @override + async def start_listening( + self, + settings: "PromptExecutionSettings", + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> AsyncGenerator[StreamingChatMessageContent, Any]: + ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])] + self.peer_connection = RTCPeerConnection(configuration=RTCConfiguration(iceServers=ice_servers)) + + @self.peer_connection.on("track") + async def on_track(track: MediaStreamTrack) -> None: + if track.kind == "audio": + while True: + frame = await track.recv() + await self.output_buffer.put( + ( + ListenEvents.RESPONSE_AUDIO_DELTA, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[AudioContent(data=frame.to_ndarray(), data_format="base64")], + choice_index=0, + inner_content=frame, + ), + ), + ) + + data_channel = self.peer_connection.createDataChannel("oai-events") + + @data_channel.on("message") + async def on_data(data: bytes) -> None: + event = RealtimeServerEvent.model_validate_strings(data) + event_type = ListenEvents(event.type) + self.event_log.setdefault(event_type, []).append(event) + for handler in self.event_handlers.get(event_type, []): + task = handler(event=event, settings=settings) + if not task: + continue + if isawaitable(task): + async_result = await task + if not async_result: + continue + result, should_return = async_result + else: + result, should_return = task + if should_return: + yield result + else: + chat_history.add_message(result) + + offer = await self.peer_connection.createOffer() + await self.peer_connection.setLocalDescription(offer) + + try: + ephemeral_token = await self.get_ephemeral_token() + headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} + + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions?model={self.ai_model_id}", + headers=headers, + data=offer.sdp, + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"OpenAI WebRTC error: {error_text}") + + sdp_answer = await response.text() + answer = RTCSessionDescription(sdp=sdp_answer, type="answer") + await self.peer_connection.setRemoteDescription(answer) + + except Exception as e: + logger.error(f"Failed to connect to OpenAI: {e!s}") + raise + + @override + async def start_sending(self, input_audio_track: MediaStreamTrack | None = None, **kwargs: Any) -> None: + if input_audio_track: + if not self.peer_connection: + raise ValueError("Peer connection is not established.") + self.peer_connection.addTransceiver(input_audio_track) + + if not self.data_channel: + raise ValueError("Data channel is not established.") + while True: + item = await self.input_buffer.get() + if not item: + continue + if isinstance(item, tuple): + event, data = item + else: + event = item + data = None + if not isinstance(event, SendEvents): + event = SendEvents(event) + response: dict[str, Any] = { + "type": event, + } + match event: + case SendEvents.SESSION_UPDATE: + if "settings" not in data: + logger.error("Event data does not contain 'settings'") + response["session"] = data["settings"].prepare_settings_dict() + case SendEvents.CONVERSATION_ITEM_CREATE: + if "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + for item in content.items: + match item: + case TextContent(): + response["item"] = ConversationItemParam( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ) + + case FunctionCallContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function call needs to have a call_id") + continue + response["item"] = ConversationItemParam( + type="function_call", + name=item.name, + arguments=item.arguments, + call_id=call_id, + ) + + case FunctionResultContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function result needs to have a call_id") + continue + response["item"] = ConversationItemParam( + type="function_call_output", + output=item.result, + call_id=call_id, + ) + + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + response["item_id"] = data["item_id"] + response["content_index"] = 0 + response["audio_end_ms"] = data.get("audio_end_ms", 0) + + case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + response["item_id"] = data["item_id"] + case SendEvents.RESPONSE_CREATE: + if "response" in data: + response["response"] = data["response"] + case SendEvents.RESPONSE_CANCEL: + if "response_id" in data: + response["response_id"] = data["response_id"] + + self.data_channel.send(json.dumps(response)) + + @override + async def create_session( + self, + settings: PromptExecutionSettings | None = None, + chat_history: ChatHistory | None = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + async def get_ephemeral_token(self) -> str: + """Get an ephemeral token from OpenAI.""" + headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} + data = {"model": self.ai_model_id, "voice": "echo"} + + try: + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") + + result = await response.json() + return result["client_secret"]["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise + + @override + async def update_session( + self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any + ) -> None: + if settings: + if "kernel" in kwargs: + settings = prepare_settings_for_function_calling( + settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=kwargs.get("kernel"), # type: ignore + ) + await self.input_buffer.put((SendEvents.SESSION_UPDATE, {"settings": settings})) + if chat_history and len(chat_history) > 0: + for msg in chat_history.messages: + await self.input_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": msg})) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.peer_connection: + await self.peer_connection.close() + if self.data_channel: + await self.data_channel.close() + self.peer_connection = None + self.data_channel = None + + # region Event callbacks + + def response_audio_transcript_delta_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response audio transcript delta.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ), True + + def response_audio_transcript_done_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response audio transcript done.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], + choice_index=event.content_index, + inner_content=event, + ), False + + def response_function_call_arguments_delta_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> tuple[Any, bool]: + """Handle response function call arguments delta.""" + return StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + ], + choice_index=0, + inner_content=event, + ), True + + def error_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Handle error.""" + logger.error("Error received: %s", event.error) + + def session_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Handle session.""" + logger.debug("Session created or updated, session: %s", event.session) + + async def response_function_call_arguments_done_callback( + self, + event: RealtimeServerEvent, + settings: PromptExecutionSettings | None = None, + **kwargs: Any, + ) -> None: + """Handle response function call done.""" + item = FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + kernel: Kernel | None = kwargs.get("kernel") + call_id = item.name + function_name = next( + output_item_event.item.name + for output_item_event in self.event_log[ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED] + if output_item_event.item.call_id == call_id + ) + item.plugin_name, item.function_name = function_name.split("-", 1) + if kernel: + chat_history = ChatHistory() + await kernel.invoke_function_call(item, chat_history) + await self.input_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": chat_history.messages[-1]})) # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send_event(SendEvents.RESPONSE_CREATE) + await self.input_buffer.put(SendEvents.RESPONSE_CREATE) return chat_history.messages[-1], False + # region settings + + @override def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - """Get the request settings class.""" from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa OpenAIRealtimeExecutionSettings, ) diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index ebdd4eed3739..c9a48f9d45b0 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -1,40 +1,27 @@ # Copyright (c) Microsoft. All rights reserved. +import sys from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator, Callable +from asyncio import Queue +from collections.abc import Callable from typing import TYPE_CHECKING, Any, ClassVar +from pydantic import Field + +if sys.version_info >= (3, 11): + from asyncio import TaskGroup +else: + from taskgroup import TaskGroup + from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory - from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent - -#### -# TODO (eavanvalkenburg): Move to ADR -# Receiving: -# Option 1: Events and Contents split -# - content received through main receive_content method -# - events received through event callback handlers -# Option 2: Everything is Content -# - content (events as new Content Type) received through main receive_content method -# Option 3: Everything is Event (current) -# - receive_content method is removed -# - events received through main listen method -# - default event handlers added for things like errors and function calling -# - built-in vs custom event handling - separate or not? -# Sending: -# Option 1: Events and Contents split -# - send_content and send_event -# Option 2: Everything is Content -# - single method needed, with EventContent type support -# Option 3: Everything is Event (current) -# - send_event method only, Content is part of event data -#### @experimental_class @@ -42,6 +29,8 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False + input_buffer: Queue[tuple[str, dict[str, Any]] | str] = Field(default_factory=Queue) + output_buffer: Queue[tuple[str, StreamingChatMessageContent]] = Field(default_factory=Queue) async def __aenter__(self) -> "RealtimeClientBase": """Enter the context manager. @@ -94,41 +83,50 @@ async def update_session( """ raise NotImplementedError - @abstractmethod - async def event_listener( + async def start_streaming( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> AsyncGenerator["StreamingChatMessageContent", Any]: - """Get text contents from audio. + ) -> None: + """Start streaming, will start both listening and sending. + + This method, start tasks for both listening and sending. + + The arguments are passed to the start_listening method. Args: settings: Prompt execution settings. chat_history: Chat history. kwargs: Additional arguments. - - Yields: - StreamingChatMessageContent messages """ - raise NotImplementedError + async with TaskGroup() as tg: + tg.create_task(self.start_listening(settings=settings, chat_history=chat_history, **kwargs)) + tg.create_task(self.start_sending(**kwargs)) @abstractmethod - async def send_event( + async def start_listening( self, - event: str, - event_data: dict[str, Any] | None = None, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Send an event to the session. + """Starts listening for messages from the service, adds them to the output_buffer. Args: - event: Event name, can be a string or a Enum value. - event_data: Event data. + settings: Prompt execution settings. + chat_history: Chat history. kwargs: Additional arguments. """ raise NotImplementedError + @abstractmethod + async def start_sending( + self, + ) -> None: + """Start sending items from the input_buffer to the service.""" + raise NotImplementedError + def _update_function_choice_settings_callback( self, ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py index ed32a7d0f595..45d37c622e1a 100644 --- a/python/semantic_kernel/contents/audio_content.py +++ b/python/semantic_kernel/contents/audio_content.py @@ -86,3 +86,8 @@ def from_audio_file(cls: type[_T], path: str) -> "AudioContent": def to_dict(self) -> dict[str, Any]: """Convert the instance to a dictionary.""" return {"type": "audio_url", "audio_url": {"uri": str(self)}} + + @classmethod + def from_nd_array(cls: type[_T], data: ndarray, mime_type: str) -> "AudioContent": + """Create an instance from an nd array.""" + return cls(data=data, mime_type=mime_type) diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 1a0a4850569f..85fbf4e38cb5 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -121,7 +121,7 @@ def data_uri(self, value: str): self.metadata.update(self._data_uri.parameters) @property - def data(self) -> bytes: + def data(self) -> bytes | ndarray: """Get the data.""" if self._data_uri and self._data_uri.data_array: return self._data_uri.data_array.tobytes() diff --git a/python/semantic_kernel/contents/utils/data_uri.py b/python/semantic_kernel/contents/utils/data_uri.py index 1695491e9110..03e75410d5e3 100644 --- a/python/semantic_kernel/contents/utils/data_uri.py +++ b/python/semantic_kernel/contents/utils/data_uri.py @@ -152,6 +152,8 @@ def from_data_uri(cls: type[_T], data_uri: str | Url, default_mime_type: str = " def to_string(self, metadata: dict[str, str] = {}) -> str: """Return the data uri as a string.""" + if self.data_array: + data_str = self.data_array.tobytes().decode("utf-8") parameters = ";".join([f"{key}={val}" for key, val in metadata.items()]) parameters = f";{parameters}" if parameters else "" data_format = f"{self.data_format}" if self.data_format else "" diff --git a/python/uv.lock b/python/uv.lock index 7b452ae40cb3..d48f48d9d964 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -486,7 +486,7 @@ name = "build" version = "1.2.2.post1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "os_name == 'nt' and sys_platform == 'win32'" }, + { name = "colorama", marker = "(os_name == 'nt' and sys_platform == 'darwin') or (os_name == 'nt' and sys_platform == 'linux') or (os_name == 'nt' and sys_platform == 'win32')" }, { name = "importlib-metadata", marker = "(python_full_version < '3.10.2' and sys_platform == 'darwin') or (python_full_version < '3.10.2' and sys_platform == 'linux') or (python_full_version < '3.10.2' and sys_platform == 'win32')" }, { name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyproject-hooks", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -729,7 +729,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -1917,7 +1917,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "sys_platform == 'darwin'" }, + { name = "appnope", marker = "(platform_system == 'Darwin' and sys_platform == 'darwin') or (platform_system == 'Darwin' and sys_platform == 'linux') or (platform_system == 'Darwin' and sys_platform == 'win32')" }, { name = "comm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "debugpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ipython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2794,7 +2794,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -2805,7 +2805,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, @@ -2824,9 +2824,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, @@ -2837,7 +2837,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, @@ -3469,7 +3469,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "pywin32", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4926,7 +4926,7 @@ hugging-face = [ { name = "transformers", extra = ["torch"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] milvus = [ - { name = "milvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "milvus", marker = "(platform_system != 'Windows' and sys_platform == 'darwin') or (platform_system != 'Windows' and sys_platform == 'linux') or (platform_system != 'Windows' and sys_platform == 'win32')" }, { name = "pymilvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] mistralai = [ @@ -5013,7 +5013,7 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'google'", specifier = "~=0.8" }, { name = "ipykernel", marker = "extra == 'notebooks'", specifier = "~=6.29" }, { name = "jinja2", specifier = "~=3.1" }, - { name = "milvus", marker = "sys_platform != 'win32' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, + { name = "milvus", marker = "platform_system != 'Windows' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, { name = "mistralai", marker = "extra == 'mistralai'", specifier = ">=1.2,<2.0" }, { name = "motor", marker = "extra == 'mongo'", specifier = ">=3.3.2,<3.8.0" }, { name = "nest-asyncio", specifier = "~=1.6" }, @@ -5300,6 +5300,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 }, ] +[[package]] +name = "taskgroup" +version = "0.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'win32')" }, + { name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'win32')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/b1/74babcc824a57904e919f3af16d86c08b524c0691504baf038ef2d7f655c/taskgroup-0.2.2-py2.py3-none-any.whl", hash = "sha256:e2c53121609f4ae97303e9ea1524304b4de6faf9eb2c9280c7f87976479a52fb", size = 14237 }, +] + [[package]] name = "tenacity" version = "9.0.0" @@ -5508,7 +5521,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ From 6faf93f4c0e1d5ea6ca009a81710401f5c936b24 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 16 Jan 2025 10:07:51 +0100 Subject: [PATCH 08/50] updated ADR --- docs/decisions/00XX-realtime-api-clients.md | 122 +++++++++++++------- 1 file changed, 81 insertions(+), 41 deletions(-) diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 1b0bbd2d6c52..96570b389de1 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -79,82 +79,122 @@ Server side events: ## Decision Drivers - - Simple programming model that is likely able to handle future realtime api's and evolution of the existing ones. -- Support for the most common scenario's and content, extensible for the rest. -- Natively integrated with Semantic Kernel especially for content types and function calling. -- Support multiple types of connections, like websocket and WebRTC - +- Whenever possible we transform incoming content into Semantic Kernel content, but surface everything, so it's extensible +- Protocol agnostic, should be able to use different types of protocols under the covers, like websocket and WebRTC, without changing the client code (unless the protocol requires it). + ## Decision driver questions - For WebRTC, a audio device can be passed, should this be a requirement for the client also for websockets? -## Considered Options +There are multiple areas where we need to make decisions, these are: +- Content and Events +- Programming model +- Audio speaker/microphone handling +# Content and Events + +## Considered Options - Content and Events Both the sending and receiving side of these integrations need to decide how to deal with the api's. -- Treat content events separate from control events -- Treat everything as content items -- Treat everything as events +1. Treat content events separate from control events +1. Treat everything as content items +1. Treat everything as events -### Treat content events separate from control events +### 1. Treat content events separate from control events This would mean there are two mechanisms in the clients, one deals with content, and one with control events. - Pro: - strongly typed responses for known content - easy to use as the main interactions are clear with familiar SK content types, the rest goes through a separate mechanism - - this might fit better with something like WebRTC that has distinct channels for audio and video vs a data stream for all other events - Con: - new content support requires updates in the codebase and can be considered breaking (potentially sending additional types back) - additional complexity in dealing with two streams of data -### Treat everything as content items +### 2. Treat everything as content items +This would mean that all events are turned into Semantic Kernel content items, and would also mean that we need to define additional content types for the control events. +- Pro: + - everything is a content item, so it's easy to deal with +- Con: + - overkill for simple control events -## Decision Outcome +### 3. Treat everything as events +This would mean that all events are retained and returned to the developer as is, without any transformation. -Chosen option: "{title of option 1}", because -{justification. e.g., only option, which meets k.o. criterion decision driver | which resolves force {force} | … | comes out best (see below)}. +- Pro: + - no transformation needed + - easy to maintain +- Con: + - nothing easing the burden on the developer, they need to deal with the raw events + - no way to easily switch between one provider and another - +## Decision Outcome - Content and Events -### Consequences +Chosen option: ... -- Good, because {positive consequence, e.g., improvement of one or more desired qualities, …} -- Bad, because {negative consequence, e.g., compromising one or more desired qualities, …} -- … +# Programming model - +## Considered Options - Programming model +The programming model for the clients needs to be simple and easy to use, while also being able to handle the complexity of the realtime api's. -## Validation +_In this section we will refer to events for both content and events, regardless of the decision made in the previous section._ -{describe how the implementation of/compliance with the ADR is validated. E.g., by a review or an ArchUnit test} +1. Async generator for receiving events, that yields contents, combined with a event handler/callback mechanism for receiving events and a function for sending events + - 1a: Single event handlers, where each event is passed to the handler + - 1b: Multiple event handlers, where each event type has its own handler +2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers - +### 1. Async generator for receiving events, that yields contents, combined with a event handler/callback mechanism for receiving events and a function for sending events +This would mean that the client would have a mechanism to register event handlers, and the integration would call these handlers when an event is received. For sending events, a function would be created that sends the event to the service. -## Pros and Cons of the Options +- Pro: + - without any additional setup you get content back, just as with "regular" chat models + - event handlers are mostly for more complex interactions, so ok to be slightly more complex +- Con: + - developer judgement needs to be made (or exposed with parameters) on what is returned through the async generator and what is passed to the event handlers -### {title of option 1} +### 2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers +This would mean that the there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like auto-function calling can listen in on the same queue and act on it, and put a message back on the sending queue with ease. - +- Pro: + - simple to use, just start sending and start receiving + - easy to understand, as queues are a well known concept + - developers can just skip events they are not interested in +- Con: + - potentially causes audio delays because of the queueing mechanism + +## Decision Outcome - Programming model + +Chosen option: ... + +# Audio speaker/microphone handling + +## Considered Options - Audio speaker/microphone handling -{example | description | pointer to more information | …} +1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio +2. Send and receive AudioContent (wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing -- Good, because {argument a} -- Good, because {argument b} - -- Neutral, because {argument c} -- Bad, because {argument d} -- … +### 1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio +This would mean that the client would have a mechanism to register audio handlers, and the integration would call these handlers when audio is received or needs to be sent. A additional abstraction for this would have to be created in Semantic Kernel (or potentially taken from a standard). -### {title of other option} +- Pro: + - simple/local audio handlers can be shipped with SK making it easy to use + - extensible by third parties to integrate into other systems (like Azure Communications Service) + - could mitigate buffer issues by prioritizing audio content being sent to the handlers +- Con: + - extra code in SK that needs to be maintained, potentially relying on third party code + +### 2. Send and receive AudioContent (wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing +This would mean that the client would receive AudioContent items, and would have to deal with them itself, including recording and playing the audio. + +- Pro: + - no extra code in SK that needs to be maintained +- Con: + - extra burden on the developer to deal with the audio -{example | description | pointer to more information | …} +## Decision Outcome - Audio speaker/microphone handling -- Good, because {argument a} -- Good, because {argument b} -- Neutral, because {argument c} -- Bad, because {argument d} -- … +Chosen option: ... From 43bc2f3448bbf7a78880d585089c0d1aaaba0d9f Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 14:41:29 +0100 Subject: [PATCH 09/50] webrtc working! --- docs/decisions/00XX-realtime-api-clients.md | 4 +- .../audio/04-chat_with_realtime_api.py | 167 +++---- .../concepts/audio/audio_player_async.py | 75 --- .../concepts/audio/audio_recorder_stream.py | 77 --- .../open_ai/services/open_ai_config_base.py | 5 +- .../ai/open_ai/services/open_ai_realtime.py | 10 +- .../open_ai/services/open_ai_realtime_base.py | 455 +++++++++--------- .../ai/open_ai/settings/open_ai_settings.py | 4 + .../connectors/ai/realtime_client_base.py | 5 +- .../connectors/ai/realtime_helpers.py | 190 ++++++++ .../semantic_kernel/contents/audio_content.py | 4 +- python/uv.lock | 26 +- 12 files changed, 507 insertions(+), 515 deletions(-) delete mode 100644 python/samples/concepts/audio/audio_player_async.py delete mode 100644 python/samples/concepts/audio/audio_recorder_stream.py create mode 100644 python/semantic_kernel/connectors/ai/realtime_helpers.py diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 96570b389de1..6fcf0972aea2 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -12,7 +12,7 @@ informed: ## Context and Problem Statement -Multiple model providers are starting to enable realtime voice-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. The key addition that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. +Multiple model providers are starting to enable realtime voice-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. The key addition that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. There are also options for Google to use video and images as input, so really it is multimodal, but for now we are focusing on the voice-to-voice part, while keeping in mind that video is coming. The way these API's work at this time is through either Websockets or WebRTC. @@ -154,7 +154,7 @@ This would mean that the client would have a mechanism to register event handler - developer judgement needs to be made (or exposed with parameters) on what is returned through the async generator and what is passed to the event handlers ### 2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers -This would mean that the there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like auto-function calling can listen in on the same queue and act on it, and put a message back on the sending queue with ease. +This would mean that the there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like parsing events to content types and auto-function calling are processed first, and the result is put in the queue, the content type should use inner_content to capture the full event and these might add a message to the send queue as well. - Pro: - simple to use, just start sending and start receiving diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 40f16a2c0a24..0f895f7dc9dc 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -1,17 +1,11 @@ # Copyright (c) Microsoft. All rights reserved. import asyncio -import contextlib import logging import signal -from typing import Any +from random import randint -import numpy as np -from aiortc.mediastreams import MediaStreamError, MediaStreamTrack -from av import AudioFrame -from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent +import sounddevice as sd -from samples.concepts.audio.audio_player_async import AudioPlayerAsync -from samples.concepts.audio.audio_recorder_stream import AudioRecorderStream from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( @@ -19,12 +13,18 @@ OpenAIRealtimeWebRTC, TurnDetection, ) -from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents import AudioContent, ChatHistory, StreamingTextContent +from semantic_kernel.connectors.ai.realtime_helpers import SKSimplePlayer +from semantic_kernel.contents import ChatHistory +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) +aiortc_log = logging.getLogger("aiortc") +aiortc_log.setLevel(logging.WARNING) +aioice_log = logging.getLogger("aioice") +aioice_log.setLevel(logging.WARNING) logger = logging.getLogger(__name__) # This simple sample demonstrates how to use the OpenAI Realtime API to create @@ -34,7 +34,8 @@ # - pyaudio # - sounddevice # - pydub -# e.g. pip install semantic-kernel[openai_realtime] pyaudio sounddevice pydub +# - aiortc +# e.g. pip install pyaudio sounddevice pydub # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. @@ -45,124 +46,66 @@ def check_audio_devices(): - import sounddevice as sd # type: ignore - - print(sd.query_devices()) + logger.info(sd.query_devices()) check_audio_devices() -class Speaker: - """This is a simple class that opens the session with the realtime api and plays the audio response. +class ReceivingStreamHandler: + """This is a simple class that listens to the received buffer of the RealtimeClientBase. + + It can be used to play audio and print the transcript of the conversation. - At the same time it prints the transcript of the conversation to the console. + It can also be used to act on other events from the service. """ - def __init__(self, audio_player: AudioPlayerAsync, realtime_client: RealtimeClientBase, kernel: Kernel): + def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKSimplePlayer | None = None): self.audio_player = audio_player self.realtime_client = realtime_client - self.kernel = kernel - async def play( + async def listen( self, - chat_history: ChatHistory, - settings: OpenAIRealtimeExecutionSettings, + play_audio: bool = True, print_transcript: bool = True, ) -> None: - # reset the frame count for the audio player - self.audio_player.reset_frame_count() # print the start message of the transcript if print_transcript: print("Mosscap (transcript): ", end="") try: # start listening for events while True: - _, content = await self.realtime_client.output_buffer.get() - if not content: - continue - # the contents returned should be StreamingChatMessageContent - # so we will loop through the items within it. - for item in content.items: - match item: - case StreamingTextContent(): - if print_transcript: - print(item.text, end="") - await asyncio.sleep(0.01) - continue - case AudioContent(): - self.audio_player.add_data(item.data) - await asyncio.sleep(0.01) - continue + event_type, event = await self.realtime_client.receive_buffer.get() + match event_type: + case ListenEvents.RESPONSE_AUDIO_DELTA: + if play_audio and self.audio_player and isinstance(event, StreamingChatMessageContent): + await self.audio_player.add_audio(event.items[0]) + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA: + if print_transcript and isinstance(event, StreamingChatMessageContent): + print(event.content, end="") + case ListenEvents.RESPONSE_CREATED: + if print_transcript: + print("") + await asyncio.sleep(0.01) except asyncio.CancelledError: print("\nThanks for talking to Mosscap!") -class Microphone(MediaStreamTrack): - """This is a simple class that opens the microphone and sends the audio to the realtime api.""" - - kind = "audio" - - def __init__(self, audio_recorder: AudioRecorderStream, realtime_client: RealtimeClientBase): - self.audio_recorder = audio_recorder - self.realtime_client = realtime_client - self.queue = asyncio.Queue() - self.loop = asyncio.get_running_loop() - self._pts = 0 - - async def recv(self) -> Any: - # start the audio recording - try: - return await self.queue.get() - except Exception as e: - logger.error(f"Error receiving audio frame: {str(e)}") - raise MediaStreamError("Failed to receive audio frame") - - async def record_audio(self): - def callback(indata, frames, time, status): - if status: - logger.warning(f"Audio input status: {status}") - audio_data = indata.copy() - - if audio_data.dtype != np.int16: - audio_data = (audio_data * 32767).astype(np.int16) - - # Create AudioFrame with incrementing pts - frame = AudioFrame( - samples=len(audio_data), - layout="mono", - format="s16", # 16-bit signed integer - ) - frame.rate = 48000 - frame.pts = self._pts - self._pts += len(audio_data) # Increment pts by frame size - - frame.planes[0].update(audio_data.tobytes()) - - asyncio.run_coroutine_threadsafe(self.queue.put(frame), self.loop) - - await self.realtime_client.input_buffer.put("response.create") - await self.audio_recorder.stream_audio_content_with_callback(callback=callback) - - # this function is used to stop the processes when ctrl + c is pressed def signal_handler(): for task in asyncio.all_tasks(): task.cancel() +weather_conditions = ["sunny", "hot", "cloudy", "raining", "freezing", "snowing"] + + @kernel_function def get_weather(location: str) -> str: """Get the weather for a location.""" - logger.debug(f"Getting weather for {location}") - return f"The weather in {location} is sunny." - - -def response_created_callback( - event: RealtimeServerEvent, settings: PromptExecutionSettings | None = None, **kwargs: Any -) -> None: - """Add a empty print to start a new line for a new response.""" - print("") + weather = weather_conditions[randint(0, len(weather_conditions))] # nosec + logger.warning(f"Getting weather for {location}: {weather}") + return f"The weather in {location} is {weather}." async def main() -> None: @@ -174,20 +117,20 @@ async def main() -> None: kernel = Kernel() kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) - # create the realtime client and register the response created callback - realtime_client = OpenAIRealtimeWebRTC(ai_model_id="gpt-4o-realtime-preview-2024-12-17") - realtime_client.register_event_handler("response.created", response_created_callback) + # create the realtime client and optionally add the audio output function, this is optional + audio_player = SKSimplePlayer() + realtime_client = OpenAIRealtimeWebRTC(audio_output=audio_player.realtime_client_callback) - # create the speaker and microphone - speaker = Speaker(AudioPlayerAsync(device_id=None), realtime_client, kernel) - microphone = Microphone(AudioRecorderStream(device_id=None), realtime_client) + # create stream receiver, this can play the audio, if the audio_player is passed + # and allows you to print the transcript of the conversation + # and review or act on other events from the service + stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None) # Create the settings for the session # the key thing to decide on is to enable the server_vad turn detection # if turn is turned off (by setting turn_detection=None), you will have to send # the "input_audio_buffer.commit" and "response.create" event to the realtime api # to signal the end of the user's turn and start the response. - # The realtime api, does not use a system message, but takes instructions as a parameter for a session instructions = """ You are a chat bot. Your name is Mosscap and @@ -197,7 +140,7 @@ async def main() -> None: effectively, but you tend to answer with long flowery prose. """ - # but we can add a chat history to conversation after starting it + # and we can add a chat history to conversation after starting it chat_history = ChatHistory() chat_history.add_user_message("Hi there, who are you?") chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") @@ -208,14 +151,14 @@ async def main() -> None: turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) - async with realtime_client: - await realtime_client.update_session(settings=settings, chat_history=chat_history) - await realtime_client.start_listening(settings, chat_history) - await realtime_client.start_sending(input_audio_track=microphone) - # await realtime_client.start_streaming(settings, chat_history, input_audio_track=microphone) - # start the the speaker and the microphone - with contextlib.suppress(asyncio.CancelledError): - await speaker.play(chat_history, settings) + # the context manager calls the create_session method on the client and start listening to the audio stream + async with realtime_client, audio_player: + await realtime_client.update_session( + settings=settings, chat_history=chat_history, kernel=kernel, create_response=True + ) + async with asyncio.TaskGroup() as tg: + tg.create_task(realtime_client.start_streaming()) + tg.create_task(stream_handler.listen()) if __name__ == "__main__": diff --git a/python/samples/concepts/audio/audio_player_async.py b/python/samples/concepts/audio/audio_player_async.py deleted file mode 100644 index 36c1492094a6..000000000000 --- a/python/samples/concepts/audio/audio_player_async.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import threading - -import numpy as np -import pyaudio -import sounddevice as sd - -CHUNK_LENGTH_S = 0.05 # 100ms -SAMPLE_RATE = 24000 -FORMAT = pyaudio.paInt16 -CHANNELS = 1 - - -class AudioPlayerAsync: - def __init__(self, device_id: int | None = None): - self.queue = [] - self.lock = threading.Lock() - self.stream = sd.OutputStream( - callback=self.callback, - samplerate=SAMPLE_RATE, - channels=CHANNELS, - dtype=np.int16, - blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), - device=device_id, - ) - self.playing = False - self._frame_count = 0 - - def callback(self, outdata, frames, time, status): # noqa - with self.lock: - data = np.empty(0, dtype=np.int16) - - # get next item from queue if there is still space in the buffer - while len(data) < frames and len(self.queue) > 0: - item = self.queue.pop(0) - frames_needed = frames - len(data) - data = np.concatenate((data, item[:frames_needed])) - if len(item) > frames_needed: - self.queue.insert(0, item[frames_needed:]) - - self._frame_count += len(data) - - # fill the rest of the frames with zeros if there is no more data - if len(data) < frames: - data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) - - outdata[:] = data.reshape(-1, 1) - - def reset_frame_count(self): - self._frame_count = 0 - - def get_frame_count(self): - return self._frame_count - - def add_data(self, data: bytes | np.ndarray): - with self.lock: - # bytes is pcm16 single channel audio data, convert to numpy array - np_data = np.frombuffer(data, dtype=np.int16) if isinstance(data, bytes) else data - self.queue.append(np_data) - if not self.playing: - self.start() - - def start(self): - self.playing = True - self.stream.start() - - def stop(self): - self.playing = False - self.stream.stop() - with self.lock: - self.queue = [] - - def terminate(self): - self.stream.close() diff --git a/python/samples/concepts/audio/audio_recorder_stream.py b/python/samples/concepts/audio/audio_recorder_stream.py deleted file mode 100644 index 20c758af3e39..000000000000 --- a/python/samples/concepts/audio/audio_recorder_stream.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import asyncio -import base64 -from collections.abc import AsyncGenerator, Callable -from typing import Any, ClassVar, cast - -import sounddevice as sd -from pydantic import BaseModel - -from semantic_kernel.contents.audio_content import AudioContent - - -class AudioRecorderStream(BaseModel): - """A class to record audio from the microphone and save it to a WAV file. - - To start recording, press the spacebar. To stop recording, release the spacebar. - - To use as a context manager, that automatically removes the output file after exiting the context: - ``` - with AudioRecorder(output_filepath="output.wav") as recorder: - recorder.start_recording() - # Do something with the recorded audio - ... - ``` - """ - - # Audio recording parameters - CHANNELS: ClassVar[int] = 1 - SAMPLE_RATE: ClassVar[int] = 24000 - CHUNK_LENGTH_S: ClassVar[float] = 0.05 - device_id: int | None = None - - async def stream_audio_content_with_callback(self, callback: Callable[..., Any]) -> None: - stream = sd.InputStream( - channels=self.CHANNELS, - samplerate=self.SAMPLE_RATE, - dtype="int16", - device=self.device_id, - callback=callback, - ) - stream.start() - try: - while True: - await asyncio.sleep(0) - except KeyboardInterrupt: - pass - finally: - stream.stop() - stream.close() - - async def stream_audio_content(self) -> AsyncGenerator[AudioContent, None]: - # device_info = sd.query_devices() - # print(device_info) - - read_size = int(self.SAMPLE_RATE * 0.02) - - stream = sd.InputStream( - channels=self.CHANNELS, - samplerate=self.SAMPLE_RATE, - dtype="int16", - device=self.device_id, - ) - stream.start() - try: - while True: - if stream.read_available < read_size: - await asyncio.sleep(0) - continue - - data, _ = stream.read(read_size) - yield AudioContent(data=base64.b64encode(cast(Any, data)), data_format="base64", mime_type="audio/wav") - except KeyboardInterrupt: - pass - finally: - stream.stop() - stream.close() diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py index d3d72795665b..7883be04f4ff 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py @@ -3,6 +3,7 @@ import logging from collections.abc import Mapping from copy import copy +from typing import Any from openai import AsyncOpenAI from pydantic import ConfigDict, Field, validate_call @@ -30,6 +31,7 @@ def __init__( default_headers: Mapping[str, str] | None = None, client: AsyncOpenAI | None = None, instruction_role: str | None = None, + **kwargs: Any, ) -> None: """Initialize a client for OpenAI services. @@ -51,6 +53,7 @@ def __init__( client (AsyncOpenAI): An existing OpenAI client, optional. instruction_role (str): The role to use for 'instruction' messages, for example, summarization prompts could use `developer` or `system`. (Optional) + kwargs: Additional keyword arguments. """ # Merge APP_INFO into the headers if it exists @@ -76,7 +79,7 @@ def __init__( args["service_id"] = service_id if instruction_role: args["instruction_role"] = instruction_role - super().__init__(**args) + super().__init__(**args, **kwargs) def to_dict(self) -> dict[str, str]: """Create a dict of the service settings.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 39c85816ced3..412d0814feb8 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. from collections.abc import Mapping +from typing import Any from openai import AsyncOpenAI from pydantic import ValidationError @@ -82,6 +83,7 @@ def __init__( async_client: AsyncOpenAI | None = None, env_file_path: str | None = None, env_file_encoding: str | None = None, + **kwargs: Any, ) -> None: """Initialize an OpenAITextCompletion service. @@ -99,25 +101,27 @@ def __init__( env_file_path (str | None): Use the environment settings file as a fallback to environment variables. (Optional) env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. """ try: openai_settings = OpenAISettings.create( api_key=api_key, org_id=org_id, - text_model_id=ai_model_id, + realtime_model_id=ai_model_id, env_file_path=env_file_path, env_file_encoding=env_file_encoding, ) except ValidationError as ex: raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex - if not openai_settings.text_model_id: + if not openai_settings.realtime_model_id: raise ServiceInitializationError("The OpenAI text model ID is required.") super().__init__( - ai_model_id=openai_settings.text_model_id, + ai_model_id=openai_settings.realtime_model_id, service_id=service_id, api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, org_id=openai_settings.org_id, ai_model_type=OpenAIModelTypes.TEXT, default_headers=default_headers, client=async_client, + **kwargs, ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index f82bce19164f..e387ef4005aa 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -2,13 +2,14 @@ import asyncio import base64 +import contextlib import json import logging import sys -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Callable, Coroutine from enum import Enum from inspect import isawaitable -from typing import Any, ClassVar, Protocol, runtime_checkable +from typing import Any, ClassVar, Protocol, cast, runtime_checkable if sys.version_info >= (3, 12): from typing import override # pragma: no cover @@ -24,15 +25,25 @@ RTCPeerConnection, RTCSessionDescription, ) +from av import AudioFrame +from openai._models import construct_type_unchecked from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection from openai.types.beta.realtime.conversation_item_create_event_param import ConversationItemParam from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from pydantic import Field +from pydantic import Field, PrivateAttr -from semantic_kernel.connectors.ai.function_calling_utils import prepare_settings_for_function_calling +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_calling_utils import ( + prepare_settings_for_function_calling, +) +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_utils import ( + update_settings_from_function_call_configuration, +) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.function_call_content import FunctionCallContent @@ -46,6 +57,8 @@ logger: logging.Logger = logging.getLogger(__name__) +# region Protocols + @runtime_checkable @experimental_class @@ -77,6 +90,9 @@ def __call__( ... +# region Events + + @experimental_class class SendEvents(str, Enum): """Events that can be sent.""" @@ -126,6 +142,9 @@ class ListenEvents(str, Enum): RATE_LIMITS_UPDATED = "rate_limits.updated" +# region Websocket + + @experimental_class class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): """OpenAI Realtime service.""" @@ -437,8 +456,6 @@ async def response_function_call_arguments_done_callback( await self.start_sending(SendEvents.RESPONSE_CREATE) return chat_history.messages[-1], False - # region settings - @override def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa @@ -448,6 +465,9 @@ def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"] return OpenAIRealtimeExecutionSettings +# region WebRTC + + @experimental_class class OpenAIRealtimeWebRTCBase(OpenAIHandler, RealtimeClientBase): """OpenAI WebRTC Realtime service.""" @@ -455,135 +475,127 @@ class OpenAIRealtimeWebRTCBase(OpenAIHandler, RealtimeClientBase): SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None - connection: AsyncRealtimeConnection | None = None - connected: asyncio.Event = Field(default_factory=asyncio.Event) - event_log: dict[str, list[RealtimeServerEvent]] = Field(default_factory=dict) - event_handlers: dict[str, list[EventCallBackProtocol | EventCallBackProtocolAsync]] = Field(default_factory=dict) + audio_output: Callable[[AudioFrame], Coroutine[Any, Any, None] | None] | None = None + kernel: Kernel | None = None - def model_post_init(self, *args, **kwargs) -> None: - """Post init method for the model.""" - # Register the default event handlers - self.register_event_handler( - ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, self.response_audio_transcript_delta_callback - ) - self.register_event_handler( - ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE, self.response_audio_transcript_done_callback - ) - self.register_event_handler( - ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, self.response_function_call_arguments_delta_callback - ) - self.register_event_handler(ListenEvents.ERROR, self.error_callback) - self.register_event_handler(ListenEvents.SESSION_CREATED, self.session_callback) - self.register_event_handler(ListenEvents.SESSION_UPDATED, self.session_callback) - - def register_event_handler( - self, event_type: str | ListenEvents, handler: EventCallBackProtocol | EventCallBackProtocolAsync - ) -> None: - """Register a event handler.""" - if not isinstance(event_type, ListenEvents): - event_type = ListenEvents(event_type) - self.event_handlers.setdefault(event_type, []).append(handler) + _current_settings: PromptExecutionSettings | None = PrivateAttr(None) + _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) @override async def start_listening( self, - settings: "PromptExecutionSettings", + settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> AsyncGenerator[StreamingChatMessageContent, Any]: - ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])] - self.peer_connection = RTCPeerConnection(configuration=RTCConfiguration(iceServers=ice_servers)) + ) -> None: + pass - @self.peer_connection.on("track") - async def on_track(track: MediaStreamTrack) -> None: - if track.kind == "audio": - while True: - frame = await track.recv() - await self.output_buffer.put( - ( - ListenEvents.RESPONSE_AUDIO_DELTA, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[AudioContent(data=frame.to_ndarray(), data_format="base64")], - choice_index=0, - inner_content=frame, - ), + async def _on_track(self, track: MediaStreamTrack) -> None: + logger.info(f"Received {track.kind} track from remote") + if track.kind != "audio": + return + while True: + try: + # This is a MediaStreamTrack, so the type is AudioFrame + # this might need to be updated if video becomes part of this + frame: AudioFrame = await track.recv() # type: ignore + except Exception as e: + logger.error(f"Error getting audio frame: {e!s}") + break + + try: + if self.audio_output: + out = self.audio_output(frame) + if isawaitable(out): + await out + + except Exception as e: + logger.error(f"Error playing remote audio frame: {e!s}") + try: + await self.receive_buffer.put( + ( + ListenEvents.RESPONSE_AUDIO_DELTA, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame)], # type: ignore + choice_index=0, ), - ) - - data_channel = self.peer_connection.createDataChannel("oai-events") - - @data_channel.on("message") - async def on_data(data: bytes) -> None: - event = RealtimeServerEvent.model_validate_strings(data) - event_type = ListenEvents(event.type) - self.event_log.setdefault(event_type, []).append(event) - for handler in self.event_handlers.get(event_type, []): - task = handler(event=event, settings=settings) - if not task: - continue - if isawaitable(task): - async_result = await task - if not async_result: - continue - result, should_return = async_result - else: - result, should_return = task - if should_return: - yield result - else: - chat_history.add_message(result) + ), + ) + except Exception as e: + logger.error(f"Error processing remote audio frame: {e!s}") + await asyncio.sleep(0.01) - offer = await self.peer_connection.createOffer() - await self.peer_connection.setLocalDescription(offer) + async def _on_data(self, data: str) -> None: + """This method is called whenever a data channel message is received. + The data is parsed into a RealtimeServerEvent (by OpenAI) and then processed. + """ try: - ephemeral_token = await self.get_ephemeral_token() - headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} - - async with ( - ClientSession() as session, - session.post( - f"{self.client.beta.realtime._client.base_url}/realtime/sessions?model={self.ai_model_id}", - headers=headers, - data=offer.sdp, - ) as response, - ): - if response.status not in [200, 201]: - error_text = await response.text() - raise Exception(f"OpenAI WebRTC error: {error_text}") - - sdp_answer = await response.text() - answer = RTCSessionDescription(sdp=sdp_answer, type="answer") - await self.peer_connection.setRemoteDescription(answer) - + event = cast( + RealtimeServerEvent, + construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), + ) except Exception as e: - logger.error(f"Failed to connect to OpenAI: {e!s}") - raise + logger.error(f"Failed to parse event {data} with error: {e!s}") + return + match event.type: + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA: + await self.receive_buffer.put(( + event.type, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + content=event.delta, + choice_index=event.content_index, + inner_content=event, + ), + )) + case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED: + if event.item.type == "function_call": + self._call_id_to_function_map[event.item.call_id] = event.item.name + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA: + await self.receive_buffer.put(( + event.type, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + ], + choice_index=0, + inner_content=event, + ), + )) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE: + await self._handle_function_call_arguments_done(event) + case ListenEvents.ERROR: + logger.error("Error received: %s", event.error) + case ListenEvents.SESSION_CREATED, ListenEvents.SESSION_UPDATED: + logger.info("Session created or updated, session: %s", event.session) + case _: + logger.debug(f"Received event: {event}") + # we put all event in the output buffer, but after the interpreted one. + await self.receive_buffer.put((event.type, event)) @override - async def start_sending(self, input_audio_track: MediaStreamTrack | None = None, **kwargs: Any) -> None: - if input_audio_track: - if not self.peer_connection: - raise ValueError("Peer connection is not established.") - self.peer_connection.addTransceiver(input_audio_track) - - if not self.data_channel: - raise ValueError("Data channel is not established.") + async def start_sending(self, **kwargs: Any) -> None: while True: - item = await self.input_buffer.get() + item = await self.send_buffer.get() if not item: continue if isinstance(item, tuple): event, data = item else: event = item - data = None + data = {} if not isinstance(event, SendEvents): event = SendEvents(event) - response: dict[str, Any] = { - "type": event, - } + response: dict[str, Any] = {"type": event.value} match event: case SendEvents.SESSION_UPDATE: if "settings" not in data: @@ -651,170 +663,153 @@ async def start_sending(self, input_audio_track: MediaStreamTrack | None = None, if "response_id" in data: response["response_id"] = data["response_id"] - self.data_channel.send(json.dumps(response)) + if self.data_channel: + while self.data_channel.readyState != "open": + await asyncio.sleep(0.1) + try: + self.data_channel.send(json.dumps(response)) + except Exception as e: + logger.error(f"Failed to send event {event} with error: {e!s}") @override async def create_session( self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, + audio_track: MediaStreamTrack | None = None, **kwargs: Any, ) -> None: """Create a session in the service.""" - if settings or chat_history or kwargs: - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])] + self.peer_connection = RTCPeerConnection(configuration=RTCConfiguration(iceServers=ice_servers)) - async def get_ephemeral_token(self) -> str: - """Get an ephemeral token from OpenAI.""" - headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} - data = {"model": self.ai_model_id, "voice": "echo"} + self.peer_connection.on("track")(self._on_track) + + self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") + self.data_channel.on("message")(self._on_data) + + self.peer_connection.addTransceiver(audio_track or SKAudioTrack(), "sendrecv") + + offer = await self.peer_connection.createOffer() + await self.peer_connection.setLocalDescription(offer) try: + ephemeral_token = await self.get_ephemeral_token() + headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} + async with ( ClientSession() as session, session.post( - f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", + headers=headers, + data=offer.sdp, ) as response, ): if response.status not in [200, 201]: error_text = await response.text() - raise Exception(f"Failed to get ephemeral token: {error_text}") + raise Exception(f"OpenAI WebRTC error: {error_text}") - result = await response.json() - return result["client_secret"]["value"] + sdp_answer = await response.text() + answer = RTCSessionDescription(sdp=sdp_answer, type="answer") + await self.peer_connection.setRemoteDescription(answer) + logger.info("Connected to OpenAI WebRTC") except Exception as e: - logger.error(f"Failed to get ephemeral token: {e!s}") + logger.error(f"Failed to connect to OpenAI: {e!s}") raise + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + @override async def update_session( - self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any + self, + settings: PromptExecutionSettings | None = None, + chat_history: ChatHistory | None = None, + create_response: bool = True, + **kwargs: Any, ) -> None: + if "kernel" in kwargs: + self.kernel = kwargs["kernel"] if settings: - if "kernel" in kwargs: - settings = prepare_settings_for_function_calling( - settings, - self.get_prompt_execution_settings_class(), - self._update_function_choice_settings_callback(), - kernel=kwargs.get("kernel"), # type: ignore - ) - await self.input_buffer.put((SendEvents.SESSION_UPDATE, {"settings": settings})) + self._current_settings = settings + if self._current_settings and self.kernel: + self._current_settings = prepare_settings_for_function_calling( + self._current_settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=self.kernel, # type: ignore + ) + await self.send_buffer.put((SendEvents.SESSION_UPDATE, {"settings": self._current_settings})) if chat_history and len(chat_history) > 0: for msg in chat_history.messages: - await self.input_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": msg})) + await self.send_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": msg})) + if create_response: + await self.send_buffer.put(SendEvents.RESPONSE_CREATE) @override async def close_session(self) -> None: """Close the session in the service.""" if self.peer_connection: - await self.peer_connection.close() - if self.data_channel: - await self.data_channel.close() + with contextlib.suppress(asyncio.CancelledError): + await self.peer_connection.close() self.peer_connection = None + if self.data_channel: + with contextlib.suppress(asyncio.CancelledError): + self.data_channel.close() self.data_channel = None - # region Event callbacks - - def response_audio_transcript_delta_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response audio transcript delta.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, - ), True - - def response_audio_transcript_done_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response audio transcript done.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, - ), False - - def response_function_call_arguments_delta_callback( + async def _handle_function_call_arguments_done( self, event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response function call arguments delta.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - ], - choice_index=0, - inner_content=event, - ), True - - def error_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> None: - """Handle error.""" - logger.error("Error received: %s", event.error) - - def session_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> None: - """Handle session.""" - logger.debug("Session created or updated, session: %s", event.session) - - async def response_function_call_arguments_done_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, ) -> None: """Handle response function call done.""" + plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) + if not plugin_name or not function_name: + logger.error("Function call needs to have a plugin name and function name") + return item = FunctionCallContent( id=event.item_id, - name=event.call_id, - arguments=event.delta, + plugin_name=plugin_name, + function_name=function_name, + arguments=event.arguments, index=event.output_index, metadata={"call_id": event.call_id}, ) - kernel: Kernel | None = kwargs.get("kernel") - call_id = item.name - function_name = next( - output_item_event.item.name - for output_item_event in self.event_log[ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED] - if output_item_event.item.call_id == call_id - ) - item.plugin_name, item.function_name = function_name.split("-", 1) - if kernel: - chat_history = ChatHistory() - await kernel.invoke_function_call(item, chat_history) - await self.input_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": chat_history.messages[-1]})) - # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.input_buffer.put(SendEvents.RESPONSE_CREATE) - return chat_history.messages[-1], False + if not self.kernel and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions: + return + chat_history = ChatHistory() + await self.kernel.invoke_function_call(item, chat_history) + created_output = chat_history.messages[-1] + # This returns the output to the service + await self.send_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": created_output})) + # The model doesn't start responding to the tool call automatically, so triggering it here. + await self.send_buffer.put(SendEvents.RESPONSE_CREATE) + # This allows a user to have a full conversation in his code + await self.receive_buffer.put((ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, created_output)) + + async def get_ephemeral_token(self) -> str: + """Get an ephemeral token from OpenAI.""" + headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} + data = {"model": self.ai_model_id, "voice": "echo"} + + try: + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") - # region settings + result = await response.json() + return result["client_secret"]["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise @override def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: @@ -823,3 +818,9 @@ def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"] ) return OpenAIRealtimeExecutionSettings + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py index 6423a5385a33..7276af4b1f3b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py @@ -32,6 +32,9 @@ class OpenAISettings(KernelBaseSettings): (Env var OPENAI_AUDIO_TO_TEXT_MODEL_ID) - text_to_audio_model_id: str | None - The OpenAI text to audio model ID to use, for example, jukebox-1. (Env var OPENAI_TEXT_TO_AUDIO_MODEL_ID) + - realtime_model_id: str | None - The OpenAI realtime model ID to use, + for example, gpt-4o-realtime-preview-2024-12-17. + (Env var OPENAI_REALTIME_MODEL_ID) - env_file_path: str | None - if provided, the .env settings are read from this file path location """ @@ -45,3 +48,4 @@ class OpenAISettings(KernelBaseSettings): text_to_image_model_id: str | None = None audio_to_text_model_id: str | None = None text_to_audio_model_id: str | None = None + realtime_model_id: str | None = None diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index c9a48f9d45b0..991854987faa 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -15,7 +15,6 @@ from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class @@ -29,8 +28,8 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False - input_buffer: Queue[tuple[str, dict[str, Any]] | str] = Field(default_factory=Queue) - output_buffer: Queue[tuple[str, StreamingChatMessageContent]] = Field(default_factory=Queue) + send_buffer: Queue[str | tuple[str, Any]] = Field(default_factory=Queue) + receive_buffer: Queue[tuple[str, Any]] = Field(default_factory=Queue) async def __aenter__(self) -> "RealtimeClientBase": """Enter the context manager. diff --git a/python/semantic_kernel/connectors/ai/realtime_helpers.py b/python/semantic_kernel/connectors/ai/realtime_helpers.py new file mode 100644 index 000000000000..94549c402199 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/realtime_helpers.py @@ -0,0 +1,190 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging +from typing import Any, Final + +import numpy as np +import sounddevice as sd +from aiortc.mediastreams import MediaStreamError, MediaStreamTrack +from av.audio.frame import AudioFrame +from av.frame import Frame +from pydantic import Field, PrivateAttr + +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.kernel_pydantic import KernelBaseModel + +logger = logging.getLogger(__name__) + +SAMPLE_RATE: Final[int] = 48000 +TRACK_CHANNELS: Final[int] = 1 +PLAYER_CHANNELS: Final[int] = 2 +FRAME_DURATION: Final[int] = 20 +DTYPE: Final[np.dtype] = np.int16 + + +class SKAudioTrack(KernelBaseModel, MediaStreamTrack): + """A simple class using sounddevice to record audio from the default input device. + + And implementing the MediaStreamTrack interface for use with aiortc. + """ + + kind: str = "audio" + sample_rate: int = SAMPLE_RATE + channels: int = TRACK_CHANNELS + frame_duration: int = FRAME_DURATION + dtype: np.dtype = DTYPE + device: str | int | None = None + queue: asyncio.Queue[Frame] = Field(default_factory=asyncio.Queue) + is_recording: bool = False + stream: sd.InputStream | None = None + frame_size: int = 0 + _recording_task: asyncio.Task | None = None + _loop: asyncio.AbstractEventLoop | None = None + _pts: int = 0 # Add this to track the pts + + def __init__(self, **kwargs: Any): + """Initialize the audio track. + + Args: + **kwargs: Additional keyword arguments. + + """ + kwargs["frame_size"] = int( + kwargs.get("sample_rate", SAMPLE_RATE) * kwargs.get("frame_duration", FRAME_DURATION) / 1000 + ) + super().__init__(**kwargs) + MediaStreamTrack.__init__(self) + + async def recv(self) -> Frame: + """Receive the next frame of audio data.""" + if not self._recording_task: + self._recording_task = asyncio.create_task(self.start_recording()) + + try: + return await self.queue.get() + except Exception as e: + logger.error(f"Error receiving audio frame: {e!s}") + raise MediaStreamError("Failed to receive audio frame") + + async def start_recording(self): + """Start recording audio from the input device.""" + if self.is_recording: + return + + self.is_recording = True + self._loop = asyncio.get_running_loop() + self._pts = 0 # Reset pts when starting recording + + try: + + def callback(indata: np.ndarray, frames: int, time: Any, status: Any) -> None: + if status: + logger.warning(f"Audio input status: {status}") + + audio_data = indata.copy() + if audio_data.dtype != self.dtype: + if self.dtype == np.int16: + audio_data = (audio_data * 32767).astype(self.dtype) + else: + audio_data = audio_data.astype(self.dtype) + + frame = AudioFrame( + format="s16", + layout="mono", + samples=len(audio_data), + ) + frame.rate = self.sample_rate + frame.pts = self._pts + frame.planes[0].update(audio_data.tobytes()) + self._pts += len(audio_data) + if self._loop and self._loop.is_running(): + asyncio.run_coroutine_threadsafe(self.queue.put(frame), self._loop) + + self.stream = sd.InputStream( + device=self.device, + channels=self.channels, + samplerate=self.sample_rate, + dtype=self.dtype, + blocksize=self.frame_size, + callback=callback, + ) + self.stream.start() + + while self.is_recording: + await asyncio.sleep(0.1) + + except Exception as e: + logger.error(f"Error in audio recording: {e!s}") + raise + finally: + self.is_recording = False + + +class SKSimplePlayer(KernelBaseModel): + """Simple class that plays audio using sounddevice. + + Make sure the device_id is set to the correct device for your system. + + The sample rate, channels and frame duration should be set to match the audio you + are receiving, the defaults are for WebRTC. + """ + + device_id: int | None = None + sample_rate: int = SAMPLE_RATE + channels: int = PLAYER_CHANNELS + frame_duration_ms: int = FRAME_DURATION + queue: asyncio.Queue[np.ndarray] = Field(default_factory=asyncio.Queue) + _stream: sd.OutputStream | None = PrivateAttr(None) + + def model_post_init(self, __context: Any) -> None: + """Initialize the audio stream.""" + self._stream = sd.OutputStream( + callback=self.callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=np.int16, + blocksize=int(self.sample_rate * self.frame_duration_ms / 1000), + device=self.device_id, + ) + + async def __aenter__(self): + """Start the audio stream when entering a context.""" + self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + """Stop the audio stream when exiting a context.""" + self.stop() + + def start(self): + """Start the audio stream.""" + if self._stream: + self._stream.start() + + def stop(self): + """Stop the audio stream.""" + if self._stream: + self._stream.stop() + + def callback(self, outdata, frames, time, status): + """This callback is called by sounddevice when it needs more audio data to play.""" + if status: + logger.info(f"Audio output status: {status}") + if self.queue.empty(): + return + data: np.ndarray = self.queue.get_nowait() + outdata[:] = data.reshape(outdata.shape) + + async def realtime_client_callback(self, frame: AudioFrame): + """This function is used by the RealtimeClientBase to play audio.""" + await self.queue.put(frame.to_ndarray()) + + async def add_audio(self, audio_content: AudioContent): + """This function is used to add audio to the queue for playing. + + It uses a shortcut for this sample, because we know a AudioFrame is in the inner_content field. + """ + if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): + await self.queue.put(audio_content.inner_content.to_ndarray()) + # TODO (eavanvalkenburg): check ndarray diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py index 45d37c622e1a..b28d2e23a0e0 100644 --- a/python/semantic_kernel/contents/audio_content.py +++ b/python/semantic_kernel/contents/audio_content.py @@ -77,7 +77,7 @@ def __init__( ) @classmethod - def from_audio_file(cls: type[_T], path: str) -> "AudioContent": + def from_audio_file(cls: type[_T], path: str) -> _T: """Create an instance from an audio file.""" mime_type = mimetypes.guess_type(path)[0] with open(path, "rb") as audio_file: @@ -88,6 +88,6 @@ def to_dict(self) -> dict[str, Any]: return {"type": "audio_url", "audio_url": {"uri": str(self)}} @classmethod - def from_nd_array(cls: type[_T], data: ndarray, mime_type: str) -> "AudioContent": + def from_nd_array(cls: type[_T], data: ndarray, mime_type: str) -> _T: """Create an instance from an nd array.""" return cls(data=data, mime_type=mime_type) diff --git a/python/uv.lock b/python/uv.lock index d48f48d9d964..77f5d2cc8773 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -486,7 +486,7 @@ name = "build" version = "1.2.2.post1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "(os_name == 'nt' and sys_platform == 'darwin') or (os_name == 'nt' and sys_platform == 'linux') or (os_name == 'nt' and sys_platform == 'win32')" }, + { name = "colorama", marker = "os_name == 'nt' and sys_platform == 'win32'" }, { name = "importlib-metadata", marker = "(python_full_version < '3.10.2' and sys_platform == 'darwin') or (python_full_version < '3.10.2' and sys_platform == 'linux') or (python_full_version < '3.10.2' and sys_platform == 'win32')" }, { name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyproject-hooks", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -729,7 +729,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -1917,7 +1917,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "(platform_system == 'Darwin' and sys_platform == 'darwin') or (platform_system == 'Darwin' and sys_platform == 'linux') or (platform_system == 'Darwin' and sys_platform == 'win32')" }, + { name = "appnope", marker = "sys_platform == 'darwin'" }, { name = "comm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "debugpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ipython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2794,7 +2794,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -2805,7 +2805,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, @@ -2824,9 +2824,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, @@ -2837,7 +2837,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, @@ -3469,7 +3469,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4926,7 +4926,7 @@ hugging-face = [ { name = "transformers", extra = ["torch"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] milvus = [ - { name = "milvus", marker = "(platform_system != 'Windows' and sys_platform == 'darwin') or (platform_system != 'Windows' and sys_platform == 'linux') or (platform_system != 'Windows' and sys_platform == 'win32')" }, + { name = "milvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pymilvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] mistralai = [ @@ -5013,7 +5013,7 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'google'", specifier = "~=0.8" }, { name = "ipykernel", marker = "extra == 'notebooks'", specifier = "~=6.29" }, { name = "jinja2", specifier = "~=3.1" }, - { name = "milvus", marker = "platform_system != 'Windows' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, + { name = "milvus", marker = "sys_platform != 'win32' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, { name = "mistralai", marker = "extra == 'mistralai'", specifier = ">=1.2,<2.0" }, { name = "motor", marker = "extra == 'mongo'", specifier = ">=3.3.2,<3.8.0" }, { name = "nest-asyncio", specifier = "~=1.6" }, @@ -5521,7 +5521,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "(platform_system == 'Windows' and sys_platform == 'darwin') or (platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform == 'win32')" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ From 81328821ff31d90707297d95825783981ef29237 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 15:55:51 +0100 Subject: [PATCH 10/50] added dependency --- python/pyproject.toml | 5 +- .../audio/04-chat_with_realtime_api.py | 5 + .../open_ai/services/open_ai_realtime_base.py | 4 +- python/uv.lock | 103 ++++++++++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 6972267008aa..56c7a6932298 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -129,7 +129,8 @@ dapr = [ "flask-dapr>=1.14.0" ] openai_realtime = [ - "openai[realtime] ~= 1.0" + "openai[realtime] ~= 1.0", + "aiortc>=1.9.0" ] [tool.uv] @@ -228,5 +229,3 @@ name = "semantic_kernel" [build-system] requires = ["flit-core >= 3.9,<4.0"] build-backend = "flit_core.buildapi" - - diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 0f895f7dc9dc..902ad72d48d4 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -156,6 +156,11 @@ async def main() -> None: await realtime_client.update_session( settings=settings, chat_history=chat_history, kernel=kernel, create_response=True ) + # you can also send other events to the service, like this + # await realtime_client.send_buffer.put(( + # SendEvents.CONVERSATION_ITEM_CREATE, + # {"item": ChatMessageContent(role="user", content="Hi there, who are you?")}, + # )) async with asyncio.TaskGroup() as tg: tg.create_task(realtime_client.start_streaming()) tg.create_task(stream_handler.listen()) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index e387ef4005aa..7caf5a5671df 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -529,7 +529,7 @@ async def _on_track(self, track: MediaStreamTrack) -> None: async def _on_data(self, data: str) -> None: """This method is called whenever a data channel message is received. - The data is parsed into a RealtimeServerEvent (by OpenAI) and then processed. + The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. """ try: event = cast( @@ -580,6 +580,8 @@ async def _on_data(self, data: str) -> None: case _: logger.debug(f"Received event: {event}") # we put all event in the output buffer, but after the interpreted one. + # so when dealing with them, make sure to check the type of the event, since they + # might be of different types. await self.receive_buffer.put((event.type, event)) @override diff --git a/python/uv.lock b/python/uv.lock index 77f5d2cc8773..63cf4711162a 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -130,6 +130,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/54/ebb815bc0fe057d8e7a11c086c479e972e827082f39aeebc6019dd4f0862/aiohttp-3.11.13-cp313-cp313-win_amd64.whl", hash = "sha256:5ceb81a4db2decdfa087381b5fc5847aa448244f973e5da232610304e199e7b2", size = 436452 }, ] +[[package]] +name = "aioice" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dnspython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "ifaddr", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/b6/e2b0e48ccb5b04fe29265e93f14a0915f416e359c897ae87d570566c430b/aioice-0.9.0.tar.gz", hash = "sha256:fc2401b1c4b6e19372eaaeaa28fd1bd9cbf6b0e412e48625297c53b495eebd1e", size = 40324 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/35/d21e48d3ba25d32aba5d142d54c4491376c659dd74d052a30dd25198007b/aioice-0.9.0-py3-none-any.whl", hash = "sha256:b609597a3a5a611e0004ff04772e16aceb881d51c25c0afc4ceac05d5e50024e", size = 24177 }, +] + +[[package]] +name = "aiortc" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aioice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "av", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "google-crc32c", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyee", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pylibsrtp", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyopenssl", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/32/e9b01e2271124643e5dc15c273f2bb8155efebf5bc2115407441ac62f4c5/aiortc-1.9.0.tar.gz", hash = "sha256:03faa76d76ef0e5989ac10386898b029369756102217230e2fcd4b029c50b303", size = 1168973 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/01/db89910fc4dfb72ca25fd9a41326762a490d93d39d2fc4aac3f86c05857d/aiortc-1.9.0-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e3e67c1970c2cffacac53c8f161df264efc62b22721c64a621940935028ee087", size = 1216069 }, + { url = "https://files.pythonhosted.org/packages/4c/6d/76ed96521080492c7264eacf73a8cba2202f1ff9f59af1776c5a2532f332/aiortc-1.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d893cb3d4ffa0ff4f9bb03a88f0a700cdbcd4c0dc060a46c59a27ccd1c890663", size = 896012 }, + { url = "https://files.pythonhosted.org/packages/8c/87/1f666108764fa5b557bed4f0fd5e2acccd739bb2cca2b766dcacb53e5669/aiortc-1.9.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:176b4eb38d833667f87cf719a7a3e105e25a35b138b30893294418c1c96e38db", size = 1779113 }, + { url = "https://files.pythonhosted.org/packages/32/03/f3233e936f7a81549bd95f33f3d304e2a9211cb35d819d74570c0718b1ac/aiortc-1.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44b610f36b8d17123855dfbe915fa6874201765b8a2c7fd9cf72d14cf417740", size = 1896322 }, + { url = "https://files.pythonhosted.org/packages/96/99/6672cf57777801c6ddacc13e1ee07f8c2151d0847a4f81455eeec998eaed/aiortc-1.9.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55505adb31d56cba19a1ef8ad6aa9b727ccdba2a83bfbfb4aa79ef3c472026a6", size = 1918600 }, + { url = "https://files.pythonhosted.org/packages/76/e3/bdb76e7e51bc4fc7a5869597de2effad073ccf5ef14de3aed742d7384107/aiortc-1.9.0-cp38-abi3-win32.whl", hash = "sha256:680b703e35870e301535c930bfe32e7d012224a91ce51531aba45a3124ef07cc", size = 923055 }, + { url = "https://files.pythonhosted.org/packages/6a/df/de098b31a3fbf1117f6d4cb84c14518636054e3c95a9d9f693a1123c95b3/aiortc-1.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:de5e7020cfc2d2d9fb95690926ff2e3b3c30cd4f5f5bc68d5b6756a8eebb686e", size = 1009610 }, + { url = "https://files.pythonhosted.org/packages/95/26/c382db590897fe638254f948d8514772d13ff59b5ada0a71d87322f48c52/aiortc-1.9.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:34c516ae4e70e8f64494305057af09311444325722fe6938ec38dd1e111adca9", size = 1209093 }, + { url = "https://files.pythonhosted.org/packages/68/48/2fe7de04461fdc4aee8c78c67cfe03579eaa72fb215c4b063acaeb4fd118/aiortc-1.9.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:40e61c1b84914d6f4c2968ff49353a22eed9419de74b151237cdb71af431209c", size = 888818 }, + { url = "https://files.pythonhosted.org/packages/da/d5/94bf7ed6189c316ffef930787cba009387f9bcd2f1c482392b71cca3918d/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1924e130a441507b1315956aff05c504a274f1a09802def225d0f3a3d1870320", size = 1732549 }, + { url = "https://files.pythonhosted.org/packages/e7/0a/6495c696cd7f806bafe511fb27203ce918947c4461398384a4e6bd4b7e57/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbb62950e396c311e398925149fa76bc90b8d6525b4eccf28cba704e7ded8bf5", size = 1843911 }, + { url = "https://files.pythonhosted.org/packages/82/36/ffd0f74c73fa6abca0b76bd38473ed7d82dfbada7e57c6efe2a37ee40483/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5234177e8d3126a0190ed9b6f8d0288daedcc0158c45cc279b4e6ac7d97f43f8", size = 1868240 }, + { url = "https://files.pythonhosted.org/packages/fb/46/8cb087a11f2f2d1139bd7e21615cc082097bffc4990d43c9f45f9cf6c8bf/aiortc-1.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e31575eb050aa68e0ea4c519aef101770b2297954f49e64a5c3d73ef27702ea", size = 1004186 }, +] + [[package]] name = "aiosignal" version = "1.3.2" @@ -1882,6 +1926,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "ifaddr" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/ac/fb4c578f4a3256561548cd825646680edcadb9440f3f68add95ade1eb791/ifaddr-0.2.0.tar.gz", hash = "sha256:cc0cbfcaabf765d44595825fb96a99bb12c79716b73b44330ea38ee2b0c4aed4", size = 10485 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/1f/19ebc343cc71a7ffa78f17018535adc5cbdd87afb31d7c34874680148b32/ifaddr-0.2.0-py3-none-any.whl", hash = "sha256:085e0305cfe6f16ab12d72e2024030f5d52674afad6911bb1eee207177b8a748", size = 12314 }, +] + [[package]] name = "importlib-metadata" version = "8.5.0" @@ -3952,6 +4005,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/a9/3b9642025174bbe67e900785fb99c9bfe91ea584b0b7126ff99945c24a0e/pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820", size = 30746 }, ] +[[package]] +name = "pyee" +version = "12.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/37/8fb6e653597b2b67ef552ed49b438d5398ba3b85a9453f8ada0fd77d455c/pyee-12.1.1.tar.gz", hash = "sha256:bbc33c09e2ff827f74191e3e5bbc6be7da02f627b7ec30d86f5ce1a6fb2424a3", size = 30915 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/68/7e150cba9eeffdeb3c5cecdb6896d70c8edd46ce41c0491e12fb2b2256ff/pyee-12.1.1-py3-none-any.whl", hash = "sha256:18a19c650556bb6b32b406d7f017c8f513aceed1ef7ca618fb65de7bd2d347ef", size = 15527 }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -3975,6 +4040,29 @@ crypto = [ { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] +[[package]] +name = "pylibsrtp" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6b/ae/c95199144eed954976223bdce3f94564eb6c43567111aff8048a26a429bd/pylibsrtp-0.10.0.tar.gz", hash = "sha256:d8001912d7f51bd05b4ea3551747930631777fd37892cf3bfe0e541a742e699f", size = 10557 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1c/d2/ffc24f80e83a54d9b309cdae6b31cf9294b4f3a85ab107827fd272d1e687/pylibsrtp-0.10.0-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6a1121ceea3339e0a84842a4a9da0fcf57cc8f99eb60dbf31a46d978b4170e7c", size = 1704188 }, + { url = "https://files.pythonhosted.org/packages/66/3e/db86a09a5cb290a274f76ce25f4fae3a7e3c4a4dbc64baf7e2aaa57a32bb/pylibsrtp-0.10.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ca1994e73c6857b0a695fdde94cc5ac846c1b0d5d8766255a1dc2db40857f667", size = 2028580 }, + { url = "https://files.pythonhosted.org/packages/21/ab/9b2b5ad2ceaa1660de16e0a2e3c54a2043a9c4a3eef7718930c78dc84e77/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb7640b524544603d07bd4373b04c9582c8cfe41d9789d3f492081f053bed9c1", size = 2484470 }, + { url = "https://files.pythonhosted.org/packages/ab/e6/b0a30e79aa2312834b33f5e9c0ad459fc94e195c610634ee9665fafb1fc8/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f13aa945e1dcf8c138bf3d4a6e34056c4c2f69bf9934bc53b320ef14c7317ccc", size = 2078367 }, + { url = "https://files.pythonhosted.org/packages/16/78/9ea0c88490ad4fe9683ddf3bbee702c7a2331e83a333bb3aa52e8d7d909b/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b2ef1c32d1145239dd0fe7b7fbe083334d345df6b4597fc66faf914a32682d9", size = 2134898 }, + { url = "https://files.pythonhosted.org/packages/00/f6/c76fa5401f9d95c14db70de0cf4fad922ad61686843bc3e7411178a64bc8/pylibsrtp-0.10.0-cp38-abi3-win32.whl", hash = "sha256:8c6fe2576b2ab13942b47db6c2ffe71f5eb1edc1dc3bdd7283169fecd5249e74", size = 1130881 }, + { url = "https://files.pythonhosted.org/packages/4c/31/85a58625edc0b6967fe0904c9d89d019bcece3f3e3bf775b9151a8cf9d0d/pylibsrtp-0.10.0-cp38-abi3-win_amd64.whl", hash = "sha256:cd965d4b0e9a77b362526cab119f4d9ce39b83f1f20f46c6af8e694b86fa19a7", size = 1448840 }, + { url = "https://files.pythonhosted.org/packages/66/b5/30b57cac6adf93dfee20cceba6cd91e216c81b723df2bc9dcfe781456263/pylibsrtp-0.10.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:582e9771be7ffd060faea215cb4248afdad1356da473df1b8f35c7e382ca3871", size = 1699981 }, + { url = "https://files.pythonhosted.org/packages/16/e8/3846ac56ae4a2de91e9b3e67dff5363b2b07148616d283416fd8dd8c6ca6/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70111eeb87e5d3ffb9623e1ea036329dc81fed1282aa93c1f32377862ca0a0d8", size = 2441012 }, + { url = "https://files.pythonhosted.org/packages/b1/9f/c611fc47ef5d84dfffca0292bcfb2d78ee5fc1a98d50cf22dfcda3eee171/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eda06947ab42fd3737f01a7b98537a5d5908434d37c70488d10e7bd2ff0d520c", size = 2019497 }, + { url = "https://files.pythonhosted.org/packages/d8/38/90c897fc2f2929290ada1032fa3e0bd39eca9190503250f6724a7bc22b5b/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:511158499309c3f7e97e1ebeffbf3dd939e641ea553de43cfc02d3576aad5c15", size = 2074919 }, + { url = "https://files.pythonhosted.org/packages/2c/46/e92f8a8d7cb5c1d68ec85254a8535aad922efa15646c7ba0c7746b42c4ea/pylibsrtp-0.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4033481f332331bf14b9705dca69efd09d3809ba4a2ff69914c53dddf39c20c1", size = 1446426 }, +] + [[package]] name = "pymeta3" version = "0.5.1" @@ -4055,6 +4143,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/7b/8d0767251e687966cf19a4ad032d597ab135d26af5ecebbdb8895ea92cf0/pymongo-4.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3854db4be39cb9e0c34add1fd7e515deab0b4ee30f3cc3978e057746d119ac12", size = 987871 }, ] +[[package]] +name = "pyopenssl" +version = "25.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "typing-extensions", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/26/e25b4a374b4639e0c235527bbe31c0524f26eda701d79456a7e1877f4cc5/pyopenssl-25.0.0.tar.gz", hash = "sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16", size = 179573 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/d7/eb76863d2060dcbe7c7e6cccfd95ac02ea0b9acc37745a0d99ff6457aefb/pyOpenSSL-25.0.0-py3-none-any.whl", hash = "sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90", size = 56453 }, +] + [[package]] name = "pyparsing" version = "3.2.1" @@ -4946,6 +5047,7 @@ onnx = [ { name = "onnxruntime-genai", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, ] openai-realtime = [ + { name = "aiortc", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "openai", extra = ["realtime"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] pandas = [ @@ -4993,6 +5095,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiohttp", specifier = "~=3.8" }, + { name = "aiortc", marker = "extra == 'openai-realtime'", specifier = ">=1.9.0" }, { name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.32" }, { name = "autogen-agentchat", marker = "extra == 'autogen'", specifier = ">=0.2,<0.4" }, { name = "azure-ai-inference", marker = "extra == 'azure'", specifier = ">=1.0.0b6" }, From b5c54434ea37193fb114d93d82dd17f95a409394 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 15:59:52 +0100 Subject: [PATCH 11/50] added dep --- python/pyproject.toml | 3 ++- python/uv.lock | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 56c7a6932298..a46cc1f92ec2 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -130,7 +130,8 @@ dapr = [ ] openai_realtime = [ "openai[realtime] ~= 1.0", - "aiortc>=1.9.0" + "aiortc>=1.9.0", + "sounddevice>=0.5.1", ] [tool.uv] diff --git a/python/uv.lock b/python/uv.lock index 63cf4711162a..b4507e84efff 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -5049,6 +5049,7 @@ onnx = [ openai-realtime = [ { name = "aiortc", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "openai", extra = ["realtime"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "sounddevice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] pandas = [ { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -5347,6 +5348,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/93/84a16940c44f6ec62cf334f25aed3128a514dffc361397eee09421a1c7f2/snoop-0.6.0-py3-none-any.whl", hash = "sha256:f5ea9060e65594bf404e6841086b4a964cc27bc30569109c91a470f948b0f729", size = 27461 }, ] +[[package]] +name = "sounddevice" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/2d/b04ae180312b81dbb694504bee170eada5372242e186f6298139fd3a0513/sounddevice-0.5.1.tar.gz", hash = "sha256:09ca991daeda8ce4be9ac91e15a9a81c8f81efa6b695a348c9171ea0c16cb041", size = 52896 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/d1/464b5fca3decdd0cfec8c47f7b4161a0b12972453201c1bf03811f367c5e/sounddevice-0.5.1-py3-none-any.whl", hash = "sha256:e2017f182888c3f3c280d9fbac92e5dbddac024a7e3442f6e6116bd79dab8a9c", size = 32276 }, + { url = "https://files.pythonhosted.org/packages/6f/f6/6703fe7cf3d7b7279040c792aeec6334e7305956aba4a80f23e62c8fdc44/sounddevice-0.5.1-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:d16cb23d92322526a86a9490c427bf8d49e273d9ccc0bd096feecd229cde6031", size = 107916 }, + { url = "https://files.pythonhosted.org/packages/57/a5/78a5e71f5ec0faedc54f4053775d61407bfbd7d0c18228c7f3d4252fd276/sounddevice-0.5.1-py3-none-win32.whl", hash = "sha256:d84cc6231526e7a08e89beff229c37f762baefe5e0cc2747cbe8e3a565470055", size = 312494 }, + { url = "https://files.pythonhosted.org/packages/af/9b/15217b04f3b36d30de55fef542389d722de63f1ad81f9c72d8afc98cb6ab/sounddevice-0.5.1-py3-none-win_amd64.whl", hash = "sha256:4313b63f2076552b23ac3e0abd3bcfc0c1c6a696fc356759a13bd113c9df90f1", size = 363634 }, +] + [[package]] name = "soupsieve" version = "2.6" From 6120ba1e3d42910bdd9e44db7c270027cd4d25e1 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 16:02:11 +0100 Subject: [PATCH 12/50] added nd --- python/.cspell.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/.cspell.json b/python/.cspell.json index d236099b5943..5fa1d6aec631 100644 --- a/python/.cspell.json +++ b/python/.cspell.json @@ -47,6 +47,7 @@ "logprobs", "mistralai", "mongocluster", + "nd", "ndarray", "nopep", "NOSQL", @@ -72,4 +73,4 @@ "vertexai", "Weaviate" ] -} +} \ No newline at end of file From ecdb16a634742e69f1846bcff9cd6fb3d284d22f Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 16:04:25 +0100 Subject: [PATCH 13/50] renamed --- python/semantic_kernel/contents/audio_content.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py index b28d2e23a0e0..5f356218ba2b 100644 --- a/python/semantic_kernel/contents/audio_content.py +++ b/python/semantic_kernel/contents/audio_content.py @@ -88,6 +88,6 @@ def to_dict(self) -> dict[str, Any]: return {"type": "audio_url", "audio_url": {"uri": str(self)}} @classmethod - def from_nd_array(cls: type[_T], data: ndarray, mime_type: str) -> _T: - """Create an instance from an nd array.""" + def from_ndarray(cls: type[_T], data: ndarray, mime_type: str) -> _T: + """Create an instance from an ndarray.""" return cls(data=data, mime_type=mime_type) From 8a2a5259e5422729ed8e5b667bce36e28dc64239 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 17 Jan 2025 16:13:42 +0100 Subject: [PATCH 14/50] changed import --- .../ai/open_ai/services/open_ai_realtime_base.py | 3 ++- .../semantic_kernel/connectors/ai/realtime_helpers.py | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py index 7caf5a5671df..a4b86218f525 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py @@ -43,7 +43,6 @@ ) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.function_call_content import FunctionCallContent @@ -682,6 +681,8 @@ async def create_session( **kwargs: Any, ) -> None: """Create a session in the service.""" + from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack + ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])] self.peer_connection = RTCPeerConnection(configuration=RTCConfiguration(iceServers=ice_servers)) diff --git a/python/semantic_kernel/connectors/ai/realtime_helpers.py b/python/semantic_kernel/connectors/ai/realtime_helpers.py index 94549c402199..b89988f90ab3 100644 --- a/python/semantic_kernel/connectors/ai/realtime_helpers.py +++ b/python/semantic_kernel/connectors/ai/realtime_helpers.py @@ -5,11 +5,11 @@ from typing import Any, Final import numpy as np -import sounddevice as sd from aiortc.mediastreams import MediaStreamError, MediaStreamTrack from av.audio.frame import AudioFrame from av.frame import Frame from pydantic import Field, PrivateAttr +from sounddevice import InputStream, OutputStream from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.kernel_pydantic import KernelBaseModel @@ -37,7 +37,7 @@ class SKAudioTrack(KernelBaseModel, MediaStreamTrack): device: str | int | None = None queue: asyncio.Queue[Frame] = Field(default_factory=asyncio.Queue) is_recording: bool = False - stream: sd.InputStream | None = None + stream: InputStream | None = None frame_size: int = 0 _recording_task: asyncio.Task | None = None _loop: asyncio.AbstractEventLoop | None = None @@ -101,7 +101,7 @@ def callback(indata: np.ndarray, frames: int, time: Any, status: Any) -> None: if self._loop and self._loop.is_running(): asyncio.run_coroutine_threadsafe(self.queue.put(frame), self._loop) - self.stream = sd.InputStream( + self.stream = InputStream( device=self.device, channels=self.channels, samplerate=self.sample_rate, @@ -135,11 +135,11 @@ class SKSimplePlayer(KernelBaseModel): channels: int = PLAYER_CHANNELS frame_duration_ms: int = FRAME_DURATION queue: asyncio.Queue[np.ndarray] = Field(default_factory=asyncio.Queue) - _stream: sd.OutputStream | None = PrivateAttr(None) + _stream: OutputStream | None = PrivateAttr(None) def model_post_init(self, __context: Any) -> None: """Initialize the audio stream.""" - self._stream = sd.OutputStream( + self._stream = OutputStream( callback=self.callback, samplerate=self.sample_rate, channels=self.channels, From 4bef21a41d1ae2fcbc4b5a3ff683e127f5c35046 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 20 Jan 2025 16:31:12 +0100 Subject: [PATCH 15/50] restructured --- .../audio/04-chat_with_realtime_api.py | 7 +- .../connectors/ai/open_ai/__init__.py | 3 +- .../open_ai/services/open_ai_model_types.py | 1 + .../ai/open_ai/services/open_ai_realtime.py | 102 +-- .../open_ai/services/open_ai_realtime_base.py | 829 ------------------ .../ai/open_ai/services/realtime/__init__.py | 0 .../ai/open_ai/services/realtime/const.py | 54 ++ .../realtime/open_ai_realtime_base.py | 202 +++++ .../realtime/open_ai_realtime_webrtc.py | 307 +++++++ .../realtime/open_ai_realtime_websocket.py | 201 +++++ .../utils.py} | 0 .../connectors/ai/realtime_client_base.py | 87 +- 12 files changed, 858 insertions(+), 935 deletions(-) delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py rename python/semantic_kernel/connectors/ai/open_ai/services/{open_ai_realtime_utils.py => realtime/utils.py} (100%) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 902ad72d48d4..af4024e12849 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -9,11 +9,11 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( + OpenAIRealtime, OpenAIRealtimeExecutionSettings, - OpenAIRealtimeWebRTC, TurnDetection, ) -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ListenEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.connectors.ai.realtime_helpers import SKSimplePlayer from semantic_kernel.contents import ChatHistory @@ -26,6 +26,7 @@ aioice_log = logging.getLogger("aioice") aioice_log.setLevel(logging.WARNING) logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # This simple sample demonstrates how to use the OpenAI Realtime API to create # a chat bot that can listen and respond directly through audio. @@ -119,7 +120,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional audio_player = SKSimplePlayer() - realtime_client = OpenAIRealtimeWebRTC(audio_output=audio_player.realtime_client_callback) + realtime_client = OpenAIRealtime(protocol="webrtc", audio_output=audio_player.realtime_client_callback) # create stream receiver, this can play the audio, if the audio_player is passed # and allows you to print the transcript of the conversation diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 2c2a87a64a7b..27d36ea30d34 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -40,7 +40,7 @@ from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime, OpenAIRealtimeWebRTC +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio @@ -76,7 +76,6 @@ "OpenAIPromptExecutionSettings", "OpenAIRealtime", "OpenAIRealtimeExecutionSettings", - "OpenAIRealtimeWebRTC", "OpenAISettings", "OpenAITextCompletion", "OpenAITextEmbedding", diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py index 7a1f43da234e..ea2e05deead7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py @@ -12,3 +12,4 @@ class OpenAIModelTypes(Enum): TEXT_TO_IMAGE = "text-to-image" AUDIO_TO_TEXT = "audio-to-text" TEXT_TO_AUDIO = "text-to-audio" + REALTIME = "realtime" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 412d0814feb8..9ba373cce6ff 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,80 +1,37 @@ # Copyright (c) Microsoft. All rights reserved. +from ast import TypeVar from collections.abc import Mapping -from typing import Any +from typing import Any, ClassVar, Literal from openai import AsyncOpenAI from pydantic import ValidationError from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase -from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ( - OpenAIRealtimeBase, - OpenAIRealtimeWebRTCBase, +from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_webrtc import OpenAIRealtimeWebRTCBase +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( + OpenAIRealtimeWebsocketBase, ) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +_T = TypeVar("_T", bound="OpenAIRealtime") -class OpenAIRealtime(OpenAIRealtimeBase, OpenAIConfigBase): - """OpenAI Realtime service.""" - - def __init__( - self, - ai_model_id: str | None = None, - api_key: str | None = None, - org_id: str | None = None, - service_id: str | None = None, - default_headers: Mapping[str, str] | None = None, - async_client: AsyncOpenAI | None = None, - env_file_path: str | None = None, - env_file_encoding: str | None = None, - ) -> None: - """Initialize an OpenAITextCompletion service. - - Args: - ai_model_id (str | None): OpenAI model name, see - https://platform.openai.com/docs/models - service_id (str | None): Service ID tied to the execution settings. - api_key (str | None): The optional API key to use. If provided will override, - the env vars or .env file value. - org_id (str | None): The optional org ID to use. If provided will override, - the env vars or .env file value. - default_headers: The default headers mapping of string keys to - string values for HTTP requests. (Optional) - async_client (Optional[AsyncOpenAI]): An existing client to use. (Optional) - env_file_path (str | None): Use the environment settings file as a fallback to - environment variables. (Optional) - env_file_encoding (str | None): The encoding of the environment settings file. (Optional) - """ - try: - openai_settings = OpenAISettings.create( - api_key=api_key, - org_id=org_id, - text_model_id=ai_model_id, - env_file_path=env_file_path, - env_file_encoding=env_file_encoding, - ) - except ValidationError as ex: - raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex - if not openai_settings.text_model_id: - raise ServiceInitializationError("The OpenAI text model ID is required.") - super().__init__( - ai_model_id=openai_settings.text_model_id, - service_id=service_id, - api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, - org_id=openai_settings.org_id, - ai_model_type=OpenAIModelTypes.TEXT, - default_headers=default_headers, - client=async_client, - ) - -class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): +class OpenAIRealtime(OpenAIConfigBase, OpenAIRealtimeBase): """OpenAI Realtime service.""" + def __new__(cls: type["_T"], *args: Any, **kwargs: Any) -> "_T": + """Pick the right subclass, based on protocol.""" + subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} + subclass = subclass_map[kwargs.pop("protocol", "websocket")] + return super(OpenAIRealtime, subclass).__new__(subclass) + def __init__( self, + protocol: Literal["websocket", "webrtc"] = "websocket", ai_model_id: str | None = None, api_key: str | None = None, org_id: str | None = None, @@ -85,9 +42,10 @@ def __init__( env_file_encoding: str | None = None, **kwargs: Any, ) -> None: - """Initialize an OpenAITextCompletion service. + """Initialize an OpenAIRealtime service. Args: + protocol: The protocol to use, can be either "websocket" or "webrtc". ai_model_id (str | None): OpenAI model name, see https://platform.openai.com/docs/models service_id (str | None): Service ID tied to the execution settings. @@ -116,12 +74,32 @@ def __init__( if not openai_settings.realtime_model_id: raise ServiceInitializationError("The OpenAI text model ID is required.") super().__init__( + protocol=protocol, ai_model_id=openai_settings.realtime_model_id, service_id=service_id, api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, org_id=openai_settings.org_id, - ai_model_type=OpenAIModelTypes.TEXT, + ai_model_type=OpenAIModelTypes.REALTIME, default_headers=default_headers, client=async_client, - **kwargs, ) + + +class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase): + """OpenAI Realtime service using WebRTC protocol. + + This should not be used directly, use OpenAIRealtime instead. + Set protocol="webrtc" to use this class. + """ + + protocol: ClassVar[Literal["webrtc"]] = "webrtc" + + +class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase): + """OpenAI Realtime service using WebSocket protocol. + + This should not be used directly, use OpenAIRealtime instead. + Set protocol="websocket" to use this class. + """ + + protocol: ClassVar[Literal["websocket"]] = "websocket" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py deleted file mode 100644 index a4b86218f525..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_base.py +++ /dev/null @@ -1,829 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import asyncio -import base64 -import contextlib -import json -import logging -import sys -from collections.abc import AsyncGenerator, Callable, Coroutine -from enum import Enum -from inspect import isawaitable -from typing import Any, ClassVar, Protocol, cast, runtime_checkable - -if sys.version_info >= (3, 12): - from typing import override # pragma: no cover -else: - from typing_extensions import override # pragma: no cover - -from aiohttp import ClientSession -from aiortc import ( - MediaStreamTrack, - RTCConfiguration, - RTCDataChannel, - RTCIceServer, - RTCPeerConnection, - RTCSessionDescription, -) -from av import AudioFrame -from openai._models import construct_type_unchecked -from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection -from openai.types.beta.realtime.conversation_item_create_event_param import ConversationItemParam -from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from pydantic import Field, PrivateAttr - -from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration -from semantic_kernel.connectors.ai.function_calling_utils import ( - prepare_settings_for_function_calling, -) -from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_utils import ( - update_settings_from_function_call_configuration, -) -from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent -from semantic_kernel.contents.streaming_text_content import StreamingTextContent -from semantic_kernel.contents.text_content import TextContent -from semantic_kernel.contents.utils.author_role import AuthorRole -from semantic_kernel.kernel import Kernel -from semantic_kernel.utils.experimental_decorator import experimental_class - -logger: logging.Logger = logging.getLogger(__name__) - -# region Protocols - - -@runtime_checkable -@experimental_class -class EventCallBackProtocolAsync(Protocol): - """Event callback protocol.""" - - async def __call__( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool] | None: - """Call the event callback.""" - ... - - -@runtime_checkable -@experimental_class -class EventCallBackProtocol(Protocol): - """Event callback protocol.""" - - def __call__( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool] | None: - """Call the event callback.""" - ... - - -# region Events - - -@experimental_class -class SendEvents(str, Enum): - """Events that can be sent.""" - - SESSION_UPDATE = "session.update" - INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" - INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" - INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" - CONVERSATION_ITEM_CREATE = "conversation.item.create" - CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" - CONVERSATION_ITEM_DELETE = "conversation.item.delete" - RESPONSE_CREATE = "response.create" - RESPONSE_CANCEL = "response.cancel" - - -@experimental_class -class ListenEvents(str, Enum): - """Events that can be listened to.""" - - ERROR = "error" - SESSION_CREATED = "session.created" - SESSION_UPDATED = "session.updated" - CONVERSATION_CREATED = "conversation.created" - INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" - INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" - INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" - INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" - CONVERSATION_ITEM_CREATED = "conversation.item.created" - CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" - CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" - CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" - CONVERSATION_ITEM_DELETED = "conversation.item.deleted" - RESPONSE_CREATED = "response.created" - RESPONSE_DONE = "response.done" # contains usage info -> log - RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" - RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" - RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" - RESPONSE_CONTENT_PART_DONE = "response.content_part.done" - RESPONSE_TEXT_DELTA = "response.text.delta" - RESPONSE_TEXT_DONE = "response.text.done" - RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" - RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" - RESPONSE_AUDIO_DELTA = "response.audio.delta" - RESPONSE_AUDIO_DONE = "response.audio.done" - RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" - RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" - RATE_LIMITS_UPDATED = "rate_limits.updated" - - -# region Websocket - - -@experimental_class -class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): - """OpenAI Realtime service.""" - - SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - connection: AsyncRealtimeConnection | None = None - connected: asyncio.Event = Field(default_factory=asyncio.Event) - event_log: dict[str, list[RealtimeServerEvent]] = Field(default_factory=dict) - event_handlers: dict[str, list[EventCallBackProtocol | EventCallBackProtocolAsync]] = Field(default_factory=dict) - - def model_post_init(self, *args, **kwargs) -> None: - """Post init method for the model.""" - # Register the default event handlers - self.register_event_handler(ListenEvents.RESPONSE_AUDIO_DELTA, self.response_audio_delta_callback) - self.register_event_handler( - ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, self.response_audio_transcript_delta_callback - ) - self.register_event_handler( - ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE, self.response_audio_transcript_done_callback - ) - self.register_event_handler( - ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, self.response_function_call_arguments_delta_callback - ) - self.register_event_handler(ListenEvents.ERROR, self.error_callback) - self.register_event_handler(ListenEvents.SESSION_CREATED, self.session_callback) - self.register_event_handler(ListenEvents.SESSION_UPDATED, self.session_callback) - - def register_event_handler( - self, event_type: str | ListenEvents, handler: EventCallBackProtocol | EventCallBackProtocolAsync - ) -> None: - """Register a event handler.""" - if not isinstance(event_type, ListenEvents): - event_type = ListenEvents(event_type) - self.event_handlers.setdefault(event_type, []).append(handler) - - @override - async def start_listening( - self, - settings: "PromptExecutionSettings", - chat_history: "ChatHistory | None" = None, - **kwargs: Any, - ) -> AsyncGenerator[StreamingChatMessageContent, Any]: - await self.connected.wait() - if not self.connection: - raise ValueError("Connection is not established.") - if not chat_history: - chat_history = ChatHistory() - async for event in self.connection: - event_type = ListenEvents(event.type) - self.event_log.setdefault(event_type, []).append(event) - for handler in self.event_handlers.get(event_type, []): - task = handler(event=event, settings=settings) - if not task: - continue - if isawaitable(task): - async_result = await task - if not async_result: - continue - result, should_return = async_result - else: - result, should_return = task - if should_return: - yield result - else: - chat_history.add_message(result) - - for event_type in self.event_log: - logger.debug(f"Event type: {event_type}, count: {len(self.event_log[event_type])}") - - @override - async def start_sending(self, event: str | SendEvents, **kwargs: Any) -> None: - await self.connected.wait() - if not self.connection: - raise ValueError("Connection is not established.") - if not isinstance(event, SendEvents): - event = SendEvents(event) - match event: - case SendEvents.SESSION_UPDATE: - if "settings" not in kwargs: - logger.error("Event data does not contain 'settings'") - await self.connection.session.update(session=kwargs["settings"].prepare_settings_dict()) - case SendEvents.INPUT_AUDIO_BUFFER_APPEND: - if "content" not in kwargs: - logger.error("Event data does not contain 'content'") - return - await self.connection.input_audio_buffer.append(audio=kwargs["content"].data.decode("utf-8")) - case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: - await self.connection.input_audio_buffer.commit() - case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: - await self.connection.input_audio_buffer.clear() - case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in kwargs: - logger.error("Event data does not contain 'item'") - return - content = kwargs["item"] - for item in content.items: - match item: - case TextContent(): - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ) - ) - case FunctionCallContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function call needs to have a call_id") - continue - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="function_call", - name=item.name, - arguments=item.arguments, - call_id=call_id, - ) - ) - case FunctionResultContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function result needs to have a call_id") - continue - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="function_call_output", - output=item.result, - call_id=call_id, - ) - ) - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in kwargs: - logger.error("Event data does not contain 'item_id'") - return - await self.connection.conversation.item.truncate( - item_id=kwargs["item_id"], content_index=0, audio_end_ms=kwargs.get("audio_end_ms", 0) - ) - case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in kwargs: - logger.error("Event data does not contain 'item_id'") - return - await self.connection.conversation.item.delete(item_id=kwargs["item_id"]) - case SendEvents.RESPONSE_CREATE: - if "response" in kwargs: - await self.connection.response.create(response=kwargs["response"]) - else: - await self.connection.response.create() - case SendEvents.RESPONSE_CANCEL: - if "response_id" in kwargs: - await self.connection.response.cancel(response_id=kwargs["response_id"]) - else: - await self.connection.response.cancel() - - @override - async def create_session( - self, - settings: PromptExecutionSettings | None = None, - chat_history: ChatHistory | None = None, - **kwargs: Any, - ) -> None: - """Create a session in the service.""" - self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() - self.connected.set() - if settings or chat_history or kwargs: - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) - - @override - async def update_session( - self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any - ) -> None: - if settings: - if "kernel" in kwargs: - settings = prepare_settings_for_function_calling( - settings, - self.get_prompt_execution_settings_class(), - self._update_function_choice_settings_callback(), - kernel=kwargs.get("kernel"), # type: ignore - ) - await self.start_sending(SendEvents.SESSION_UPDATE, settings=settings) - if chat_history and len(chat_history) > 0: - await asyncio.gather( - *(self.start_sending(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) for msg in chat_history.messages) - ) - - @override - async def close_session(self) -> None: - """Close the session in the service.""" - if self.connected.is_set(): - await self.connection.close() - self.connection = None - self.connected.clear() - - # region Event callbacks - - def response_audio_delta_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response audio delta.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[AudioContent(data=base64.b64decode(event.delta), data_format="base64")], - choice_index=event.content_index, - inner_content=event, - ), True - - def response_audio_transcript_delta_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response audio transcript delta.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.delta, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, - ), True - - def response_audio_transcript_done_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response audio transcript done.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[StreamingTextContent(text=event.transcript, choice_index=event.content_index)], - choice_index=event.content_index, - inner_content=event, - ), False - - def response_function_call_arguments_delta_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> tuple[Any, bool]: - """Handle response function call arguments delta.""" - return StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - ], - choice_index=0, - inner_content=event, - ), True - - def error_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> None: - """Handle error.""" - logger.error("Error received: %s", event.error) - - def session_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> None: - """Handle session.""" - logger.debug("Session created or updated, session: %s", event.session) - - async def response_function_call_arguments_done_callback( - self, - event: RealtimeServerEvent, - settings: PromptExecutionSettings | None = None, - **kwargs: Any, - ) -> None: - """Handle response function call done.""" - item = FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - kernel: Kernel | None = kwargs.get("kernel") - call_id = item.name - function_name = next( - output_item_event.item.name - for output_item_event in self.event_log[ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED] - if output_item_event.item.call_id == call_id - ) - item.plugin_name, item.function_name = function_name.split("-", 1) - if kernel: - chat_history = ChatHistory() - await kernel.invoke_function_call(item, chat_history) - await self.start_sending(SendEvents.CONVERSATION_ITEM_CREATE, item=chat_history.messages[-1]) - # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.start_sending(SendEvents.RESPONSE_CREATE) - return chat_history.messages[-1], False - - @override - def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa - OpenAIRealtimeExecutionSettings, - ) - - return OpenAIRealtimeExecutionSettings - - -# region WebRTC - - -@experimental_class -class OpenAIRealtimeWebRTCBase(OpenAIHandler, RealtimeClientBase): - """OpenAI WebRTC Realtime service.""" - - SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - peer_connection: RTCPeerConnection | None = None - data_channel: RTCDataChannel | None = None - audio_output: Callable[[AudioFrame], Coroutine[Any, Any, None] | None] | None = None - kernel: Kernel | None = None - - _current_settings: PromptExecutionSettings | None = PrivateAttr(None) - _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) - - @override - async def start_listening( - self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - **kwargs: Any, - ) -> None: - pass - - async def _on_track(self, track: MediaStreamTrack) -> None: - logger.info(f"Received {track.kind} track from remote") - if track.kind != "audio": - return - while True: - try: - # This is a MediaStreamTrack, so the type is AudioFrame - # this might need to be updated if video becomes part of this - frame: AudioFrame = await track.recv() # type: ignore - except Exception as e: - logger.error(f"Error getting audio frame: {e!s}") - break - - try: - if self.audio_output: - out = self.audio_output(frame) - if isawaitable(out): - await out - - except Exception as e: - logger.error(f"Error playing remote audio frame: {e!s}") - try: - await self.receive_buffer.put( - ( - ListenEvents.RESPONSE_AUDIO_DELTA, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame)], # type: ignore - choice_index=0, - ), - ), - ) - except Exception as e: - logger.error(f"Error processing remote audio frame: {e!s}") - await asyncio.sleep(0.01) - - async def _on_data(self, data: str) -> None: - """This method is called whenever a data channel message is received. - - The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. - """ - try: - event = cast( - RealtimeServerEvent, - construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), - ) - except Exception as e: - logger.error(f"Failed to parse event {data} with error: {e!s}") - return - match event.type: - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA: - await self.receive_buffer.put(( - event.type, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - content=event.delta, - choice_index=event.content_index, - inner_content=event, - ), - )) - case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED: - if event.item.type == "function_call": - self._call_id_to_function_map[event.item.call_id] = event.item.name - case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA: - await self.receive_buffer.put(( - event.type, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - ], - choice_index=0, - inner_content=event, - ), - )) - case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE: - await self._handle_function_call_arguments_done(event) - case ListenEvents.ERROR: - logger.error("Error received: %s", event.error) - case ListenEvents.SESSION_CREATED, ListenEvents.SESSION_UPDATED: - logger.info("Session created or updated, session: %s", event.session) - case _: - logger.debug(f"Received event: {event}") - # we put all event in the output buffer, but after the interpreted one. - # so when dealing with them, make sure to check the type of the event, since they - # might be of different types. - await self.receive_buffer.put((event.type, event)) - - @override - async def start_sending(self, **kwargs: Any) -> None: - while True: - item = await self.send_buffer.get() - if not item: - continue - if isinstance(item, tuple): - event, data = item - else: - event = item - data = {} - if not isinstance(event, SendEvents): - event = SendEvents(event) - response: dict[str, Any] = {"type": event.value} - match event: - case SendEvents.SESSION_UPDATE: - if "settings" not in data: - logger.error("Event data does not contain 'settings'") - response["session"] = data["settings"].prepare_settings_dict() - case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in data: - logger.error("Event data does not contain 'item'") - return - content = data["item"] - for item in content.items: - match item: - case TextContent(): - response["item"] = ConversationItemParam( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ) - - case FunctionCallContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function call needs to have a call_id") - continue - response["item"] = ConversationItemParam( - type="function_call", - name=item.name, - arguments=item.arguments, - call_id=call_id, - ) - - case FunctionResultContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function result needs to have a call_id") - continue - response["item"] = ConversationItemParam( - type="function_call_output", - output=item.result, - call_id=call_id, - ) - - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in data: - logger.error("Event data does not contain 'item_id'") - return - response["item_id"] = data["item_id"] - response["content_index"] = 0 - response["audio_end_ms"] = data.get("audio_end_ms", 0) - - case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in data: - logger.error("Event data does not contain 'item_id'") - return - response["item_id"] = data["item_id"] - case SendEvents.RESPONSE_CREATE: - if "response" in data: - response["response"] = data["response"] - case SendEvents.RESPONSE_CANCEL: - if "response_id" in data: - response["response_id"] = data["response_id"] - - if self.data_channel: - while self.data_channel.readyState != "open": - await asyncio.sleep(0.1) - try: - self.data_channel.send(json.dumps(response)) - except Exception as e: - logger.error(f"Failed to send event {event} with error: {e!s}") - - @override - async def create_session( - self, - settings: PromptExecutionSettings | None = None, - chat_history: ChatHistory | None = None, - audio_track: MediaStreamTrack | None = None, - **kwargs: Any, - ) -> None: - """Create a session in the service.""" - from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack - - ice_servers = [RTCIceServer(urls=["stun:stun.l.google.com:19302"])] - self.peer_connection = RTCPeerConnection(configuration=RTCConfiguration(iceServers=ice_servers)) - - self.peer_connection.on("track")(self._on_track) - - self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") - self.data_channel.on("message")(self._on_data) - - self.peer_connection.addTransceiver(audio_track or SKAudioTrack(), "sendrecv") - - offer = await self.peer_connection.createOffer() - await self.peer_connection.setLocalDescription(offer) - - try: - ephemeral_token = await self.get_ephemeral_token() - headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} - - async with ( - ClientSession() as session, - session.post( - f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", - headers=headers, - data=offer.sdp, - ) as response, - ): - if response.status not in [200, 201]: - error_text = await response.text() - raise Exception(f"OpenAI WebRTC error: {error_text}") - - sdp_answer = await response.text() - answer = RTCSessionDescription(sdp=sdp_answer, type="answer") - await self.peer_connection.setRemoteDescription(answer) - logger.info("Connected to OpenAI WebRTC") - - except Exception as e: - logger.error(f"Failed to connect to OpenAI: {e!s}") - raise - - if settings or chat_history or kwargs: - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) - - @override - async def update_session( - self, - settings: PromptExecutionSettings | None = None, - chat_history: ChatHistory | None = None, - create_response: bool = True, - **kwargs: Any, - ) -> None: - if "kernel" in kwargs: - self.kernel = kwargs["kernel"] - if settings: - self._current_settings = settings - if self._current_settings and self.kernel: - self._current_settings = prepare_settings_for_function_calling( - self._current_settings, - self.get_prompt_execution_settings_class(), - self._update_function_choice_settings_callback(), - kernel=self.kernel, # type: ignore - ) - await self.send_buffer.put((SendEvents.SESSION_UPDATE, {"settings": self._current_settings})) - if chat_history and len(chat_history) > 0: - for msg in chat_history.messages: - await self.send_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": msg})) - if create_response: - await self.send_buffer.put(SendEvents.RESPONSE_CREATE) - - @override - async def close_session(self) -> None: - """Close the session in the service.""" - if self.peer_connection: - with contextlib.suppress(asyncio.CancelledError): - await self.peer_connection.close() - self.peer_connection = None - if self.data_channel: - with contextlib.suppress(asyncio.CancelledError): - self.data_channel.close() - self.data_channel = None - - async def _handle_function_call_arguments_done( - self, - event: RealtimeServerEvent, - ) -> None: - """Handle response function call done.""" - plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) - if not plugin_name or not function_name: - logger.error("Function call needs to have a plugin name and function name") - return - item = FunctionCallContent( - id=event.item_id, - plugin_name=plugin_name, - function_name=function_name, - arguments=event.arguments, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - if not self.kernel and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions: - return - chat_history = ChatHistory() - await self.kernel.invoke_function_call(item, chat_history) - created_output = chat_history.messages[-1] - # This returns the output to the service - await self.send_buffer.put((SendEvents.CONVERSATION_ITEM_CREATE, {"item": created_output})) - # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send_buffer.put(SendEvents.RESPONSE_CREATE) - # This allows a user to have a full conversation in his code - await self.receive_buffer.put((ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, created_output)) - - async def get_ephemeral_token(self) -> str: - """Get an ephemeral token from OpenAI.""" - headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} - data = {"model": self.ai_model_id, "voice": "echo"} - - try: - async with ( - ClientSession() as session, - session.post( - f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data - ) as response, - ): - if response.status not in [200, 201]: - error_text = await response.text() - raise Exception(f"Failed to get ephemeral token: {error_text}") - - result = await response.json() - return result["client_secret"]["value"] - - except Exception as e: - logger.error(f"Failed to get ephemeral token: {e!s}") - raise - - @override - def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa - OpenAIRealtimeExecutionSettings, - ) - - return OpenAIRealtimeExecutionSettings - - @override - def _update_function_choice_settings_callback( - self, - ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: - return update_settings_from_function_call_configuration diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py new file mode 100644 index 000000000000..533e00d24d53 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py @@ -0,0 +1,54 @@ +# Copyright (c) Microsoft. All rights reserved. + +from enum import Enum + +from semantic_kernel.utils.experimental_decorator import experimental_class + + +@experimental_class +class SendEvents(str, Enum): + """Events that can be sent.""" + + SESSION_UPDATE = "session.update" + INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" + INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" + INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" + CONVERSATION_ITEM_CREATE = "conversation.item.create" + CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" + CONVERSATION_ITEM_DELETE = "conversation.item.delete" + RESPONSE_CREATE = "response.create" + RESPONSE_CANCEL = "response.cancel" + + +@experimental_class +class ListenEvents(str, Enum): + """Events that can be listened to.""" + + ERROR = "error" + SESSION_CREATED = "session.created" + SESSION_UPDATED = "session.updated" + CONVERSATION_CREATED = "conversation.created" + INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" + INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" + INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" + INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" + CONVERSATION_ITEM_CREATED = "conversation.item.created" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" + CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" + CONVERSATION_ITEM_DELETED = "conversation.item.deleted" + RESPONSE_CREATED = "response.created" + RESPONSE_DONE = "response.done" # contains usage info -> log + RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" + RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" + RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" + RESPONSE_CONTENT_PART_DONE = "response.content_part.done" + RESPONSE_TEXT_DELTA = "response.text.delta" + RESPONSE_TEXT_DONE = "response.text.done" + RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" + RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" + RESPONSE_AUDIO_DELTA = "response.audio.delta" + RESPONSE_AUDIO_DONE = "response.audio.done" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" + RATE_LIMITS_UPDATED = "rate_limits.updated" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py new file mode 100644 index 000000000000..9f72ee1fd5d1 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -0,0 +1,202 @@ +# Copyright (c) Microsoft. All rights reserved. + +import logging +import sys +from collections.abc import Callable, Coroutine +from typing import TYPE_CHECKING, Any, ClassVar, Literal + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent +from openai.types.beta.realtime.response_function_call_arguments_done_event import ( + ResponseFunctionCallArgumentsDoneEvent, +) +from pydantic import PrivateAttr + +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_calling_utils import ( + prepare_settings_for_function_calling, +) +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.utils import ( + update_settings_from_function_call_configuration, +) +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.contents.utils.author_role import AuthorRole +from semantic_kernel.kernel import Kernel +from semantic_kernel.utils.experimental_decorator import experimental_class + +if TYPE_CHECKING: + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + + +logger: logging.Logger = logging.getLogger(__name__) + + +@experimental_class +class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): + """OpenAI Realtime service.""" + + protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True + audio_output: Callable[[Any], Coroutine[Any, Any, None] | None] | None = None + kernel: Kernel | None = None + + _current_settings: PromptExecutionSettings | None = PrivateAttr(None) + _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) + + async def _handle_event(self, event: RealtimeServerEvent) -> None: + """Handle all events but audio delta. + + Audio delta has to be handled by the implementation of the protocol as some + protocols have different ways of handling audio. + + + """ + match event.type: + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: + await self.receive_buffer.put(( + event.type, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + content=event.delta, + choice_index=event.content_index, + inner_content=event, + ), + )) + case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: + if event.item.type == "function_call" and event.item.call_id and event.item.name: + self._call_id_to_function_map[event.item.call_id] = event.item.name + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: + await self.receive_buffer.put(( + event.type, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + FunctionCallContent( + id=event.item_id, + name=event.call_id, + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + ], + choice_index=0, + inner_content=event, + ), + )) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: + await self._handle_function_call_arguments_done(event) + case ListenEvents.ERROR.value: + logger.error("Error received: %s", event.error) + case ListenEvents.SESSION_CREATED.value, ListenEvents.SESSION_UPDATED.value: + logger.info("Session created or updated, session: %s", event.session) + case _: + logger.debug(f"Received event: {event}") + # we put all event in the output buffer, but after the interpreted one. + # so when dealing with them, make sure to check the type of the event, since they + # might be of different types. + await self.receive_buffer.put((event.type, event)) + + @override + async def update_session( + self, + settings: PromptExecutionSettings | None = None, + chat_history: ChatHistory | None = None, + create_response: bool = False, + **kwargs: Any, + ) -> None: + if "kernel" in kwargs: + self.kernel = kwargs["kernel"] + if settings: + self._current_settings = settings + if self._current_settings and self.kernel: + self._current_settings = prepare_settings_for_function_calling( + self._current_settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=self.kernel, # type: ignore + ) + await self.send(SendEvents.SESSION_UPDATE, settings=self._current_settings) + if chat_history and len(chat_history) > 0: + for msg in chat_history.messages: + await self.send(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) + if create_response: + await self.send(SendEvents.RESPONSE_CREATE) + + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration + + async def _handle_function_call_arguments_done( + self, + event: ResponseFunctionCallArgumentsDoneEvent, + ) -> None: + """Handle response function call done.""" + if not self.kernel or ( + self._current_settings + and self._current_settings.function_choice_behavior + and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions + ): + return + plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) + if not plugin_name or not function_name: + logger.error("Function call needs to have a plugin name and function name") + return + item = FunctionCallContent( + id=event.item_id, + plugin_name=plugin_name, + function_name=function_name, + arguments=event.arguments, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + chat_history = ChatHistory() + await self.kernel.invoke_function_call(item, chat_history) + created_output = chat_history.messages[-1] + # This returns the output to the service + await self.send(SendEvents.CONVERSATION_ITEM_CREATE, item=created_output) + # The model doesn't start responding to the tool call automatically, so triggering it here. + await self.send(SendEvents.RESPONSE_CREATE) + # This allows a user to have a full conversation in his code + await self.receive_buffer.put((ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, created_output)) + + @override + async def start_listening( + self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any + ) -> None: + pass + + @override + async def start_sending(self, **kwargs: Any) -> None: + pass + + @override + async def create_session( + self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any + ) -> None: + pass + + @override + async def close_session(self) -> None: + pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py new file mode 100644 index 000000000000..583e34bfd997 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -0,0 +1,307 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import contextlib +import json +import logging +import sys +from collections.abc import Callable, Coroutine +from inspect import isawaitable +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast + +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +from aiohttp import ClientSession +from aiortc import ( + RTCConfiguration, + RTCDataChannel, + RTCIceServer, + RTCPeerConnection, + RTCSessionDescription, +) +from av.audio.frame import AudioFrame +from openai._models import construct_type_unchecked +from openai.types.beta.realtime.conversation_item_param import ConversationItemParam +from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent + +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.contents.utils.author_role import AuthorRole +from semantic_kernel.utils.experimental_decorator import experimental_class + +if TYPE_CHECKING: + from aiortc import MediaStreamTrack + + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + + +logger: logging.Logger = logging.getLogger(__name__) + + +@experimental_class +class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): + """OpenAI WebRTC Realtime service.""" + + protocol: ClassVar[Literal["webrtc"]] = "webrtc" + peer_connection: RTCPeerConnection | None = None + data_channel: RTCDataChannel | None = None + audio_output: Callable[[AudioFrame], Coroutine[Any, Any, None] | None] | None = None + + # region public methods + + @override + async def start_listening( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + create_response: bool = False, + **kwargs: Any, + ) -> None: + if chat_history or settings or create_response: + await self.update_session(settings=settings, chat_history=chat_history, create_response=create_response) + + @override + async def start_sending(self, **kwargs: Any) -> None: + if not self.data_channel: + logger.error("Data channel not initialized") + return + while self.data_channel.readyState != "open": + await asyncio.sleep(0.1) + while True: + event, data = await self.send_buffer.get() + if not isinstance(event, SendEvents): + event = SendEvents(event) + response: dict[str, Any] = {"type": event.value} + match event: + case SendEvents.SESSION_UPDATE: + if "settings" not in data: + logger.error("Event data does not contain 'settings'") + response["session"] = data["settings"].prepare_settings_dict() + case SendEvents.CONVERSATION_ITEM_CREATE: + if "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + for item in content.items: + match item: + case TextContent(): + response["item"] = ConversationItemParam( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ) + + case FunctionCallContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function call needs to have a call_id") + continue + response["item"] = ConversationItemParam( + type="function_call", + name=item.name or item.function_name, + arguments="" + if not item.arguments + else item.arguments + if isinstance(item.arguments, str) + else json.dumps(item.arguments), + call_id=call_id, + ) + + case FunctionResultContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function result needs to have a call_id") + continue + response["item"] = ConversationItemParam( + type="function_call_output", + output=item.result, + call_id=call_id, + ) + + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + response["item_id"] = data["item_id"] + response["content_index"] = 0 + response["audio_end_ms"] = data.get("audio_end_ms", 0) + + case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + response["item_id"] = data["item_id"] + case SendEvents.RESPONSE_CREATE: + if "response" in data: + response["response"] = data["response"] + case SendEvents.RESPONSE_CANCEL: + if "response_id" in data: + response["response_id"] = data["response_id"] + + try: + self.data_channel.send(json.dumps(response)) + except Exception as e: + logger.error(f"Failed to send event {event} with error: {e!s}") + + @override + async def create_session( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + audio_track: "MediaStreamTrack | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + if not audio_track: + from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack + + audio_track = SKAudioTrack() + + self.peer_connection = RTCPeerConnection( + configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) + ) + + # track is the audio track being returned from the service + self.peer_connection.on("track")(self._on_track) + + # data channel is used to send and receive messages + self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") + self.data_channel.on("message")(self._on_data) + + # this is the incoming audio, which sends audio to the service + self.peer_connection.addTransceiver(audio_track) + + offer = await self.peer_connection.createOffer() + await self.peer_connection.setLocalDescription(offer) + + try: + ephemeral_token = await self._get_ephemeral_token() + headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} + + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", + headers=headers, + data=offer.sdp, + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"OpenAI WebRTC error: {error_text}") + + sdp_answer = await response.text() + answer = RTCSessionDescription(sdp=sdp_answer, type="answer") + await self.peer_connection.setRemoteDescription(answer) + logger.info("Connected to OpenAI WebRTC") + + except Exception as e: + logger.error(f"Failed to connect to OpenAI: {e!s}") + raise + + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.peer_connection: + with contextlib.suppress(asyncio.CancelledError): + await self.peer_connection.close() + self.peer_connection = None + if self.data_channel: + with contextlib.suppress(asyncio.CancelledError): + self.data_channel.close() + self.data_channel = None + + # region implementation specifics + + async def _on_track(self, track: "MediaStreamTrack") -> None: + logger.info(f"Received {track.kind} track from remote") + if track.kind != "audio": + return + while True: + try: + # This is a MediaStreamTrack, so the type is AudioFrame + # this might need to be updated if video becomes part of this + frame: AudioFrame = await track.recv() # type: ignore + except Exception as e: + logger.error(f"Error getting audio frame: {e!s}") + break + + try: + if self.audio_output: + out = self.audio_output(frame) + if isawaitable(out): + await out + + except Exception as e: + logger.error(f"Error playing remote audio frame: {e!s}") + try: + await self.receive_buffer.put( + ( + ListenEvents.RESPONSE_AUDIO_DELTA, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame)], # type: ignore + choice_index=0, + ), + ), + ) + except Exception as e: + logger.error(f"Error processing remote audio frame: {e!s}") + await asyncio.sleep(0.01) + + async def _on_data(self, data: str) -> None: + """This method is called whenever a data channel message is received. + + The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. + Audio data is not send through this channel, use _on_track for that. + """ + try: + event = cast( + RealtimeServerEvent, + construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), + ) + except Exception as e: + logger.error(f"Failed to parse event {data} with error: {e!s}") + return + await self._handle_event(event) + + async def _get_ephemeral_token(self) -> str: + """Get an ephemeral token from OpenAI.""" + headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} + data = {"model": self.ai_model_id, "voice": "echo"} + + try: + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") + + result = await response.json() + return result["client_secret"]["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py new file mode 100644 index 000000000000..95ff1ab3a6b8 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -0,0 +1,201 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +import json +import logging +import sys +from inspect import isawaitable +from typing import TYPE_CHECKING, Any, ClassVar, Literal + +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection +from openai.types.beta.realtime.conversation_item_param import ConversationItemParam +from pydantic import Field + +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.contents.utils.author_role import AuthorRole +from semantic_kernel.utils.experimental_decorator import experimental_class + +if TYPE_CHECKING: + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + +logger: logging.Logger = logging.getLogger(__name__) + +# region Websocket + + +@experimental_class +class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): + """OpenAI Realtime service.""" + + protocol: ClassVar[Literal["websocket"]] = "websocket" + connection: AsyncRealtimeConnection | None = None + connected: asyncio.Event = Field(default_factory=asyncio.Event) + + @override + async def start_listening( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + create_response: bool = False, + **kwargs: Any, + ) -> None: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + + if chat_history or settings or create_response: + await self.update_session(settings=settings, chat_history=chat_history, create_response=create_response) + + async for event in self.connection: + if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: + if self.audio_output: + out = self.audio_output(event) + if isawaitable(out): + await out + try: + await self.receive_buffer.put(( + event.type, + StreamingChatMessageContent( + role=AuthorRole.ASSISTANT, + items=[ + AudioContent( + data=base64.b64decode(event.delta), + data_format="base64", + inner_content=event, + ) + ], # type: ignore + choice_index=event.content_index, + ), + )) + except Exception as e: + logger.error(f"Error processing remote audio frame: {e!s}") + else: + await self._handle_event(event) + + @override + async def start_sending(self, **kwargs: Any) -> None: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + while True: + event, data = await self.send_buffer.get() + match event: + case SendEvents.SESSION_UPDATE: + if "settings" not in data: + logger.error("Event data does not contain 'settings'") + await self.connection.session.update(session=data["settings"].prepare_settings_dict()) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if "content" not in data: + logger.error("Event data does not contain 'content'") + return + await self.connection.input_audio_buffer.append(audio=data["content"].data.decode("utf-8")) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + await self.connection.input_audio_buffer.commit() + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + await self.connection.input_audio_buffer.clear() + case SendEvents.CONVERSATION_ITEM_CREATE: + if "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + for item in content.items: + match item: + case TextContent(): + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ) + ) + case FunctionCallContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function call needs to have a call_id") + continue + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="function_call", + name=item.name or item.function_name, + arguments="" + if not item.arguments + else item.arguments + if isinstance(item.arguments, str) + else json.dumps(item.arguments), + call_id=call_id, + ) + ) + case FunctionResultContent(): + call_id = item.metadata.get("call_id") + if not call_id: + logger.error("Function result needs to have a call_id") + continue + await self.connection.conversation.item.create( + item=ConversationItemParam( + type="function_call_output", + output=item.result, + call_id=call_id, + ) + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self.connection.conversation.item.truncate( + item_id=data["item_id"], content_index=0, audio_end_ms=data.get("audio_end_ms", 0) + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self.connection.conversation.item.delete(item_id=data["item_id"]) + case SendEvents.RESPONSE_CREATE: + if "response" in data: + await self.connection.response.create(response=data["response"]) + else: + await self.connection.response.create() + case SendEvents.RESPONSE_CANCEL: + if "response_id" in data: + await self.connection.response.cancel(response_id=data["response_id"]) + else: + await self.connection.response.cancel() + + @override + async def create_session( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() + self.connected.set() + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.connected.is_set() and self.connection: + await self.connection.close() + self.connection = None + self.connected.clear() diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py similarity index 100% rename from python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime_utils.py rename to python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 991854987faa..5f6fa302d545 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -28,52 +28,47 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False - send_buffer: Queue[str | tuple[str, Any]] = Field(default_factory=Queue) + send_buffer: Queue[tuple[str, Any]] = Field(default_factory=Queue) receive_buffer: Queue[tuple[str, Any]] = Field(default_factory=Queue) - async def __aenter__(self) -> "RealtimeClientBase": - """Enter the context manager. + async def send(self, event: str, **kwargs: Any) -> None: + """Send an event to the service. - Default implementation calls the create session method. + Args: + event: The event to send. + kwargs: Additional arguments. """ - await self.create_session() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - """Exit the context manager.""" - await self.close_session() + await self.send_buffer.put((event, kwargs)) - @abstractmethod - async def close_session(self) -> None: - """Close the session in the service.""" - pass - - @abstractmethod - async def create_session( + async def start_streaming( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Create a session in the service. + """Start streaming, will start both listening and sending. + + This method, start tasks for both listening and sending. + + The arguments are passed to the start_listening method. Args: settings: Prompt execution settings. chat_history: Chat history. kwargs: Additional arguments. """ - raise NotImplementedError + async with TaskGroup() as tg: + tg.create_task(self.start_listening(settings=settings, chat_history=chat_history, **kwargs)) + tg.create_task(self.start_sending(**kwargs)) @abstractmethod - async def update_session( + async def start_listening( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Update a session in the service. - - Can be used when using the context manager instead of calling create_session with these same arguments. + """Starts listening for messages from the service, adds them to the output_buffer. Args: settings: Prompt execution settings. @@ -82,35 +77,39 @@ async def update_session( """ raise NotImplementedError - async def start_streaming( + @abstractmethod + async def start_sending( + self, + ) -> None: + """Start sending items from the input_buffer to the service.""" + raise NotImplementedError + + @abstractmethod + async def create_session( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Start streaming, will start both listening and sending. - - This method, start tasks for both listening and sending. - - The arguments are passed to the start_listening method. + """Create a session in the service. Args: settings: Prompt execution settings. chat_history: Chat history. kwargs: Additional arguments. """ - async with TaskGroup() as tg: - tg.create_task(self.start_listening(settings=settings, chat_history=chat_history, **kwargs)) - tg.create_task(self.start_sending(**kwargs)) + raise NotImplementedError @abstractmethod - async def start_listening( + async def update_session( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> None: - """Starts listening for messages from the service, adds them to the output_buffer. + """Update a session in the service. + + Can be used when using the context manager instead of calling create_session with these same arguments. Args: settings: Prompt execution settings. @@ -120,11 +119,9 @@ async def start_listening( raise NotImplementedError @abstractmethod - async def start_sending( - self, - ) -> None: - """Start sending items from the input_buffer to the service.""" - raise NotImplementedError + async def close_session(self) -> None: + """Close the session in the service.""" + pass def _update_function_choice_settings_callback( self, @@ -135,3 +132,15 @@ def _update_function_choice_settings_callback( update the settings from a function call configuration. """ return lambda configuration, settings, choice_type: None + + async def __aenter__(self) -> "RealtimeClientBase": + """Enter the context manager. + + Default implementation calls the create session method. + """ + await self.create_session() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Exit the context manager.""" + await self.close_session() From a6d317d538c88c69356878169adddfb778ce68f6 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 20 Jan 2025 16:49:51 +0100 Subject: [PATCH 16/50] fix import --- .../connectors/ai/open_ai/services/open_ai_realtime.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 9ba373cce6ff..076c46396ed7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,8 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. -from ast import TypeVar from collections.abc import Mapping -from typing import Any, ClassVar, Literal +from typing import Any, ClassVar, Literal, TypeVar from openai import AsyncOpenAI from pydantic import ValidationError From b8ff264e4e14e862167a9827eebc835b11aac8c8 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 21 Jan 2025 15:29:21 +0100 Subject: [PATCH 17/50] small optimization in code --- .../audio/04-chat_with_realtime_api.py | 43 ++--- .../ai/open_ai/services/open_ai_realtime.py | 12 +- .../realtime/open_ai_realtime_base.py | 5 +- .../realtime/open_ai_realtime_webrtc.py | 11 +- .../realtime/open_ai_realtime_websocket.py | 9 +- .../connectors/ai/utils/__init__.py | 0 .../ai/{ => utils}/realtime_helpers.py | 148 +++++++++++------- 7 files changed, 130 insertions(+), 98 deletions(-) create mode 100644 python/semantic_kernel/connectors/ai/utils/__init__.py rename python/semantic_kernel/connectors/ai/{ => utils}/realtime_helpers.py (53%) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index af4024e12849..6259d06f7061 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. import asyncio import logging -import signal +from datetime import datetime from random import randint import sounddevice as sd @@ -15,7 +15,7 @@ ) from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.connectors.ai.realtime_helpers import SKSimplePlayer +from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioPlayer from semantic_kernel.contents import ChatHistory from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.functions import kernel_function @@ -61,7 +61,7 @@ class ReceivingStreamHandler: It can also be used to act on other events from the service. """ - def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKSimplePlayer | None = None): + def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKAudioPlayer | None = None): self.audio_player = audio_player self.realtime_client = realtime_client @@ -92,12 +92,6 @@ async def listen( print("\nThanks for talking to Mosscap!") -# this function is used to stop the processes when ctrl + c is pressed -def signal_handler(): - for task in asyncio.all_tasks(): - task.cancel() - - weather_conditions = ["sunny", "hot", "cloudy", "raining", "freezing", "snowing"] @@ -109,20 +103,26 @@ def get_weather(location: str) -> str: return f"The weather in {location} is {weather}." -async def main() -> None: - # setup the asyncio loop with the signal event handler - loop = asyncio.get_event_loop() - loop.add_signal_handler(signal.SIGINT, signal_handler) +@kernel_function +def get_date_time() -> str: + """Get the current date and time.""" + return f"The current date and time is {datetime.now().isoformat()}." + +async def main() -> None: # create the Kernel and add a simple function for function calling. kernel = Kernel() kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) + kernel.add_function(plugin_name="time", function_name="get_date_time", function=get_date_time) # create the realtime client and optionally add the audio output function, this is optional - audio_player = SKSimplePlayer() - realtime_client = OpenAIRealtime(protocol="webrtc", audio_output=audio_player.realtime_client_callback) + audio_player = SKAudioPlayer() + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + realtime_client = OpenAIRealtime(protocol="webrtc", audio_output_callback=audio_player.client_callback) - # create stream receiver, this can play the audio, if the audio_player is passed + # create stream receiver (defined above), this can play the audio, + # if the audio_player is passed (commented out here) # and allows you to print the transcript of the conversation # and review or act on other events from the service stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None) @@ -148,7 +148,7 @@ async def main() -> None: settings = OpenAIRealtimeExecutionSettings( instructions=instructions, - voice="sage", + voice="alloy", turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) @@ -157,11 +157,12 @@ async def main() -> None: await realtime_client.update_session( settings=settings, chat_history=chat_history, kernel=kernel, create_response=True ) - # you can also send other events to the service, like this - # await realtime_client.send_buffer.put(( + # you can also send other events to the service, like this (the first has content, the second does not) + # await realtime_client.send( # SendEvents.CONVERSATION_ITEM_CREATE, - # {"item": ChatMessageContent(role="user", content="Hi there, who are you?")}, - # )) + # item=ChatMessageContent(role="user", content="Hi there, who are you?")}, + # ) + # await realtime_client.send(SendEvents.RESPONSE_CREATE) async with asyncio.TaskGroup() as tg: tg.create_task(realtime_client.start_streaming()) tg.create_task(stream_handler.listen()) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 076c46396ed7..1a7c5acc330d 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,8 +1,9 @@ # Copyright (c) Microsoft. All rights reserved. -from collections.abc import Mapping +from collections.abc import Callable, Coroutine, Mapping from typing import Any, ClassVar, Literal, TypeVar +from numpy import ndarray from openai import AsyncOpenAI from pydantic import ValidationError @@ -31,6 +32,7 @@ def __new__(cls: type["_T"], *args: Any, **kwargs: Any) -> "_T": def __init__( self, protocol: Literal["websocket", "webrtc"] = "websocket", + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, ai_model_id: str | None = None, api_key: str | None = None, org_id: str | None = None, @@ -45,6 +47,13 @@ def __init__( Args: protocol: The protocol to use, can be either "websocket" or "webrtc". + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible. + It is called first in both websockets and webrtc. + Even when passed, the audio content will still be + added to the receiving queue. ai_model_id (str | None): OpenAI model name, see https://platform.openai.com/docs/models service_id (str | None): Service ID tied to the execution settings. @@ -74,6 +83,7 @@ def __init__( raise ServiceInitializationError("The OpenAI text model ID is required.") super().__init__( protocol=protocol, + audio_output_callback=audio_output_callback, ai_model_id=openai_settings.realtime_model_id, service_id=service_id, api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 9f72ee1fd5d1..2865138cf0bb 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -10,6 +10,7 @@ else: from typing_extensions import override # pragma: no cover +from numpy import ndarray from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent from openai.types.beta.realtime.response_function_call_arguments_done_event import ( ResponseFunctionCallArgumentsDoneEvent, @@ -49,7 +50,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - audio_output: Callable[[Any], Coroutine[Any, Any, None] | None] | None = None + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None kernel: Kernel | None = None _current_settings: PromptExecutionSettings | None = PrivateAttr(None) @@ -60,8 +61,6 @@ async def _handle_event(self, event: RealtimeServerEvent) -> None: Audio delta has to be handled by the implementation of the protocol as some protocols have different ways of handling audio. - - """ match event.type: case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 583e34bfd997..1cfd68db0aaa 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -5,8 +5,6 @@ import json import logging import sys -from collections.abc import Callable, Coroutine -from inspect import isawaitable from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase @@ -55,7 +53,6 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): protocol: ClassVar[Literal["webrtc"]] = "webrtc" peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None - audio_output: Callable[[AudioFrame], Coroutine[Any, Any, None] | None] | None = None # region public methods @@ -168,7 +165,7 @@ async def create_session( ) -> None: """Create a session in the service.""" if not audio_track: - from semantic_kernel.connectors.ai.realtime_helpers import SKAudioTrack + from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioTrack audio_track = SKAudioTrack() @@ -245,10 +242,8 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: break try: - if self.audio_output: - out = self.audio_output(frame) - if isawaitable(out): - await out + if self.audio_output_callback: + await self.audio_output_callback(frame.to_ndarray()) except Exception as e: logger.error(f"Error playing remote audio frame: {e!s}") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 95ff1ab3a6b8..85048a4bfaef 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -5,9 +5,10 @@ import json import logging import sys -from inspect import isawaitable from typing import TYPE_CHECKING, Any, ClassVar, Literal +import numpy as np + from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase if sys.version_info >= (3, 12): @@ -62,10 +63,8 @@ async def start_listening( async for event in self.connection: if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: - if self.audio_output: - out = self.audio_output(event) - if isawaitable(out): - await out + if self.audio_output_callback: + await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) try: await self.receive_buffer.put(( event.type, diff --git a/python/semantic_kernel/connectors/ai/utils/__init__.py b/python/semantic_kernel/connectors/ai/utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/semantic_kernel/connectors/ai/realtime_helpers.py b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py similarity index 53% rename from python/semantic_kernel/connectors/ai/realtime_helpers.py rename to python/semantic_kernel/connectors/ai/utils/realtime_helpers.py index b89988f90ab3..dd6d0e5fe16f 100644 --- a/python/semantic_kernel/connectors/ai/realtime_helpers.py +++ b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py @@ -5,6 +5,7 @@ from typing import Any, Final import numpy as np +import numpy.typing as npt from aiortc.mediastreams import MediaStreamError, MediaStreamTrack from av.audio.frame import AudioFrame from av.frame import Frame @@ -20,25 +21,25 @@ TRACK_CHANNELS: Final[int] = 1 PLAYER_CHANNELS: Final[int] = 2 FRAME_DURATION: Final[int] = 20 -DTYPE: Final[np.dtype] = np.int16 +DTYPE: Final[npt.DTypeLike] = np.int16 class SKAudioTrack(KernelBaseModel, MediaStreamTrack): - """A simple class using sounddevice to record audio from the default input device. + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. - And implementing the MediaStreamTrack interface for use with aiortc. + Make sure the device_id is set to the correct device for your system. """ kind: str = "audio" sample_rate: int = SAMPLE_RATE channels: int = TRACK_CHANNELS frame_duration: int = FRAME_DURATION - dtype: np.dtype = DTYPE + dtype: npt.DTypeLike = DTYPE device: str | int | None = None queue: asyncio.Queue[Frame] = Field(default_factory=asyncio.Queue) is_recording: bool = False - stream: InputStream | None = None frame_size: int = 0 + _stream: InputStream | None = None _recording_task: asyncio.Task | None = None _loop: asyncio.AbstractEventLoop | None = None _pts: int = 0 # Add this to track the pts @@ -62,11 +63,36 @@ async def recv(self) -> Frame: self._recording_task = asyncio.create_task(self.start_recording()) try: - return await self.queue.get() + frame = await self.queue.get() + self.queue.task_done() + return frame except Exception as e: logger.error(f"Error receiving audio frame: {e!s}") raise MediaStreamError("Failed to receive audio frame") + def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, status: Any) -> None: + if status: + logger.warning(f"Audio input status: {status}") + if self._loop and self._loop.is_running(): + asyncio.run_coroutine_threadsafe(self.queue.put(self._create_frame(indata)), self._loop) + + def _create_frame(self, indata: np.ndarray) -> Frame: + audio_data = indata.copy() + if audio_data.dtype != self.dtype: + audio_data = ( + (audio_data * 32767).astype(self.dtype) if self.dtype == np.int16 else audio_data.astype(self.dtype) + ) + frame = AudioFrame( + format="s16", + layout="mono", + samples=len(audio_data), + ) + frame.rate = self.sample_rate + frame.pts = self._pts + frame.planes[0].update(audio_data.tobytes()) + self._pts += len(audio_data) + return frame + async def start_recording(self): """Start recording audio from the input device.""" if self.is_recording: @@ -77,39 +103,15 @@ async def start_recording(self): self._pts = 0 # Reset pts when starting recording try: - - def callback(indata: np.ndarray, frames: int, time: Any, status: Any) -> None: - if status: - logger.warning(f"Audio input status: {status}") - - audio_data = indata.copy() - if audio_data.dtype != self.dtype: - if self.dtype == np.int16: - audio_data = (audio_data * 32767).astype(self.dtype) - else: - audio_data = audio_data.astype(self.dtype) - - frame = AudioFrame( - format="s16", - layout="mono", - samples=len(audio_data), - ) - frame.rate = self.sample_rate - frame.pts = self._pts - frame.planes[0].update(audio_data.tobytes()) - self._pts += len(audio_data) - if self._loop and self._loop.is_running(): - asyncio.run_coroutine_threadsafe(self.queue.put(frame), self._loop) - - self.stream = InputStream( + self._stream = InputStream( device=self.device, channels=self.channels, samplerate=self.sample_rate, dtype=self.dtype, blocksize=self.frame_size, - callback=callback, + callback=self._sounddevice_callback, ) - self.stream.start() + self._stream.start() while self.is_recording: await asyncio.sleep(0.1) @@ -121,7 +123,7 @@ def callback(indata: np.ndarray, frames: int, time: Any, status: Any) -> None: self.is_recording = False -class SKSimplePlayer(KernelBaseModel): +class SKAudioPlayer(KernelBaseModel): """Simple class that plays audio using sounddevice. Make sure the device_id is set to the correct device for your system. @@ -132,22 +134,12 @@ class SKSimplePlayer(KernelBaseModel): device_id: int | None = None sample_rate: int = SAMPLE_RATE + dtype: npt.DTypeLike = DTYPE channels: int = PLAYER_CHANNELS frame_duration_ms: int = FRAME_DURATION - queue: asyncio.Queue[np.ndarray] = Field(default_factory=asyncio.Queue) + _queue: asyncio.Queue[np.ndarray] | None = None _stream: OutputStream | None = PrivateAttr(None) - def model_post_init(self, __context: Any) -> None: - """Initialize the audio stream.""" - self._stream = OutputStream( - callback=self.callback, - samplerate=self.sample_rate, - channels=self.channels, - dtype=np.int16, - blocksize=int(self.sample_rate * self.frame_duration_ms / 1000), - device=self.device_id, - ) - async def __aenter__(self): """Start the audio stream when entering a context.""" self.start() @@ -159,32 +151,68 @@ async def __aexit__(self, exc_type, exc, tb): def start(self): """Start the audio stream.""" - if self._stream: + self._queue = asyncio.Queue() + self._stream = OutputStream( + callback=self._sounddevice_callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.dtype, + blocksize=int(self.sample_rate * self.frame_duration_ms / 1000), + device=self.device_id, + ) + if self._stream and self._queue: self._stream.start() def stop(self): """Stop the audio stream.""" if self._stream: self._stream.stop() + self._stream = None + self._queue = None - def callback(self, outdata, frames, time, status): + def _sounddevice_callback(self, outdata, frames, time, status): """This callback is called by sounddevice when it needs more audio data to play.""" if status: logger.info(f"Audio output status: {status}") - if self.queue.empty(): - return - data: np.ndarray = self.queue.get_nowait() - outdata[:] = data.reshape(outdata.shape) - - async def realtime_client_callback(self, frame: AudioFrame): - """This function is used by the RealtimeClientBase to play audio.""" - await self.queue.put(frame.to_ndarray()) + if self._queue: + if self._queue.empty(): + return + data: np.ndarray = self._queue.get_nowait() + outdata[:] = data.reshape(outdata.shape) + self._queue.task_done() + + async def client_callback(self, content: np.ndarray): + """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" + if self._queue: + await self._queue.put(content) + else: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) - async def add_audio(self, audio_content: AudioContent): + async def add_audio(self, audio_content: AudioContent) -> None: """This function is used to add audio to the queue for playing. - It uses a shortcut for this sample, because we know a AudioFrame is in the inner_content field. + It first checks if there is a AudioFrame in the inner_content of the AudioContent. + If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. """ + if not self._queue: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + return if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): - await self.queue.put(audio_content.inner_content.to_ndarray()) - # TODO (eavanvalkenburg): check ndarray + await self._queue.put(audio_content.inner_content.to_ndarray()) + return + if isinstance(audio_content.data, np.ndarray): + await self._queue.put(audio_content.data) + return + if isinstance(audio_content.data, bytes): + await self._queue.put(np.frombuffer(audio_content.data, dtype=self.dtype)) + return + if isinstance(audio_content.data, str): + await self._queue.put(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) + return + logger.error(f"Unknown audio content: {audio_content}") From 89f1988f9e63fb197e411c53c4abad509a9a3258 Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Wed, 22 Jan 2025 13:09:30 +0100 Subject: [PATCH 18/50] updates to the ADR --- docs/decisions/00XX-realtime-api-clients.md | 49 ++++++++++----------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 6fcf0972aea2..a51744d9d400 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -12,25 +12,26 @@ informed: ## Context and Problem Statement -Multiple model providers are starting to enable realtime voice-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. The key addition that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. There are also options for Google to use video and images as input, so really it is multimodal, but for now we are focusing on the voice-to-voice part, while keeping in mind that video is coming. +Multiple model providers are starting to enable realtime voice-to-voice or even multi-model-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. + The key feature that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. There are also options for Google to use video and images as input, but for now we are focusing on the voice-to-voice part, while keeping in mind that video is coming. -The way these API's work at this time is through either Websockets or WebRTC. +The protocols that these API's use at this time are Websockets and WebRTC. In both cases there are events being sent to and from the service, some events contain content, text, audio, or video (so far only sending, not receiving), while some events are "control" events, like content created, function call requested, etc. Sending events include, sending content, either voice, text or function call output, or events, like committing the input audio and requesting a response. ### Websocket -Websocket has been around for a while and is a well known technology, it is a full-duplex communication protocol over a single, long-lived connection. It is used for sending and receiving messages between client and server in real-time. Each event can contain a message, which might contain a content item, or a control event. +Websocket has been around for a while and is a well known technology, it is a full-duplex communication protocol over a single, long-lived connection. It is used for sending and receiving messages between client and server in real-time. Each event can contain a message, which might contain a content item, or a control event. Audio is sent as a base64 encoded string. ### WebRTC -WebRTC is a Mozilla project that provides web browsers and mobile applications with real-time communication via simple application programming interfaces (APIs). It allows audio and video communication to work inside web pages by allowing direct peer-to-peer communication, eliminating the need to install plugins or download native apps. It is used for sending and receiving audio and video streams, and can be used for sending messages as well. The big difference compared to websockets is that it does explicitly create a channel for audio and video, and a separate channel for "data", which are events but also things like Function calls. +WebRTC is a Mozilla project that provides web browsers and mobile applications with real-time communication via simple application programming interfaces (APIs). It allows audio and video communication to work inside web pages and other applications by allowing direct peer-to-peer communication, eliminating the need to install plugins or download native apps. It is used for sending and receiving audio and video streams, and can be used for sending (data-)messages as well. The big difference compared to websockets is that it explicitly create a channel for audio and video, and a separate channel for "data", which are events but in this space also things like Function calls. Both the OpenAI and Google realtime api's are in preview/beta, this means there might be breaking changes in the way they work coming in the future, therefore the clients built to support these API's are going to be experimental until the API's stabilize. One feature that we need to consider if and how to deal with is whether or not a service uses Voice Activated Detection, OpenAI supports turning that off and allows parameters for how it behaves, while Google has it on by default and it cannot be configured. -### Event types (websocket and partially webrtc) +### Event types (Websocket and partially WebRTC) -Client side events: +#### Client side events: | **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | | ------------------------- | --------------------------------- | ---------------------------- | ---------------------------------- | | Control | Configure session | `session.update` | `BidiGenerateContentSetup` | @@ -44,7 +45,7 @@ Client side events: | Control | Ask for response | `response.create` | `-` | | Control | Cancel response | `response.cancel` | `-` | -Server side events: +#### Server side events: | **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | | ------------------------- | -------------------------------------- | ------------------------------------------------------- | ----------------------------------------- | | Control | Error | `error` | `-` | @@ -78,13 +79,10 @@ Server side events: | Control | Rate limits updated | `rate_limits.updated` | `-` | -## Decision Drivers -- Simple programming model that is likely able to handle future realtime api's and evolution of the existing ones. +## Overall Decision Drivers +- Simple programming model that is likely able to handle future realtime api's and the evolution of the existing ones. - Whenever possible we transform incoming content into Semantic Kernel content, but surface everything, so it's extensible -- Protocol agnostic, should be able to use different types of protocols under the covers, like websocket and WebRTC, without changing the client code (unless the protocol requires it). - -## Decision driver questions -- For WebRTC, a audio device can be passed, should this be a requirement for the client also for websockets? +- Protocol agnostic, should be able to use different types of protocols under the covers, like websocket and WebRTC, without changing the client code (unless the protocol requires it), there will be slight differences in behavior depending on the protocol. There are multiple areas where we need to make decisions, these are: - Content and Events @@ -94,7 +92,7 @@ There are multiple areas where we need to make decisions, these are: # Content and Events ## Considered Options - Content and Events -Both the sending and receiving side of these integrations need to decide how to deal with the api's. +Both the sending and receiving side of these integrations need to decide how to deal with the events. 1. Treat content events separate from control events 1. Treat everything as content items @@ -163,6 +161,16 @@ This would mean that the there are two queues, one for sending and one for recei - Con: - potentially causes audio delays because of the queueing mechanism +### 2b. Same as option 2, but with priority handling of audio content +This would mean that the audio content is handled, and passed to the developer code, and then all other events are processed. + +- Pro: + - mitigates audio delays + - easy to understand, as queues are a well known concept + - developers can just skip events they are not interested in +- Con: + - Two separate mechanisms used for audio content and events + ## Decision Outcome - Programming model Chosen option: ... @@ -172,7 +180,7 @@ Chosen option: ... ## Considered Options - Audio speaker/microphone handling 1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio -2. Send and receive AudioContent (wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing +2. Send and receive AudioContent (potentially wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing ### 1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio This would mean that the client would have a mechanism to register audio handlers, and the integration would call these handlers when audio is received or needs to be sent. A additional abstraction for this would have to be created in Semantic Kernel (or potentially taken from a standard). @@ -191,17 +199,8 @@ This would mean that the client would receive AudioContent items, and would have - no extra code in SK that needs to be maintained - Con: - extra burden on the developer to deal with the audio + - harder to get started with ## Decision Outcome - Audio speaker/microphone handling Chosen option: ... - - - -## More Information - -{You might want to provide additional evidence/confidence for the decision outcome here and/or -document the team agreement on the decision and/or -define when this decision when and how the decision should be realized and if/when it should be re-visited and/or -how the decision is validated. -Links to other decisions and resources might appear here as well.} From ee1ce022c3c28e363eac8823359e37d38e9de7ff Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 23 Jan 2025 11:06:40 +0100 Subject: [PATCH 19/50] import improvements --- .../audio/04-chat_with_realtime_api.py | 44 ++++++++++--------- .../connectors/ai/open_ai/__init__.py | 3 ++ .../connectors/ai/utils/__init__.py | 5 +++ 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/04-chat_with_realtime_api.py index 6259d06f7061..a1884349b3e7 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api.py @@ -1,23 +1,21 @@ # Copyright (c) Microsoft. All rights reserved. + import asyncio import logging from datetime import datetime from random import randint -import sounddevice as sd - from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( + ListenEvents, OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioPlayer -from semantic_kernel.contents import ChatHistory -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.connectors.ai.utils import SKAudioPlayer +from semantic_kernel.contents import ChatHistory, StreamingChatMessageContent from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) @@ -47,7 +45,9 @@ def check_audio_devices(): - logger.info(sd.query_devices()) + import sounddevice as sd + + logger.debug(sd.query_devices()) check_audio_devices() @@ -87,25 +87,26 @@ async def listen( case ListenEvents.RESPONSE_CREATED: if print_transcript: print("") + # case ....: + # # add other event handling here await asyncio.sleep(0.01) except asyncio.CancelledError: print("\nThanks for talking to Mosscap!") -weather_conditions = ["sunny", "hot", "cloudy", "raining", "freezing", "snowing"] - - @kernel_function def get_weather(location: str) -> str: """Get the weather for a location.""" - weather = weather_conditions[randint(0, len(weather_conditions))] # nosec - logger.warning(f"Getting weather for {location}: {weather}") + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + logger.info(f"Getting weather for {location}: {weather}") return f"The weather in {location} is {weather}." @kernel_function def get_date_time() -> str: """Get the current date and time.""" + logger.info("Getting current datetime") return f"The current date and time is {datetime.now().isoformat()}." @@ -128,10 +129,6 @@ async def main() -> None: stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None) # Create the settings for the session - # the key thing to decide on is to enable the server_vad turn detection - # if turn is turned off (by setting turn_detection=None), you will have to send - # the "input_audio_buffer.commit" and "response.create" event to the realtime api - # to signal the end of the user's turn and start the response. # The realtime api, does not use a system message, but takes instructions as a parameter for a session instructions = """ You are a chat bot. Your name is Mosscap and @@ -141,17 +138,22 @@ async def main() -> None: effectively, but you tend to answer with long flowery prose. """ - # and we can add a chat history to conversation after starting it - chat_history = ChatHistory() - chat_history.add_user_message("Hi there, who are you?") - chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") - + # the key thing to decide on is to enable the server_vad turn detection + # if turn is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample settings = OpenAIRealtimeExecutionSettings( instructions=instructions, voice="alloy", turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) + # and we can add a chat history to conversation after starting it + chat_history = ChatHistory() + chat_history.add_user_message("Hi there, who are you?") + chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + # the context manager calls the create_session method on the client and start listening to the audio stream async with realtime_client, audio_player: await realtime_client.update_session( diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 27d36ea30d34..4241ec1e49f3 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -45,6 +45,7 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings @@ -68,6 +69,7 @@ "DataSourceFieldsMapping", "DataSourceFieldsMapping", "ExtraBody", + "ListenEvents", "OpenAIAudioToText", "OpenAIAudioToTextExecutionSettings", "OpenAIChatCompletion", @@ -84,5 +86,6 @@ "OpenAITextToAudioExecutionSettings", "OpenAITextToImage", "OpenAITextToImageExecutionSettings", + "SendEvents", "TurnDetection", ] diff --git a/python/semantic_kernel/connectors/ai/utils/__init__.py b/python/semantic_kernel/connectors/ai/utils/__init__.py index e69de29bb2d1..2cd59106a8a0 100644 --- a/python/semantic_kernel/connectors/ai/utils/__init__.py +++ b/python/semantic_kernel/connectors/ai/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Microsoft. All rights reserved. + +from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioPlayer, SKAudioTrack + +__all__ = ["SKAudioPlayer", "SKAudioTrack"] From da370c3212a3c72276ca615e864b4296fc17003f Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 28 Jan 2025 16:02:31 +0100 Subject: [PATCH 20/50] updated code and ADR --- docs/decisions/00XX-realtime-api-clients.md | 41 ++++++++- .../audio/04-chat_with_realtime_api_simple.py | 88 ++++++++++++++++++ ...y => 05-chat_with_realtime_api_complex.py} | 73 +++++---------- .../realtime/open_ai_realtime_base.py | 92 +++++++++++-------- .../realtime/open_ai_realtime_webrtc.py | 62 +++++++------ .../realtime/open_ai_realtime_websocket.py | 45 ++++----- .../connectors/ai/realtime_client_base.py | 19 ++-- .../contents/realtime_event.py | 56 +++++++++++ 8 files changed, 323 insertions(+), 153 deletions(-) create mode 100644 python/samples/concepts/audio/04-chat_with_realtime_api_simple.py rename python/samples/concepts/audio/{04-chat_with_realtime_api.py => 05-chat_with_realtime_api_complex.py} (68%) create mode 100644 python/semantic_kernel/contents/realtime_event.py diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index a51744d9d400..94c7351ba7fe 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -128,7 +128,30 @@ This would mean that all events are retained and returned to the developer as is ## Decision Outcome - Content and Events -Chosen option: ... +Chosen option: 3 Treat Everything as Events + +This option was chosen to allow abstraction away from the raw events, while still allowing the developer to access the raw events if needed. This allows for a simple programming model, while still allowing for complex interactions. +A set of events are defined, for basic types, like 'audio', 'text', 'function_call', 'function_result', it then has two other fields, service_event which is filled with the event type from the service and a field for the actual content, with a name that makes sense: + +```python +AudioEvent( + event_type="audio", + service_event= "response.audio.delta", + audio: AudioContent(...) +) +``` + +Next to these we will have a generic event, called ServiceEvent, this is the catch-all, which has event_type: "service", the service_event field filled with the event type from the service and a field called 'event' which contains the raw event from the service. + +```python +ServiceEvent( + event_type="service", + service_event= "conversation.item.create", + event: { ... } +) +``` + +This allows you to easily filter on the event_type, and then use the service_event to filter on the specific event type, and then use the content field to get the content, or the event field to get the raw event. # Programming model @@ -137,10 +160,11 @@ The programming model for the clients needs to be simple and easy to use, while _In this section we will refer to events for both content and events, regardless of the decision made in the previous section._ -1. Async generator for receiving events, that yields contents, combined with a event handler/callback mechanism for receiving events and a function for sending events +1. Async generator for receiving events, that yields Events, combined with a event handler/callback mechanism for receiving events and a function for sending events - 1a: Single event handlers, where each event is passed to the handler - 1b: Multiple event handlers, where each event type has its own handler 2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers +3. Purely a start listening method that yields Events, and a send method that sends events ### 1. Async generator for receiving events, that yields contents, combined with a event handler/callback mechanism for receiving events and a function for sending events This would mean that the client would have a mechanism to register event handlers, and the integration would call these handlers when an event is received. For sending events, a function would be created that sends the event to the service. @@ -173,7 +197,18 @@ This would mean that the audio content is handled, and passed to the developer c ## Decision Outcome - Programming model -Chosen option: ... +Chosen option: Purely a start listening method that yields Events, and a send method that sends events + +This makes the programming model very easy, a minimal setup that should work for every service and protocol would look like this: +```python +async for event in realtime_client.start_streaming(): + match event.event_type: + case "audio": + await audio_player.add_audio(event.audio) + case "text": + print(event.text.text) +``` + # Audio speaker/microphone handling diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py new file mode 100644 index 000000000000..116f9e3f8d81 --- /dev/null +++ b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py @@ -0,0 +1,88 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging + +from semantic_kernel.connectors.ai.open_ai import ( + OpenAIRealtime, + OpenAIRealtimeExecutionSettings, + TurnDetection, +) +from semantic_kernel.connectors.ai.utils import SKAudioPlayer + +logging.basicConfig(level=logging.WARNING) +aiortc_log = logging.getLogger("aiortc") +aiortc_log.setLevel(logging.WARNING) +aioice_log = logging.getLogger("aioice") +aioice_log.setLevel(logging.WARNING) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# This simple sample demonstrates how to use the OpenAI Realtime API to create +# a chat bot that can listen and respond directly through audio. +# It requires installing: +# - semantic-kernel[openai_realtime] +# - pyaudio +# - sounddevice +# - pydub +# - aiortc +# e.g. pip install pyaudio sounddevice pydub + +# The characterics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can check the available devices by uncommenting line below the function + + +def check_audio_devices(): + import sounddevice as sd + + logger.debug(sd.query_devices()) + + +check_audio_devices() + + +async def main() -> None: + # create the realtime client and optionally add the audio output function, this is optional + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + realtime_client = OpenAIRealtime(protocol="webrtc") + # Create the settings for the session + settings = OpenAIRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + voice="alloy", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + ) + # the context manager calls the create_session method on the client and start listening to the audio stream + audio_player = SKAudioPlayer() + async with realtime_client, audio_player: + await realtime_client.update_session(settings=settings, create_response=True) + async for event in realtime_client.start_streaming(): + match event.event_type: + case "audio": + await audio_player.add_audio(event.audio) + case "text": + print(event.text.text) + case "service": + if event.service_type == "session.update": + print("Session updated") + if event.service_type == "error": + logger.error(event.event) + + +if __name__ == "__main__": + print( + "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api.py b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py similarity index 68% rename from python/samples/concepts/audio/04-chat_with_realtime_api.py rename to python/samples/concepts/audio/05-chat_with_realtime_api_complex.py index a1884349b3e7..18785b81348d 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api.py +++ b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py @@ -13,9 +13,8 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.connectors.ai.utils import SKAudioPlayer -from semantic_kernel.contents import ChatHistory, StreamingChatMessageContent +from semantic_kernel.contents import ChatHistory from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) @@ -53,47 +52,6 @@ def check_audio_devices(): check_audio_devices() -class ReceivingStreamHandler: - """This is a simple class that listens to the received buffer of the RealtimeClientBase. - - It can be used to play audio and print the transcript of the conversation. - - It can also be used to act on other events from the service. - """ - - def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKAudioPlayer | None = None): - self.audio_player = audio_player - self.realtime_client = realtime_client - - async def listen( - self, - play_audio: bool = True, - print_transcript: bool = True, - ) -> None: - # print the start message of the transcript - if print_transcript: - print("Mosscap (transcript): ", end="") - try: - # start listening for events - while True: - event_type, event = await self.realtime_client.receive_buffer.get() - match event_type: - case ListenEvents.RESPONSE_AUDIO_DELTA: - if play_audio and self.audio_player and isinstance(event, StreamingChatMessageContent): - await self.audio_player.add_audio(event.items[0]) - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA: - if print_transcript and isinstance(event, StreamingChatMessageContent): - print(event.content, end="") - case ListenEvents.RESPONSE_CREATED: - if print_transcript: - print("") - # case ....: - # # add other event handling here - await asyncio.sleep(0.01) - except asyncio.CancelledError: - print("\nThanks for talking to Mosscap!") - - @kernel_function def get_weather(location: str) -> str: """Get the weather for a location.""" @@ -111,6 +69,7 @@ def get_date_time() -> str: async def main() -> None: + print_transcript = True # create the Kernel and add a simple function for function calling. kernel = Kernel() kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) @@ -122,12 +81,6 @@ async def main() -> None: # they will behave the same way, even though the underlying protocol is quite different realtime_client = OpenAIRealtime(protocol="webrtc", audio_output_callback=audio_player.client_callback) - # create stream receiver (defined above), this can play the audio, - # if the audio_player is passed (commented out here) - # and allows you to print the transcript of the conversation - # and review or act on other events from the service - stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None) - # Create the settings for the session # The realtime api, does not use a system message, but takes instructions as a parameter for a session instructions = """ @@ -165,9 +118,25 @@ async def main() -> None: # item=ChatMessageContent(role="user", content="Hi there, who are you?")}, # ) # await realtime_client.send(SendEvents.RESPONSE_CREATE) - async with asyncio.TaskGroup() as tg: - tg.create_task(realtime_client.start_streaming()) - tg.create_task(stream_handler.listen()) + print("Mosscap (transcript): ", end="") + async for event in realtime_client.start_streaming(): + match event.event_type: + # case "audio": + # if play_audio and audio_player: + # await audio_player.add_audio(event.audio) + case "text": + if print_transcript: + print(event.text.text, end="") + case "service": + # OpenAI Specific events + match event.service_type: + case ListenEvents.RESPONSE_CREATED: + if print_transcript: + print("") + case ListenEvents.ERROR: + logger.error(event.event) + # case ....: + # # add other event handling here if __name__ == "__main__": diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 2865138cf0bb..5f2b49020fb6 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -2,9 +2,19 @@ import logging import sys -from collections.abc import Callable, Coroutine +from collections.abc import AsyncGenerator, Callable, Coroutine from typing import TYPE_CHECKING, Any, ClassVar, Literal +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.realtime_event import ( + FunctionCallEvent, + FunctionResultEvent, + RealtimeEvent, + ServiceEvent, + TextEvent, +) +from semantic_kernel.contents.streaming_text_content import StreamingTextContent + if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: @@ -31,8 +41,6 @@ from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent -from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.kernel import Kernel from semantic_kernel.utils.experimental_decorator import experimental_class @@ -56,7 +64,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): _current_settings: PromptExecutionSettings | None = PrivateAttr(None) _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) - async def _handle_event(self, event: RealtimeServerEvent) -> None: + async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvent, None]: """Handle all events but audio delta. Audio delta has to be handled by the implementation of the protocol as some @@ -64,38 +72,35 @@ async def _handle_event(self, event: RealtimeServerEvent) -> None: """ match event.type: case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: - await self.receive_buffer.put(( - event.type, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - content=event.delta, - choice_index=event.content_index, + yield TextEvent( + event_type="text", + service_type=event.type, + text=StreamingTextContent( inner_content=event, + text=event.delta, + choice_index=0, ), - )) + ) case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: if event.item.type == "function_call" and event.item.call_id and event.item.name: self._call_id_to_function_map[event.item.call_id] = event.item.name case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: - await self.receive_buffer.put(( - event.type, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - FunctionCallContent( - id=event.item_id, - name=event.call_id, - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - ], - choice_index=0, + yield FunctionCallEvent( + event_type="function_call", + service_type=event.type, + function_call=FunctionCallContent( + id=event.item_id, + name=self._call_id_to_function_map[event.call_id], + arguments=event.delta, + index=event.output_index, + metadata={"call_id": event.call_id}, inner_content=event, ), - )) + ) case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: - await self._handle_function_call_arguments_done(event) + async for parsed_event in self._parse_function_call_arguments_done(event): + if parsed_event: + yield parsed_event case ListenEvents.ERROR.value: logger.error("Error received: %s", event.error) case ListenEvents.SESSION_CREATED.value, ListenEvents.SESSION_UPDATED.value: @@ -105,7 +110,7 @@ async def _handle_event(self, event: RealtimeServerEvent) -> None: # we put all event in the output buffer, but after the interpreted one. # so when dealing with them, make sure to check the type of the event, since they # might be of different types. - await self.receive_buffer.put((event.type, event)) + yield ServiceEvent(event_type="service", service_type=event.type, event=event) @override async def update_session( @@ -126,12 +131,16 @@ async def update_session( self._update_function_choice_settings_callback(), kernel=self.kernel, # type: ignore ) - await self.send(SendEvents.SESSION_UPDATE, settings=self._current_settings) + await self.send( + ServiceEvent(event_type="service", service_type=SendEvents.SESSION_UPDATE, event=self._current_settings) + ) if chat_history and len(chat_history) > 0: for msg in chat_history.messages: - await self.send(SendEvents.CONVERSATION_ITEM_CREATE, item=msg) + await self.send( + ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=msg) + ) if create_response: - await self.send(SendEvents.RESPONSE_CREATE) + await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE)) @override def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: @@ -147,20 +156,22 @@ def _update_function_choice_settings_callback( ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: return update_settings_from_function_call_configuration - async def _handle_function_call_arguments_done( + async def _parse_function_call_arguments_done( self, event: ResponseFunctionCallArgumentsDoneEvent, - ) -> None: + ) -> AsyncGenerator[RealtimeEvent | None]: """Handle response function call done.""" if not self.kernel or ( self._current_settings and self._current_settings.function_choice_behavior and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions ): + yield None return plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) if not plugin_name or not function_name: logger.error("Function call needs to have a plugin name and function name") + yield None return item = FunctionCallContent( id=event.item_id, @@ -170,15 +181,22 @@ async def _handle_function_call_arguments_done( index=event.output_index, metadata={"call_id": event.call_id}, ) + yield FunctionCallEvent( + event_type="function_call", + service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, + function_call=item, + ) chat_history = ChatHistory() await self.kernel.invoke_function_call(item, chat_history) - created_output = chat_history.messages[-1] + created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore # This returns the output to the service - await self.send(SendEvents.CONVERSATION_ITEM_CREATE, item=created_output) + await self.send( + ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=created_output) + ) # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send(SendEvents.RESPONSE_CREATE) + await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE)) # This allows a user to have a full conversation in his code - await self.receive_buffer.put((ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, created_output)) + yield FunctionResultEvent(event_type="function_result", function_result=created_output) @override async def start_listening( diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 1cfd68db0aaa..1225c2927345 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -5,9 +5,10 @@ import json import logging import sys +from collections.abc import AsyncGenerator from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase +from pydantic import Field if sys.version_info >= (3, 12): from typing import override # pragma: no cover @@ -28,12 +29,13 @@ from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeEvent from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent +from semantic_kernel.contents.realtime_event import AudioEvent from semantic_kernel.contents.text_content import TextContent -from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -53,6 +55,7 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): protocol: ClassVar[Literal["webrtc"]] = "webrtc" peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None + receive_buffer: asyncio.Queue[RealtimeEvent] = Field(default_factory=asyncio.Queue) # region public methods @@ -63,9 +66,12 @@ async def start_listening( chat_history: "ChatHistory | None" = None, create_response: bool = False, **kwargs: Any, - ) -> None: + ) -> AsyncGenerator[RealtimeEvent, None]: if chat_history or settings or create_response: await self.update_session(settings=settings, chat_history=chat_history, create_response=create_response) + while True: + event = await self.receive_buffer.get() + yield event @override async def start_sending(self, **kwargs: Any) -> None: @@ -75,20 +81,18 @@ async def start_sending(self, **kwargs: Any) -> None: while self.data_channel.readyState != "open": await asyncio.sleep(0.1) while True: - event, data = await self.send_buffer.get() - if not isinstance(event, SendEvents): - event = SendEvents(event) - response: dict[str, Any] = {"type": event.value} - match event: + event = await self.send_buffer.get() + response: dict[str, Any] = {"type": event.event_type} + match event.event_type: case SendEvents.SESSION_UPDATE: - if "settings" not in data: + if "settings" not in event.data: logger.error("Event data does not contain 'settings'") - response["session"] = data["settings"].prepare_settings_dict() + response["session"] = event.data["settings"].prepare_settings_dict() case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in data: + if "item" not in event.data: logger.error("Event data does not contain 'item'") return - content = data["item"] + content = event.data["item"] for item in content.items: match item: case TextContent(): @@ -131,24 +135,24 @@ async def start_sending(self, **kwargs: Any) -> None: ) case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in data: + if "item_id" not in event.data: logger.error("Event data does not contain 'item_id'") return - response["item_id"] = data["item_id"] + response["item_id"] = event.data["item_id"] response["content_index"] = 0 - response["audio_end_ms"] = data.get("audio_end_ms", 0) + response["audio_end_ms"] = event.data.get("audio_end_ms", 0) case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in data: + if "item_id" not in event.data: logger.error("Event data does not contain 'item_id'") return - response["item_id"] = data["item_id"] + response["item_id"] = event.data["item_id"] case SendEvents.RESPONSE_CREATE: - if "response" in data: - response["response"] = data["response"] + if "response" in event.data: + response["response"] = event.data["response"] case SendEvents.RESPONSE_CANCEL: - if "response_id" in data: - response["response_id"] = data["response_id"] + if "response_id" in event.data: + response["response_id"] = event.data["response_id"] try: self.data_channel.send(json.dumps(response)) @@ -249,13 +253,10 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: logger.error(f"Error playing remote audio frame: {e!s}") try: await self.receive_buffer.put( - ( - ListenEvents.RESPONSE_AUDIO_DELTA, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame)], # type: ignore - choice_index=0, - ), + AudioEvent( + event_type="audio", + service_type=ListenEvents.RESPONSE_AUDIO_DELTA, + audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), # type: ignore ), ) except Exception as e: @@ -276,7 +277,8 @@ async def _on_data(self, data: str) -> None: except Exception as e: logger.error(f"Failed to parse event {data} with error: {e!s}") return - await self._handle_event(event) + async for parsed_event in self._parse_event(event): + await self.receive_buffer.put(parsed_event) async def _get_ephemeral_token(self) -> str: """Get an ephemeral token from OpenAI.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 85048a4bfaef..4f32067ba3cc 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -5,12 +5,11 @@ import json import logging import sys +from collections.abc import AsyncGenerator from typing import TYPE_CHECKING, Any, ClassVar, Literal import numpy as np -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase - if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: @@ -21,6 +20,7 @@ from pydantic import Field from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent @@ -53,7 +53,7 @@ async def start_listening( chat_history: "ChatHistory | None" = None, create_response: bool = False, **kwargs: Any, - ) -> None: + ) -> AsyncGenerator[tuple[str, Any], None]: await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") @@ -66,7 +66,7 @@ async def start_listening( if self.audio_output_callback: await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) try: - await self.receive_buffer.put(( + yield ( event.type, StreamingChatMessageContent( role=AuthorRole.ASSISTANT, @@ -79,11 +79,12 @@ async def start_listening( ], # type: ignore choice_index=event.content_index, ), - )) + ) except Exception as e: logger.error(f"Error processing remote audio frame: {e!s}") else: - await self._handle_event(event) + async for event in self._parse_event(event): + yield event @override async def start_sending(self, **kwargs: Any) -> None: @@ -91,26 +92,26 @@ async def start_sending(self, **kwargs: Any) -> None: if not self.connection: raise ValueError("Connection is not established.") while True: - event, data = await self.send_buffer.get() - match event: + event = await self.send_buffer.get() + match event.event_type: case SendEvents.SESSION_UPDATE: - if "settings" not in data: + if "settings" not in event.data: logger.error("Event data does not contain 'settings'") - await self.connection.session.update(session=data["settings"].prepare_settings_dict()) + await self.connection.session.update(session=event.data["settings"].prepare_settings_dict()) case SendEvents.INPUT_AUDIO_BUFFER_APPEND: - if "content" not in data: + if "content" not in event.data: logger.error("Event data does not contain 'content'") return - await self.connection.input_audio_buffer.append(audio=data["content"].data.decode("utf-8")) + await self.connection.input_audio_buffer.append(audio=event.data["content"].data.decode("utf-8")) case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: await self.connection.input_audio_buffer.commit() case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: await self.connection.input_audio_buffer.clear() case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in data: + if "item" not in event.data: logger.error("Event data does not contain 'item'") return - content = data["item"] + content = event.data["item"] for item in content.items: match item: case TextContent(): @@ -156,25 +157,25 @@ async def start_sending(self, **kwargs: Any) -> None: ) ) case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in data: + if "item_id" not in event.data: logger.error("Event data does not contain 'item_id'") return await self.connection.conversation.item.truncate( - item_id=data["item_id"], content_index=0, audio_end_ms=data.get("audio_end_ms", 0) + item_id=event.data["item_id"], content_index=0, audio_end_ms=event.data.get("audio_end_ms", 0) ) case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in data: + if "item_id" not in event.data: logger.error("Event data does not contain 'item_id'") return - await self.connection.conversation.item.delete(item_id=data["item_id"]) + await self.connection.conversation.item.delete(item_id=event.data["item_id"]) case SendEvents.RESPONSE_CREATE: - if "response" in data: - await self.connection.response.create(response=data["response"]) + if "response" in event.data: + await self.connection.response.create(response=event.data["response"]) else: await self.connection.response.create() case SendEvents.RESPONSE_CANCEL: - if "response_id" in data: - await self.connection.response.cancel(response_id=data["response_id"]) + if "response_id" in event.data: + await self.connection.response.cancel(response_id=event.data["response_id"]) else: await self.connection.response.cancel() diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 5f6fa302d545..a6a332791293 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -3,7 +3,7 @@ import sys from abc import ABC, abstractmethod from asyncio import Queue -from collections.abc import Callable +from collections.abc import AsyncGenerator, Callable from typing import TYPE_CHECKING, Any, ClassVar from pydantic import Field @@ -15,6 +15,7 @@ from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.contents.realtime_event import RealtimeEvent from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class @@ -28,24 +29,23 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False - send_buffer: Queue[tuple[str, Any]] = Field(default_factory=Queue) - receive_buffer: Queue[tuple[str, Any]] = Field(default_factory=Queue) + send_buffer: Queue[RealtimeEvent] = Field(default_factory=Queue) - async def send(self, event: str, **kwargs: Any) -> None: + async def send(self, event: RealtimeEvent) -> None: """Send an event to the service. Args: event: The event to send. kwargs: Additional arguments. """ - await self.send_buffer.put((event, kwargs)) + await self.send_buffer.put(event) async def start_streaming( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> None: + ) -> AsyncGenerator[RealtimeEvent, None]: """Start streaming, will start both listening and sending. This method, start tasks for both listening and sending. @@ -57,9 +57,10 @@ async def start_streaming( chat_history: Chat history. kwargs: Additional arguments. """ + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) async with TaskGroup() as tg: - tg.create_task(self.start_listening(settings=settings, chat_history=chat_history, **kwargs)) tg.create_task(self.start_sending(**kwargs)) + yield from tg.create_task(self.start_listening()) @abstractmethod async def start_listening( @@ -67,8 +68,8 @@ async def start_listening( settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, **kwargs: Any, - ) -> None: - """Starts listening for messages from the service, adds them to the output_buffer. + ) -> AsyncGenerator[RealtimeEvent, None]: + """Starts listening for messages from the service, generates events. Args: settings: Prompt execution settings. diff --git a/python/semantic_kernel/contents/realtime_event.py b/python/semantic_kernel/contents/realtime_event.py new file mode 100644 index 000000000000..7de87f078ff6 --- /dev/null +++ b/python/semantic_kernel/contents/realtime_event.py @@ -0,0 +1,56 @@ +# Copyright (c) Microsoft. All rights reserved. + +from typing import Annotated, Any, Literal, TypeAlias, Union + +from pydantic import Field + +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.kernel_pydantic import KernelBaseModel + +RealtimeEvent: TypeAlias = Annotated[ + Union["ServiceEvent", "AudioEvent", "TextEvent", "FunctionCallEvent", "FunctionResultEvent"], + Field(discriminator="event_type"), +] + + +class ServiceEvent(KernelBaseModel): + """Base class for all service events.""" + + event_type: Literal["service"] + service_type: str + event: Any | None = None + + +class AudioEvent(KernelBaseModel): + """Audio event type.""" + + event_type: Literal["audio"] + service_type: str | None = None + audio: AudioContent + + +class TextEvent(KernelBaseModel): + """Text event type.""" + + event_type: Literal["text"] + service_type: str | None = None + text: TextContent + + +class FunctionCallEvent(KernelBaseModel): + """Function call event type.""" + + event_type: Literal["function_call"] + service_type: str | None = None + function_call: FunctionCallContent + + +class FunctionResultEvent(KernelBaseModel): + """Function result event type.""" + + event_type: Literal["function_result"] + service_type: str | None = None + function_result: FunctionResultContent From 9fb0eb7a8a922b9debb5c9119a7266875123617b Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Wed, 29 Jan 2025 17:00:21 +0100 Subject: [PATCH 21/50] wip on redoing the api --- docs/decisions/00XX-realtime-api-clients.md | 111 ++++++++- .../audio/04-chat_with_realtime_api_simple.py | 23 +- .../05-chat_with_realtime_api_complex.py | 33 +-- python/samples/concepts/audio/utils.py | 11 + .../ai/open_ai/services/open_ai_realtime.py | 11 +- .../realtime/open_ai_realtime_base.py | 233 +++++++++++++++--- .../realtime/open_ai_realtime_webrtc.py | 133 ++-------- .../realtime/open_ai_realtime_websocket.py | 118 +-------- .../ai/open_ai/services/realtime/utils.py | 80 ++++++ .../connectors/ai/realtime_client_base.py | 50 +--- .../connectors/ai/utils/realtime_helpers.py | 91 ++++--- .../contents/events/__init__.py | 19 ++ .../contents/{ => events}/realtime_event.py | 0 13 files changed, 551 insertions(+), 362 deletions(-) create mode 100644 python/samples/concepts/audio/utils.py create mode 100644 python/semantic_kernel/contents/events/__init__.py rename python/semantic_kernel/contents/{ => events}/realtime_event.py (100%) diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md index 94c7351ba7fe..bde864d79b52 100644 --- a/docs/decisions/00XX-realtime-api-clients.md +++ b/docs/decisions/00XX-realtime-api-clients.md @@ -130,8 +130,8 @@ This would mean that all events are retained and returned to the developer as is Chosen option: 3 Treat Everything as Events -This option was chosen to allow abstraction away from the raw events, while still allowing the developer to access the raw events if needed. This allows for a simple programming model, while still allowing for complex interactions. -A set of events are defined, for basic types, like 'audio', 'text', 'function_call', 'function_result', it then has two other fields, service_event which is filled with the event type from the service and a field for the actual content, with a name that makes sense: +This option was chosen to allow abstraction away from the raw events, while still allowing the developer to access the raw events if needed. +A set of events are defined, for basic types, like 'audio', 'text', 'function_call', 'function_result', it then has two other fields, service_event which is filled with the event type from the service and a field for the actual content, with a name that corresponds to the event type: ```python AudioEvent( @@ -153,6 +153,15 @@ ServiceEvent( This allows you to easily filter on the event_type, and then use the service_event to filter on the specific event type, and then use the content field to get the content, or the event field to get the raw event. +Collectively these are known as *RealtimeEvents*, and are returned as an async generator from the client, so you can easily loop over them. And they are passed to the send method. + +Initially RealtimeEvents are: +- AudioEvent +- TextEvent +- FunctionCallEvent +- FunctionResultEvent +- ServiceEvent + # Programming model ## Considered Options - Programming model @@ -176,7 +185,7 @@ This would mean that the client would have a mechanism to register event handler - developer judgement needs to be made (or exposed with parameters) on what is returned through the async generator and what is passed to the event handlers ### 2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers -This would mean that the there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like parsing events to content types and auto-function calling are processed first, and the result is put in the queue, the content type should use inner_content to capture the full event and these might add a message to the send queue as well. +This would mean that there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like parsing events to content types and auto-function calling are processed first, and the result is put in the queue, the content type should use inner_content to capture the full event and these might add a message to the send queue as well. - Pro: - simple to use, just start sending and start receiving @@ -239,3 +248,99 @@ This would mean that the client would receive AudioContent items, and would have ## Decision Outcome - Audio speaker/microphone handling Chosen option: ... + +# Interface design + +## Considered Options - Interface design + +1. Use a single class for everything +2. Split the service class from a session class. + +The following methods will need to be supported: +- create session +- update session +- close session +- listen for/receive events +- send events + +### 1. Use a single class for everything + +Each implementation would have to implements all of the above methods. This means that non-protocol specific elements are in the same class as the protocol specific elements and will lead to code duplication between them. + +### 2. Split the service class from a session class. + +Two interfaces are created: +- Service: create session, update session, delete session, list sessions +- Session: listen for/receive events, send events, update session, close session + +Currently neither the google or the openai api's support restarting sessions, so the advantage of splitting is mostly a implementation question but will not add any benefits to the user. + +This means that the split would be far simpler: +- Service: create session +- Session: listen for/receive events, send events, update session, close session + +## Naming + +The send and listen/receive methods need to be clear in the way their are named and this can become confusing when dealing with these api's. The following options are considered: + +Options for sending events to the service from your code: +- google uses .send in their client. +- OpenAI uses .send in their client as well +- send or send_message is used in other clients, like Azure Communication Services + +Options for listening for events from the service in your code: +- google uses .receive in their client. +- openai uses .recv in their client. +- others use receive or receive_messages in their clients. + +### Decision Outcome - Interface design + +Chosen option: Use a single class for everything +Chosen for send and receive as verbs. + +This means that the interface will look like this: +```python + +class RealtimeClient: + async def create_session(self, settings: PromptExecutionSettings, chat_history: ChatHistory, **kwargs) -> None: + ... + + async def update_session(self, settings: PromptExecutionSettings, chat_history: ChatHistory, **kwargs) -> None: + ... + + async def close_session(self, **kwargs) -> None: + ... + + async def receive(self, **kwargs) -> AsyncGenerator[RealtimeEvent, None]: + ... + + async def send(self, event: RealtimeEvent) -> None: + ... +``` + +In most cases, create_session should call update_session with the same parameters, since update session can also be done separately later on with the same inputs. + +For Python a default __aenter__ and __aexit__ method should be added to the class, so it can be used in a with statement, which calls create_session and close_session respectively. + +It is advisable, but not required, to implement the send method through a buffer/queue so that events be can 'sent' before the sessions has been established without losing them or raising exceptions, this might take a very seconds and in that time a single send call would block the application. + +For receiving a internal implementation might also rely on a buffer/queue, but this is up to the developer and what makes sense for that service. For instance webrtc relies on defining the callback at create session time, so the create_session method adds a function that adds events to the queue and the receive method starts reading from and yielding from that queue. + +The send method should handle all events types, but it might have to handle the same thing in two ways, for instance: +```python +audio = AudioContent(...) + +await client.send(AudioEvent(event_type='audio', audio=audio)) +``` + +is equivalent to (at least in the case of OpenAI): +```python +audio = AudioContent(...) + +await client.send(ServiceEvent(event_type='service', service_event='input_audio_buffer.append', event=audio)) +``` + +The first version allows one to have the exact same code for all services, while the second version is also correct and should be handled correctly as well, this once again allows for flexibility and simplicity, when audio needs to be sent to with a different event type, that is still possible in the second way, while the first uses the "default" event type for that particular service. + + + diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py index 116f9e3f8d81..5dda1dc6d308 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py @@ -3,7 +3,9 @@ import asyncio import logging +from samples.concepts.audio.utils import check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( + ListenEvents, OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, @@ -34,14 +36,6 @@ # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, # so you may need to adjust these for your system. # you can check the available devices by uncommenting line below the function - - -def check_audio_devices(): - import sounddevice as sd - - logger.debug(sd.query_devices()) - - check_audio_devices() @@ -65,18 +59,23 @@ async def main() -> None: ) # the context manager calls the create_session method on the client and start listening to the audio stream audio_player = SKAudioPlayer() + print("Mosscap (transcript): ", end="") async with realtime_client, audio_player: await realtime_client.update_session(settings=settings, create_response=True) - async for event in realtime_client.start_streaming(): + + async for event in realtime_client.receive(): match event.event_type: case "audio": await audio_player.add_audio(event.audio) case "text": - print(event.text.text) + print(event.text.text, end="") case "service": - if event.service_type == "session.update": + # OpenAI Specific events + if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") - if event.service_type == "error": + if event.service_type == ListenEvents.RESPONSE_CREATED: + print("") + if event.service_type == ListenEvents.ERROR: logger.error(event.event) diff --git a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py index 18785b81348d..77406af4f355 100644 --- a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py +++ b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py @@ -5,6 +5,7 @@ from datetime import datetime from random import randint +from samples.concepts.audio.utils import check_audio_devices from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( @@ -13,7 +14,7 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.utils import SKAudioPlayer +from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack from semantic_kernel.contents import ChatHistory from semantic_kernel.functions import kernel_function @@ -43,12 +44,6 @@ # you can check the available devices by uncommenting line below the function -def check_audio_devices(): - import sounddevice as sd - - logger.debug(sd.query_devices()) - - check_audio_devices() @@ -75,11 +70,18 @@ async def main() -> None: kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) kernel.add_function(plugin_name="time", function_name="get_date_time", function=get_date_time) - # create the realtime client and optionally add the audio output function, this is optional + # create the audio player and audio track + # both take a device_id parameter, which is the index of the device to use, if None the default device is used audio_player = SKAudioPlayer() + audio_track = SKAudioTrack() + # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - realtime_client = OpenAIRealtime(protocol="webrtc", audio_output_callback=audio_player.client_callback) + realtime_client = OpenAIRealtime( + protocol="webrtc", + audio_output_callback=audio_player.client_callback, + audio_track=audio_track, + ) # Create the settings for the session # The realtime api, does not use a system message, but takes instructions as a parameter for a session @@ -112,18 +114,9 @@ async def main() -> None: await realtime_client.update_session( settings=settings, chat_history=chat_history, kernel=kernel, create_response=True ) - # you can also send other events to the service, like this (the first has content, the second does not) - # await realtime_client.send( - # SendEvents.CONVERSATION_ITEM_CREATE, - # item=ChatMessageContent(role="user", content="Hi there, who are you?")}, - # ) - # await realtime_client.send(SendEvents.RESPONSE_CREATE) print("Mosscap (transcript): ", end="") - async for event in realtime_client.start_streaming(): + async for event in realtime_client.receive(): match event.event_type: - # case "audio": - # if play_audio and audio_player: - # await audio_player.add_audio(event.audio) case "text": if print_transcript: print(event.text.text, end="") @@ -135,8 +128,6 @@ async def main() -> None: print("") case ListenEvents.ERROR: logger.error(event.event) - # case ....: - # # add other event handling here if __name__ == "__main__": diff --git a/python/samples/concepts/audio/utils.py b/python/samples/concepts/audio/utils.py new file mode 100644 index 000000000000..fda9ecb7d772 --- /dev/null +++ b/python/samples/concepts/audio/utils.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft. All rights reserved. + +import logging + +import sounddevice as sd + +logger = logging.getLogger(__name__) + + +def check_audio_devices(): + logger.debug(sd.query_devices()) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 1a7c5acc330d..af0d1bd8b8bd 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. from collections.abc import Callable, Coroutine, Mapping -from typing import Any, ClassVar, Literal, TypeVar +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar from numpy import ndarray from openai import AsyncOpenAI @@ -17,6 +17,9 @@ from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +if TYPE_CHECKING: + from aiortc.mediastreams import MediaStreamTrack + _T = TypeVar("_T", bound="OpenAIRealtime") @@ -33,6 +36,7 @@ def __init__( self, protocol: Literal["websocket", "webrtc"] = "websocket", audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + audio_track: "MediaStreamTrack | None" = None, ai_model_id: str | None = None, api_key: str | None = None, org_id: str | None = None, @@ -54,6 +58,9 @@ def __init__( It is called first in both websockets and webrtc. Even when passed, the audio content will still be added to the receiving queue. + audio_track: The audio track to use for the service, only used by WebRTC. + A default is supplied if not provided. + It can be any class that implements the AudioStreamTrack interface. ai_model_id (str | None): OpenAI model name, see https://platform.openai.com/docs/models service_id (str | None): Service ID tied to the execution settings. @@ -81,6 +88,7 @@ def __init__( raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not openai_settings.realtime_model_id: raise ServiceInitializationError("The OpenAI text model ID is required.") + kwargs = {"audio_track": audio_track} if protocol == "webrtc" and audio_track else {} super().__init__( protocol=protocol, audio_output_callback=audio_output_callback, @@ -91,6 +99,7 @@ def __init__( ai_model_type=OpenAIModelTypes.REALTIME, default_headers=default_headers, client=async_client, + **kwargs, ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 5f2b49020fb6..f7344b5262ee 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -1,28 +1,22 @@ # Copyright (c) Microsoft. All rights reserved. +import base64 +import json import logging import sys +from abc import abstractmethod from collections.abc import AsyncGenerator, Callable, Coroutine from typing import TYPE_CHECKING, Any, ClassVar, Literal -from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.realtime_event import ( - FunctionCallEvent, - FunctionResultEvent, - RealtimeEvent, - ServiceEvent, - TextEvent, -) -from semantic_kernel.contents.streaming_text_content import StreamingTextContent - if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: from typing_extensions import override # pragma: no cover from numpy import ndarray -from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from openai.types.beta.realtime.response_function_call_arguments_done_event import ( +from openai.types.beta.realtime import ( + RealtimeClientEvent, + RealtimeServerEvent, ResponseFunctionCallArgumentsDoneEvent, ) from pydantic import PrivateAttr @@ -35,12 +29,24 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.utils import ( + _create_realtime_client_event, update_settings_from_function_call_configuration, ) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.events.realtime_event import ( + FunctionCallEvent, + FunctionResultEvent, + RealtimeEvent, + ServiceEvent, + TextEvent, +) from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.streaming_text_content import StreamingTextContent +from semantic_kernel.contents.text_content import TextContent from semantic_kernel.kernel import Kernel from semantic_kernel.utils.experimental_decorator import experimental_class @@ -61,7 +67,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None kernel: Kernel | None = None - _current_settings: PromptExecutionSettings | None = PrivateAttr(None) + _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvent, None]: @@ -132,7 +138,11 @@ async def update_session( kernel=self.kernel, # type: ignore ) await self.send( - ServiceEvent(event_type="service", service_type=SendEvents.SESSION_UPDATE, event=self._current_settings) + ServiceEvent( + event_type="service", + service_type=SendEvents.SESSION_UPDATE, + event={"settings": self._current_settings}, + ) ) if chat_history and len(chat_history) > 0: for msg in chat_history.messages: @@ -198,22 +208,185 @@ async def _parse_function_call_arguments_done( # This allows a user to have a full conversation in his code yield FunctionResultEvent(event_type="function_result", function_result=created_output) - @override - async def start_listening( - self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any - ) -> None: - pass - - @override - async def start_sending(self, **kwargs: Any) -> None: - pass + @abstractmethod + async def _send(self, event: RealtimeClientEvent) -> None: + """Send an event to the service.""" + raise NotImplementedError @override - async def create_session( - self, settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, **kwargs: Any - ) -> None: - pass + async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: + match event.event_type: + case "audio": + if isinstance(event.audio.data, ndarray): + audio_data = base64.b64encode(event.audio.data.tobytes()).decode("utf-8") + else: + audio_data = event.audio.data.decode("utf-8") + await self._send( + _create_realtime_client_event( + event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, + audio=audio_data, + ) + ) + case "text": + await self._send( + _create_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="message", + content=[ + { + "type": "input_text", + "text": event.text.text, + } + ], + role="user", + ), + ) + ) + case "function_call": + await self._send( + _create_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="function_call", + name=event.function_call.name or event.function_call.function_name, + arguments="" + if not event.function_call.arguments + else event.function_call.arguments + if isinstance(event.function_call.arguments, str) + else json.dumps(event.function_call.arguments), + call_id=event.function_call.metadata.get("call_id"), + ), + ) + ) + case "function_result": + await self._send( + _create_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="function_call_output", + output=event.function_result.result, + call_id=event.function_result.metadata.get("call_id"), + ), + ) + ) + case "service": + data = event.event + match event.service_type: + case SendEvents.SESSION_UPDATE: + if not data: + logger.error("Event data is empty") + return + settings = data.get("settings", None) + if not settings or not isinstance(settings, PromptExecutionSettings): + logger.error("Event data does not contain 'settings'") + return + if not settings.ai_model_id: + settings.ai_model_id = self.ai_model_id + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + **settings.prepare_settings_dict(), + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if not data or "audio" not in data: + logger.error("Event data does not contain 'audio'") + return + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + audio=data["audio"], + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + await self._send(_create_realtime_client_event(event_type=event.service_type)) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + await self._send(_create_realtime_client_event(event_type=event.service_type)) + case SendEvents.CONVERSATION_ITEM_CREATE: + if not data or "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + contents = content.items if isinstance(content, ChatMessageContent) else [content] + for item in contents: + match item: + case TextContent(): + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + **dict( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ), + ) + ) + case FunctionCallContent(): + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + **dict( + type="function_call", + name=item.name or item.function_name, + arguments="" + if not item.arguments + else item.arguments + if isinstance(item.arguments, str) + else json.dumps(item.arguments), + call_id=item.metadata.get("call_id"), + ), + ) + ) - @override - async def close_session(self) -> None: - pass + case FunctionResultContent(): + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + **dict( + type="function_call_output", + output=item.result, + call_id=item.metadata.get("call_id"), + ), + ) + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + content_index=0, + audio_end_ms=data.get("audio_end_ms", 0), + ) + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + ) + ) + case SendEvents.RESPONSE_CREATE: + await self._send( + _create_realtime_client_event( + event_type=event.service_type, event_id=data.get("event_id", None) if data else None + ) + ) + case SendEvents.RESPONSE_CANCEL: + await self._send( + _create_realtime_client_event( + event_type=event.service_type, + response_id=data.get("response_id", None) if data else None, + ) + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 1225c2927345..11d8676f45b6 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -8,8 +8,6 @@ from collections.abc import AsyncGenerator from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast -from pydantic import Field - if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: @@ -17,6 +15,7 @@ from aiohttp import ClientSession from aiortc import ( + MediaStreamTrack, RTCConfiguration, RTCDataChannel, RTCIceServer, @@ -25,22 +24,19 @@ ) from av.audio.frame import AudioFrame from openai._models import construct_type_unchecked -from openai.types.beta.realtime.conversation_item_param import ConversationItemParam +from openai.types.beta.realtime.realtime_client_event import RealtimeClientEvent from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent +from pydantic import Field, PrivateAttr -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.connectors.ai.realtime_client_base import RealtimeEvent +from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioTrack from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.realtime_event import AudioEvent -from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.contents.events.realtime_event import AudioEvent from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: - from aiortc import MediaStreamTrack - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory @@ -55,137 +51,50 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): protocol: ClassVar[Literal["webrtc"]] = "webrtc" peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None - receive_buffer: asyncio.Queue[RealtimeEvent] = Field(default_factory=asyncio.Queue) - - # region public methods + audio_track: MediaStreamTrack = Field(default_factory=SKAudioTrack) + _receive_buffer: asyncio.Queue[RealtimeEvent] = PrivateAttr(default_factory=asyncio.Queue) @override - async def start_listening( + async def receive( self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - create_response: bool = False, **kwargs: Any, ) -> AsyncGenerator[RealtimeEvent, None]: - if chat_history or settings or create_response: - await self.update_session(settings=settings, chat_history=chat_history, create_response=create_response) while True: - event = await self.receive_buffer.get() + event = await self._receive_buffer.get() yield event - @override - async def start_sending(self, **kwargs: Any) -> None: + async def _send(self, event: RealtimeClientEvent) -> None: if not self.data_channel: logger.error("Data channel not initialized") return while self.data_channel.readyState != "open": await asyncio.sleep(0.1) - while True: - event = await self.send_buffer.get() - response: dict[str, Any] = {"type": event.event_type} - match event.event_type: - case SendEvents.SESSION_UPDATE: - if "settings" not in event.data: - logger.error("Event data does not contain 'settings'") - response["session"] = event.data["settings"].prepare_settings_dict() - case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in event.data: - logger.error("Event data does not contain 'item'") - return - content = event.data["item"] - for item in content.items: - match item: - case TextContent(): - response["item"] = ConversationItemParam( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ) - - case FunctionCallContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function call needs to have a call_id") - continue - response["item"] = ConversationItemParam( - type="function_call", - name=item.name or item.function_name, - arguments="" - if not item.arguments - else item.arguments - if isinstance(item.arguments, str) - else json.dumps(item.arguments), - call_id=call_id, - ) - - case FunctionResultContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function result needs to have a call_id") - continue - response["item"] = ConversationItemParam( - type="function_call_output", - output=item.result, - call_id=call_id, - ) - - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in event.data: - logger.error("Event data does not contain 'item_id'") - return - response["item_id"] = event.data["item_id"] - response["content_index"] = 0 - response["audio_end_ms"] = event.data.get("audio_end_ms", 0) - - case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in event.data: - logger.error("Event data does not contain 'item_id'") - return - response["item_id"] = event.data["item_id"] - case SendEvents.RESPONSE_CREATE: - if "response" in event.data: - response["response"] = event.data["response"] - case SendEvents.RESPONSE_CANCEL: - if "response_id" in event.data: - response["response_id"] = event.data["response_id"] - - try: - self.data_channel.send(json.dumps(response)) - except Exception as e: - logger.error(f"Failed to send event {event} with error: {e!s}") + try: + self.data_channel.send(event.model_dump_json(exclude_none=True)) + except Exception as e: + logger.error(f"Failed to send event {event} with error: {e!s}") @override async def create_session( self, settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, - audio_track: "MediaStreamTrack | None" = None, **kwargs: Any, ) -> None: """Create a session in the service.""" - if not audio_track: - from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioTrack - - audio_track = SKAudioTrack() - self.peer_connection = RTCPeerConnection( configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) ) # track is the audio track being returned from the service - self.peer_connection.on("track")(self._on_track) + self.peer_connection.add_listener("track", self._on_track) # data channel is used to send and receive messages self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") - self.data_channel.on("message")(self._on_data) + self.data_channel.add_listener("message", self._on_data) # this is the incoming audio, which sends audio to the service - self.peer_connection.addTransceiver(audio_track) + self.peer_connection.addTransceiver(self.audio_track) offer = await self.peer_connection.createOffer() await self.peer_connection.setLocalDescription(offer) @@ -230,8 +139,6 @@ async def close_session(self) -> None: self.data_channel.close() self.data_channel = None - # region implementation specifics - async def _on_track(self, track: "MediaStreamTrack") -> None: logger.info(f"Received {track.kind} track from remote") if track.kind != "audio": @@ -252,7 +159,7 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: except Exception as e: logger.error(f"Error playing remote audio frame: {e!s}") try: - await self.receive_buffer.put( + await self._receive_buffer.put( AudioEvent( event_type="audio", service_type=ListenEvents.RESPONSE_AUDIO_DELTA, @@ -278,7 +185,7 @@ async def _on_data(self, data: str) -> None: logger.error(f"Failed to parse event {data} with error: {e!s}") return async for parsed_event in self._parse_event(event): - await self.receive_buffer.put(parsed_event) + await self._receive_buffer.put(parsed_event) async def _get_ephemeral_token(self) -> str: """Get an ephemeral token from OpenAI.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 4f32067ba3cc..db2d0cfea51d 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -2,30 +2,26 @@ import asyncio import base64 -import json import logging import sys from collections.abc import AsyncGenerator from typing import TYPE_CHECKING, Any, ClassVar, Literal -import numpy as np - if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: from typing_extensions import override # pragma: no cover +import numpy as np from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection -from openai.types.beta.realtime.conversation_item_param import ConversationItemParam +from openai.types.beta.realtime.realtime_client_event import RealtimeClientEvent from pydantic import Field -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.events.realtime_event import RealtimeEvent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent -from semantic_kernel.contents.text_content import TextContent from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.utils.experimental_decorator import experimental_class @@ -35,8 +31,6 @@ logger: logging.Logger = logging.getLogger(__name__) -# region Websocket - @experimental_class class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): @@ -47,20 +41,14 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): connected: asyncio.Event = Field(default_factory=asyncio.Event) @override - async def start_listening( + async def receive( self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - create_response: bool = False, **kwargs: Any, - ) -> AsyncGenerator[tuple[str, Any], None]: + ) -> AsyncGenerator[RealtimeEvent, None]: await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") - if chat_history or settings or create_response: - await self.update_session(settings=settings, chat_history=chat_history, create_response=create_response) - async for event in self.connection: if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: if self.audio_output_callback: @@ -86,98 +74,14 @@ async def start_listening( async for event in self._parse_event(event): yield event - @override - async def start_sending(self, **kwargs: Any) -> None: + async def _send(self, event: RealtimeClientEvent) -> None: await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") - while True: - event = await self.send_buffer.get() - match event.event_type: - case SendEvents.SESSION_UPDATE: - if "settings" not in event.data: - logger.error("Event data does not contain 'settings'") - await self.connection.session.update(session=event.data["settings"].prepare_settings_dict()) - case SendEvents.INPUT_AUDIO_BUFFER_APPEND: - if "content" not in event.data: - logger.error("Event data does not contain 'content'") - return - await self.connection.input_audio_buffer.append(audio=event.data["content"].data.decode("utf-8")) - case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: - await self.connection.input_audio_buffer.commit() - case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: - await self.connection.input_audio_buffer.clear() - case SendEvents.CONVERSATION_ITEM_CREATE: - if "item" not in event.data: - logger.error("Event data does not contain 'item'") - return - content = event.data["item"] - for item in content.items: - match item: - case TextContent(): - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ) - ) - case FunctionCallContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function call needs to have a call_id") - continue - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="function_call", - name=item.name or item.function_name, - arguments="" - if not item.arguments - else item.arguments - if isinstance(item.arguments, str) - else json.dumps(item.arguments), - call_id=call_id, - ) - ) - case FunctionResultContent(): - call_id = item.metadata.get("call_id") - if not call_id: - logger.error("Function result needs to have a call_id") - continue - await self.connection.conversation.item.create( - item=ConversationItemParam( - type="function_call_output", - output=item.result, - call_id=call_id, - ) - ) - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if "item_id" not in event.data: - logger.error("Event data does not contain 'item_id'") - return - await self.connection.conversation.item.truncate( - item_id=event.data["item_id"], content_index=0, audio_end_ms=event.data.get("audio_end_ms", 0) - ) - case SendEvents.CONVERSATION_ITEM_DELETE: - if "item_id" not in event.data: - logger.error("Event data does not contain 'item_id'") - return - await self.connection.conversation.item.delete(item_id=event.data["item_id"]) - case SendEvents.RESPONSE_CREATE: - if "response" in event.data: - await self.connection.response.create(response=event.data["response"]) - else: - await self.connection.response.create() - case SendEvents.RESPONSE_CANCEL: - if "response_id" in event.data: - await self.connection.response.cancel(response_id=event.data["response_id"]) - else: - await self.connection.response.cancel() + try: + await self.connection.send(event) + except Exception as e: + logger.error(f"Error sending response: {e!s}") @override async def create_session( diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py index ada8d42924c0..9aa061e44bc5 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py @@ -2,6 +2,24 @@ from typing import TYPE_CHECKING, Any +from openai.types.beta.realtime import ( + ConversationItem, + ConversationItemCreateEvent, + ConversationItemDeleteEvent, + ConversationItemTruncateEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, + RealtimeClientEvent, + ResponseCancelEvent, + ResponseCreateEvent, + SessionUpdateEvent, +) +from openai.types.beta.realtime.response_create_event import Response +from openai.types.beta.realtime.session_update_event import Session + +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import SendEvents + if TYPE_CHECKING: from semantic_kernel.connectors.ai.function_choice_behavior import ( FunctionCallChoiceConfiguration, @@ -45,3 +63,65 @@ def kernel_function_metadata_to_function_call_format( "required": [p.name for p in metadata.parameters if p.is_required and p.include_in_function_choices], }, } + + +def _create_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: + match event_type: + case SendEvents.SESSION_UPDATE: + event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} + return SessionUpdateEvent( + type=event_type, + session=Session.model_validate(kwargs), + **event_kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + return InputAudioBufferAppendEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + return InputAudioBufferCommitEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + return InputAudioBufferClearEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_CREATE: + if "event_id" in kwargs: + event_id = kwargs.pop("event_id") + if "previous_item_id" in kwargs: + previous_item_id = kwargs.pop("previous_item_id") + event_kwargs = {"event_id": event_id} if "event_id" in kwargs else {} + event_kwargs.update({"previous_item_id": previous_item_id} if "previous_item_id" in kwargs else {}) + return ConversationItemCreateEvent( + type=event_type, + item=ConversationItem.model_validate(kwargs), + **event_kwargs, + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + return ConversationItemTruncateEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + return ConversationItemDeleteEvent( + type=event_type, + **kwargs, + ) + case SendEvents.RESPONSE_CREATE: + event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} + return ResponseCreateEvent( + type=event_type, + response=Response.model_validate(kwargs), + **event_kwargs, + ) + case SendEvents.RESPONSE_CANCEL: + return ResponseCancelEvent( + type=event_type, + **kwargs, + ) + case _: + raise ValueError(f"Unknown event type: {event_type}") diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index a6a332791293..0ad1fc13a089 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -2,20 +2,17 @@ import sys from abc import ABC, abstractmethod -from asyncio import Queue from collections.abc import AsyncGenerator, Callable from typing import TYPE_CHECKING, Any, ClassVar -from pydantic import Field - if sys.version_info >= (3, 11): - from asyncio import TaskGroup + from typing import Self # pragma: no cover else: - from taskgroup import TaskGroup + from typing_extensions import Self # pragma: no cover from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.contents.realtime_event import RealtimeEvent +from semantic_kernel.contents.events.realtime_event import RealtimeEvent from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class @@ -29,8 +26,8 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False - send_buffer: Queue[RealtimeEvent] = Field(default_factory=Queue) + @abstractmethod async def send(self, event: RealtimeEvent) -> None: """Send an event to the service. @@ -38,53 +35,20 @@ async def send(self, event: RealtimeEvent) -> None: event: The event to send. kwargs: Additional arguments. """ - await self.send_buffer.put(event) - - async def start_streaming( - self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvent, None]: - """Start streaming, will start both listening and sending. - - This method, start tasks for both listening and sending. - - The arguments are passed to the start_listening method. - - Args: - settings: Prompt execution settings. - chat_history: Chat history. - kwargs: Additional arguments. - """ - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) - async with TaskGroup() as tg: - tg.create_task(self.start_sending(**kwargs)) - yield from tg.create_task(self.start_listening()) + raise NotImplementedError @abstractmethod - async def start_listening( + def receive( self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, **kwargs: Any, ) -> AsyncGenerator[RealtimeEvent, None]: """Starts listening for messages from the service, generates events. Args: - settings: Prompt execution settings. - chat_history: Chat history. kwargs: Additional arguments. """ raise NotImplementedError - @abstractmethod - async def start_sending( - self, - ) -> None: - """Start sending items from the input_buffer to the service.""" - raise NotImplementedError - @abstractmethod async def create_session( self, @@ -134,7 +98,7 @@ def _update_function_choice_settings_callback( """ return lambda configuration, settings, choice_type: None - async def __aenter__(self) -> "RealtimeClientBase": + async def __aenter__(self) -> "Self": """Enter the context manager. Default implementation calls the create session method. diff --git a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py index dd6d0e5fe16f..ed2ef294c716 100644 --- a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py +++ b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py @@ -2,14 +2,14 @@ import asyncio import logging -from typing import Any, Final +from typing import Any, ClassVar, Final import numpy as np import numpy.typing as npt from aiortc.mediastreams import MediaStreamError, MediaStreamTrack from av.audio.frame import AudioFrame from av.frame import Frame -from pydantic import Field, PrivateAttr +from pydantic import PrivateAttr from sounddevice import InputStream, OutputStream from semantic_kernel.contents.audio_content import AudioContent @@ -25,36 +25,54 @@ class SKAudioTrack(KernelBaseModel, MediaStreamTrack): - """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.""" - Make sure the device_id is set to the correct device for your system. - """ - - kind: str = "audio" + kind: ClassVar[str] = "audio" + device_id: str | int | None = None sample_rate: int = SAMPLE_RATE channels: int = TRACK_CHANNELS frame_duration: int = FRAME_DURATION dtype: npt.DTypeLike = DTYPE - device: str | int | None = None - queue: asyncio.Queue[Frame] = Field(default_factory=asyncio.Queue) - is_recording: bool = False frame_size: int = 0 + _queue: asyncio.Queue[Frame] = PrivateAttr(default_factory=asyncio.Queue) + _is_recording: bool = False _stream: InputStream | None = None _recording_task: asyncio.Task | None = None _loop: asyncio.AbstractEventLoop | None = None - _pts: int = 0 # Add this to track the pts + _pts: int = 0 - def __init__(self, **kwargs: Any): - """Initialize the audio track. + def __init__( + self, + *, + device_id: str | int | None = None, + sample_rate: int = SAMPLE_RATE, + channels: int = TRACK_CHANNELS, + frame_duration: int = FRAME_DURATION, + dtype: npt.DTypeLike = DTYPE, + ): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + Make sure the device_id is set to the correct device for your system. Args: + device_id: The device id to use for recording audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + frame_duration: The duration of each audio frame in milliseconds. + dtype: The data type for the audio. **kwargs: Additional keyword arguments. - """ - kwargs["frame_size"] = int( - kwargs.get("sample_rate", SAMPLE_RATE) * kwargs.get("frame_duration", FRAME_DURATION) / 1000 + args = { + "device_id": device_id, + "sample_rate": sample_rate, + "channels": channels, + "frame_duration": frame_duration, + "dtype": dtype, + } + args["frame_size"] = int( + args.get("sample_rate", SAMPLE_RATE) * args.get("frame_duration", FRAME_DURATION) / 1000 ) - super().__init__(**kwargs) + super().__init__(**args) MediaStreamTrack.__init__(self) async def recv(self) -> Frame: @@ -63,8 +81,8 @@ async def recv(self) -> Frame: self._recording_task = asyncio.create_task(self.start_recording()) try: - frame = await self.queue.get() - self.queue.task_done() + frame = await self._queue.get() + self._queue.task_done() return frame except Exception as e: logger.error(f"Error receiving audio frame: {e!s}") @@ -74,7 +92,7 @@ def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, stat if status: logger.warning(f"Audio input status: {status}") if self._loop and self._loop.is_running(): - asyncio.run_coroutine_threadsafe(self.queue.put(self._create_frame(indata)), self._loop) + asyncio.run_coroutine_threadsafe(self._queue.put(self._create_frame(indata)), self._loop) def _create_frame(self, indata: np.ndarray) -> Frame: audio_data = indata.copy() @@ -95,16 +113,16 @@ def _create_frame(self, indata: np.ndarray) -> Frame: async def start_recording(self): """Start recording audio from the input device.""" - if self.is_recording: + if self._is_recording: return - self.is_recording = True + self._is_recording = True self._loop = asyncio.get_running_loop() self._pts = 0 # Reset pts when starting recording try: self._stream = InputStream( - device=self.device, + device=self.device_id, channels=self.channels, samplerate=self.sample_rate, dtype=self.dtype, @@ -113,14 +131,14 @@ async def start_recording(self): ) self._stream.start() - while self.is_recording: + while self._is_recording: await asyncio.sleep(0.1) except Exception as e: logger.error(f"Error in audio recording: {e!s}") raise finally: - self.is_recording = False + self._is_recording = False class SKAudioPlayer(KernelBaseModel): @@ -128,17 +146,26 @@ class SKAudioPlayer(KernelBaseModel): Make sure the device_id is set to the correct device for your system. - The sample rate, channels and frame duration should be set to match the audio you - are receiving, the defaults are for WebRTC. + The sample rate, channels and frame duration + should be set to match the audio you + are receiving. + + Args: + device_id: The device id to use for playing audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + dtype: The data type for the audio. + frame_duration: The duration of each audio frame in milliseconds + """ device_id: int | None = None sample_rate: int = SAMPLE_RATE - dtype: npt.DTypeLike = DTYPE channels: int = PLAYER_CHANNELS - frame_duration_ms: int = FRAME_DURATION - _queue: asyncio.Queue[np.ndarray] | None = None - _stream: OutputStream | None = PrivateAttr(None) + dtype: npt.DTypeLike = DTYPE + frame_duration: int = FRAME_DURATION + _queue: asyncio.Queue[np.ndarray] | None = PrivateAttr(default=None) + _stream: OutputStream | None = PrivateAttr(default=None) async def __aenter__(self): """Start the audio stream when entering a context.""" @@ -157,7 +184,7 @@ def start(self): samplerate=self.sample_rate, channels=self.channels, dtype=self.dtype, - blocksize=int(self.sample_rate * self.frame_duration_ms / 1000), + blocksize=int(self.sample_rate * self.frame_duration / 1000), device=self.device_id, ) if self._stream and self._queue: diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py new file mode 100644 index 000000000000..7466a652364b --- /dev/null +++ b/python/semantic_kernel/contents/events/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Microsoft. All rights reserved. + +from semantic_kernel.contents.events.realtime_event import ( + AudioEvent, + FunctionCallEvent, + FunctionResultEvent, + RealtimeEvent, + ServiceEvent, + TextEvent, +) + +__all__ = [ + "AudioEvent", + "FunctionCallEvent", + "FunctionResultEvent", + "RealtimeEvent", + "ServiceEvent", + "TextEvent", +] diff --git a/python/semantic_kernel/contents/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py similarity index 100% rename from python/semantic_kernel/contents/realtime_event.py rename to python/semantic_kernel/contents/events/realtime_event.py From f02e5d8cccab699869827024656fe1021fe69e24 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 30 Jan 2025 19:59:53 +0100 Subject: [PATCH 22/50] WIP --- .../audio/04-chat_with_realtime_api_simple.py | 2 +- .../05-chat_with_realtime_api_complex.py | 22 +++-- .../ai/open_ai/services/open_ai_realtime.py | 84 ++++++++++++++++--- .../realtime/open_ai_realtime_base.py | 49 ++++++----- .../realtime/open_ai_realtime_webrtc.py | 3 +- .../realtime/open_ai_realtime_websocket.py | 21 ++--- .../ai/open_ai/services/realtime/utils.py | 5 +- .../connectors/ai/utils/realtime_helpers.py | 44 +++++++--- .../contents/binary_content.py | 10 +++ .../contents/events/realtime_event.py | 10 +-- 10 files changed, 170 insertions(+), 80 deletions(-) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py index 5dda1dc6d308..06ee11807a81 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py +++ b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py @@ -43,7 +43,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - realtime_client = OpenAIRealtime(protocol="webrtc") + realtime_client = OpenAIRealtime("webrtc") # Create the settings for the session settings = OpenAIRealtimeExecutionSettings( instructions=""" diff --git a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py index 77406af4f355..b567c0028178 100644 --- a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py +++ b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py @@ -52,35 +52,41 @@ def get_weather(location: str) -> str: """Get the weather for a location.""" weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec - logger.info(f"Getting weather for {location}: {weather}") + logger.info(f"@ Getting weather for {location}: {weather}") return f"The weather in {location} is {weather}." @kernel_function def get_date_time() -> str: """Get the current date and time.""" - logger.info("Getting current datetime") + logger.info("@ Getting current datetime") return f"The current date and time is {datetime.now().isoformat()}." +@kernel_function +def goodbye(): + """When the user is done, say goodbye and then call this function.""" + logger.info("@ Goodbye has been called!") + raise KeyboardInterrupt + + async def main() -> None: print_transcript = True # create the Kernel and add a simple function for function calling. kernel = Kernel() - kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather) - kernel.add_function(plugin_name="time", function_name="get_date_time", function=get_date_time) + kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) # create the audio player and audio track # both take a device_id parameter, which is the index of the device to use, if None the default device is used - audio_player = SKAudioPlayer() + audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1) audio_track = SKAudioTrack() # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different realtime_client = OpenAIRealtime( - protocol="webrtc", + protocol="websocket", audio_output_callback=audio_player.client_callback, - audio_track=audio_track, + # audio_track=audio_track, ) # Create the settings for the session @@ -110,7 +116,7 @@ async def main() -> None: chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") # the context manager calls the create_session method on the client and start listening to the audio stream - async with realtime_client, audio_player: + async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client): await realtime_client.update_session( settings=settings, chat_history=chat_history, kernel=kernel, create_response=True ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index af0d1bd8b8bd..b9e809d4e396 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from collections.abc import Callable, Coroutine, Mapping +from collections.abc import AsyncGenerator, Callable, Coroutine, Mapping from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar from numpy import ndarray @@ -15,26 +15,66 @@ OpenAIRealtimeWebsocketBase, ) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.events.realtime_event import RealtimeEvent from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack + from semantic_kernel.connectors.ai import PromptExecutionSettings + from semantic_kernel.contents import ChatHistory + _T = TypeVar("_T", bound="OpenAIRealtime") -class OpenAIRealtime(OpenAIConfigBase, OpenAIRealtimeBase): +__all__ = ["OpenAIRealtime"] + + +class RealtimeClientStub(RealtimeClientBase): + """This class makes sure that IDE's don't complain about missing methods in the below superclass.""" + + async def send(self, event: Any) -> None: + pass + + async def create_session( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> None: + pass + + def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]: + pass + + async def update_session( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> None: + pass + + async def close_session(self) -> None: + pass + + +class OpenAIRealtime(OpenAIRealtimeBase, RealtimeClientStub): """OpenAI Realtime service.""" - def __new__(cls: type["_T"], *args: Any, **kwargs: Any) -> "_T": + def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T": """Pick the right subclass, based on protocol.""" subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} - subclass = subclass_map[kwargs.pop("protocol", "websocket")] + subclass = subclass_map[protocol] return super(OpenAIRealtime, subclass).__new__(subclass) def __init__( self, - protocol: Literal["websocket", "webrtc"] = "websocket", + protocol: Literal["websocket", "webrtc"], + *, audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, audio_track: "MediaStreamTrack | None" = None, ai_model_id: str | None = None, @@ -42,7 +82,7 @@ def __init__( org_id: str | None = None, service_id: str | None = None, default_headers: Mapping[str, str] | None = None, - async_client: AsyncOpenAI | None = None, + client: AsyncOpenAI | None = None, env_file_path: str | None = None, env_file_encoding: str | None = None, **kwargs: Any, @@ -50,7 +90,7 @@ def __init__( """Initialize an OpenAIRealtime service. Args: - protocol: The protocol to use, can be either "websocket" or "webrtc". + protocol: The protocol to use, must be either "websocket" or "webrtc". audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the @@ -70,7 +110,7 @@ def __init__( the env vars or .env file value. default_headers: The default headers mapping of string keys to string values for HTTP requests. (Optional) - async_client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + client (Optional[AsyncOpenAI]): An existing client to use. (Optional) env_file_path (str | None): Use the environment settings file as a fallback to environment variables. (Optional) env_file_encoding (str | None): The encoding of the environment settings file. (Optional) @@ -88,7 +128,6 @@ def __init__( raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not openai_settings.realtime_model_id: raise ServiceInitializationError("The OpenAI text model ID is required.") - kwargs = {"audio_track": audio_track} if protocol == "webrtc" and audio_track else {} super().__init__( protocol=protocol, audio_output_callback=audio_output_callback, @@ -98,12 +137,12 @@ def __init__( org_id=openai_settings.org_id, ai_model_type=OpenAIModelTypes.REALTIME, default_headers=default_headers, - client=async_client, + client=client, **kwargs, ) -class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase): +class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase, OpenAIConfigBase): """OpenAI Realtime service using WebRTC protocol. This should not be used directly, use OpenAIRealtime instead. @@ -112,8 +151,19 @@ class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase): protocol: ClassVar[Literal["webrtc"]] = "webrtc" + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + """Initialize an OpenAIRealtime service using WebRTC protocol.""" + super().__init__( + *args, + **kwargs, + ) + -class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase): +class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase, OpenAIConfigBase): """OpenAI Realtime service using WebSocket protocol. This should not be used directly, use OpenAIRealtime instead. @@ -121,3 +171,13 @@ class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase): """ protocol: ClassVar[Literal["websocket"]] = "websocket" + + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + super().__init__( + *args, + **kwargs, + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index f7344b5262ee..0e94dd9c6854 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -1,10 +1,8 @@ # Copyright (c) Microsoft. All rights reserved. -import base64 import json import logging import sys -from abc import abstractmethod from collections.abc import AsyncGenerator, Callable, Coroutine from typing import TYPE_CHECKING, Any, ClassVar, Literal @@ -146,11 +144,24 @@ async def update_session( ) if chat_history and len(chat_history) > 0: for msg in chat_history.messages: - await self.send( - ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=msg) - ) + for item in msg.items: + match item: + case TextContent(): + await self.send(TextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item)) + case FunctionCallContent(): + await self.send( + FunctionCallEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item) + ) + case FunctionResultContent(): + await self.send( + FunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item + ) + ) + case _: + logger.error("Unsupported item type: %s", item) if create_response: - await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE)) + await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) @override def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: @@ -191,24 +202,21 @@ async def _parse_function_call_arguments_done( index=event.output_index, metadata={"call_id": event.call_id}, ) - yield FunctionCallEvent( - event_type="function_call", - service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, - function_call=item, - ) + yield FunctionCallEvent(service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item) chat_history = ChatHistory() await self.kernel.invoke_function_call(item, chat_history) created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore # This returns the output to the service - await self.send( - ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=created_output) + result = FunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, + function_result=created_output, ) + await self.send(result) # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE)) + await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) # This allows a user to have a full conversation in his code - yield FunctionResultEvent(event_type="function_result", function_result=created_output) + yield result - @abstractmethod async def _send(self, event: RealtimeClientEvent) -> None: """Send an event to the service.""" raise NotImplementedError @@ -217,14 +225,9 @@ async def _send(self, event: RealtimeClientEvent) -> None: async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: match event.event_type: case "audio": - if isinstance(event.audio.data, ndarray): - audio_data = base64.b64encode(event.audio.data.tobytes()).decode("utf-8") - else: - audio_data = event.audio.data.decode("utf-8") await self._send( _create_realtime_client_event( - event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, - audio=audio_data, + event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.to_base64_bytestring() ) ) case "text": @@ -286,7 +289,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: await self._send( _create_realtime_client_event( event_type=event.service_type, - **settings.prepare_settings_dict(), + session=settings.prepare_settings_dict(), ) ) case SendEvents.INPUT_AUDIO_BUFFER_APPEND: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 11d8676f45b6..731ff423011b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -161,9 +161,8 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: try: await self._receive_buffer.put( AudioEvent( - event_type="audio", service_type=ListenEvents.RESPONSE_AUDIO_DELTA, - audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), # type: ignore + audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), ), ) except Exception as e: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index db2d0cfea51d..8adee40db02d 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -20,9 +20,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import RealtimeEvent -from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent -from semantic_kernel.contents.utils.author_role import AuthorRole +from semantic_kernel.contents.events.realtime_event import AudioEvent, RealtimeEvent from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -54,18 +52,11 @@ async def receive( if self.audio_output_callback: await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) try: - yield ( - event.type, - StreamingChatMessageContent( - role=AuthorRole.ASSISTANT, - items=[ - AudioContent( - data=base64.b64decode(event.delta), - data_format="base64", - inner_content=event, - ) - ], # type: ignore - choice_index=event.content_index, + yield AudioEvent( + audio=AudioContent( + data=base64.b64decode(event.delta), + data_format="base64", + inner_content=event, ), ) except Exception as e: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py index 9aa061e44bc5..a33531ca19c7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py @@ -68,11 +68,10 @@ def kernel_function_metadata_to_function_call_format( def _create_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: match event_type: case SendEvents.SESSION_UPDATE: - event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} return SessionUpdateEvent( type=event_type, - session=Session.model_validate(kwargs), - **event_kwargs, + session=Session.model_validate(kwargs.pop("session")), + **kwargs, ) case SendEvents.INPUT_AUDIO_BUFFER_APPEND: return InputAudioBufferAppendEvent( diff --git a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py index ed2ef294c716..33fd09ce7f66 100644 --- a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py +++ b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py @@ -2,6 +2,7 @@ import asyncio import logging +from contextlib import asynccontextmanager from typing import Any, ClassVar, Final import numpy as np @@ -12,7 +13,9 @@ from pydantic import PrivateAttr from sounddevice import InputStream, OutputStream +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.events.realtime_event import AudioEvent from semantic_kernel.kernel_pydantic import KernelBaseModel logger = logging.getLogger(__name__) @@ -28,7 +31,7 @@ class SKAudioTrack(KernelBaseModel, MediaStreamTrack): """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.""" kind: ClassVar[str] = "audio" - device_id: str | int | None = None + device: str | int | None = None sample_rate: int = SAMPLE_RATE channels: int = TRACK_CHANNELS frame_duration: int = FRAME_DURATION @@ -44,7 +47,7 @@ class SKAudioTrack(KernelBaseModel, MediaStreamTrack): def __init__( self, *, - device_id: str | int | None = None, + device: str | int | None = None, sample_rate: int = SAMPLE_RATE, channels: int = TRACK_CHANNELS, frame_duration: int = FRAME_DURATION, @@ -52,10 +55,10 @@ def __init__( ): """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. - Make sure the device_id is set to the correct device for your system. + Make sure the device is set to the correct device for your system. Args: - device_id: The device id to use for recording audio. + device: The device id to use for recording audio. sample_rate: The sample rate for the audio. channels: The number of channels for the audio. frame_duration: The duration of each audio frame in milliseconds. @@ -63,7 +66,7 @@ def __init__( **kwargs: Additional keyword arguments. """ args = { - "device_id": device_id, + "device": device, "sample_rate": sample_rate, "channels": channels, "frame_duration": frame_duration, @@ -88,6 +91,15 @@ async def recv(self) -> Frame: logger.error(f"Error receiving audio frame: {e!s}") raise MediaStreamError("Failed to receive audio frame") + @asynccontextmanager + async def stream_to_realtime_client(self, realtime_client: RealtimeClientBase): + """Stream audio data to a RealtimeClientBase.""" + while True: + frame = await self.recv() + await realtime_client.send(AudioEvent(audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16"))) + yield + await asyncio.sleep(0.01) + def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, status: Any) -> None: if status: logger.warning(f"Audio input status: {status}") @@ -122,7 +134,7 @@ async def start_recording(self): try: self._stream = InputStream( - device=self.device_id, + device=self.device, channels=self.channels, samplerate=self.sample_rate, dtype=self.dtype, @@ -151,7 +163,7 @@ class SKAudioPlayer(KernelBaseModel): are receiving. Args: - device_id: The device id to use for playing audio. + device: The device id to use for playing audio. sample_rate: The sample rate for the audio. channels: The number of channels for the audio. dtype: The data type for the audio. @@ -159,7 +171,7 @@ class SKAudioPlayer(KernelBaseModel): """ - device_id: int | None = None + device: int | None = None sample_rate: int = SAMPLE_RATE channels: int = PLAYER_CHANNELS dtype: npt.DTypeLike = DTYPE @@ -185,7 +197,7 @@ def start(self): channels=self.channels, dtype=self.dtype, blocksize=int(self.sample_rate * self.frame_duration / 1000), - device=self.device_id, + device=self.device, ) if self._stream and self._queue: self._stream.start() @@ -205,8 +217,18 @@ def _sounddevice_callback(self, outdata, frames, time, status): if self._queue.empty(): return data: np.ndarray = self._queue.get_nowait() - outdata[:] = data.reshape(outdata.shape) - self._queue.task_done() + if data.size == frames: + outdata[:] = data.reshape(outdata.shape) + self._queue.task_done() + else: + if data.size > frames: + self._queue.put_nowait(data[frames:]) + outdata[:] = np.concatenate((np.empty(0, dtype=np.int16), data[:frames])).reshape(outdata.shape) + else: + outdata[:] = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))).reshape( + outdata.shape + ) + self._queue.task_done() async def client_callback(self, content: np.ndarray): """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 85fbf4e38cb5..b2b47dc6e0ef 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -193,3 +193,13 @@ def write_to_file(self, path: str | FilePath) -> None: def to_dict(self) -> dict[str, Any]: """Convert the instance to a dictionary.""" return {"type": "binary", "binary": {"uri": str(self)}} + + def to_base64_bytestring(self, encoding: str = "utf-8") -> str: + """Convert the instance to a bytestring.""" + if self._data_uri and self._data_uri.data_array is not None: + return b64encode(self._data_uri.data_array.tobytes()).decode(encoding) + if self._data_uri and self._data_uri.data_bytes: + return self._data_uri.data_bytes.decode(encoding) + if self._data_uri and self._data_uri.data_str: + return self._data_uri.data_str + return "" diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index 7de87f078ff6..edb2c5917778 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -19,7 +19,7 @@ class ServiceEvent(KernelBaseModel): """Base class for all service events.""" - event_type: Literal["service"] + event_type: Literal["service"] = "service" service_type: str event: Any | None = None @@ -27,7 +27,7 @@ class ServiceEvent(KernelBaseModel): class AudioEvent(KernelBaseModel): """Audio event type.""" - event_type: Literal["audio"] + event_type: Literal["audio"] = "audio" service_type: str | None = None audio: AudioContent @@ -35,7 +35,7 @@ class AudioEvent(KernelBaseModel): class TextEvent(KernelBaseModel): """Text event type.""" - event_type: Literal["text"] + event_type: Literal["text"] = "text" service_type: str | None = None text: TextContent @@ -43,7 +43,7 @@ class TextEvent(KernelBaseModel): class FunctionCallEvent(KernelBaseModel): """Function call event type.""" - event_type: Literal["function_call"] + event_type: Literal["function_call"] = "function_call" service_type: str | None = None function_call: FunctionCallContent @@ -51,6 +51,6 @@ class FunctionCallEvent(KernelBaseModel): class FunctionResultEvent(KernelBaseModel): """Function result event type.""" - event_type: Literal["function_result"] + event_type: Literal["function_result"] = "function_result" service_type: str | None = None function_result: FunctionResultContent From 7434c70e9bd1535315f1d8bbcf8916fcdabc9630 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 31 Jan 2025 15:49:52 +0100 Subject: [PATCH 23/50] removed built-in audio players, split for websocket and rtc --- docs/decisions/00XX-realtime-api-clients.md | 346 ------------- python/samples/concepts/audio/utils.py | 11 - .../01-chat_with_realtime_webrtc.py} | 18 +- .../01-chat_with_realtime_websocket.py | 95 ++++ .../02-chat_with_function_calling.py} | 15 +- python/samples/concepts/realtime/utils.py | 470 ++++++++++++++++++ .../ai/open_ai/services/open_ai_realtime.py | 41 +- .../realtime/open_ai_realtime_base.py | 103 ++-- .../realtime/open_ai_realtime_webrtc.py | 11 +- .../realtime/open_ai_realtime_websocket.py | 10 +- .../ai/open_ai/services/realtime/utils.py | 2 +- .../connectors/ai/realtime_client_base.py | 5 +- .../connectors/ai/utils/__init__.py | 5 - .../connectors/ai/utils/realtime_helpers.py | 267 ---------- .../contents/events/realtime_event.py | 33 +- 15 files changed, 689 insertions(+), 743 deletions(-) delete mode 100644 docs/decisions/00XX-realtime-api-clients.md delete mode 100644 python/samples/concepts/audio/utils.py rename python/samples/concepts/{audio/04-chat_with_realtime_api_simple.py => realtime/01-chat_with_realtime_webrtc.py} (86%) create mode 100644 python/samples/concepts/realtime/01-chat_with_realtime_websocket.py rename python/samples/concepts/{audio/05-chat_with_realtime_api_complex.py => realtime/02-chat_with_function_calling.py} (92%) create mode 100644 python/samples/concepts/realtime/utils.py delete mode 100644 python/semantic_kernel/connectors/ai/utils/__init__.py delete mode 100644 python/semantic_kernel/connectors/ai/utils/realtime_helpers.py diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md deleted file mode 100644 index bde864d79b52..000000000000 --- a/docs/decisions/00XX-realtime-api-clients.md +++ /dev/null @@ -1,346 +0,0 @@ ---- -# These are optional elements. Feel free to remove any of them. -status: {proposed } -contact: {Eduard van Valkenburg} -date: {2025-01-10} -deciders: { Eduard van Valkenburg, Mark Wallace, Ben Thomas, Roger Barreto} -consulted: -informed: ---- - -# Realtime API Clients - -## Context and Problem Statement - -Multiple model providers are starting to enable realtime voice-to-voice or even multi-model-to-voice communication with their models, this includes OpenAI with their [Realtime API](https://openai.com/index/introducing-the-realtime-api/) and [Google Gemini](https://ai.google.dev/api/multimodal-live). These API's promise some very interesting new ways of using LLM's in different settings, which we want to enable with Semantic Kernel. - The key feature that Semantic Kernel brings into this system is the ability to (re)use Semantic Kernel function as tools with these API's. There are also options for Google to use video and images as input, but for now we are focusing on the voice-to-voice part, while keeping in mind that video is coming. - -The protocols that these API's use at this time are Websockets and WebRTC. - -In both cases there are events being sent to and from the service, some events contain content, text, audio, or video (so far only sending, not receiving), while some events are "control" events, like content created, function call requested, etc. Sending events include, sending content, either voice, text or function call output, or events, like committing the input audio and requesting a response. - -### Websocket -Websocket has been around for a while and is a well known technology, it is a full-duplex communication protocol over a single, long-lived connection. It is used for sending and receiving messages between client and server in real-time. Each event can contain a message, which might contain a content item, or a control event. Audio is sent as a base64 encoded string. - -### WebRTC -WebRTC is a Mozilla project that provides web browsers and mobile applications with real-time communication via simple application programming interfaces (APIs). It allows audio and video communication to work inside web pages and other applications by allowing direct peer-to-peer communication, eliminating the need to install plugins or download native apps. It is used for sending and receiving audio and video streams, and can be used for sending (data-)messages as well. The big difference compared to websockets is that it explicitly create a channel for audio and video, and a separate channel for "data", which are events but in this space also things like Function calls. - -Both the OpenAI and Google realtime api's are in preview/beta, this means there might be breaking changes in the way they work coming in the future, therefore the clients built to support these API's are going to be experimental until the API's stabilize. - -One feature that we need to consider if and how to deal with is whether or not a service uses Voice Activated Detection, OpenAI supports turning that off and allows parameters for how it behaves, while Google has it on by default and it cannot be configured. - -### Event types (Websocket and partially WebRTC) - -#### Client side events: -| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | -| ------------------------- | --------------------------------- | ---------------------------- | ---------------------------------- | -| Control | Configure session | `session.update` | `BidiGenerateContentSetup` | -| Content | Send voice input | `input_audio_buffer.append` | `BidiGenerateContentRealtimeInput` | -| Control | Commit input and request response | `input_audio_buffer.commit` | `-` | -| Control | Clean audio input buffer | `input_audio_buffer.clear` | `-` | -| Content | Send text input | `conversation.item.create` | `BidiGenerateContentClientContent` | -| Control | Interrupt audio | `conversation.item.truncate` | `-` | -| Control | Delete content | `conversation.item.delete` | `-` | -| Control | Respond to function call request | `conversation.item.create` | `BidiGenerateContentToolResponse` | -| Control | Ask for response | `response.create` | `-` | -| Control | Cancel response | `response.cancel` | `-` | - -#### Server side events: -| **Content/Control event** | **Event Description** | **OpenAI Event** | **Google Event** | -| ------------------------- | -------------------------------------- | ------------------------------------------------------- | ----------------------------------------- | -| Control | Error | `error` | `-` | -| Control | Session created | `session.created` | `BidiGenerateContentSetupComplete` | -| Control | Session updated | `session.updated` | `BidiGenerateContentSetupComplete` | -| Control | Conversation created | `conversation.created` | `-` | -| Control | Input audio buffer committed | `input_audio_buffer.committed` | `-` | -| Control | Input audio buffer cleared | `input_audio_buffer.cleared` | `-` | -| Control | Input audio buffer speech started | `input_audio_buffer.speech_started` | `-` | -| Control | Input audio buffer speech stopped | `input_audio_buffer.speech_stopped` | `-` | -| Content | Conversation item created | `conversation.item.created` | `-` | -| Content | Input audio transcription completed | `conversation.item.input_audio_transcription.completed` | | -| Content | Input audio transcription failed | `conversation.item.input_audio_transcription.failed` | | -| Control | Conversation item truncated | `conversation.item.truncated` | `-` | -| Control | Conversation item deleted | `conversation.item.deleted` | `-` | -| Control | Response created | `response.created` | `-` | -| Control | Response done | `response.done` | `-` | -| Content | Response output item added | `response.output_item.added` | `-` | -| Content | Response output item done | `response.output_item.done` | `-` | -| Content | Response content part added | `response.content_part.added` | `-` | -| Content | Response content part done | `response.content_part.done` | `-` | -| Content | Response text delta | `response.text.delta` | `BidiGenerateContentServerContent` | -| Content | Response text done | `response.text.done` | `-` | -| Content | Response audio transcript delta | `response.audio_transcript.delta` | `BidiGenerateContentServerContent` | -| Content | Response audio transcript done | `response.audio_transcript.done` | `-` | -| Content | Response audio delta | `response.audio.delta` | `BidiGenerateContentServerContent` | -| Content | Response audio done | `response.audio.done` | `-` | -| Content | Response function call arguments delta | `response.function_call_arguments.delta` | `BidiGenerateContentToolCall` | -| Content | Response function call arguments done | `response.function_call_arguments.done` | `-` | -| Control | Function call cancelled | `-` | `BidiGenerateContentToolCallCancellation` | -| Control | Rate limits updated | `rate_limits.updated` | `-` | - - -## Overall Decision Drivers -- Simple programming model that is likely able to handle future realtime api's and the evolution of the existing ones. -- Whenever possible we transform incoming content into Semantic Kernel content, but surface everything, so it's extensible -- Protocol agnostic, should be able to use different types of protocols under the covers, like websocket and WebRTC, without changing the client code (unless the protocol requires it), there will be slight differences in behavior depending on the protocol. - -There are multiple areas where we need to make decisions, these are: -- Content and Events -- Programming model -- Audio speaker/microphone handling - -# Content and Events - -## Considered Options - Content and Events -Both the sending and receiving side of these integrations need to decide how to deal with the events. - -1. Treat content events separate from control events -1. Treat everything as content items -1. Treat everything as events - -### 1. Treat content events separate from control events -This would mean there are two mechanisms in the clients, one deals with content, and one with control events. - -- Pro: - - strongly typed responses for known content - - easy to use as the main interactions are clear with familiar SK content types, the rest goes through a separate mechanism -- Con: - - new content support requires updates in the codebase and can be considered breaking (potentially sending additional types back) - - additional complexity in dealing with two streams of data - -### 2. Treat everything as content items -This would mean that all events are turned into Semantic Kernel content items, and would also mean that we need to define additional content types for the control events. - -- Pro: - - everything is a content item, so it's easy to deal with -- Con: - - overkill for simple control events - -### 3. Treat everything as events -This would mean that all events are retained and returned to the developer as is, without any transformation. - -- Pro: - - no transformation needed - - easy to maintain -- Con: - - nothing easing the burden on the developer, they need to deal with the raw events - - no way to easily switch between one provider and another - -## Decision Outcome - Content and Events - -Chosen option: 3 Treat Everything as Events - -This option was chosen to allow abstraction away from the raw events, while still allowing the developer to access the raw events if needed. -A set of events are defined, for basic types, like 'audio', 'text', 'function_call', 'function_result', it then has two other fields, service_event which is filled with the event type from the service and a field for the actual content, with a name that corresponds to the event type: - -```python -AudioEvent( - event_type="audio", - service_event= "response.audio.delta", - audio: AudioContent(...) -) -``` - -Next to these we will have a generic event, called ServiceEvent, this is the catch-all, which has event_type: "service", the service_event field filled with the event type from the service and a field called 'event' which contains the raw event from the service. - -```python -ServiceEvent( - event_type="service", - service_event= "conversation.item.create", - event: { ... } -) -``` - -This allows you to easily filter on the event_type, and then use the service_event to filter on the specific event type, and then use the content field to get the content, or the event field to get the raw event. - -Collectively these are known as *RealtimeEvents*, and are returned as an async generator from the client, so you can easily loop over them. And they are passed to the send method. - -Initially RealtimeEvents are: -- AudioEvent -- TextEvent -- FunctionCallEvent -- FunctionResultEvent -- ServiceEvent - -# Programming model - -## Considered Options - Programming model -The programming model for the clients needs to be simple and easy to use, while also being able to handle the complexity of the realtime api's. - -_In this section we will refer to events for both content and events, regardless of the decision made in the previous section._ - -1. Async generator for receiving events, that yields Events, combined with a event handler/callback mechanism for receiving events and a function for sending events - - 1a: Single event handlers, where each event is passed to the handler - - 1b: Multiple event handlers, where each event type has its own handler -2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers -3. Purely a start listening method that yields Events, and a send method that sends events - -### 1. Async generator for receiving events, that yields contents, combined with a event handler/callback mechanism for receiving events and a function for sending events -This would mean that the client would have a mechanism to register event handlers, and the integration would call these handlers when an event is received. For sending events, a function would be created that sends the event to the service. - -- Pro: - - without any additional setup you get content back, just as with "regular" chat models - - event handlers are mostly for more complex interactions, so ok to be slightly more complex -- Con: - - developer judgement needs to be made (or exposed with parameters) on what is returned through the async generator and what is passed to the event handlers - -### 2. Event buffers/queues that are exposed to the developer, start sending and start receiving methods, that just initiate the sending and receiving of events and thereby the filling of the buffers -This would mean that there are two queues, one for sending and one for receiving, and the developer can listen to the receiving queue and send to the sending queue. Internal things like parsing events to content types and auto-function calling are processed first, and the result is put in the queue, the content type should use inner_content to capture the full event and these might add a message to the send queue as well. - -- Pro: - - simple to use, just start sending and start receiving - - easy to understand, as queues are a well known concept - - developers can just skip events they are not interested in -- Con: - - potentially causes audio delays because of the queueing mechanism - -### 2b. Same as option 2, but with priority handling of audio content -This would mean that the audio content is handled, and passed to the developer code, and then all other events are processed. - -- Pro: - - mitigates audio delays - - easy to understand, as queues are a well known concept - - developers can just skip events they are not interested in -- Con: - - Two separate mechanisms used for audio content and events - -## Decision Outcome - Programming model - -Chosen option: Purely a start listening method that yields Events, and a send method that sends events - -This makes the programming model very easy, a minimal setup that should work for every service and protocol would look like this: -```python -async for event in realtime_client.start_streaming(): - match event.event_type: - case "audio": - await audio_player.add_audio(event.audio) - case "text": - print(event.text.text) -``` - - -# Audio speaker/microphone handling - -## Considered Options - Audio speaker/microphone handling - -1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio -2. Send and receive AudioContent (potentially wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing - -### 1. Create abstraction in SK for audio handlers, that can be passed into the realtime client to record and play audio -This would mean that the client would have a mechanism to register audio handlers, and the integration would call these handlers when audio is received or needs to be sent. A additional abstraction for this would have to be created in Semantic Kernel (or potentially taken from a standard). - -- Pro: - - simple/local audio handlers can be shipped with SK making it easy to use - - extensible by third parties to integrate into other systems (like Azure Communications Service) - - could mitigate buffer issues by prioritizing audio content being sent to the handlers -- Con: - - extra code in SK that needs to be maintained, potentially relying on third party code - -### 2. Send and receive AudioContent (wrapped in StreamingChatMessageContent) to the client, and let the client handle the audio recording and playing -This would mean that the client would receive AudioContent items, and would have to deal with them itself, including recording and playing the audio. - -- Pro: - - no extra code in SK that needs to be maintained -- Con: - - extra burden on the developer to deal with the audio - - harder to get started with - -## Decision Outcome - Audio speaker/microphone handling - -Chosen option: ... - -# Interface design - -## Considered Options - Interface design - -1. Use a single class for everything -2. Split the service class from a session class. - -The following methods will need to be supported: -- create session -- update session -- close session -- listen for/receive events -- send events - -### 1. Use a single class for everything - -Each implementation would have to implements all of the above methods. This means that non-protocol specific elements are in the same class as the protocol specific elements and will lead to code duplication between them. - -### 2. Split the service class from a session class. - -Two interfaces are created: -- Service: create session, update session, delete session, list sessions -- Session: listen for/receive events, send events, update session, close session - -Currently neither the google or the openai api's support restarting sessions, so the advantage of splitting is mostly a implementation question but will not add any benefits to the user. - -This means that the split would be far simpler: -- Service: create session -- Session: listen for/receive events, send events, update session, close session - -## Naming - -The send and listen/receive methods need to be clear in the way their are named and this can become confusing when dealing with these api's. The following options are considered: - -Options for sending events to the service from your code: -- google uses .send in their client. -- OpenAI uses .send in their client as well -- send or send_message is used in other clients, like Azure Communication Services - -Options for listening for events from the service in your code: -- google uses .receive in their client. -- openai uses .recv in their client. -- others use receive or receive_messages in their clients. - -### Decision Outcome - Interface design - -Chosen option: Use a single class for everything -Chosen for send and receive as verbs. - -This means that the interface will look like this: -```python - -class RealtimeClient: - async def create_session(self, settings: PromptExecutionSettings, chat_history: ChatHistory, **kwargs) -> None: - ... - - async def update_session(self, settings: PromptExecutionSettings, chat_history: ChatHistory, **kwargs) -> None: - ... - - async def close_session(self, **kwargs) -> None: - ... - - async def receive(self, **kwargs) -> AsyncGenerator[RealtimeEvent, None]: - ... - - async def send(self, event: RealtimeEvent) -> None: - ... -``` - -In most cases, create_session should call update_session with the same parameters, since update session can also be done separately later on with the same inputs. - -For Python a default __aenter__ and __aexit__ method should be added to the class, so it can be used in a with statement, which calls create_session and close_session respectively. - -It is advisable, but not required, to implement the send method through a buffer/queue so that events be can 'sent' before the sessions has been established without losing them or raising exceptions, this might take a very seconds and in that time a single send call would block the application. - -For receiving a internal implementation might also rely on a buffer/queue, but this is up to the developer and what makes sense for that service. For instance webrtc relies on defining the callback at create session time, so the create_session method adds a function that adds events to the queue and the receive method starts reading from and yielding from that queue. - -The send method should handle all events types, but it might have to handle the same thing in two ways, for instance: -```python -audio = AudioContent(...) - -await client.send(AudioEvent(event_type='audio', audio=audio)) -``` - -is equivalent to (at least in the case of OpenAI): -```python -audio = AudioContent(...) - -await client.send(ServiceEvent(event_type='service', service_event='input_audio_buffer.append', event=audio)) -``` - -The first version allows one to have the exact same code for all services, while the second version is also correct and should be handled correctly as well, this once again allows for flexibility and simplicity, when audio needs to be sent to with a different event type, that is still possible in the second way, while the first uses the "default" event type for that particular service. - - - diff --git a/python/samples/concepts/audio/utils.py b/python/samples/concepts/audio/utils.py deleted file mode 100644 index fda9ecb7d772..000000000000 --- a/python/samples/concepts/audio/utils.py +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import logging - -import sounddevice as sd - -logger = logging.getLogger(__name__) - - -def check_audio_devices(): - logger.debug(sd.query_devices()) diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py similarity index 86% rename from python/samples/concepts/audio/04-chat_with_realtime_api_simple.py rename to python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py index 06ee11807a81..38d3803a737b 100644 --- a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py @@ -3,16 +3,17 @@ import asyncio import logging -from samples.concepts.audio.utils import check_audio_devices +from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( ListenEvents, OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.utils import SKAudioPlayer logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) aiortc_log = logging.getLogger("aiortc") aiortc_log.setLevel(logging.WARNING) aioice_log = logging.getLogger("aioice") @@ -43,7 +44,12 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - realtime_client = OpenAIRealtime("webrtc") + audio_player = AudioPlayerWebRTC() + realtime_client = OpenAIRealtime( + "webrtc", + audio_output_callback=audio_player.client_callback, + audio_track=AudioRecorderWebRTC(), + ) # Create the settings for the session settings = OpenAIRealtimeExecutionSettings( instructions=""" @@ -58,15 +64,15 @@ async def main() -> None: turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), ) # the context manager calls the create_session method on the client and start listening to the audio stream - audio_player = SKAudioPlayer() + print("Mosscap (transcript): ", end="") async with realtime_client, audio_player: await realtime_client.update_session(settings=settings, create_response=True) async for event in realtime_client.receive(): match event.event_type: - case "audio": - await audio_player.add_audio(event.audio) + # case "audio": + # await audio_player.add_audio(event.audio) case "text": print(event.text.text, end="") case "service": diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py new file mode 100644 index 000000000000..e647da6ff4a9 --- /dev/null +++ b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py @@ -0,0 +1,95 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging + +from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices +from semantic_kernel.connectors.ai.open_ai import ( + ListenEvents, + OpenAIRealtime, + OpenAIRealtimeExecutionSettings, + TurnDetection, +) + +logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) +aiortc_log = logging.getLogger("aiortc") +aiortc_log.setLevel(logging.WARNING) +aioice_log = logging.getLogger("aioice") +aioice_log.setLevel(logging.WARNING) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# This simple sample demonstrates how to use the OpenAI Realtime API to create +# a chat bot that can listen and respond directly through audio. +# It requires installing: +# - semantic-kernel[openai_realtime] +# - pyaudio +# - sounddevice +# - pydub +# - aiortc +# e.g. pip install pyaudio sounddevice pydub + +# The characterics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can check the available devices by uncommenting line below the function +check_audio_devices() + + +async def main() -> None: + # create the realtime client and optionally add the audio output function, this is optional + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + audio_player = AudioPlayerWebsocket() + realtime_client = OpenAIRealtime( + "websocket", + audio_output_callback=audio_player.client_callback, + ) + audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) + # Create the settings for the session + settings = OpenAIRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + voice="shimmer", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + ) + # the context manager calls the create_session method on the client and start listening to the audio stream + print("Mosscap (transcript): ", end="") + + async with realtime_client, audio_player, audio_recorder: + await realtime_client.update_session(settings=settings, create_response=True) + + async for event in realtime_client.receive(): + match event.event_type: + # this can be used as an alternative to the callback function used above, + # the callback is faster and smoother + # case "audio": + # await audio_player.add_audio(event.audio) + case "text": + print(event.text.text, end="") + case "service": + # OpenAI Specific events + if event.service_type == ListenEvents.SESSION_UPDATED: + print("Session updated") + if event.service_type == ListenEvents.RESPONSE_CREATED: + print("") + if event.service_type == ListenEvents.ERROR: + logger.error(event.event) + + +if __name__ == "__main__": + print( + "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py b/python/samples/concepts/realtime/02-chat_with_function_calling.py similarity index 92% rename from python/samples/concepts/audio/05-chat_with_realtime_api_complex.py rename to python/samples/concepts/realtime/02-chat_with_function_calling.py index b567c0028178..c74b6b583d23 100644 --- a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py +++ b/python/samples/concepts/realtime/02-chat_with_function_calling.py @@ -5,7 +5,7 @@ from datetime import datetime from random import randint -from samples.concepts.audio.utils import check_audio_devices +from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( @@ -14,11 +14,12 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack from semantic_kernel.contents import ChatHistory from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) aiortc_log = logging.getLogger("aiortc") aiortc_log.setLevel(logging.WARNING) aioice_log = logging.getLogger("aioice") @@ -78,15 +79,15 @@ async def main() -> None: # create the audio player and audio track # both take a device_id parameter, which is the index of the device to use, if None the default device is used - audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1) - audio_track = SKAudioTrack() + audio_player = AudioPlayerWebRTC() + audio_track = AudioRecorderWebRTC() # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different realtime_client = OpenAIRealtime( - protocol="websocket", + protocol="webrtc", audio_output_callback=audio_player.client_callback, - # audio_track=audio_track, + audio_track=audio_track, ) # Create the settings for the session @@ -116,7 +117,7 @@ async def main() -> None: chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") # the context manager calls the create_session method on the client and start listening to the audio stream - async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client): + async with realtime_client, audio_player: await realtime_client.update_session( settings=settings, chat_history=chat_history, kernel=kernel, create_response=True ) diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py new file mode 100644 index 000000000000..d7f39369a0d4 --- /dev/null +++ b/python/samples/concepts/realtime/utils.py @@ -0,0 +1,470 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +import logging +import threading +from typing import Any, ClassVar, Final, cast + +import numpy as np +import numpy.typing as npt +import sounddevice as sd +from aiortc.mediastreams import MediaStreamError, MediaStreamTrack +from av.audio.frame import AudioFrame +from av.frame import Frame +from pydantic import PrivateAttr +from sounddevice import InputStream, OutputStream + +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.events.realtime_event import AudioEvent +from semantic_kernel.kernel_pydantic import KernelBaseModel + +logger = logging.getLogger(__name__) + +SAMPLE_RATE: Final[int] = 24000 +RECORDER_CHANNELS: Final[int] = 1 +PLAYER_CHANNELS: Final[int] = 1 +FRAME_DURATION: Final[int] = 100 +SAMPLE_RATE_WEBRTC: Final[int] = 48000 +RECORDER_CHANNELS_WEBRTC: Final[int] = 1 +PLAYER_CHANNELS_WEBRTC: Final[int] = 2 +FRAME_DURATION_WEBRTC: Final[int] = 20 +DTYPE: Final[npt.DTypeLike] = np.int16 + + +def check_audio_devices(): + logger.info(sd.query_devices()) + + +# region: Recorders + + +class AudioRecorderWebRTC(KernelBaseModel, MediaStreamTrack): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.""" + + kind: ClassVar[str] = "audio" + device: str | int | None = None + sample_rate: int + channels: int + frame_duration: int + dtype: npt.DTypeLike = DTYPE + frame_size: int = 0 + _queue: asyncio.Queue[Frame] = PrivateAttr(default_factory=asyncio.Queue) + _is_recording: bool = False + _stream: InputStream | None = None + _recording_task: asyncio.Task | None = None + _loop: asyncio.AbstractEventLoop | None = None + _pts: int = 0 + + def __init__( + self, + *, + device: str | int | None = None, + sample_rate: int = SAMPLE_RATE_WEBRTC, + channels: int = RECORDER_CHANNELS_WEBRTC, + frame_duration: int = FRAME_DURATION_WEBRTC, + dtype: npt.DTypeLike = DTYPE, + ): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + Make sure the device is set to the correct device for your system. + + Args: + device: The device id to use for recording audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + frame_duration: The duration of each audio frame in milliseconds. + dtype: The data type for the audio. + """ + super().__init__(**{ + "device": device, + "sample_rate": sample_rate, + "channels": channels, + "frame_duration": frame_duration, + "dtype": dtype, + "frame_size": int(sample_rate * frame_duration / 1000), + }) + MediaStreamTrack.__init__(self) + + async def recv(self) -> Frame: + """Receive the next frame of audio data.""" + if not self._recording_task: + self._recording_task = asyncio.create_task(self.start_recording()) + + try: + frame = await self._queue.get() + self._queue.task_done() + return frame + except Exception as e: + logger.error(f"Error receiving audio frame: {e!s}") + raise MediaStreamError("Failed to receive audio frame") + + def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, status: Any) -> None: + if status: + logger.warning(f"Audio input status: {status}") + if self._loop and self._loop.is_running(): + asyncio.run_coroutine_threadsafe(self._queue.put(self._create_frame(indata)), self._loop) + + def _create_frame(self, indata: np.ndarray) -> Frame: + audio_data = indata.copy() + if audio_data.dtype != self.dtype: + audio_data = ( + (audio_data * 32767).astype(self.dtype) if self.dtype == np.int16 else audio_data.astype(self.dtype) + ) + frame = AudioFrame( + format="s16", + layout="mono", + samples=len(audio_data), + ) + frame.rate = self.sample_rate + frame.pts = self._pts + frame.planes[0].update(audio_data.tobytes()) + self._pts += len(audio_data) + return frame + + async def start_recording(self): + """Start recording audio from the input device.""" + if self._is_recording: + return + + self._is_recording = True + self._loop = asyncio.get_running_loop() + self._pts = 0 # Reset pts when starting recording + + try: + self._stream = InputStream( + device=self.device, + channels=self.channels, + samplerate=self.sample_rate, + dtype=self.dtype, + blocksize=self.frame_size, + callback=self._sounddevice_callback, + ) + self._stream.start() + + while self._is_recording: + await asyncio.sleep(0.1) + except asyncio.CancelledError | KeyboardInterrupt: + logger.debug("Recording task was stopped.") + except Exception as e: + logger.error(f"Error in audio recording: {e!s}") + raise + finally: + self._is_recording = False + + +class AudioRecorderWebsocket(KernelBaseModel): + """A simple class that implements a sounddevice for use with websockets.""" + + realtime_client: RealtimeClientBase + device: str | int | None = None + sample_rate: int + channels: int + frame_duration: int + dtype: npt.DTypeLike = DTYPE + frame_size: int = 0 + _stream: InputStream | None = None + _pts: int = 0 + _stream_task: asyncio.Task | None = None + + def __init__( + self, + *, + realtime_client: RealtimeClientBase, + device: str | int | None = None, + sample_rate: int = SAMPLE_RATE, + channels: int = RECORDER_CHANNELS, + frame_duration: int = FRAME_DURATION, + dtype: npt.DTypeLike = DTYPE, + ): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + Make sure the device is set to the correct device for your system. + + Args: + realtime_client: The RealtimeClientBase to use for streaming audio. + device: The device id to use for recording audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + frame_duration: The duration of each audio frame in milliseconds. + dtype: The data type for the audio. + **kwargs: Additional keyword arguments. + """ + super().__init__(**{ + "realtime_client": realtime_client, + "device": device, + "sample_rate": sample_rate, + "channels": channels, + "frame_duration": frame_duration, + "dtype": dtype, + "frame_size": int(sample_rate * frame_duration / 1000), + }) + + async def __aenter__(self): + """Stream audio data to a RealtimeClientBase.""" + if not self._stream_task: + self._stream_task = asyncio.create_task(self._start_stream()) + return self + + async def _start_stream(self): + self._pts = 0 # Reset pts when starting recording + self._stream = InputStream( + device=self.device, + channels=self.channels, + samplerate=self.sample_rate, + dtype=self.dtype, + blocksize=self.frame_size, + ) + self._stream.start() + try: + while True: + if self._stream.read_available < self.frame_size: + await asyncio.sleep(0) + continue + data, _ = self._stream.read(self.frame_size) + + await self.realtime_client.send( + AudioEvent(audio=AudioContent(data=base64.b64encode(cast(Any, data)).decode("utf-8"))) + ) + + await asyncio.sleep(0) + except asyncio.CancelledError: + pass + + async def __aexit__(self, exc_type, exc, tb): + """Stop recording audio.""" + if self._stream_task: + self._stream_task.cancel() + await self._stream_task + if self._stream: + self._stream.stop() + self._stream.close() + + +# region: Players + + +class AudioPlayerWebRTC(KernelBaseModel): + """Simple class that plays audio using sounddevice. + + Make sure the device_id is set to the correct device for your system. + + The sample rate, channels and frame duration + should be set to match the audio you + are receiving. + + Args: + device: The device id to use for playing audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + dtype: The data type for the audio. + frame_duration: The duration of each audio frame in milliseconds + + """ + + device: int | None = None + sample_rate: int = SAMPLE_RATE_WEBRTC + channels: int = PLAYER_CHANNELS_WEBRTC + dtype: npt.DTypeLike = DTYPE + frame_duration: int = FRAME_DURATION_WEBRTC + _queue: asyncio.Queue[np.ndarray] | None = PrivateAttr(default=None) + _stream: OutputStream | None = PrivateAttr(default=None) + + async def __aenter__(self): + """Start the audio stream when entering a context.""" + self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + """Stop the audio stream when exiting a context.""" + self.stop() + + def start(self): + """Start the audio stream.""" + self._queue = asyncio.Queue() + self._stream = OutputStream( + callback=self._sounddevice_callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.dtype, + blocksize=int(self.sample_rate * self.frame_duration / 1000), + device=self.device, + ) + if self._stream and self._queue: + self._stream.start() + + def stop(self): + """Stop the audio stream.""" + if self._stream: + self._stream.stop() + self._stream = None + self._queue = None + + def _sounddevice_callback(self, outdata, frames, time, status): + """This callback is called by sounddevice when it needs more audio data to play.""" + if status: + logger.debug(f"Audio output status: {status}") + if self._queue: + if self._queue.empty(): + return + data = self._queue.get_nowait() + outdata[:] = data.reshape(outdata.shape) + self._queue.task_done() + else: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + + async def client_callback(self, content: np.ndarray): + """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" + if self._queue: + await self._queue.put(content) + else: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + + async def add_audio(self, audio_content: AudioContent) -> None: + """This function is used to add audio to the queue for playing. + + It first checks if there is a AudioFrame in the inner_content of the AudioContent. + If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. + """ + if not self._queue: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + return + if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): + await self._queue.put(audio_content.inner_content.to_ndarray()) + return + if isinstance(audio_content.data, np.ndarray): + await self._queue.put(audio_content.data) + return + if isinstance(audio_content.data, bytes): + await self._queue.put(np.frombuffer(audio_content.data, dtype=self.dtype)) + return + if isinstance(audio_content.data, str): + await self._queue.put(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) + return + logger.error(f"Unknown audio content: {audio_content}") + + +class AudioPlayerWebsocket(KernelBaseModel): + """Simple class that plays audio using sounddevice. + + Make sure the device_id is set to the correct device for your system. + + The sample rate, channels and frame duration + should be set to match the audio you + are receiving. + + Args: + device: The device id to use for playing audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + dtype: The data type for the audio. + frame_duration: The duration of each audio frame in milliseconds + + """ + + device: int | None = None + sample_rate: int = SAMPLE_RATE + channels: int = PLAYER_CHANNELS + dtype: npt.DTypeLike = DTYPE + frame_duration: int = FRAME_DURATION + _lock: Any = PrivateAttr(default_factory=threading.Lock) + _queue: list[np.ndarray] = PrivateAttr(default_factory=list) + _stream: OutputStream | None = PrivateAttr(default=None) + _frame_count: int = 0 + + async def __aenter__(self): + """Start the audio stream when entering a context.""" + self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + """Stop the audio stream when exiting a context.""" + self.stop() + + def start(self): + """Start the audio stream.""" + with self._lock: + self._queue = [] + self._stream = OutputStream( + callback=self._sounddevice_callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.dtype, + blocksize=int(self.sample_rate * self.frame_duration / 1000), + device=self.device, + ) + if self._stream: + self._stream.start() + + def stop(self): + """Stop the audio stream.""" + if self._stream: + self._stream.stop() + self._stream = None + with self._lock: + self._queue = [] + + def _sounddevice_callback(self, outdata, frames, time, status): + """This callback is called by sounddevice when it needs more audio data to play.""" + with self._lock: + if status: + logger.debug(f"Audio output status: {status}") + data = np.empty(0, dtype=np.int16) + + # get next item from queue if there is still space in the buffer + while len(data) < frames and len(self._queue) > 0: + item = self._queue.pop(0) + frames_needed = frames - len(data) + data = np.concatenate((data, item[:frames_needed])) + if len(item) > frames_needed: + self._queue.insert(0, item[frames_needed:]) + + self._frame_count += len(data) + + # fill the rest of the frames with zeros if there is no more data + if len(data) < frames: + data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) + + outdata[:] = data.reshape(-1, 1) + + def reset_frame_count(self): + self._frame_count = 0 + + def get_frame_count(self): + return self._frame_count + + async def client_callback(self, content: np.ndarray): + """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" + with self._lock: + self._queue.append(content) + + async def add_audio(self, audio_content: AudioContent) -> None: + """This function is used to add audio to the queue for playing. + + It first checks if there is a AudioFrame in the inner_content of the AudioContent. + If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. + """ + with self._lock: + if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): + self._queue.append(audio_content.inner_content.to_ndarray()) + return + if isinstance(audio_content.data, np.ndarray): + self._queue.append(audio_content.data) + return + if isinstance(audio_content.data, bytes): + self._queue.append(np.frombuffer(audio_content.data, dtype=self.dtype)) + return + if isinstance(audio_content.data, str): + self._queue.append(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) + return + logger.error(f"Unknown audio content: {audio_content}") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index b9e809d4e396..7d6f60eafbd2 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from collections.abc import AsyncGenerator, Callable, Coroutine, Mapping +from collections.abc import Callable, Coroutine, Mapping from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar from numpy import ndarray @@ -15,17 +15,11 @@ OpenAIRealtimeWebsocketBase, ) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings -from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.events.realtime_event import RealtimeEvent from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack - from semantic_kernel.connectors.ai import PromptExecutionSettings - from semantic_kernel.contents import ChatHistory _T = TypeVar("_T", bound="OpenAIRealtime") @@ -33,36 +27,7 @@ __all__ = ["OpenAIRealtime"] -class RealtimeClientStub(RealtimeClientBase): - """This class makes sure that IDE's don't complain about missing methods in the below superclass.""" - - async def send(self, event: Any) -> None: - pass - - async def create_session( - self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - **kwargs: Any, - ) -> None: - pass - - def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]: - pass - - async def update_session( - self, - settings: "PromptExecutionSettings | None" = None, - chat_history: "ChatHistory | None" = None, - **kwargs: Any, - ) -> None: - pass - - async def close_session(self) -> None: - pass - - -class OpenAIRealtime(OpenAIRealtimeBase, RealtimeClientStub): +class OpenAIRealtime(OpenAIRealtimeBase): """OpenAI Realtime service.""" def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T": @@ -128,6 +93,8 @@ def __init__( raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not openai_settings.realtime_model_id: raise ServiceInitializationError("The OpenAI text model ID is required.") + if audio_track: + kwargs["audio_track"] = audio_track super().__init__( protocol=protocol, audio_output_callback=audio_output_callback, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 0e94dd9c6854..2789bf0d16e2 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -3,15 +3,18 @@ import json import logging import sys -from collections.abc import AsyncGenerator, Callable, Coroutine +from collections.abc import AsyncGenerator, Callable from typing import TYPE_CHECKING, Any, ClassVar, Literal +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, +) + if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: from typing_extensions import override # pragma: no cover -from numpy import ndarray from openai.types.beta.realtime import ( RealtimeClientEvent, RealtimeServerEvent, @@ -27,7 +30,7 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.utils import ( - _create_realtime_client_event, + _create_openai_realtime_client_event, update_settings_from_function_call_configuration, ) from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings @@ -60,9 +63,8 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): """OpenAI Realtime service.""" - protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None + protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" kernel: Kernel | None = None _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) @@ -77,7 +79,6 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt match event.type: case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: yield TextEvent( - event_type="text", service_type=event.type, text=StreamingTextContent( inner_content=event, @@ -90,7 +91,6 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt self._call_id_to_function_map[event.item.call_id] = event.item.name case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: yield FunctionCallEvent( - event_type="function_call", service_type=event.type, function_call=FunctionCallContent( id=event.item_id, @@ -114,7 +114,7 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt # we put all event in the output buffer, but after the interpreted one. # so when dealing with them, make sure to check the type of the event, since they # might be of different types. - yield ServiceEvent(event_type="service", service_type=event.type, event=event) + yield ServiceEvent(service_type=event.type, event=event) @override async def update_session( @@ -137,7 +137,6 @@ async def update_session( ) await self.send( ServiceEvent( - event_type="service", service_type=SendEvents.SESSION_UPDATE, event={"settings": self._current_settings}, ) @@ -163,20 +162,6 @@ async def update_session( if create_response: await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) - @override - def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa - OpenAIRealtimeExecutionSettings, - ) - - return OpenAIRealtimeExecutionSettings - - @override - def _update_function_choice_settings_callback( - self, - ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: - return update_settings_from_function_call_configuration - async def _parse_function_call_arguments_done( self, event: ResponseFunctionCallArgumentsDoneEvent, @@ -226,13 +211,13 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: match event.event_type: case "audio": await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.to_base64_bytestring() ) ) case "text": await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, **dict( type="message", @@ -248,7 +233,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ) case "function_call": await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, **dict( type="function_call", @@ -264,7 +249,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ) case "function_result": await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, **dict( type="function_call_output", @@ -281,13 +266,22 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: logger.error("Event data is empty") return settings = data.get("settings", None) - if not settings or not isinstance(settings, PromptExecutionSettings): + if not settings: logger.error("Event data does not contain 'settings'") return + if not isinstance(settings, OpenAIRealtimeExecutionSettings): + try: + settings = self.get_prompt_execution_settings_from_settings(settings) + except Exception as e: + logger.error( + f"Failed to properly create settings from passed settings: {settings}, error: {e}" + ) + return + assert isinstance(settings, OpenAIRealtimeExecutionSettings) # nosec if not settings.ai_model_id: settings.ai_model_id = self.ai_model_id await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, session=settings.prepare_settings_dict(), ) @@ -297,15 +291,15 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: logger.error("Event data does not contain 'audio'") return await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, audio=data["audio"], ) ) case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: - await self._send(_create_realtime_client_event(event_type=event.service_type)) + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: - await self._send(_create_realtime_client_event(event_type=event.service_type)) + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) case SendEvents.CONVERSATION_ITEM_CREATE: if not data or "item" not in data: logger.error("Event data does not contain 'item'") @@ -316,7 +310,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: match item: case TextContent(): await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, **dict( type="message", @@ -332,7 +326,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ) case FunctionCallContent(): await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, **dict( type="function_call", @@ -349,7 +343,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: case FunctionResultContent(): await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, **dict( type="function_call_output", @@ -363,7 +357,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: logger.error("Event data does not contain 'item_id'") return await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, item_id=data["item_id"], content_index=0, @@ -375,21 +369,52 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: logger.error("Event data does not contain 'item_id'") return await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, item_id=data["item_id"], ) ) case SendEvents.RESPONSE_CREATE: await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, event_id=data.get("event_id", None) if data else None ) ) case SendEvents.RESPONSE_CANCEL: await self._send( - _create_realtime_client_event( + _create_openai_realtime_client_event( event_type=event.service_type, response_id=data.get("response_id", None) if data else None, ) ) + + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration + + @override + async def create_session( + self, + settings: "PromptExecutionSettings | None" = None, + chat_history: "ChatHistory | None" = None, + **kwargs: Any, + ) -> None: + pass + + @override + def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]: + pass + + @override + async def close_session(self) -> None: + pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 731ff423011b..2a6bf71dfd68 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -26,13 +26,12 @@ from openai._models import construct_type_unchecked from openai.types.beta.realtime.realtime_client_event import RealtimeClientEvent from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from pydantic import Field, PrivateAttr +from pydantic import PrivateAttr from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeEvent -from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioTrack from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.events import RealtimeEvent from semantic_kernel.contents.events.realtime_event import AudioEvent from semantic_kernel.utils.experimental_decorator import experimental_class @@ -51,7 +50,7 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): protocol: ClassVar[Literal["webrtc"]] = "webrtc" peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None - audio_track: MediaStreamTrack = Field(default_factory=SKAudioTrack) + audio_track: MediaStreamTrack | None = None _receive_buffer: asyncio.Queue[RealtimeEvent] = PrivateAttr(default_factory=asyncio.Queue) @override @@ -82,6 +81,8 @@ async def create_session( **kwargs: Any, ) -> None: """Create a session in the service.""" + if not self.audio_track: + raise Exception("Audio track not initialized") self.peer_connection = RTCPeerConnection( configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) ) @@ -161,8 +162,8 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: try: await self._receive_buffer.put( AudioEvent( - service_type=ListenEvents.RESPONSE_AUDIO_DELTA, audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), + service_type=ListenEvents.RESPONSE_AUDIO_DELTA, ), ) except Exception as e: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 8adee40db02d..3b476d96d3c0 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -49,15 +49,13 @@ async def receive( async for event in self.connection: if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: + audio_bytes = base64.b64decode(event.delta) if self.audio_output_callback: - await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) + await self.audio_output_callback(np.frombuffer(audio_bytes, dtype=np.int16)) try: yield AudioEvent( - audio=AudioContent( - data=base64.b64decode(event.delta), - data_format="base64", - inner_content=event, - ), + audio=AudioContent(data=audio_bytes, data_format="base64", inner_content=event), + service_type=event.type, ) except Exception as e: logger.error(f"Error processing remote audio frame: {e!s}") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py index a33531ca19c7..bb815eead6dd 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py @@ -65,7 +65,7 @@ def kernel_function_metadata_to_function_call_format( } -def _create_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: +def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: match event_type: case SendEvents.SESSION_UPDATE: return SessionUpdateEvent( diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 0ad1fc13a089..cc70df1f3c90 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -2,7 +2,7 @@ import sys from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator, Callable +from collections.abc import AsyncGenerator, Callable, Coroutine from typing import TYPE_CHECKING, Any, ClassVar if sys.version_info >= (3, 11): @@ -10,6 +10,8 @@ else: from typing_extensions import Self # pragma: no cover +from numpy import ndarray + from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.contents.events.realtime_event import RealtimeEvent @@ -26,6 +28,7 @@ class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None @abstractmethod async def send(self, event: RealtimeEvent) -> None: diff --git a/python/semantic_kernel/connectors/ai/utils/__init__.py b/python/semantic_kernel/connectors/ai/utils/__init__.py deleted file mode 100644 index 2cd59106a8a0..000000000000 --- a/python/semantic_kernel/connectors/ai/utils/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -from semantic_kernel.connectors.ai.utils.realtime_helpers import SKAudioPlayer, SKAudioTrack - -__all__ = ["SKAudioPlayer", "SKAudioTrack"] diff --git a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py deleted file mode 100644 index 33fd09ce7f66..000000000000 --- a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py +++ /dev/null @@ -1,267 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import asyncio -import logging -from contextlib import asynccontextmanager -from typing import Any, ClassVar, Final - -import numpy as np -import numpy.typing as npt -from aiortc.mediastreams import MediaStreamError, MediaStreamTrack -from av.audio.frame import AudioFrame -from av.frame import Frame -from pydantic import PrivateAttr -from sounddevice import InputStream, OutputStream - -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import AudioEvent -from semantic_kernel.kernel_pydantic import KernelBaseModel - -logger = logging.getLogger(__name__) - -SAMPLE_RATE: Final[int] = 48000 -TRACK_CHANNELS: Final[int] = 1 -PLAYER_CHANNELS: Final[int] = 2 -FRAME_DURATION: Final[int] = 20 -DTYPE: Final[npt.DTypeLike] = np.int16 - - -class SKAudioTrack(KernelBaseModel, MediaStreamTrack): - """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.""" - - kind: ClassVar[str] = "audio" - device: str | int | None = None - sample_rate: int = SAMPLE_RATE - channels: int = TRACK_CHANNELS - frame_duration: int = FRAME_DURATION - dtype: npt.DTypeLike = DTYPE - frame_size: int = 0 - _queue: asyncio.Queue[Frame] = PrivateAttr(default_factory=asyncio.Queue) - _is_recording: bool = False - _stream: InputStream | None = None - _recording_task: asyncio.Task | None = None - _loop: asyncio.AbstractEventLoop | None = None - _pts: int = 0 - - def __init__( - self, - *, - device: str | int | None = None, - sample_rate: int = SAMPLE_RATE, - channels: int = TRACK_CHANNELS, - frame_duration: int = FRAME_DURATION, - dtype: npt.DTypeLike = DTYPE, - ): - """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. - - Make sure the device is set to the correct device for your system. - - Args: - device: The device id to use for recording audio. - sample_rate: The sample rate for the audio. - channels: The number of channels for the audio. - frame_duration: The duration of each audio frame in milliseconds. - dtype: The data type for the audio. - **kwargs: Additional keyword arguments. - """ - args = { - "device": device, - "sample_rate": sample_rate, - "channels": channels, - "frame_duration": frame_duration, - "dtype": dtype, - } - args["frame_size"] = int( - args.get("sample_rate", SAMPLE_RATE) * args.get("frame_duration", FRAME_DURATION) / 1000 - ) - super().__init__(**args) - MediaStreamTrack.__init__(self) - - async def recv(self) -> Frame: - """Receive the next frame of audio data.""" - if not self._recording_task: - self._recording_task = asyncio.create_task(self.start_recording()) - - try: - frame = await self._queue.get() - self._queue.task_done() - return frame - except Exception as e: - logger.error(f"Error receiving audio frame: {e!s}") - raise MediaStreamError("Failed to receive audio frame") - - @asynccontextmanager - async def stream_to_realtime_client(self, realtime_client: RealtimeClientBase): - """Stream audio data to a RealtimeClientBase.""" - while True: - frame = await self.recv() - await realtime_client.send(AudioEvent(audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16"))) - yield - await asyncio.sleep(0.01) - - def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, status: Any) -> None: - if status: - logger.warning(f"Audio input status: {status}") - if self._loop and self._loop.is_running(): - asyncio.run_coroutine_threadsafe(self._queue.put(self._create_frame(indata)), self._loop) - - def _create_frame(self, indata: np.ndarray) -> Frame: - audio_data = indata.copy() - if audio_data.dtype != self.dtype: - audio_data = ( - (audio_data * 32767).astype(self.dtype) if self.dtype == np.int16 else audio_data.astype(self.dtype) - ) - frame = AudioFrame( - format="s16", - layout="mono", - samples=len(audio_data), - ) - frame.rate = self.sample_rate - frame.pts = self._pts - frame.planes[0].update(audio_data.tobytes()) - self._pts += len(audio_data) - return frame - - async def start_recording(self): - """Start recording audio from the input device.""" - if self._is_recording: - return - - self._is_recording = True - self._loop = asyncio.get_running_loop() - self._pts = 0 # Reset pts when starting recording - - try: - self._stream = InputStream( - device=self.device, - channels=self.channels, - samplerate=self.sample_rate, - dtype=self.dtype, - blocksize=self.frame_size, - callback=self._sounddevice_callback, - ) - self._stream.start() - - while self._is_recording: - await asyncio.sleep(0.1) - - except Exception as e: - logger.error(f"Error in audio recording: {e!s}") - raise - finally: - self._is_recording = False - - -class SKAudioPlayer(KernelBaseModel): - """Simple class that plays audio using sounddevice. - - Make sure the device_id is set to the correct device for your system. - - The sample rate, channels and frame duration - should be set to match the audio you - are receiving. - - Args: - device: The device id to use for playing audio. - sample_rate: The sample rate for the audio. - channels: The number of channels for the audio. - dtype: The data type for the audio. - frame_duration: The duration of each audio frame in milliseconds - - """ - - device: int | None = None - sample_rate: int = SAMPLE_RATE - channels: int = PLAYER_CHANNELS - dtype: npt.DTypeLike = DTYPE - frame_duration: int = FRAME_DURATION - _queue: asyncio.Queue[np.ndarray] | None = PrivateAttr(default=None) - _stream: OutputStream | None = PrivateAttr(default=None) - - async def __aenter__(self): - """Start the audio stream when entering a context.""" - self.start() - return self - - async def __aexit__(self, exc_type, exc, tb): - """Stop the audio stream when exiting a context.""" - self.stop() - - def start(self): - """Start the audio stream.""" - self._queue = asyncio.Queue() - self._stream = OutputStream( - callback=self._sounddevice_callback, - samplerate=self.sample_rate, - channels=self.channels, - dtype=self.dtype, - blocksize=int(self.sample_rate * self.frame_duration / 1000), - device=self.device, - ) - if self._stream and self._queue: - self._stream.start() - - def stop(self): - """Stop the audio stream.""" - if self._stream: - self._stream.stop() - self._stream = None - self._queue = None - - def _sounddevice_callback(self, outdata, frames, time, status): - """This callback is called by sounddevice when it needs more audio data to play.""" - if status: - logger.info(f"Audio output status: {status}") - if self._queue: - if self._queue.empty(): - return - data: np.ndarray = self._queue.get_nowait() - if data.size == frames: - outdata[:] = data.reshape(outdata.shape) - self._queue.task_done() - else: - if data.size > frames: - self._queue.put_nowait(data[frames:]) - outdata[:] = np.concatenate((np.empty(0, dtype=np.int16), data[:frames])).reshape(outdata.shape) - else: - outdata[:] = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))).reshape( - outdata.shape - ) - self._queue.task_done() - - async def client_callback(self, content: np.ndarray): - """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" - if self._queue: - await self._queue.put(content) - else: - logger.error( - "Audio queue not initialized, make sure to call start before " - "using the player, or use the context manager." - ) - - async def add_audio(self, audio_content: AudioContent) -> None: - """This function is used to add audio to the queue for playing. - - It first checks if there is a AudioFrame in the inner_content of the AudioContent. - If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. - """ - if not self._queue: - logger.error( - "Audio queue not initialized, make sure to call start before " - "using the player, or use the context manager." - ) - return - if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): - await self._queue.put(audio_content.inner_content.to_ndarray()) - return - if isinstance(audio_content.data, np.ndarray): - await self._queue.put(audio_content.data) - return - if isinstance(audio_content.data, bytes): - await self._queue.put(np.frombuffer(audio_content.data, dtype=self.dtype)) - return - if isinstance(audio_content.data, str): - await self._queue.put(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) - return - logger.error(f"Unknown audio content: {audio_content}") diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index edb2c5917778..682c3b4d4e79 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -1,17 +1,18 @@ # Copyright (c) Microsoft. All rights reserved. -from typing import Annotated, Any, Literal, TypeAlias, Union +from typing import Annotated, Any, ClassVar, Literal, TypeAlias, Union from pydantic import Field from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.image_content import ImageContent from semantic_kernel.contents.text_content import TextContent from semantic_kernel.kernel_pydantic import KernelBaseModel RealtimeEvent: TypeAlias = Annotated[ - Union["ServiceEvent", "AudioEvent", "TextEvent", "FunctionCallEvent", "FunctionResultEvent"], + Union["ServiceEvent", "AudioEvent", "TextEvent", "FunctionCallEvent", "FunctionResultEvent", "ImageEvent"], Field(discriminator="event_type"), ] @@ -19,38 +20,46 @@ class ServiceEvent(KernelBaseModel): """Base class for all service events.""" - event_type: Literal["service"] = "service" + event: Any | None = Field(default=None, description="The event content.") service_type: str - event: Any | None = None + event_type: ClassVar[Literal["service"]] = "service" class AudioEvent(KernelBaseModel): """Audio event type.""" - event_type: Literal["audio"] = "audio" + audio: AudioContent = Field(..., description="Audio content.") service_type: str | None = None - audio: AudioContent + event_type: ClassVar[Literal["audio"]] = "audio" class TextEvent(KernelBaseModel): """Text event type.""" - event_type: Literal["text"] = "text" + text: TextContent = Field(..., description="Text content.") service_type: str | None = None - text: TextContent + event_type: ClassVar[Literal["text"]] = "text" class FunctionCallEvent(KernelBaseModel): """Function call event type.""" - event_type: Literal["function_call"] = "function_call" + function_call: FunctionCallContent = Field(..., description="Function call content.") service_type: str | None = None - function_call: FunctionCallContent + event_type: ClassVar[Literal["function_call"]] = "function_call" class FunctionResultEvent(KernelBaseModel): """Function result event type.""" - event_type: Literal["function_result"] = "function_result" + function_result: FunctionResultContent = Field(..., description="Function result content.") service_type: str | None = None - function_result: FunctionResultContent + event_type: ClassVar[Literal["function_result"]] = "function_result" + + +class ImageEvent(KernelBaseModel): + """Image event type.""" + + image: ImageContent = Field(..., description="Image content.") + service_type: str | None = None + event_type: ClassVar[Literal["image"]] = "image" From 0911c0456cb3579e5aab3faeb8078b2b54eecfb5 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 31 Jan 2025 15:54:56 +0100 Subject: [PATCH 24/50] add image event import --- python/semantic_kernel/contents/events/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py index 7466a652364b..432c4a9c0851 100644 --- a/python/semantic_kernel/contents/events/__init__.py +++ b/python/semantic_kernel/contents/events/__init__.py @@ -4,6 +4,7 @@ AudioEvent, FunctionCallEvent, FunctionResultEvent, + ImageEvent, RealtimeEvent, ServiceEvent, TextEvent, @@ -13,6 +14,7 @@ "AudioEvent", "FunctionCallEvent", "FunctionResultEvent", + "ImageEvent", "RealtimeEvent", "ServiceEvent", "TextEvent", From d9e5fe6c09d5acafccc811c16f619bf45dde5f51 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Wed, 12 Feb 2025 10:04:27 +0100 Subject: [PATCH 25/50] naming updates and added call --- .../realtime/01-chat_with_realtime_webrtc.py | 8 +- .../01-chat_with_realtime_websocket.py | 14 ++-- .../realtime/02-chat_with_function_calling.py | 16 ++-- python/samples/concepts/realtime/utils.py | 4 +- .../connectors/ai/function_calling_utils.py | 13 --- .../open_ai_realtime_execution_settings.py | 30 +++++-- .../realtime/open_ai_realtime_base.py | 80 ++++++++++++------- .../realtime/open_ai_realtime_webrtc.py | 6 +- .../realtime/open_ai_realtime_websocket.py | 6 +- .../connectors/ai/realtime_client_base.py | 38 +++++++-- .../contents/events/__init__.py | 24 +++--- .../contents/events/realtime_event.py | 21 +++-- 12 files changed, 159 insertions(+), 101 deletions(-) diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py index 38d3803a737b..d7804226b1b6 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py @@ -64,11 +64,7 @@ async def main() -> None: turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), ) # the context manager calls the create_session method on the client and start listening to the audio stream - - print("Mosscap (transcript): ", end="") - async with realtime_client, audio_player: - await realtime_client.update_session(settings=settings, create_response=True) - + async with audio_player, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): match event.event_type: # case "audio": @@ -80,7 +76,7 @@ async def main() -> None: if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") if event.service_type == ListenEvents.RESPONSE_CREATED: - print("") + print("\nMosscap (transcript): ", end="") if event.service_type == ListenEvents.ERROR: logger.error(event.event) diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py index e647da6ff4a9..f7ab8f1e850e 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py @@ -5,11 +5,13 @@ from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( - ListenEvents, OpenAIRealtime, OpenAIRealtimeExecutionSettings, +) +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( TurnDetection, ) +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -61,14 +63,10 @@ async def main() -> None: flowery prose. """, voice="shimmer", - turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + turn_detection=TurnDetection(create_response=True, silence_duration_ms=800, threshold=0.8), ) # the context manager calls the create_session method on the client and start listening to the audio stream - print("Mosscap (transcript): ", end="") - - async with realtime_client, audio_player, audio_recorder: - await realtime_client.update_session(settings=settings, create_response=True) - + async with realtime_client(settings=settings, create_response=True), audio_player, audio_recorder: async for event in realtime_client.receive(): match event.event_type: # this can be used as an alternative to the callback function used above, @@ -82,7 +80,7 @@ async def main() -> None: if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") if event.service_type == ListenEvents.RESPONSE_CREATED: - print("") + print("\nMosscap (transcript): ", end="") if event.service_type == ListenEvents.ERROR: logger.error(event.event) diff --git a/python/samples/concepts/realtime/02-chat_with_function_calling.py b/python/samples/concepts/realtime/02-chat_with_function_calling.py index c74b6b583d23..c1579488af41 100644 --- a/python/samples/concepts/realtime/02-chat_with_function_calling.py +++ b/python/samples/concepts/realtime/02-chat_with_function_calling.py @@ -117,11 +117,15 @@ async def main() -> None: chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") # the context manager calls the create_session method on the client and start listening to the audio stream - async with realtime_client, audio_player: - await realtime_client.update_session( - settings=settings, chat_history=chat_history, kernel=kernel, create_response=True - ) - print("Mosscap (transcript): ", end="") + async with ( + audio_player, + realtime_client( + settings=settings, + chat_history=chat_history, + kernel=kernel, + create_response=True, + ), + ): async for event in realtime_client.receive(): match event.event_type: case "text": @@ -132,7 +136,7 @@ async def main() -> None: match event.service_type: case ListenEvents.RESPONSE_CREATED: if print_transcript: - print("") + print("\nMosscap (transcript): ", end="") case ListenEvents.ERROR: logger.error(event.event) diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py index d7f39369a0d4..290080413719 100644 --- a/python/samples/concepts/realtime/utils.py +++ b/python/samples/concepts/realtime/utils.py @@ -17,7 +17,7 @@ from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import AudioEvent +from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent from semantic_kernel.kernel_pydantic import KernelBaseModel logger = logging.getLogger(__name__) @@ -225,7 +225,7 @@ async def _start_stream(self): data, _ = self._stream.read(self.frame_size) await self.realtime_client.send( - AudioEvent(audio=AudioContent(data=base64.b64encode(cast(Any, data)).decode("utf-8"))) + RealtimeAudioEvent(audio=AudioContent(data=base64.b64encode(cast(Any, data)).decode("utf-8"))) ) await asyncio.sleep(0) diff --git a/python/semantic_kernel/connectors/ai/function_calling_utils.py b/python/semantic_kernel/connectors/ai/function_calling_utils.py index ec09b4d2850f..11d2c0a2eeb8 100644 --- a/python/semantic_kernel/connectors/ai/function_calling_utils.py +++ b/python/semantic_kernel/connectors/ai/function_calling_utils.py @@ -162,19 +162,6 @@ def prepare_settings_for_function_calling( if not isinstance(settings, settings_class): settings = settings_class.from_prompt_execution_settings(settings) - # For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior` - # if this method is called with a `FunctionCallBehavior` object as part of the settings - - from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior - from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior - - if hasattr(settings, "function_call_behavior") and isinstance( - settings.function_call_behavior, FunctionCallBehavior - ): - settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior( - settings.function_call_behavior - ) - if settings.function_choice_behavior: # Configure the function choice behavior into the settings object # that will become part of the request to the AI service diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index a26237b78b84..446161e365d1 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from typing import Annotated, Any, Literal from pydantic import Field @@ -10,16 +10,34 @@ class InputAudioTranscription(KernelBaseModel): - """Input audio transcription settings.""" + """Input audio transcription settings. + + Args: + model: The model to use for transcription, currently only "whisper-1" is supported. + language: The language of the audio, should be in ISO-639-1 format, like 'en'. + prompt: An optional text to guide the model's style or continue a previous audio segment. + The prompt should match the audio language. + """ model: Literal["whisper-1"] | None = None + language: str | None = None + prompt: str | None = None class TurnDetection(KernelBaseModel): - """Turn detection settings.""" + """Turn detection settings. + + Args: + type: The type of turn detection, currently only "server_vad" is supported. + threshold: The threshold for voice activity detection, should be between 0 and 1. + prefix_padding_ms: The padding before the detected voice activity, in milliseconds. + silence_duration_ms: The duration of silence to detect the end of a turn, in milliseconds. + create_response: Whether to create a response for each detected turn. + + """ - type: Literal["server_vad"] | None = None - threshold: Annotated[float | None, Field(ge=0, le=1)] = None + type: Literal["server_vad"] = "server_vad" + threshold: Annotated[float | None, Field(ge=0.0, le=1.0)] = None prefix_padding_ms: Annotated[int | None, Field(ge=0)] = None silence_duration_ms: Annotated[int | None, Field(ge=0)] = None create_response: bool | None = None @@ -34,7 +52,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): voice: str | None = None input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None - input_audio_transcription: InputAudioTranscription | None = None + input_audio_transcription: Annotated[InputAudioTranscription | Mapping[str, str] | None, Field()] = None turn_detection: TurnDetection | None = None tools: Annotated[ list[dict[str, Any]] | None, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 2789bf0d16e2..41bc0bf8e2e1 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -38,11 +38,11 @@ from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.events.realtime_event import ( - FunctionCallEvent, - FunctionResultEvent, RealtimeEvent, - ServiceEvent, - TextEvent, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeServiceEvent, + RealtimeTextEvent, ) from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent @@ -78,7 +78,7 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt """ match event.type: case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: - yield TextEvent( + yield RealtimeTextEvent( service_type=event.type, text=StreamingTextContent( inner_content=event, @@ -90,7 +90,7 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt if event.item.type == "function_call" and event.item.call_id and event.item.name: self._call_id_to_function_map[event.item.call_id] = event.item.name case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: - yield FunctionCallEvent( + yield RealtimeFunctionCallEvent( service_type=event.type, function_call=FunctionCallContent( id=event.item_id, @@ -114,53 +114,77 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt # we put all event in the output buffer, but after the interpreted one. # so when dealing with them, make sure to check the type of the event, since they # might be of different types. - yield ServiceEvent(service_type=event.type, event=event) + yield RealtimeServiceEvent(service_type=event.type, event=event) @override async def update_session( self, - settings: PromptExecutionSettings | None = None, chat_history: ChatHistory | None = None, + settings: PromptExecutionSettings | None = None, create_response: bool = False, **kwargs: Any, ) -> None: - if "kernel" in kwargs: - self.kernel = kwargs["kernel"] + """Update the session in the service. + + Args: + chat_history: Chat history. + settings: Prompt execution settings, if kernel is linked to the service or passed as + Kwargs, it will be used to update the settings for function calling. + create_response: Create a response, get the model to start responding, default is False. + kwargs: Additional arguments, if 'kernel' is passed, it will be used to update the + settings for function calling, others will be ignored. + + """ + if kwargs: + if self._create_kwargs: + kwargs = {**self._create_kwargs, **kwargs} + else: + kwargs = self._create_kwargs or {} if settings: self._current_settings = settings - if self._current_settings and self.kernel: - self._current_settings = prepare_settings_for_function_calling( - self._current_settings, - self.get_prompt_execution_settings_class(), - self._update_function_choice_settings_callback(), - kernel=self.kernel, # type: ignore - ) + if "kernel" in kwargs: + self.kernel = kwargs["kernel"] + + if self._current_settings: + if self.kernel: + self._current_settings = prepare_settings_for_function_calling( + self._current_settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=self.kernel, # type: ignore + ) await self.send( - ServiceEvent( + RealtimeServiceEvent( service_type=SendEvents.SESSION_UPDATE, event={"settings": self._current_settings}, ) ) + if chat_history and len(chat_history) > 0: for msg in chat_history.messages: for item in msg.items: match item: case TextContent(): - await self.send(TextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item)) + await self.send( + RealtimeTextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item) + ) case FunctionCallContent(): await self.send( - FunctionCallEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item) + RealtimeFunctionCallEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item + ) ) case FunctionResultContent(): await self.send( - FunctionResultEvent( + RealtimeFunctionResultEvent( service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item ) ) case _: logger.error("Unsupported item type: %s", item) - if create_response: - await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) + + if create_response or kwargs.get("create_response", False) is True: + await self.send(RealtimeServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) async def _parse_function_call_arguments_done( self, @@ -187,18 +211,20 @@ async def _parse_function_call_arguments_done( index=event.output_index, metadata={"call_id": event.call_id}, ) - yield FunctionCallEvent(service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item) + yield RealtimeFunctionCallEvent( + service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item + ) chat_history = ChatHistory() await self.kernel.invoke_function_call(item, chat_history) created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore # This returns the output to the service - result = FunctionResultEvent( + result = RealtimeFunctionResultEvent( service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=created_output, ) await self.send(result) # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) + await self.send(RealtimeServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) # This allows a user to have a full conversation in his code yield result @@ -405,8 +431,8 @@ def _update_function_choice_settings_callback( @override async def create_session( self, - settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, **kwargs: Any, ) -> None: pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 2a6bf71dfd68..f9f12e38ba7f 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -32,7 +32,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.events import RealtimeEvent -from semantic_kernel.contents.events.realtime_event import AudioEvent +from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -76,8 +76,8 @@ async def _send(self, event: RealtimeClientEvent) -> None: @override async def create_session( self, - settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, **kwargs: Any, ) -> None: """Create a session in the service.""" @@ -161,7 +161,7 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: logger.error(f"Error playing remote audio frame: {e!s}") try: await self._receive_buffer.put( - AudioEvent( + RealtimeAudioEvent( audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), service_type=ListenEvents.RESPONSE_AUDIO_DELTA, ), diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 3b476d96d3c0..508d307464d9 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -20,7 +20,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import AudioEvent, RealtimeEvent +from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvent from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -53,7 +53,7 @@ async def receive( if self.audio_output_callback: await self.audio_output_callback(np.frombuffer(audio_bytes, dtype=np.int16)) try: - yield AudioEvent( + yield RealtimeAudioEvent( audio=AudioContent(data=audio_bytes, data_format="base64", inner_content=event), service_type=event.type, ) @@ -75,8 +75,8 @@ async def _send(self, event: RealtimeClientEvent) -> None: @override async def create_session( self, - settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, **kwargs: Any, ) -> None: """Create a session in the service.""" diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index cc70df1f3c90..c77782a3a578 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -3,7 +3,9 @@ import sys from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Callable, Coroutine -from typing import TYPE_CHECKING, Any, ClassVar +from typing import Any, ClassVar + +from pydantic import PrivateAttr if sys.version_info >= (3, 11): from typing import Self # pragma: no cover @@ -14,14 +16,12 @@ from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.events.realtime_event import RealtimeEvent from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - from semantic_kernel.contents.chat_history import ChatHistory - @experimental_class class RealtimeClientBase(AIServiceClientBase, ABC): @@ -29,6 +29,9 @@ class RealtimeClientBase(AIServiceClientBase, ABC): SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None + _chat_history: ChatHistory | None = PrivateAttr(default=None) + _settings: PromptExecutionSettings | None = PrivateAttr(default=None) + _create_kwargs: dict[str, Any] | None = PrivateAttr(default=None) @abstractmethod async def send(self, event: RealtimeEvent) -> None: @@ -55,8 +58,8 @@ def receive( @abstractmethod async def create_session( self, - settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, **kwargs: Any, ) -> None: """Create a session in the service. @@ -71,8 +74,8 @@ async def create_session( @abstractmethod async def update_session( self, - settings: "PromptExecutionSettings | None" = None, chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, **kwargs: Any, ) -> None: """Update a session in the service. @@ -106,9 +109,28 @@ async def __aenter__(self) -> "Self": Default implementation calls the create session method. """ - await self.create_session() + await self.create_session(self._chat_history, self._settings) return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """Exit the context manager.""" await self.close_session() + + def __call__( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> Self: + """Call the service and set the chat history and settings. + + Args: + chat_history: Chat history. + settings: Prompt execution settings. + kwargs: Additional arguments, can include `kernel` or specific settings for the service. + Check the update_session method for the specific service for more details. + """ + self._chat_history = chat_history + self._settings = settings + self._create_kwargs = kwargs + return self diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py index 432c4a9c0851..5d6dd52d44bb 100644 --- a/python/semantic_kernel/contents/events/__init__.py +++ b/python/semantic_kernel/contents/events/__init__.py @@ -1,21 +1,21 @@ # Copyright (c) Microsoft. All rights reserved. from semantic_kernel.contents.events.realtime_event import ( - AudioEvent, - FunctionCallEvent, - FunctionResultEvent, - ImageEvent, + RealtimeAudioEvent, RealtimeEvent, - ServiceEvent, - TextEvent, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeImageEvent, + RealtimeServiceEvent, + RealtimeTextEvent, ) __all__ = [ - "AudioEvent", - "FunctionCallEvent", - "FunctionResultEvent", - "ImageEvent", + "RealtimeAudioEvent", "RealtimeEvent", - "ServiceEvent", - "TextEvent", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + "RealtimeServiceEvent", + "RealtimeTextEvent", ] diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index 682c3b4d4e79..6ce3698f26dc 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -12,12 +12,19 @@ from semantic_kernel.kernel_pydantic import KernelBaseModel RealtimeEvent: TypeAlias = Annotated[ - Union["ServiceEvent", "AudioEvent", "TextEvent", "FunctionCallEvent", "FunctionResultEvent", "ImageEvent"], + Union[ + "RealtimeServiceEvent", + "RealtimeAudioEvent", + "RealtimeTextEvent", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + ], Field(discriminator="event_type"), ] -class ServiceEvent(KernelBaseModel): +class RealtimeServiceEvent(KernelBaseModel): """Base class for all service events.""" event: Any | None = Field(default=None, description="The event content.") @@ -25,7 +32,7 @@ class ServiceEvent(KernelBaseModel): event_type: ClassVar[Literal["service"]] = "service" -class AudioEvent(KernelBaseModel): +class RealtimeAudioEvent(KernelBaseModel): """Audio event type.""" audio: AudioContent = Field(..., description="Audio content.") @@ -33,7 +40,7 @@ class AudioEvent(KernelBaseModel): event_type: ClassVar[Literal["audio"]] = "audio" -class TextEvent(KernelBaseModel): +class RealtimeTextEvent(KernelBaseModel): """Text event type.""" text: TextContent = Field(..., description="Text content.") @@ -41,7 +48,7 @@ class TextEvent(KernelBaseModel): event_type: ClassVar[Literal["text"]] = "text" -class FunctionCallEvent(KernelBaseModel): +class RealtimeFunctionCallEvent(KernelBaseModel): """Function call event type.""" function_call: FunctionCallContent = Field(..., description="Function call content.") @@ -49,7 +56,7 @@ class FunctionCallEvent(KernelBaseModel): event_type: ClassVar[Literal["function_call"]] = "function_call" -class FunctionResultEvent(KernelBaseModel): +class RealtimeFunctionResultEvent(KernelBaseModel): """Function result event type.""" function_result: FunctionResultContent = Field(..., description="Function result content.") @@ -57,7 +64,7 @@ class FunctionResultEvent(KernelBaseModel): event_type: ClassVar[Literal["function_result"]] = "function_result" -class ImageEvent(KernelBaseModel): +class RealtimeImageEvent(KernelBaseModel): """Image event type.""" image: ImageContent = Field(..., description="Image content.") From 43e5fb16eee205cb60179d15aa5c41ef3d61d506 Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Thu, 13 Feb 2025 12:52:45 +0100 Subject: [PATCH 26/50] redid realtimeevents --- .../realtime/01-chat_with_realtime_webrtc.py | 11 +- .../01-chat_with_realtime_websocket.py | 13 +- .../realtime/02-chat_with_function_calling.py | 9 +- python/samples/concepts/realtime/utils.py | 4 +- .../ai/open_ai/services/open_ai_realtime.py | 2 +- .../realtime/open_ai_realtime_base.py | 73 +++++--- .../realtime/open_ai_realtime_webrtc.py | 5 +- .../realtime/open_ai_realtime_websocket.py | 20 +-- .../contents/events/__init__.py | 2 - .../contents/events/realtime_event.py | 43 ++--- python/uv.lock | 165 +++++++++--------- 11 files changed, 174 insertions(+), 173 deletions(-) diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py index d7804226b1b6..298f7b242072 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py @@ -10,6 +10,7 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) +from semantic_kernel.contents.events.realtime_event import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -66,19 +67,17 @@ async def main() -> None: # the context manager calls the create_session method on the client and start listening to the audio stream async with audio_player, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): - match event.event_type: - # case "audio": + match event: + # case RealtimeAudioEvent(): # await audio_player.add_audio(event.audio) - case "text": + case RealtimeTextEvent(): print(event.text.text, end="") - case "service": + case _: # OpenAI Specific events if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") if event.service_type == ListenEvents.RESPONSE_CREATED: print("\nMosscap (transcript): ", end="") - if event.service_type == ListenEvents.ERROR: - logger.error(event.event) if __name__ == "__main__": diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py index f7ab8f1e850e..f276ed4c54a6 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py @@ -12,6 +12,7 @@ TurnDetection, ) from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents +from semantic_kernel.contents.events.realtime_event import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -66,23 +67,21 @@ async def main() -> None: turn_detection=TurnDetection(create_response=True, silence_duration_ms=800, threshold=0.8), ) # the context manager calls the create_session method on the client and start listening to the audio stream - async with realtime_client(settings=settings, create_response=True), audio_player, audio_recorder: + async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): - match event.event_type: + match event: # this can be used as an alternative to the callback function used above, # the callback is faster and smoother - # case "audio": + # case RealtimeAudioEvent(): # await audio_player.add_audio(event.audio) - case "text": + case RealtimeTextEvent(): print(event.text.text, end="") - case "service": + case _: # OpenAI Specific events if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") if event.service_type == ListenEvents.RESPONSE_CREATED: print("\nMosscap (transcript): ", end="") - if event.service_type == ListenEvents.ERROR: - logger.error(event.event) if __name__ == "__main__": diff --git a/python/samples/concepts/realtime/02-chat_with_function_calling.py b/python/samples/concepts/realtime/02-chat_with_function_calling.py index c1579488af41..fe7f94870133 100644 --- a/python/samples/concepts/realtime/02-chat_with_function_calling.py +++ b/python/samples/concepts/realtime/02-chat_with_function_calling.py @@ -15,6 +15,7 @@ TurnDetection, ) from semantic_kernel.contents import ChatHistory +from semantic_kernel.contents.events import RealtimeTextEvent from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) @@ -127,18 +128,18 @@ async def main() -> None: ), ): async for event in realtime_client.receive(): - match event.event_type: - case "text": + match event: + case RealtimeTextEvent(): if print_transcript: print(event.text.text, end="") - case "service": + case _: # OpenAI Specific events match event.service_type: case ListenEvents.RESPONSE_CREATED: if print_transcript: print("\nMosscap (transcript): ", end="") case ListenEvents.ERROR: - logger.error(event.event) + logger.error(event.service_event) if __name__ == "__main__": diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py index 290080413719..2adc87a88a20 100644 --- a/python/samples/concepts/realtime/utils.py +++ b/python/samples/concepts/realtime/utils.py @@ -145,7 +145,9 @@ async def start_recording(self): while self._is_recording: await asyncio.sleep(0.1) - except asyncio.CancelledError | KeyboardInterrupt: + except asyncio.CancelledError: + logger.debug("Recording task was stopped.") + except KeyboardInterrupt: logger.debug("Recording task was stopped.") except Exception as e: logger.error(f"Error in audio recording: {e!s}") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 7d6f60eafbd2..9d4c86ccd211 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -92,7 +92,7 @@ def __init__( except ValidationError as ex: raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not openai_settings.realtime_model_id: - raise ServiceInitializationError("The OpenAI text model ID is required.") + raise ServiceInitializationError("The OpenAI realtime model ID is required.") if audio_track: kwargs["audio_track"] = audio_track super().__init__( diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 41bc0bf8e2e1..e5776c0f8190 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -6,10 +6,6 @@ from collections.abc import AsyncGenerator, Callable from typing import TYPE_CHECKING, Any, ClassVar, Literal -from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( - OpenAIRealtimeExecutionSettings, -) - if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: @@ -27,6 +23,9 @@ prepare_settings_for_function_calling, ) from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, +) from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.utils import ( @@ -38,10 +37,10 @@ from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.events.realtime_event import ( + RealtimeAudioEvent, RealtimeEvent, RealtimeFunctionCallEvent, RealtimeFunctionResultEvent, - RealtimeServiceEvent, RealtimeTextEvent, ) from semantic_kernel.contents.function_call_content import FunctionCallContent @@ -75,11 +74,16 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt Audio delta has to be handled by the implementation of the protocol as some protocols have different ways of handling audio. + + We put all event in the output buffer, but after the interpreted one. + so when dealing with them, make sure to check the type of the event, since they + might be of different types. """ match event.type: case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: yield RealtimeTextEvent( service_type=event.type, + service_event=event, text=StreamingTextContent( inner_content=event, text=event.delta, @@ -89,9 +93,11 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: if event.item.type == "function_call" and event.item.call_id and event.item.name: self._call_id_to_function_map[event.item.call_id] = event.item.name + yield RealtimeEvent(service_type=event.type, service_event=event) case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: yield RealtimeFunctionCallEvent( service_type=event.type, + service_event=event, function_call=FunctionCallContent( id=event.item_id, name=self._call_id_to_function_map[event.call_id], @@ -107,14 +113,13 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt yield parsed_event case ListenEvents.ERROR.value: logger.error("Error received: %s", event.error) - case ListenEvents.SESSION_CREATED.value, ListenEvents.SESSION_UPDATED.value: + yield RealtimeEvent(service_type=event.type, service_event=event) + case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: logger.info("Session created or updated, session: %s", event.session) + yield RealtimeEvent(service_type=event.type, service_event=event) case _: logger.debug(f"Received event: {event}") - # we put all event in the output buffer, but after the interpreted one. - # so when dealing with them, make sure to check the type of the event, since they - # might be of different types. - yield RealtimeServiceEvent(service_type=event.type, event=event) + yield RealtimeEvent(service_type=event.type, service_event=event) @override async def update_session( @@ -154,9 +159,9 @@ async def update_session( kernel=self.kernel, # type: ignore ) await self.send( - RealtimeServiceEvent( + RealtimeEvent( service_type=SendEvents.SESSION_UPDATE, - event={"settings": self._current_settings}, + service_event={"settings": self._current_settings}, ) ) @@ -184,25 +189,35 @@ async def update_session( logger.error("Unsupported item type: %s", item) if create_response or kwargs.get("create_response", False) is True: - await self.send(RealtimeServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) async def _parse_function_call_arguments_done( self, event: ResponseFunctionCallArgumentsDoneEvent, ) -> AsyncGenerator[RealtimeEvent | None]: - """Handle response function call done.""" + """Handle response function call done. + + This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event. + + It then also yields any function results both back to the service, through `send` and to the developer. + + """ + # Step 1: check if function calling enabled: if not self.kernel or ( self._current_settings and self._current_settings.function_choice_behavior and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions ): - yield None + yield RealtimeEvent(service_type=event.type, service_event=event) return + # Step 2: check if there is a function that can be found. plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) if not plugin_name or not function_name: logger.error("Function call needs to have a plugin name and function name") - yield None + yield RealtimeEvent(service_type=event.type, service_event=event) return + + # Step 3: Parse into the function call content, and yield that. item = FunctionCallContent( id=event.item_id, plugin_name=plugin_name, @@ -212,20 +227,22 @@ async def _parse_function_call_arguments_done( metadata={"call_id": event.call_id}, ) yield RealtimeFunctionCallEvent( - service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item + service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item, service_event=event ) + + # Step 4: Invoke the function call chat_history = ChatHistory() await self.kernel.invoke_function_call(item, chat_history) created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore - # This returns the output to the service + # Step 5: Create the function result event result = RealtimeFunctionResultEvent( service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=created_output, ) + # Step 6: send the result to the service and call `create response` await self.send(result) - # The model doesn't start responding to the tool call automatically, so triggering it here. - await self.send(RealtimeServiceEvent(service_type=SendEvents.RESPONSE_CREATE)) - # This allows a user to have a full conversation in his code + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) + # Step 7: yield the function result back to the developer as well yield result async def _send(self, event: RealtimeClientEvent) -> None: @@ -234,14 +251,14 @@ async def _send(self, event: RealtimeClientEvent) -> None: @override async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: - match event.event_type: - case "audio": + match event: + case RealtimeAudioEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.to_base64_bytestring() ) ) - case "text": + case RealtimeTextEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, @@ -257,7 +274,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ), ) ) - case "function_call": + case RealtimeFunctionCallEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, @@ -273,7 +290,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ), ) ) - case "function_result": + case RealtimeFunctionResultEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, @@ -284,8 +301,8 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: ), ) ) - case "service": - data = event.event + case _: + data = event.service_event match event.service_type: case SendEvents.SESSION_UPDATE: if not data: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index f9f12e38ba7f..003a8699544d 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -47,7 +47,7 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): """OpenAI WebRTC Realtime service.""" - protocol: ClassVar[Literal["webrtc"]] = "webrtc" + protocol: ClassVar[Literal["webrtc"]] = "webrtc" # type: ignore peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None audio_track: MediaStreamTrack | None = None @@ -149,6 +149,8 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: # This is a MediaStreamTrack, so the type is AudioFrame # this might need to be updated if video becomes part of this frame: AudioFrame = await track.recv() # type: ignore + except asyncio.CancelledError: + break except Exception as e: logger.error(f"Error getting audio frame: {e!s}") break @@ -163,6 +165,7 @@ async def _on_track(self, track: "MediaStreamTrack") -> None: await self._receive_buffer.put( RealtimeAudioEvent( audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), + service_event=frame, service_type=ListenEvents.RESPONSE_AUDIO_DELTA, ), ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 508d307464d9..79fdd0371a90 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -34,7 +34,7 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): """OpenAI Realtime service.""" - protocol: ClassVar[Literal["websocket"]] = "websocket" + protocol: ClassVar[Literal["websocket"]] = "websocket" # type: ignore connection: AsyncRealtimeConnection | None = None connected: asyncio.Event = Field(default_factory=asyncio.Event) @@ -52,16 +52,14 @@ async def receive( audio_bytes = base64.b64decode(event.delta) if self.audio_output_callback: await self.audio_output_callback(np.frombuffer(audio_bytes, dtype=np.int16)) - try: - yield RealtimeAudioEvent( - audio=AudioContent(data=audio_bytes, data_format="base64", inner_content=event), - service_type=event.type, - ) - except Exception as e: - logger.error(f"Error processing remote audio frame: {e!s}") - else: - async for event in self._parse_event(event): - yield event + yield RealtimeAudioEvent( + audio=AudioContent(data=audio_bytes, data_format="base64", inner_content=event), + service_type=event.type, + service_event=event, + ) + continue + async for event in self._parse_event(event): + yield event async def _send(self, event: RealtimeClientEvent) -> None: await self.connected.wait() diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py index 5d6dd52d44bb..1da1f993c4c3 100644 --- a/python/semantic_kernel/contents/events/__init__.py +++ b/python/semantic_kernel/contents/events/__init__.py @@ -6,7 +6,6 @@ RealtimeFunctionCallEvent, RealtimeFunctionResultEvent, RealtimeImageEvent, - RealtimeServiceEvent, RealtimeTextEvent, ) @@ -16,6 +15,5 @@ "RealtimeFunctionCallEvent", "RealtimeFunctionResultEvent", "RealtimeImageEvent", - "RealtimeServiceEvent", "RealtimeTextEvent", ] diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index 6ce3698f26dc..f96f4684ae14 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from typing import Annotated, Any, ClassVar, Literal, TypeAlias, Union +from typing import Any, ClassVar, Literal from pydantic import Field @@ -11,62 +11,45 @@ from semantic_kernel.contents.text_content import TextContent from semantic_kernel.kernel_pydantic import KernelBaseModel -RealtimeEvent: TypeAlias = Annotated[ - Union[ - "RealtimeServiceEvent", - "RealtimeAudioEvent", - "RealtimeTextEvent", - "RealtimeFunctionCallEvent", - "RealtimeFunctionResultEvent", - "RealtimeImageEvent", - ], - Field(discriminator="event_type"), -] - -class RealtimeServiceEvent(KernelBaseModel): +class RealtimeEvent(KernelBaseModel): """Base class for all service events.""" - event: Any | None = Field(default=None, description="The event content.") + service_event: Any | None = Field(default=None, description="The event content.") service_type: str event_type: ClassVar[Literal["service"]] = "service" -class RealtimeAudioEvent(KernelBaseModel): +class RealtimeAudioEvent(RealtimeEvent): """Audio event type.""" + event_type: ClassVar[Literal["audio"]] = "audio" # type: ignore audio: AudioContent = Field(..., description="Audio content.") - service_type: str | None = None - event_type: ClassVar[Literal["audio"]] = "audio" -class RealtimeTextEvent(KernelBaseModel): +class RealtimeTextEvent(RealtimeEvent): """Text event type.""" + event_type: ClassVar[Literal["text"]] = "text" # type: ignore text: TextContent = Field(..., description="Text content.") - service_type: str | None = None - event_type: ClassVar[Literal["text"]] = "text" -class RealtimeFunctionCallEvent(KernelBaseModel): +class RealtimeFunctionCallEvent(RealtimeEvent): """Function call event type.""" + event_type: ClassVar[Literal["function_call"]] = "function_call" # type: ignore function_call: FunctionCallContent = Field(..., description="Function call content.") - service_type: str | None = None - event_type: ClassVar[Literal["function_call"]] = "function_call" -class RealtimeFunctionResultEvent(KernelBaseModel): +class RealtimeFunctionResultEvent(RealtimeEvent): """Function result event type.""" + event_type: ClassVar[Literal["function_result"]] = "function_result" # type: ignore function_result: FunctionResultContent = Field(..., description="Function result content.") - service_type: str | None = None - event_type: ClassVar[Literal["function_result"]] = "function_result" -class RealtimeImageEvent(KernelBaseModel): +class RealtimeImageEvent(RealtimeEvent): """Image event type.""" + event_type: ClassVar[Literal["image"]] = "image" # type: ignore image: ImageContent = Field(..., description="Image content.") - service_type: str | None = None - event_type: ClassVar[Literal["image"]] = "image" diff --git a/python/uv.lock b/python/uv.lock index b4507e84efff..b676cf25572f 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -2,18 +2,18 @@ version = 1 revision = 1 requires-python = ">=3.10" resolution-markers = [ - "python_full_version < '3.11' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version >= '3.13' and sys_platform == 'darwin'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version >= '3.13' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform == 'win32'", - "python_full_version == '3.11.*' and sys_platform == 'win32'", - "python_full_version == '3.12.*' and sys_platform == 'win32'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform == 'linux'", "python_full_version >= '3.13' and sys_platform == 'win32'", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "python_full_version < '3.11' and sys_platform == 'win32'", ] supported-markers = [ "sys_platform == 'darwin'", @@ -145,7 +145,7 @@ wheels = [ [[package]] name = "aiortc" -version = "1.9.0" +version = "1.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aioice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -157,21 +157,15 @@ dependencies = [ { name = "pylibsrtp", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyopenssl", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/71/32/e9b01e2271124643e5dc15c273f2bb8155efebf5bc2115407441ac62f4c5/aiortc-1.9.0.tar.gz", hash = "sha256:03faa76d76ef0e5989ac10386898b029369756102217230e2fcd4b029c50b303", size = 1168973 } +sdist = { url = "https://files.pythonhosted.org/packages/8a/f8/408e092748521889c9d33dddcef920afd9891cf6db4615ba6b6bfe114ff8/aiortc-1.10.1.tar.gz", hash = "sha256:64926ad86bde20c1a4dacb7c3a164e57b522606b70febe261fada4acf79641b5", size = 1179406 } wheels = [ - { url = "https://files.pythonhosted.org/packages/93/01/db89910fc4dfb72ca25fd9a41326762a490d93d39d2fc4aac3f86c05857d/aiortc-1.9.0-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e3e67c1970c2cffacac53c8f161df264efc62b22721c64a621940935028ee087", size = 1216069 }, - { url = "https://files.pythonhosted.org/packages/4c/6d/76ed96521080492c7264eacf73a8cba2202f1ff9f59af1776c5a2532f332/aiortc-1.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d893cb3d4ffa0ff4f9bb03a88f0a700cdbcd4c0dc060a46c59a27ccd1c890663", size = 896012 }, - { url = "https://files.pythonhosted.org/packages/8c/87/1f666108764fa5b557bed4f0fd5e2acccd739bb2cca2b766dcacb53e5669/aiortc-1.9.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:176b4eb38d833667f87cf719a7a3e105e25a35b138b30893294418c1c96e38db", size = 1779113 }, - { url = "https://files.pythonhosted.org/packages/32/03/f3233e936f7a81549bd95f33f3d304e2a9211cb35d819d74570c0718b1ac/aiortc-1.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44b610f36b8d17123855dfbe915fa6874201765b8a2c7fd9cf72d14cf417740", size = 1896322 }, - { url = "https://files.pythonhosted.org/packages/96/99/6672cf57777801c6ddacc13e1ee07f8c2151d0847a4f81455eeec998eaed/aiortc-1.9.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55505adb31d56cba19a1ef8ad6aa9b727ccdba2a83bfbfb4aa79ef3c472026a6", size = 1918600 }, - { url = "https://files.pythonhosted.org/packages/76/e3/bdb76e7e51bc4fc7a5869597de2effad073ccf5ef14de3aed742d7384107/aiortc-1.9.0-cp38-abi3-win32.whl", hash = "sha256:680b703e35870e301535c930bfe32e7d012224a91ce51531aba45a3124ef07cc", size = 923055 }, - { url = "https://files.pythonhosted.org/packages/6a/df/de098b31a3fbf1117f6d4cb84c14518636054e3c95a9d9f693a1123c95b3/aiortc-1.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:de5e7020cfc2d2d9fb95690926ff2e3b3c30cd4f5f5bc68d5b6756a8eebb686e", size = 1009610 }, - { url = "https://files.pythonhosted.org/packages/95/26/c382db590897fe638254f948d8514772d13ff59b5ada0a71d87322f48c52/aiortc-1.9.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:34c516ae4e70e8f64494305057af09311444325722fe6938ec38dd1e111adca9", size = 1209093 }, - { url = "https://files.pythonhosted.org/packages/68/48/2fe7de04461fdc4aee8c78c67cfe03579eaa72fb215c4b063acaeb4fd118/aiortc-1.9.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:40e61c1b84914d6f4c2968ff49353a22eed9419de74b151237cdb71af431209c", size = 888818 }, - { url = "https://files.pythonhosted.org/packages/da/d5/94bf7ed6189c316ffef930787cba009387f9bcd2f1c482392b71cca3918d/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1924e130a441507b1315956aff05c504a274f1a09802def225d0f3a3d1870320", size = 1732549 }, - { url = "https://files.pythonhosted.org/packages/e7/0a/6495c696cd7f806bafe511fb27203ce918947c4461398384a4e6bd4b7e57/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbb62950e396c311e398925149fa76bc90b8d6525b4eccf28cba704e7ded8bf5", size = 1843911 }, - { url = "https://files.pythonhosted.org/packages/82/36/ffd0f74c73fa6abca0b76bd38473ed7d82dfbada7e57c6efe2a37ee40483/aiortc-1.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5234177e8d3126a0190ed9b6f8d0288daedcc0158c45cc279b4e6ac7d97f43f8", size = 1868240 }, - { url = "https://files.pythonhosted.org/packages/fb/46/8cb087a11f2f2d1139bd7e21615cc082097bffc4990d43c9f45f9cf6c8bf/aiortc-1.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e31575eb050aa68e0ea4c519aef101770b2297954f49e64a5c3d73ef27702ea", size = 1004186 }, + { url = "https://files.pythonhosted.org/packages/0a/6b/74547a30d1ddcc81f905ef4ff7fcc2c89b7482cb2045688f2aaa4fa918aa/aiortc-1.10.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3bef536f38394b518aefae9dbf9cdd08f39e4c425f316f9692f0d8dc724810bd", size = 1218457 }, + { url = "https://files.pythonhosted.org/packages/46/92/b4ccf39cd18e366ace2a11dc7d98ed55967b4b325707386b5788149db15e/aiortc-1.10.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8842c02e38513d9432ef22982572833487bb015f23348fa10a690616dbf55143", size = 898855 }, + { url = "https://files.pythonhosted.org/packages/a4/e9/2676de48b493787d8b03129713e6bb2dfbacca2a565090f2a89cbad71f96/aiortc-1.10.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:954a420de01c0bf6b07a0c58b662029b1c4204ddbd8f5c4162bbdebd43f882b1", size = 1750403 }, + { url = "https://files.pythonhosted.org/packages/c3/9d/ab6d09183cdaf5df060923d9bd5c9ed5fb1802661d9401dba35f3c85a57b/aiortc-1.10.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7c0d46fb30307a9d7deb4b7d66f0b0e73b77a7221b063fb6dc78821a5d2aa1e", size = 1867886 }, + { url = "https://files.pythonhosted.org/packages/c2/71/0b5666e6b965dbd9a7f331aa827a6c3ab3eb4d582fefb686a7f4227b7954/aiortc-1.10.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89582f6923046f79f15d9045f432bc78191eacc95f6bed18714e86ec935188d9", size = 1893709 }, + { url = "https://files.pythonhosted.org/packages/9d/0a/8c0c78fad79ef595a0ed6e2ab413900e6bd0eac65fc5c31c9d8736bff909/aiortc-1.10.1-cp39-abi3-win32.whl", hash = "sha256:d1cbe87f740b33ffaa8e905f21092773e74916be338b64b81c8b79af4c3847eb", size = 923265 }, + { url = "https://files.pythonhosted.org/packages/73/12/a27dd588a4988021da88cb4d338d8ee65ac097afc14e9193ab0be4a48790/aiortc-1.10.1-cp39-abi3-win_amd64.whl", hash = "sha256:c9a5a0b23f8a77540068faec8837fa0a65b0396c20f09116bdb874b75e0b6abe", size = 1009488 }, ] [[package]] @@ -731,7 +725,7 @@ wheels = [ [[package]] name = "chromadb" -version = "0.6.2" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "bcrypt", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -763,9 +757,9 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d1/c5/d2b4219fdee424e881608da681c3c63b73d68dc6667bd2df14a4d9bb308d/chromadb-0.6.2.tar.gz", hash = "sha256:e9e11f04d3850796711ee05dad4e918c75ec7b62ab9cbe7b4588b68a26aaea06", size = 19979649 } +sdist = { url = "https://files.pythonhosted.org/packages/39/cd/f0f2de3f466ff514fb6b58271c14f6d22198402bb5b71b8d890231265946/chromadb-0.6.3.tar.gz", hash = "sha256:c8f34c0b704b9108b04491480a36d42e894a960429f87c6516027b5481d59ed3", size = 29297929 } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/1c/2b77093f4191ad2d1ab70b9215cb6bc9f43350aa3e9e54a44304c8379335/chromadb-0.6.2-py3-none-any.whl", hash = "sha256:77a5e07097e36cdd49d8d2925d0c4d28291cabc9677787423d2cc7c426e8895b", size = 606162 }, + { url = "https://files.pythonhosted.org/packages/28/8e/5c186c77bf749b6fe0528385e507e463f1667543328d76fd00a49e1a4e6a/chromadb-0.6.3-py3-none-any.whl", hash = "sha256:4851258489a3612b558488d98d09ae0fe0a28d5cad6bd1ba64b96fdc419dc0e5", size = 611129 }, ] [[package]] @@ -773,7 +767,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -964,27 +958,27 @@ wheels = [ [[package]] name = "debugpy" -version = "1.8.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bc/e7/666f4c9b0e24796af50aadc28d36d21c2e01e831a934535f956e09b3650c/debugpy-1.8.11.tar.gz", hash = "sha256:6ad2688b69235c43b020e04fecccdf6a96c8943ca9c2fb340b8adc103c655e57", size = 1640124 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/e6/4cf7422eaa591b4c7d6a9fde224095dac25283fdd99d90164f28714242b0/debugpy-1.8.11-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:2b26fefc4e31ff85593d68b9022e35e8925714a10ab4858fb1b577a8a48cb8cd", size = 2075100 }, - { url = "https://files.pythonhosted.org/packages/83/3a/e163de1df5995d95760a4d748b02fbefb1c1bf19e915b664017c40435dbf/debugpy-1.8.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61bc8b3b265e6949855300e84dc93d02d7a3a637f2aec6d382afd4ceb9120c9f", size = 3559724 }, - { url = "https://files.pythonhosted.org/packages/27/6c/327e19fd1bf428a1efe1a6f97b306689c54c2cebcf871b66674ead718756/debugpy-1.8.11-cp310-cp310-win32.whl", hash = "sha256:c928bbf47f65288574b78518449edaa46c82572d340e2750889bbf8cd92f3737", size = 5178068 }, - { url = "https://files.pythonhosted.org/packages/49/80/359ff8aa388f0bd4a48f0fa9ce3606396d576657ac149c6fba3cc7de8adb/debugpy-1.8.11-cp310-cp310-win_amd64.whl", hash = "sha256:8da1db4ca4f22583e834dcabdc7832e56fe16275253ee53ba66627b86e304da1", size = 5210109 }, - { url = "https://files.pythonhosted.org/packages/7c/58/8e3f7ec86c1b7985a232667b5df8f3b1b1c8401028d8f4d75e025c9556cd/debugpy-1.8.11-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:85de8474ad53ad546ff1c7c7c89230db215b9b8a02754d41cb5a76f70d0be296", size = 2173656 }, - { url = "https://files.pythonhosted.org/packages/d2/03/95738a68ade2358e5a4d63a2fd8e7ed9ad911001cfabbbb33a7f81343945/debugpy-1.8.11-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ffc382e4afa4aee367bf413f55ed17bd91b191dcaf979890af239dda435f2a1", size = 3132464 }, - { url = "https://files.pythonhosted.org/packages/ca/f4/18204891ab67300950615a6ad09b9de236203a9138f52b3b596fa17628ca/debugpy-1.8.11-cp311-cp311-win32.whl", hash = "sha256:40499a9979c55f72f4eb2fc38695419546b62594f8af194b879d2a18439c97a9", size = 5103637 }, - { url = "https://files.pythonhosted.org/packages/3b/90/3775e301cfa573b51eb8a108285681f43f5441dc4c3916feed9f386ef861/debugpy-1.8.11-cp311-cp311-win_amd64.whl", hash = "sha256:987bce16e86efa86f747d5151c54e91b3c1e36acc03ce1ddb50f9d09d16ded0e", size = 5127862 }, - { url = "https://files.pythonhosted.org/packages/c6/ae/2cf26f3111e9d94384d9c01e9d6170188b0aeda15b60a4ac6457f7c8a26f/debugpy-1.8.11-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:84e511a7545d11683d32cdb8f809ef63fc17ea2a00455cc62d0a4dbb4ed1c308", size = 2498756 }, - { url = "https://files.pythonhosted.org/packages/b0/16/ec551789d547541a46831a19aa15c147741133da188e7e6acf77510545a7/debugpy-1.8.11-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce291a5aca4985d82875d6779f61375e959208cdf09fcec40001e65fb0a54768", size = 4219136 }, - { url = "https://files.pythonhosted.org/packages/72/6f/b2b3ce673c55f882d27a6eb04a5f0c68bcad6b742ac08a86d8392ae58030/debugpy-1.8.11-cp312-cp312-win32.whl", hash = "sha256:28e45b3f827d3bf2592f3cf7ae63282e859f3259db44ed2b129093ca0ac7940b", size = 5224440 }, - { url = "https://files.pythonhosted.org/packages/77/09/b1f05be802c1caef5b3efc042fc6a7cadd13d8118b072afd04a9b9e91e06/debugpy-1.8.11-cp312-cp312-win_amd64.whl", hash = "sha256:44b1b8e6253bceada11f714acf4309ffb98bfa9ac55e4fce14f9e5d4484287a1", size = 5264578 }, - { url = "https://files.pythonhosted.org/packages/2e/66/931dc2479aa8fbf362dc6dcee707d895a84b0b2d7b64020135f20b8db1ed/debugpy-1.8.11-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:8988f7163e4381b0da7696f37eec7aca19deb02e500245df68a7159739bbd0d3", size = 2483651 }, - { url = "https://files.pythonhosted.org/packages/10/07/6c171d0fe6b8d237e35598b742f20ba062511b3a4631938cc78eefbbf847/debugpy-1.8.11-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c1f6a173d1140e557347419767d2b14ac1c9cd847e0b4c5444c7f3144697e4e", size = 4213770 }, - { url = "https://files.pythonhosted.org/packages/89/f1/0711da6ac250d4fe3bf7b3e9b14b4a86e82a98b7825075c07e19bab8da3d/debugpy-1.8.11-cp313-cp313-win32.whl", hash = "sha256:bb3b15e25891f38da3ca0740271e63ab9db61f41d4d8541745cfc1824252cb28", size = 5223911 }, - { url = "https://files.pythonhosted.org/packages/56/98/5e27fa39050749ed460025bcd0034a0a5e78a580a14079b164cc3abdeb98/debugpy-1.8.11-cp313-cp313-win_amd64.whl", hash = "sha256:d8768edcbeb34da9e11bcb8b5c2e0958d25218df7a6e56adf415ef262cd7b6d1", size = 5264166 }, - { url = "https://files.pythonhosted.org/packages/77/0a/d29a5aacf47b4383ed569b8478c02d59ee3a01ad91224d2cff8562410e43/debugpy-1.8.11-py2.py3-none-any.whl", hash = "sha256:0e22f846f4211383e6a416d04b4c13ed174d24cc5d43f5fd52e7821d0ebc8920", size = 5226874 }, +version = "1.8.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/25/c74e337134edf55c4dfc9af579eccb45af2393c40960e2795a94351e8140/debugpy-1.8.12.tar.gz", hash = "sha256:646530b04f45c830ceae8e491ca1c9320a2d2f0efea3141487c82130aba70dce", size = 1641122 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/19/dd58334c0a1ec07babf80bf29fb8daf1a7ca4c1a3bbe61548e40616ac087/debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a", size = 2076091 }, + { url = "https://files.pythonhosted.org/packages/4c/37/bde1737da15f9617d11ab7b8d5267165f1b7dae116b2585a6643e89e1fa2/debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45", size = 3560717 }, + { url = "https://files.pythonhosted.org/packages/d9/ca/bc67f5a36a7de072908bc9e1156c0f0b272a9a2224cf21540ab1ffd71a1f/debugpy-1.8.12-cp310-cp310-win32.whl", hash = "sha256:b202f591204023b3ce62ff9a47baa555dc00bb092219abf5caf0e3718ac20e7c", size = 5180672 }, + { url = "https://files.pythonhosted.org/packages/c1/b9/e899c0a80dfa674dbc992f36f2b1453cd1ee879143cdb455bc04fce999da/debugpy-1.8.12-cp310-cp310-win_amd64.whl", hash = "sha256:9649eced17a98ce816756ce50433b2dd85dfa7bc92ceb60579d68c053f98dff9", size = 5212702 }, + { url = "https://files.pythonhosted.org/packages/af/9f/5b8af282253615296264d4ef62d14a8686f0dcdebb31a669374e22fff0a4/debugpy-1.8.12-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:36f4829839ef0afdfdd208bb54f4c3d0eea86106d719811681a8627ae2e53dd5", size = 2174643 }, + { url = "https://files.pythonhosted.org/packages/ef/31/f9274dcd3b0f9f7d1e60373c3fa4696a585c55acb30729d313bb9d3bcbd1/debugpy-1.8.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a28ed481d530e3138553be60991d2d61103ce6da254e51547b79549675f539b7", size = 3133457 }, + { url = "https://files.pythonhosted.org/packages/ab/ca/6ee59e9892e424477e0c76e3798046f1fd1288040b927319c7a7b0baa484/debugpy-1.8.12-cp311-cp311-win32.whl", hash = "sha256:4ad9a94d8f5c9b954e0e3b137cc64ef3f579d0df3c3698fe9c3734ee397e4abb", size = 5106220 }, + { url = "https://files.pythonhosted.org/packages/d5/1a/8ab508ab05ede8a4eae3b139bbc06ea3ca6234f9e8c02713a044f253be5e/debugpy-1.8.12-cp311-cp311-win_amd64.whl", hash = "sha256:4703575b78dd697b294f8c65588dc86874ed787b7348c65da70cfc885efdf1e1", size = 5130481 }, + { url = "https://files.pythonhosted.org/packages/ba/e6/0f876ecfe5831ebe4762b19214364753c8bc2b357d28c5d739a1e88325c7/debugpy-1.8.12-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:7e94b643b19e8feb5215fa508aee531387494bf668b2eca27fa769ea11d9f498", size = 2500846 }, + { url = "https://files.pythonhosted.org/packages/19/64/33f41653a701f3cd2cbff8b41ebaad59885b3428b5afd0d93d16012ecf17/debugpy-1.8.12-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:086b32e233e89a2740c1615c2f775c34ae951508b28b308681dbbb87bba97d06", size = 4222181 }, + { url = "https://files.pythonhosted.org/packages/32/a6/02646cfe50bfacc9b71321c47dc19a46e35f4e0aceea227b6d205e900e34/debugpy-1.8.12-cp312-cp312-win32.whl", hash = "sha256:2ae5df899732a6051b49ea2632a9ea67f929604fd2b036613a9f12bc3163b92d", size = 5227017 }, + { url = "https://files.pythonhosted.org/packages/da/a6/10056431b5c47103474312cf4a2ec1001f73e0b63b1216706d5fef2531eb/debugpy-1.8.12-cp312-cp312-win_amd64.whl", hash = "sha256:39dfbb6fa09f12fae32639e3286112fc35ae976114f1f3d37375f3130a820969", size = 5267555 }, + { url = "https://files.pythonhosted.org/packages/cf/4d/7c3896619a8791effd5d8c31f0834471fc8f8fb3047ec4f5fc69dd1393dd/debugpy-1.8.12-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:696d8ae4dff4cbd06bf6b10d671e088b66669f110c7c4e18a44c43cf75ce966f", size = 2485246 }, + { url = "https://files.pythonhosted.org/packages/99/46/bc6dcfd7eb8cc969a5716d858e32485eb40c72c6a8dc88d1e3a4d5e95813/debugpy-1.8.12-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:898fba72b81a654e74412a67c7e0a81e89723cfe2a3ea6fcd3feaa3395138ca9", size = 4218616 }, + { url = "https://files.pythonhosted.org/packages/03/dd/d7fcdf0381a9b8094da1f6a1c9f19fed493a4f8576a2682349b3a8b20ec7/debugpy-1.8.12-cp313-cp313-win32.whl", hash = "sha256:22a11c493c70413a01ed03f01c3c3a2fc4478fc6ee186e340487b2edcd6f4180", size = 5226540 }, + { url = "https://files.pythonhosted.org/packages/25/bd/ecb98f5b5fc7ea0bfbb3c355bc1dd57c198a28780beadd1e19915bf7b4d9/debugpy-1.8.12-cp313-cp313-win_amd64.whl", hash = "sha256:fdb3c6d342825ea10b90e43d7f20f01535a72b3a1997850c0c3cefa5c27a4a2c", size = 5267134 }, + { url = "https://files.pythonhosted.org/packages/38/c4/5120ad36405c3008f451f94b8f92ef1805b1e516f6ff870f331ccb3c4cc0/debugpy-1.8.12-py2.py3-none-any.whl", hash = "sha256:274b6a2040349b5c9864e475284bce5bb062e63dce368a394b8cc865ae3b00c6", size = 5229490 }, ] [[package]] @@ -1970,7 +1964,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "sys_platform == 'darwin'" }, + { name = "appnope", marker = "platform_system == 'Darwin' and sys_platform == 'darwin'" }, { name = "comm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "debugpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ipython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2815,6 +2809,7 @@ name = "nvidia-cublas-cu12" version = "12.4.5.8" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 }, { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 }, ] @@ -2823,6 +2818,7 @@ name = "nvidia-cuda-cupti-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 }, { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 }, ] @@ -2831,6 +2827,7 @@ name = "nvidia-cuda-nvrtc-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 }, { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 }, ] @@ -2839,6 +2836,7 @@ name = "nvidia-cuda-runtime-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 }, { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 }, ] @@ -2861,6 +2859,7 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, ] @@ -2869,6 +2868,7 @@ name = "nvidia-curand-cu12" version = "10.3.5.147" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 }, { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 }, ] @@ -2882,6 +2882,7 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, ] @@ -2893,6 +2894,7 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, ] @@ -2901,6 +2903,7 @@ name = "nvidia-cusparselt-cu12" version = "0.6.2" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781 }, { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751 }, ] @@ -2917,6 +2920,7 @@ name = "nvidia-nvjitlink-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 }, { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 }, ] @@ -2925,6 +2929,7 @@ name = "nvidia-nvtx-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 }, { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 }, ] @@ -3522,7 +3527,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "pywin32", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4042,25 +4047,20 @@ crypto = [ [[package]] name = "pylibsrtp" -version = "0.10.0" +version = "0.11.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6b/ae/c95199144eed954976223bdce3f94564eb6c43567111aff8048a26a429bd/pylibsrtp-0.10.0.tar.gz", hash = "sha256:d8001912d7f51bd05b4ea3551747930631777fd37892cf3bfe0e541a742e699f", size = 10557 } +sdist = { url = "https://files.pythonhosted.org/packages/2e/49/1c5101ecfeda540699e0754dddfc91c401fbf736ebe99d66e59fe3dad2ba/pylibsrtp-0.11.0.tar.gz", hash = "sha256:5a8d19b1448baebde5ae3cedfa51f10e8ada3d9d99f43046ced0ecf1c105b8ec", size = 10786 } wheels = [ - { url = "https://files.pythonhosted.org/packages/1c/d2/ffc24f80e83a54d9b309cdae6b31cf9294b4f3a85ab107827fd272d1e687/pylibsrtp-0.10.0-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6a1121ceea3339e0a84842a4a9da0fcf57cc8f99eb60dbf31a46d978b4170e7c", size = 1704188 }, - { url = "https://files.pythonhosted.org/packages/66/3e/db86a09a5cb290a274f76ce25f4fae3a7e3c4a4dbc64baf7e2aaa57a32bb/pylibsrtp-0.10.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ca1994e73c6857b0a695fdde94cc5ac846c1b0d5d8766255a1dc2db40857f667", size = 2028580 }, - { url = "https://files.pythonhosted.org/packages/21/ab/9b2b5ad2ceaa1660de16e0a2e3c54a2043a9c4a3eef7718930c78dc84e77/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb7640b524544603d07bd4373b04c9582c8cfe41d9789d3f492081f053bed9c1", size = 2484470 }, - { url = "https://files.pythonhosted.org/packages/ab/e6/b0a30e79aa2312834b33f5e9c0ad459fc94e195c610634ee9665fafb1fc8/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f13aa945e1dcf8c138bf3d4a6e34056c4c2f69bf9934bc53b320ef14c7317ccc", size = 2078367 }, - { url = "https://files.pythonhosted.org/packages/16/78/9ea0c88490ad4fe9683ddf3bbee702c7a2331e83a333bb3aa52e8d7d909b/pylibsrtp-0.10.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b2ef1c32d1145239dd0fe7b7fbe083334d345df6b4597fc66faf914a32682d9", size = 2134898 }, - { url = "https://files.pythonhosted.org/packages/00/f6/c76fa5401f9d95c14db70de0cf4fad922ad61686843bc3e7411178a64bc8/pylibsrtp-0.10.0-cp38-abi3-win32.whl", hash = "sha256:8c6fe2576b2ab13942b47db6c2ffe71f5eb1edc1dc3bdd7283169fecd5249e74", size = 1130881 }, - { url = "https://files.pythonhosted.org/packages/4c/31/85a58625edc0b6967fe0904c9d89d019bcece3f3e3bf775b9151a8cf9d0d/pylibsrtp-0.10.0-cp38-abi3-win_amd64.whl", hash = "sha256:cd965d4b0e9a77b362526cab119f4d9ce39b83f1f20f46c6af8e694b86fa19a7", size = 1448840 }, - { url = "https://files.pythonhosted.org/packages/66/b5/30b57cac6adf93dfee20cceba6cd91e216c81b723df2bc9dcfe781456263/pylibsrtp-0.10.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:582e9771be7ffd060faea215cb4248afdad1356da473df1b8f35c7e382ca3871", size = 1699981 }, - { url = "https://files.pythonhosted.org/packages/16/e8/3846ac56ae4a2de91e9b3e67dff5363b2b07148616d283416fd8dd8c6ca6/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70111eeb87e5d3ffb9623e1ea036329dc81fed1282aa93c1f32377862ca0a0d8", size = 2441012 }, - { url = "https://files.pythonhosted.org/packages/b1/9f/c611fc47ef5d84dfffca0292bcfb2d78ee5fc1a98d50cf22dfcda3eee171/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eda06947ab42fd3737f01a7b98537a5d5908434d37c70488d10e7bd2ff0d520c", size = 2019497 }, - { url = "https://files.pythonhosted.org/packages/d8/38/90c897fc2f2929290ada1032fa3e0bd39eca9190503250f6724a7bc22b5b/pylibsrtp-0.10.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:511158499309c3f7e97e1ebeffbf3dd939e641ea553de43cfc02d3576aad5c15", size = 2074919 }, - { url = "https://files.pythonhosted.org/packages/2c/46/e92f8a8d7cb5c1d68ec85254a8535aad922efa15646c7ba0c7746b42c4ea/pylibsrtp-0.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4033481f332331bf14b9705dca69efd09d3809ba4a2ff69914c53dddf39c20c1", size = 1446426 }, + { url = "https://files.pythonhosted.org/packages/b5/95/65650bf56e1080beb5f7c963a0bb11a6ee7599bfd89b33ff4525d2b5824b/pylibsrtp-0.11.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:36c6b33347d47c889b7dd465c6ae1f44d7705d00436ca613fd2a8f5dd401b104", size = 1727506 }, + { url = "https://files.pythonhosted.org/packages/4e/b0/f12c489ea8716e74343559abc5d0dfb94d66bcfe1924d64d58424a50f496/pylibsrtp-0.11.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cf18b80f9513484a70e55136ece6ec80e7d21c03cc69abbb428e4f2745ca3cee", size = 2058008 }, + { url = "https://files.pythonhosted.org/packages/e1/2e/6040cd6da6f82f3aa1763c8c45f7fcfdfe08db5560c73f5e1deb4c36c2bb/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81bbe0cd777979f7fc45c85f0c619c9cbe709faffbf91675d9dcce560734b353", size = 2566705 }, + { url = "https://files.pythonhosted.org/packages/2b/c9/fd313ac3a23e9c45493131d9fa3463770289e59bb8422c6c6877ab3add40/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78fcdfe63925ea9a5017884c31fe9687b9b8b9f7d9beb7e25e3be47aa6ece495", size = 2168163 }, + { url = "https://files.pythonhosted.org/packages/f9/b3/ae0bac50cc0cca4b8c14de8063ba410ed3edd82c71a2315f284c9be7d679/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1909f7e781a7675d5c92cbad9e7ed3642e626e2bea5834243e423976e5420ac3", size = 2224343 }, + { url = "https://files.pythonhosted.org/packages/51/c4/650c2cecd5810f84adc89f3a94a28ea02d7ac8eaf3ee718a629c6f8ebf09/pylibsrtp-0.11.0-cp39-abi3-win32.whl", hash = "sha256:15123cecd377248747c95de9305ac314f3bcccdae46022bb4b9d60a552a26a10", size = 1156330 }, + { url = "https://files.pythonhosted.org/packages/fe/78/724307095b95c937e54c48133be3e85779cebea770f7536be555217b31f2/pylibsrtp-0.11.0-cp39-abi3-win_amd64.whl", hash = "sha256:bea2fb98029d19de516538b13c4827b6474d6f85d9ea50fae349e9671b946f7a", size = 1486448 }, ] [[package]] @@ -4970,6 +4970,7 @@ wheels = [ [[package]] name = "semantic-kernel" +version = "1.21.0" source = { editable = "." } dependencies = [ { name = "aiohttp", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -5027,7 +5028,7 @@ hugging-face = [ { name = "transformers", extra = ["torch"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] milvus = [ - { name = "milvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "milvus", marker = "(platform_system != 'Windows' and sys_platform == 'darwin') or (platform_system != 'Windows' and sys_platform == 'linux') or (platform_system != 'Windows' and sys_platform == 'win32')" }, { name = "pymilvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] mistralai = [ @@ -5117,7 +5118,7 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'google'", specifier = "~=0.8" }, { name = "ipykernel", marker = "extra == 'notebooks'", specifier = "~=6.29" }, { name = "jinja2", specifier = "~=3.1" }, - { name = "milvus", marker = "sys_platform != 'win32' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, + { name = "milvus", marker = "platform_system != 'Windows' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, { name = "mistralai", marker = "extra == 'mistralai'", specifier = ">=1.2,<2.0" }, { name = "motor", marker = "extra == 'mongo'", specifier = ">=3.3.2,<3.8.0" }, { name = "nest-asyncio", specifier = "~=1.6" }, @@ -5580,22 +5581,22 @@ dependencies = [ { name = "fsspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "networkx", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform == 'darwin') or (python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')" }, { name = "sympy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ @@ -5640,7 +5641,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ From b4d5482f273d9858a52b44a8dc2546bfc0a0390a Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Thu, 13 Feb 2025 17:10:42 +0100 Subject: [PATCH 27/50] WIP azure --- .../01-chat_with_realtime_websocket.py | 4 +- .../connectors/ai/open_ai/__init__.py | 2 + .../ai/open_ai/services/azure_config_base.py | 20 ++- .../ai/open_ai/services/azure_realtime.py | 167 ++++++++++++++++++ .../settings/azure_open_ai_settings.py | 7 + .../contents/events/realtime_event.py | 2 +- 6 files changed, 195 insertions(+), 7 deletions(-) create mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py index f276ed4c54a6..daf6133fdc3b 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py @@ -5,7 +5,7 @@ from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( - OpenAIRealtime, + AzureRealtime, OpenAIRealtimeExecutionSettings, ) from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( @@ -48,7 +48,7 @@ async def main() -> None: # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different audio_player = AudioPlayerWebsocket() - realtime_client = OpenAIRealtime( + realtime_client = AzureRealtime( "websocket", audio_output_callback=audio_player.client_callback, ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 4241ec1e49f3..6ff0a85dd341 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -34,6 +34,7 @@ ) from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion +from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtime from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio @@ -61,6 +62,7 @@ "AzureDataSourceParameters", "AzureEmbeddingDependency", "AzureOpenAISettings", + "AzureRealtime", "AzureTextCompletion", "AzureTextEmbedding", "AzureTextToAudio", diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py index da50e4ee56b6..c60b08b030f1 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py @@ -3,9 +3,11 @@ import logging from collections.abc import Awaitable, Callable, Mapping from copy import copy +from typing import Any from openai import AsyncAzureOpenAI from pydantic import ConfigDict, validate_call +from pydantic_core import Url from semantic_kernel.connectors.ai.open_ai.const import DEFAULT_AZURE_API_VERSION from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler, OpenAIModelTypes @@ -27,7 +29,7 @@ def __init__( deployment_name: str, ai_model_type: OpenAIModelTypes, endpoint: HttpsUrl | None = None, - base_url: HttpsUrl | None = None, + base_url: Url | None = None, api_version: str = DEFAULT_AZURE_API_VERSION, service_id: str | None = None, api_key: str | None = None, @@ -37,6 +39,7 @@ def __init__( default_headers: Mapping[str, str] | None = None, client: AsyncAzureOpenAI | None = None, instruction_role: str | None = None, + **kwargs: Any, ) -> None: """Internal class for configuring a connection to an Azure OpenAI service. @@ -47,7 +50,7 @@ def __init__( deployment_name (str): Name of the deployment. ai_model_type (OpenAIModelTypes): The type of OpenAI model to deploy. endpoint (HttpsUrl): The specific endpoint URL for the deployment. (Optional) - base_url (HttpsUrl): The base URL for Azure services. (Optional) + base_url (Url): The base URL for Azure services. (Optional) api_version (str): Azure API version. Defaults to the defined DEFAULT_AZURE_API_VERSION. service_id (str): Service ID for the deployment. (Optional) api_key (str): API key for Azure services. (Optional) @@ -59,6 +62,7 @@ def __init__( client (AsyncAzureOpenAI): An existing client to use. (Optional) instruction_role (str | None): The role to use for 'instruction' messages, for example, summarization prompts could use `developer` or `system`. (Optional) + kwargs: Additional keyword arguments. """ # Merge APP_INFO into the headers if it exists @@ -82,9 +86,17 @@ def __init__( if not base_url: if not endpoint: raise ServiceInitializationError("Please provide an endpoint or a base_url") - base_url = HttpsUrl(f"{str(endpoint).rstrip('/')}/openai/deployments/{deployment_name}") + if ai_model_type == OpenAIModelTypes.REALTIME: + # wss://my-eastus2-openai-resource.openai.azure.com/openai/realtime?api-version=2024-12-17&deployment=gpt-4o-mini-realtime-preview-deployment-name + temp_url = f"{str(endpoint).replace('https', 'wss').rstrip('/')}.openai.azure.com/openai/realtime" + else: + temp_url = f"{str(endpoint).rstrip('/')}/openai/deployments/{deployment_name}" + else: + # when supplying the base url instead of the endpoint, the developer should know what to use, + # when doing realtime, that includes using the wss protocol + temp_url = str(base_url) client = AsyncAzureOpenAI( - base_url=str(base_url), + base_url=temp_url, api_version=api_version, api_key=api_key, azure_ad_token=ad_token, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py new file mode 100644 index 000000000000..f2556456ff24 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -0,0 +1,167 @@ +# Copyright (c) Microsoft. All rights reserved. + +from collections.abc import Callable, Coroutine, Mapping +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar + +from numpy import ndarray +from openai import AsyncAzureOpenAI +from openai.lib.azure import AsyncAzureADTokenProvider +from pydantic import ValidationError + +from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_webrtc import OpenAIRealtimeWebRTCBase +from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( + OpenAIRealtimeWebsocketBase, +) +from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings +from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError + +if TYPE_CHECKING: + from aiortc.mediastreams import MediaStreamTrack + + +_T = TypeVar("_T", bound="AzureRealtime") + + +__all__ = ["AzureRealtime"] + + +class AzureRealtime(OpenAIRealtimeBase): + """Azure OpenAI Realtime service.""" + + def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T": + """Pick the right subclass, based on protocol.""" + subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} + subclass = subclass_map[protocol] + return super(AzureRealtime, subclass).__new__(subclass) + + def __init__( + self, + protocol: Literal["websocket", "webrtc"], + *, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + audio_track: "MediaStreamTrack | None" = None, + service_id: str | None = None, + api_key: str | None = None, + deployment_name: str | None = None, + endpoint: str | None = None, + base_url: str | None = None, + api_version: str | None = None, + ad_token: str | None = None, + ad_token_provider: AsyncAzureADTokenProvider | None = None, + token_endpoint: str | None = None, + default_headers: Mapping[str, str] | None = None, + async_client: AsyncAzureOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + **kwargs: Any, + ) -> None: + """Initialize an OpenAIRealtime service. + + Args: + protocol: The protocol to use, must be either "websocket" or "webrtc". + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible. + It is called first in both websockets and webrtc. + Even when passed, the audio content will still be + added to the receiving queue. + audio_track: The audio track to use for the service, only used by WebRTC. + A default is supplied if not provided. + It can be any class that implements the AudioStreamTrack interface. + service_id (str | None): The service ID for the Azure deployment. (Optional) + api_key (str | None): The optional api key. If provided, will override the value in the + env vars or .env file. + deployment_name (str | None): The optional deployment. If provided, will override the value + (chat_deployment_name) in the env vars or .env file. + endpoint (str | None): The optional deployment endpoint. If provided will override the value + in the env vars or .env file. + base_url (str | None): The optional deployment base_url. If provided will override the value + in the env vars or .env file. + api_version (str | None): The optional deployment api version. If provided will override the value + in the env vars or .env file. + ad_token (str | None): The Azure Active Directory token. (Optional) + ad_token_provider (AsyncAzureADTokenProvider): The Azure Active Directory token provider. (Optional) + token_endpoint (str | None): The token endpoint to request an Azure token. (Optional) + default_headers (Mapping[str, str]): The default headers mapping of string keys to + string values for HTTP requests. (Optional) + async_client (AsyncAzureOpenAI | None): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. + """ + try: + azure_openai_settings = AzureOpenAISettings.create( + api_key=api_key, + base_url=base_url, + endpoint=endpoint, + realtime_model_id=deployment_name, + api_version=api_version, + token_endpoint=token_endpoint, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not azure_openai_settings.realtime_deployment_name: + raise ServiceInitializationError("The OpenAI realtime model ID is required.") + if audio_track: + kwargs["audio_track"] = audio_track + super().__init__( + protocol=protocol, + audio_output_callback=audio_output_callback, + deployment_name=azure_openai_settings.realtime_deployment_name, + endpoint=azure_openai_settings.endpoint, + base_url=azure_openai_settings.base_url, + api_version=azure_openai_settings.api_version, + ad_token=ad_token, + ad_token_provider=ad_token_provider, + token_endpoint=azure_openai_settings.token_endpoint, + ai_model_type=OpenAIModelTypes.REALTIME, + service_id=service_id, + default_headers=default_headers, + client=async_client, + **kwargs, + ) + + +class AzureRealtimeWebRTC(AzureRealtime, OpenAIRealtimeWebRTCBase, AzureOpenAIConfigBase): + """OpenAI Realtime service using WebRTC protocol. + + This should not be used directly, use OpenAIRealtime instead. + Set protocol="webrtc" to use this class. + """ + + protocol: ClassVar[Literal["webrtc"]] = "webrtc" + + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + """Initialize an OpenAIRealtime service using WebRTC protocol.""" + raise NotImplementedError("Azure Realtime WebRTC is not yet supported.") + + +class AzureRealtimeWebsocket(AzureRealtime, OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): + """OpenAI Realtime service using WebSocket protocol. + + This should not be used directly, use OpenAIRealtime instead. + Set protocol="websocket" to use this class. + """ + + protocol: ClassVar[Literal["websocket"]] = "websocket" + + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + super().__init__( + *args, + **kwargs, + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py index a943757048c5..e6bb49cf3da0 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py @@ -55,6 +55,12 @@ class AzureOpenAISettings(KernelBaseSettings): Resource Management > Deployments in the Azure portal or, alternatively, under Management > Deployments in Azure OpenAI Studio. (Env var AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME) + - realtime_deployment_name: str - The name of the Azure Realtime deployment. This value + will correspond to the custom name you chose for your deployment + when you deployed a model. This value can be found under + Resource Management > Deployments in the Azure portal or, alternatively, + under Management > Deployments in Azure OpenAI Studio. + (Env var AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME) - api_key: SecretStr - The API key for the Azure deployment. This value can be found in the Keys & Endpoint section when examining your resource in the Azure portal. You can use either KEY1 or KEY2. @@ -85,6 +91,7 @@ class AzureOpenAISettings(KernelBaseSettings): text_to_image_deployment_name: str | None = None audio_to_text_deployment_name: str | None = None text_to_audio_deployment_name: str | None = None + realtime_deployment_name: str | None = None endpoint: HttpsUrl | None = None base_url: HttpsUrl | None = None api_key: SecretStr | None = None diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index f96f4684ae14..7ebfa231fa08 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -16,7 +16,7 @@ class RealtimeEvent(KernelBaseModel): """Base class for all service events.""" service_event: Any | None = Field(default=None, description="The event content.") - service_type: str + service_type: str | None = None event_type: ClassVar[Literal["service"]] = "service" From ca80839a9658da5e43d6c75f27d9bb8323770561 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 14 Feb 2025 12:06:37 +0100 Subject: [PATCH 28/50] working azure realtime websockets --- python/.vscode/launch.json | 2 +- .../ai/open_ai/services/azure_config_base.py | 45 ++++++++-------- .../ai/open_ai/services/azure_realtime.py | 4 +- .../settings/azure_open_ai_settings.py | 3 +- python/uv.lock | 52 +++++++------------ 5 files changed, 49 insertions(+), 57 deletions(-) diff --git a/python/.vscode/launch.json b/python/.vscode/launch.json index 831aaf5149bc..80145e18a817 100644 --- a/python/.vscode/launch.json +++ b/python/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "${file}", "console": "integratedTerminal", - "justMyCode": true + "justMyCode": false }, { "name": "Python FastAPI app with Dapr", diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py index c60b08b030f1..94d8691534fa 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py @@ -83,26 +83,29 @@ def __init__( "Please provide either api_key, ad_token or ad_token_provider or a client." ) - if not base_url: - if not endpoint: - raise ServiceInitializationError("Please provide an endpoint or a base_url") - if ai_model_type == OpenAIModelTypes.REALTIME: - # wss://my-eastus2-openai-resource.openai.azure.com/openai/realtime?api-version=2024-12-17&deployment=gpt-4o-mini-realtime-preview-deployment-name - temp_url = f"{str(endpoint).replace('https', 'wss').rstrip('/')}.openai.azure.com/openai/realtime" - else: - temp_url = f"{str(endpoint).rstrip('/')}/openai/deployments/{deployment_name}" - else: - # when supplying the base url instead of the endpoint, the developer should know what to use, - # when doing realtime, that includes using the wss protocol - temp_url = str(base_url) - client = AsyncAzureOpenAI( - base_url=temp_url, - api_version=api_version, - api_key=api_key, - azure_ad_token=ad_token, - azure_ad_token_provider=ad_token_provider, - default_headers=merged_headers, - ) + if not endpoint and not base_url: + raise ServiceInitializationError("Please provide an endpoint or a base_url") + + args: dict[str, Any] = { + "default_headers": merged_headers, + } + if api_version: + args["api_version"] = api_version + if ad_token: + args["azure_ad_token"] = ad_token + if ad_token_provider: + args["azure_ad_token_provider"] = ad_token_provider + if api_key: + args["api_key"] = api_key + if base_url: + args["base_url"] = str(base_url) + if endpoint and not base_url: + args["azure_endpoint"] = str(endpoint) + # TODO (eavanvalkenburg): Remove the check on model type when the package fixes: https://github.com/openai/openai-python/issues/2120 + if deployment_name and ai_model_type != OpenAIModelTypes.REALTIME: + args["azure_deployment"] = deployment_name + + client = AsyncAzureOpenAI(**args) args = { "ai_model_id": deployment_name, "client": client, @@ -112,7 +115,7 @@ def __init__( args["service_id"] = service_id if instruction_role: args["instruction_role"] = instruction_role - super().__init__(**args) + super().__init__(**args, **kwargs) def to_dict(self) -> dict[str, str]: """Convert the configuration to a dictionary.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index f2556456ff24..b8281fc950ec 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -99,7 +99,7 @@ def __init__( api_key=api_key, base_url=base_url, endpoint=endpoint, - realtime_model_id=deployment_name, + realtime_deployment_name=deployment_name, api_version=api_version, token_endpoint=token_endpoint, env_file_path=env_file_path, @@ -144,7 +144,7 @@ def __init__( **kwargs: Any, ) -> None: """Initialize an OpenAIRealtime service using WebRTC protocol.""" - raise NotImplementedError("Azure Realtime WebRTC is not yet supported.") + raise NotImplementedError("Azure Realtime with WebRTC is not yet supported.") class AzureRealtimeWebsocket(AzureRealtime, OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py index e6bb49cf3da0..47ebc4c2b7b7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py @@ -3,6 +3,7 @@ from typing import ClassVar from pydantic import SecretStr +from pydantic_core import Url from semantic_kernel.connectors.ai.open_ai.const import DEFAULT_AZURE_API_VERSION from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError @@ -93,7 +94,7 @@ class AzureOpenAISettings(KernelBaseSettings): text_to_audio_deployment_name: str | None = None realtime_deployment_name: str | None = None endpoint: HttpsUrl | None = None - base_url: HttpsUrl | None = None + base_url: Url | None = None api_key: SecretStr | None = None api_version: str = DEFAULT_AZURE_API_VERSION token_endpoint: str = "https://cognitiveservices.azure.com/.default" diff --git a/python/uv.lock b/python/uv.lock index b676cf25572f..bd37dfbcc86f 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -767,7 +767,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -1964,7 +1964,7 @@ name = "ipykernel" version = "6.29.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "appnope", marker = "platform_system == 'Darwin' and sys_platform == 'darwin'" }, + { name = "appnope", marker = "sys_platform == 'darwin'" }, { name = "comm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "debugpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "ipython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2809,7 +2809,6 @@ name = "nvidia-cublas-cu12" version = "12.4.5.8" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 }, { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 }, ] @@ -2818,7 +2817,6 @@ name = "nvidia-cuda-cupti-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 }, { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 }, ] @@ -2827,7 +2825,6 @@ name = "nvidia-cuda-nvrtc-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 }, { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 }, ] @@ -2836,7 +2833,6 @@ name = "nvidia-cuda-runtime-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 }, { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 }, ] @@ -2859,7 +2855,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 }, ] @@ -2868,7 +2863,6 @@ name = "nvidia-curand-cu12" version = "10.3.5.147" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 }, { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 }, ] @@ -2882,7 +2876,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 }, ] @@ -2894,7 +2887,6 @@ dependencies = [ { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, ] @@ -2903,7 +2895,6 @@ name = "nvidia-cusparselt-cu12" version = "0.6.2" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781 }, { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751 }, ] @@ -2920,7 +2911,6 @@ name = "nvidia-nvjitlink-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 }, { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 }, ] @@ -2929,7 +2919,6 @@ name = "nvidia-nvtx-cu12" version = "12.4.127" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 }, { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 }, ] @@ -3527,7 +3516,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -4970,7 +4959,6 @@ wheels = [ [[package]] name = "semantic-kernel" -version = "1.21.0" source = { editable = "." } dependencies = [ { name = "aiohttp", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -5028,7 +5016,7 @@ hugging-face = [ { name = "transformers", extra = ["torch"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] milvus = [ - { name = "milvus", marker = "(platform_system != 'Windows' and sys_platform == 'darwin') or (platform_system != 'Windows' and sys_platform == 'linux') or (platform_system != 'Windows' and sys_platform == 'win32')" }, + { name = "milvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pymilvus", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] mistralai = [ @@ -5118,7 +5106,7 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'google'", specifier = "~=0.8" }, { name = "ipykernel", marker = "extra == 'notebooks'", specifier = "~=6.29" }, { name = "jinja2", specifier = "~=3.1" }, - { name = "milvus", marker = "platform_system != 'Windows' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, + { name = "milvus", marker = "sys_platform != 'win32' and extra == 'milvus'", specifier = ">=2.3,<2.3.8" }, { name = "mistralai", marker = "extra == 'mistralai'", specifier = ">=1.2,<2.0" }, { name = "motor", marker = "extra == 'mongo'", specifier = ">=3.3.2,<3.8.0" }, { name = "nest-asyncio", specifier = "~=1.6" }, @@ -5581,22 +5569,22 @@ dependencies = [ { name = "fsspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "networkx", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform == 'darwin') or (python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')" }, { name = "sympy", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "triton", marker = "platform_machine == 'x86_64' and platform_system == 'Linux' and sys_platform == 'linux'" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ @@ -5641,7 +5629,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows' and sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ From 1643196a2d460c1b1e34a49099c9849b0ce6807b Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 14 Feb 2025 15:25:45 +0100 Subject: [PATCH 29/50] added call automation sample --- .../demos/call_automation/create_kernel.py | 33 +++ python/samples/demos/call_automation/main.py | 229 ++++++++++++++++++ .../samples/demos/call_automation/readme.md | 60 +++++ .../demos/call_automation/requirements.txt | 5 + .../connectors/ai/open_ai/__init__.py | 2 + 5 files changed, 329 insertions(+) create mode 100644 python/samples/demos/call_automation/create_kernel.py create mode 100644 python/samples/demos/call_automation/main.py create mode 100644 python/samples/demos/call_automation/readme.md create mode 100644 python/samples/demos/call_automation/requirements.txt diff --git a/python/samples/demos/call_automation/create_kernel.py b/python/samples/demos/call_automation/create_kernel.py new file mode 100644 index 000000000000..dc8aaa948b1d --- /dev/null +++ b/python/samples/demos/call_automation/create_kernel.py @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft. All rights reserved. + +from datetime import datetime +from random import randint + +from semantic_kernel import Kernel +from semantic_kernel.functions import kernel_function + + +@kernel_function +def get_weather(location: str) -> str: + """Get the weather for a location.""" + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + return f"The weather in {location} is {weather}." + + +@kernel_function +def get_date_time() -> str: + """Get the current date and time.""" + return f"The current date and time is {datetime.now().isoformat()}." + + +@kernel_function +def goodbye(): + """When the user is done, say goodbye and then call this function.""" + raise KeyboardInterrupt + + +def create_kernel() -> Kernel: + kernel = Kernel() + kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) + return kernel diff --git a/python/samples/demos/call_automation/main.py b/python/samples/demos/call_automation/main.py new file mode 100644 index 000000000000..302386d96a32 --- /dev/null +++ b/python/samples/demos/call_automation/main.py @@ -0,0 +1,229 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import os +import uuid +from logging import INFO +from urllib.parse import urlencode, urlparse, urlunparse + +from azure.communication.callautomation import ( + AudioFormat, + MediaStreamingAudioChannelType, + MediaStreamingContentType, + MediaStreamingOptions, + MediaStreamingTransportType, +) +from azure.communication.callautomation.aio import CallAutomationClient +from azure.eventgrid import EventGridEvent, SystemEventNames +from quart import Quart, Response, json, request, websocket + +from samples.demos.call_automation.create_kernel import create_kernel +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtime, + InputAudioTranscription, + OpenAIRealtimeExecutionSettings, + SendEvents, + TurnDetection, +) +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeEvent + +# Your ACS resource connection string +ACS_CONNECTION_STRING = os.environ["ACS_CONNECTION_STRING"] +# Callback events URI to handle callback events. +CALLBACK_URI_HOST = os.environ["CALLBACK_URI_HOST"] +CALLBACK_EVENTS_URI = CALLBACK_URI_HOST + "/api/callbacks" + +acs_client = CallAutomationClient.from_connection_string(ACS_CONNECTION_STRING) +app = Quart(__name__) + + +@app.route("/api/incomingCall", methods=["POST"]) +async def incoming_call_handler() -> Response: + app.logger.info("incoming event data") + for event_dict in await request.json: + event = EventGridEvent.from_dict(event_dict) + app.logger.info("incoming event data --> %s", event.data) + if event.event_type == SystemEventNames.EventGridSubscriptionValidationEventName: + app.logger.info("Validating subscription") + validation_code = event.data["validationCode"] + validation_response = {"validationResponse": validation_code} + return Response(response=json.dumps(validation_response), status=200) + if event.event_type == "Microsoft.Communication.IncomingCall": + app.logger.info("Incoming call received: data=%s", event.data) + if event.data["from"]["kind"] == "phoneNumber": + caller_id = event.data["from"]["phoneNumber"]["value"] + else: + caller_id = event.data["from"]["rawId"] + app.logger.info("incoming call handler caller id: %s", caller_id) + incoming_call_context = event.data["incomingCallContext"] + guid = uuid.uuid4() + query_parameters = urlencode({"callerId": caller_id}) + callback_uri = f"{CALLBACK_EVENTS_URI}/{guid}?{query_parameters}" + + parsed_url = urlparse(CALLBACK_EVENTS_URI) + websocket_url = urlunparse(("wss", parsed_url.netloc, "/ws", "", "", "")) + + app.logger.info("callback url: %s", callback_uri) + app.logger.info("websocket url: %s", websocket_url) + + media_streaming_options = MediaStreamingOptions( + transport_url=websocket_url, + transport_type=MediaStreamingTransportType.WEBSOCKET, + content_type=MediaStreamingContentType.AUDIO, + audio_channel_type=MediaStreamingAudioChannelType.MIXED, + start_media_streaming=True, + enable_bidirectional=True, + audio_format=AudioFormat.PCM24_K_MONO, + ) + + answer_call_result = await acs_client.answer_call( + incoming_call_context=incoming_call_context, + operation_context="incomingCall", + callback_url=callback_uri, + media_streaming=media_streaming_options, + ) + app.logger.info("Answered call for connection id: %s", answer_call_result.call_connection_id) + return Response(status=200) + return Response(status=200) + + +@app.route("/api/callbacks/", methods=["POST"]) +async def callbacks(contextId): + for event in await request.json: + # Parsing callback events + global call_connection_id + event_data = event["data"] + call_connection_id = event_data["callConnectionId"] + app.logger.info( + f"Received Event:-> {event['type']}, Correlation Id:-> {event_data['correlationId']}, CallConnectionId:-> {call_connection_id}" # noqa: E501 + ) + if event["type"] == "Microsoft.Communication.CallConnected": + call_connection_properties = await acs_client.get_call_connection(call_connection_id).get_call_properties() + media_streaming_subscription = call_connection_properties.media_streaming_subscription + app.logger.info(f"MediaStreamingSubscription:--> {media_streaming_subscription}") + app.logger.info(f"Received CallConnected event for connection id: {call_connection_id}") + app.logger.info("CORRELATION ID:--> %s", event_data["correlationId"]) + app.logger.info("CALL CONNECTION ID:--> %s", event_data["callConnectionId"]) + elif ( + event["type"] == "Microsoft.Communication.MediaStreamingStarted" + or event["type"] == "Microsoft.Communication.MediaStreamingStopped" + ): + app.logger.info(f"Media streaming content type:--> {event_data['mediaStreamingUpdate']['contentType']}") + app.logger.info(f"Media streaming status:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatus']}") + app.logger.info( + f"Media streaming status details:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatusDetails']}" # noqa: E501 + ) + elif event["type"] == "Microsoft.Communication.MediaStreamingFailed": + app.logger.info( + f"Code:->{event_data['resultInformation']['code']}, Subcode:-> {event_data['resultInformation']['subCode']}" # noqa: E501 + ) + app.logger.info(f"Message:->{event_data['resultInformation']['message']}") + elif event["type"] == "Microsoft.Communication.CallDisconnected": + pass + return Response(status=200) + + +# WebSocket. +@app.websocket("/ws") +async def ws(): + print("Client connected to WebSocket") + kernel = create_kernel() + + client = AzureRealtime("websocket") + settings = OpenAIRealtimeExecutionSettings( + instructions="""You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose.""", + turn_detection=TurnDetection(type="server_vad"), + voice="shimmer", + input_audio_format="pcm16", + output_audio_format="pcm16", + input_audio_transcription=InputAudioTranscription(model="whisper-1"), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + receive_task = asyncio.create_task(receive_messages(client, settings, kernel)) + while True: + try: + # Receive data from the client + stream_data = await websocket.receive() + data = json.loads(stream_data) + kind = data["kind"] + if kind == "AudioData": + await client.send( + event=RealtimeEvent( + service_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, + service_event={"audio": data["audioData"]["data"]}, + ) + ) + except Exception: + print("Websocket connection closed.") + break + receive_task.cancel() + + +async def receive_messages( + client: RealtimeClientBase, + settings: OpenAIRealtimeExecutionSettings, + kernel: Kernel, +): + async with client( + settings=settings, + create_response=True, + kernel=kernel, + ): + async for event in client.receive(): + match event: + case RealtimeAudioEvent(): + await websocket.send( + json.dumps({"kind": "AudioData", "audioData": {"data": event.service_event.delta}}) + ) + case _: + match event.service_type: + case "session.created": + print("Session Created Message") + print(f" Session Id: {event.service_event.session.id}") + pass + case "error": + print(f" Error: {event.service_event.error}") + pass + case "input_audio_buffer.cleared": + print("Input Audio Buffer Cleared Message") + pass + case "input_audio_buffer.speech_started": + print(f"Voice activity detection started at {event.service_event.audio_start_ms} [ms]") + await websocket.send(json.dumps({"Kind": "StopAudio", "AudioData": None, "StopAudio": {}})) + pass + case "input_audio_buffer.speech_stopped": + pass + case "conversation.item.input_audio_transcription.completed": + print(f" User:-- {event.service_event.transcript}") + case "conversation.item.input_audio_transcription.failed": + print(f" Error: {event.service_event.error}") + case "response.done": + print("Response Done Message") + print(f" Response Id: {event.service_event.response.id}") + if event.service_event.response.status_details: + print( + f" Status Details: {event.service_event.response.status_details.model_dump_json()}" + ) + case "response.audio_transcript.done": + print(f" AI:-- {event.service_event.transcript}") + + case _: + pass + + +@app.route("/") +def home(): + return "Hello ACS CallAutomation!" + + +if __name__ == "__main__": + app.logger.setLevel(INFO) + app.run(port=8080) diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md new file mode 100644 index 000000000000..066c922f4d35 --- /dev/null +++ b/python/samples/demos/call_automation/readme.md @@ -0,0 +1,60 @@ +| page_type | languages | products | +| --------- | --------------------------------------- | --------------------------------------------------------------------------- | +| sample |
Python
|
azureazure-communication-services
| + +# Call Automation - Quick Start Sample + +This is a sample application demonstrated during Microsoft Ignite 2024. It highlights an integration of Azure Communication Services with Azure OpenAI Service to enable intelligent conversational agents. + +## Prerequisites + +- An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). +- A deployed Communication Services resource. [Create a Communication Services resource](https://docs.microsoft.com/azure/communication-services/quickstarts/create-communication-resource). +- A [phone number](https://learn.microsoft.com/en-us/azure/communication-services/quickstarts/telephony/get-phone-number) in your Azure Communication Services resource that can get inbound calls. NB: phone numbers are not available in free subscriptions. +- [Python](https://www.python.org/downloads/) 3.7 or above. +- An Azure OpenAI Resource and Deployed Model. See [instructions](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal). + +## Before running the sample for the first time + +1. Open an instance of PowerShell, Windows Terminal, Command Prompt or equivalent and navigate to the directory that you would like to clone the sample to. +2. git clone `https://github.com/Azure-Samples/communication-services-python-quickstarts.git`. +3. Navigate to `callautomation-azure-openai-voice` folder and open `main.py` file. + +### Setup the Python environment + +Create and activate python virtual environment and install required packages using following command +``` +pip install -r requirements.txt +pip install -r ./aoai-whl/rtclient-0.5.1-py3-none-any.whl +``` + +### Setup and host your Azure DevTunnel + +[Azure DevTunnels](https://learn.microsoft.com/en-us/azure/developer/dev-tunnels/overview) is an Azure service that enables you to share local web services hosted on the internet. Use the commands below to connect your local development environment to the public internet. This creates a tunnel with a persistent endpoint URL and which allows anonymous access. We will then use this endpoint to notify your application of calling events from the ACS Call Automation service. + +```bash +devtunnel create --allow-anonymous +devtunnel port create -p 8080 +devtunnel host +``` + +### Configuring application + +Open `main.py` file to configure the following settings + +1. `ACS_CONNECTION_STRING`: Azure Communication Service resource's connection string. +2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use dev tunnel url) + +Open `azureOpenAIService.py` file to configure the following settings + +1. `AZURE_OPENAI_SERVICE_ENDPOINT`: Azure Open AI service endpoint +2. `AZURE_OPENAI_SERVICE_KEY`: Azure Open AI service key +3. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name + +## Run app locally + +1. Navigate to `callautomation-azure-openai-voice` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal +2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url. +3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). + +Once that's completed you should have a running application. The best way to test this is to place a call to your ACS phone number and talk to your intelligent agent. \ No newline at end of file diff --git a/python/samples/demos/call_automation/requirements.txt b/python/samples/demos/call_automation/requirements.txt new file mode 100644 index 000000000000..5c024fddac08 --- /dev/null +++ b/python/samples/demos/call_automation/requirements.txt @@ -0,0 +1,5 @@ +Quart>=0.19.6 +azure-eventgrid==4.11.0 +aiohttp>= 3.11.9 +azure-communication-callautomation==1.4.0b1 +semantic-kernel \ No newline at end of file diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 6ff0a85dd341..b96b72322cd4 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -23,6 +23,7 @@ OpenAITextPromptExecutionSettings, ) from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + InputAudioTranscription, OpenAIRealtimeExecutionSettings, TurnDetection, ) @@ -71,6 +72,7 @@ "DataSourceFieldsMapping", "DataSourceFieldsMapping", "ExtraBody", + "InputAudioTranscription", "ListenEvents", "OpenAIAudioToText", "OpenAIAudioToTextExecutionSettings", From 363f9db971ebad148a15029bd2b7b20cae52ac4b Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 11:46:06 +0100 Subject: [PATCH 30/50] added function calling sample with azure --- .../01-chat_with_realtime_websocket.py | 10 +- .../02-azure_chat_with_function_calling.py | 134 ++++++++++++++++++ 2 files changed, 136 insertions(+), 8 deletions(-) create mode 100644 python/samples/concepts/realtime/02-azure_chat_with_function_calling.py diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py index daf6133fdc3b..d5d3624b9316 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py @@ -6,21 +6,15 @@ from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( AzureRealtime, + ListenEvents, OpenAIRealtimeExecutionSettings, -) -from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( TurnDetection, ) -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents -from semantic_kernel.contents.events.realtime_event import RealtimeTextEvent +from semantic_kernel.contents.events import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") utils_log.setLevel(logging.INFO) -aiortc_log = logging.getLogger("aiortc") -aiortc_log.setLevel(logging.WARNING) -aioice_log = logging.getLogger("aioice") -aioice_log.setLevel(logging.WARNING) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/python/samples/concepts/realtime/02-azure_chat_with_function_calling.py b/python/samples/concepts/realtime/02-azure_chat_with_function_calling.py new file mode 100644 index 000000000000..7d751eee6a7c --- /dev/null +++ b/python/samples/concepts/realtime/02-azure_chat_with_function_calling.py @@ -0,0 +1,134 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging +from datetime import datetime +from random import randint + +from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtime, + ListenEvents, + OpenAIRealtimeExecutionSettings, + TurnDetection, +) +from semantic_kernel.contents import ChatHistory +from semantic_kernel.contents.events import RealtimeTextEvent +from semantic_kernel.functions import kernel_function + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# This simple sample demonstrates how to use the OpenAI Realtime API to create +# a chat bot that can listen and respond directly through audio. +# It requires installing: +# - semantic-kernel[openai_realtime] +# - pyaudio +# - sounddevice +# - pydub +# - aiortc +# e.g. pip install pyaudio sounddevice pydub + + +@kernel_function +def get_weather(location: str) -> str: + """Get the weather for a location.""" + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + logger.info(f"@ Getting weather for {location}: {weather}") + return f"The weather in {location} is {weather}." + + +@kernel_function +def get_date_time() -> str: + """Get the current date and time.""" + logger.info("@ Getting current datetime") + return f"The current date and time is {datetime.now().isoformat()}." + + +@kernel_function +def goodbye(): + """When the user is done, say goodbye and then call this function.""" + logger.info("@ Goodbye has been called!") + raise KeyboardInterrupt + + +async def main() -> None: + print_transcript = True + # create the Kernel and add a simple function for function calling. + kernel = Kernel() + kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) + + # create the audio player and audio track + # both take a device_id parameter, which is the index of the device to use, if None the default device is used + audio_player = AudioPlayerWebsocket() + # create the realtime client and optionally add the audio output function, this is optional + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + realtime_client = AzureRealtime( + protocol="websocket", + audio_output_callback=audio_player.client_callback, + ) + audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) + + # Create the settings for the session + # The realtime api, does not use a system message, but takes instructions as a parameter for a session + instructions = """ + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """ + # the key thing to decide on is to enable the server_vad turn detection + # if turn is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample + settings = OpenAIRealtimeExecutionSettings( + instructions=instructions, + voice="alloy", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + # and we can add a chat history to conversation after starting it + chat_history = ChatHistory() + chat_history.add_user_message("Hi there, who are you?") + chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + + # the context manager calls the create_session method on the client and start listening to the audio stream + async with ( + audio_player, + audio_recorder, + realtime_client( + settings=settings, + chat_history=chat_history, + kernel=kernel, + create_response=True, + ), + ): + async for event in realtime_client.receive(): + match event: + case RealtimeTextEvent(): + if print_transcript: + print(event.text.text, end="") + case _: + # OpenAI Specific events + match event.service_type: + case ListenEvents.RESPONSE_CREATED: + if print_transcript: + print("\nMosscap (transcript): ", end="") + case ListenEvents.ERROR: + print(event.service_event) + logger.error(event.service_event) + + +if __name__ == "__main__": + print( + "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) From ba7e312f127d8a3c0e6d9463c958ff5f113426b5 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 14:22:18 +0100 Subject: [PATCH 31/50] much improvement to the call automation sample --- .../demos/call_automation/.env.example | 8 + .../demos/call_automation/create_kernel.py | 33 -- python/samples/demos/call_automation/main.py | 323 +++++++++++------- .../samples/demos/call_automation/readme.md | 27 +- .../realtime/open_ai_realtime_base.py | 2 +- .../realtime/open_ai_realtime_websocket.py | 5 +- .../contents/binary_content.py | 18 +- 7 files changed, 221 insertions(+), 195 deletions(-) create mode 100644 python/samples/demos/call_automation/.env.example delete mode 100644 python/samples/demos/call_automation/create_kernel.py mode change 100644 => 100755 python/samples/demos/call_automation/main.py diff --git a/python/samples/demos/call_automation/.env.example b/python/samples/demos/call_automation/.env.example new file mode 100644 index 000000000000..055528e2c2f3 --- /dev/null +++ b/python/samples/demos/call_automation/.env.example @@ -0,0 +1,8 @@ +ACS_CONNECTION_STRING= +CALLBACK_URI_HOST= + +AZURE_OPENAI_SERVICE_ENDPOINT= +AZURE_OPENAI_DEPLOYMENT_MODEL_NAME= +AZURE_OPENAI_API_VERSION= + +AZURE_OPENAI_SERVICE_KEY= \ No newline at end of file diff --git a/python/samples/demos/call_automation/create_kernel.py b/python/samples/demos/call_automation/create_kernel.py deleted file mode 100644 index dc8aaa948b1d..000000000000 --- a/python/samples/demos/call_automation/create_kernel.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -from datetime import datetime -from random import randint - -from semantic_kernel import Kernel -from semantic_kernel.functions import kernel_function - - -@kernel_function -def get_weather(location: str) -> str: - """Get the weather for a location.""" - weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") - weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec - return f"The weather in {location} is {weather}." - - -@kernel_function -def get_date_time() -> str: - """Get the current date and time.""" - return f"The current date and time is {datetime.now().isoformat()}." - - -@kernel_function -def goodbye(): - """When the user is done, say goodbye and then call this function.""" - raise KeyboardInterrupt - - -def create_kernel() -> Kernel: - kernel = Kernel() - kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) - return kernel diff --git a/python/samples/demos/call_automation/main.py b/python/samples/demos/call_automation/main.py old mode 100644 new mode 100755 index 302386d96a32..1acf650a6225 --- a/python/samples/demos/call_automation/main.py +++ b/python/samples/demos/call_automation/main.py @@ -1,9 +1,32 @@ -# Copyright (c) Microsoft. All rights reserved. - +#!/usr/bin/env uv run # noqa: CPY001 +#################################################################### +# Copyright (c) Microsoft. All rights reserved. # +# Sample Quart webapp with that connects to OpenAI or Azure OpenAI # +# If you have `uv` installed and the `OPENAI_API_KEY` # +# environment variable set, you can run this example with just # +# # +# `.python/samples/demo/call_automation/main.py` # +#################################################################### +# +# /// script +# requires-python = ">=3.9" +# dependencies = [ +# "Quart", +# "azure-eventgrid", +# "azure-communication-callautomation==1.4.0b1", +# "semantic-kernel[openai_realtime]", +# ] +# +# [tool.uv.sources] +# semantic-kernel = { path = "../../../", editable = true } +# /// import asyncio +import base64 import os import uuid +from datetime import datetime from logging import INFO +from random import randint from urllib.parse import urlencode, urlparse, urlunparse from azure.communication.callautomation import ( @@ -15,30 +38,157 @@ ) from azure.communication.callautomation.aio import CallAutomationClient from azure.eventgrid import EventGridEvent, SystemEventNames +from numpy import ndarray from quart import Quart, Response, json, request, websocket -from samples.demos.call_automation.create_kernel import create_kernel from semantic_kernel import Kernel from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( - AzureRealtime, InputAudioTranscription, OpenAIRealtimeExecutionSettings, - SendEvents, TurnDetection, ) +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime +from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeEvent +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.events import RealtimeAudioEvent +from semantic_kernel.functions import kernel_function -# Your ACS resource connection string -ACS_CONNECTION_STRING = os.environ["ACS_CONNECTION_STRING"] # Callback events URI to handle callback events. CALLBACK_URI_HOST = os.environ["CALLBACK_URI_HOST"] CALLBACK_EVENTS_URI = CALLBACK_URI_HOST + "/api/callbacks" -acs_client = CallAutomationClient.from_connection_string(ACS_CONNECTION_STRING) +acs_client = CallAutomationClient.from_connection_string(os.environ["ACS_CONNECTION_STRING"]) app = Quart(__name__) +# region: Semantic Kernel + +kernel = Kernel() + + +class HelperPlugin: + """Helper plugin for the Semantic Kernel.""" + + @kernel_function + def get_weather(self, location: str) -> str: + """Get the weather for a location.""" + app.logger.info(f"@ Getting weather for {location}") + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + return f"The weather in {location} is {weather}." + + @kernel_function + def get_date_time(self) -> str: + """Get the current date and time.""" + app.logger.info("@ Getting current datetime") + return f"The current date and time is {datetime.now().isoformat()}." + + @kernel_function + async def goodbye(self): + """When the user is done, say goodbye and then call this function.""" + app.logger.info("@ Goodbye has been called!") + global call_connection_id + await acs_client.get_call_connection(call_connection_id).hang_up(is_for_everyone=True) + + +kernel.add_plugin(plugin=HelperPlugin(), plugin_name="helpers", description="Helper functions for the realtime client.") + +# region: handlers for audio and data streams + + +async def from_realtime_to_acs(audio: ndarray): + """Function that forwards the audio from the model to the websocket of the ACS client.""" + await websocket.send( + json.dumps({"kind": "AudioData", "audioData": {"data": base64.b64encode(audio.tobytes()).decode("utf-8")}}) + ) + + +async def from_acs_to_realtime(client: RealtimeClientBase): + """Function that forwards the audio from the ACS client to the model.""" + while True: + try: + # Receive data from the ACS client + stream_data = await websocket.receive() + data = json.loads(stream_data) + if data["kind"] == "AudioData": + # send it to the Realtime service + await client.send( + event=RealtimeAudioEvent( + audio=AudioContent(data=data["audioData"]["data"], data_format="base64", inner_content=data), + ) + ) + except Exception: + print("Websocket connection closed.") + break + + +async def handle_realtime_messages(client: RealtimeClientBase): + """Function that handles the messages from the Realtime service. + + This function only handles the non-audio messages. + Audio is done through the callback so that it is faster and smoother. + """ + async for event in client.receive(): + match event.service_type: + case ListenEvents.SESSION_CREATED: + print("Session Created Message") + print(f" Session Id: {event.service_event.session.id}") + case ListenEvents.ERROR: + print(f" Error: {event.service_event.error}") + case ListenEvents.INPUT_AUDIO_BUFFER_CLEARED: + print("Input Audio Buffer Cleared Message") + case ListenEvents.INPUT_AUDIO_BUFFER_SPEECH_STARTED: + print(f"Voice activity detection started at {event.service_event.audio_start_ms} [ms]") + await websocket.send(json.dumps({"Kind": "StopAudio", "AudioData": None, "StopAudio": {}})) + + case ListenEvents.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED: + print(f" User:-- {event.service_event.transcript}") + case ListenEvents.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED: + print(f" Error: {event.service_event.error}") + case ListenEvents.RESPONSE_DONE: + print("Response Done Message") + print(f" Response Id: {event.service_event.response.id}") + if event.service_event.response.status_details: + print(f" Status Details: {event.service_event.response.status_details.model_dump_json()}") + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE: + print(f" AI:-- {event.service_event.transcript}") + + +# region: Quart routes + + +# WebSocket. +@app.websocket("/ws") +async def ws(): + print("Client connected to WebSocket") + + # create the client, using the audio callback + client = OpenAIRealtime("websocket", audio_output_callback=from_realtime_to_acs) + settings = OpenAIRealtimeExecutionSettings( + instructions="""You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose.""", + turn_detection=TurnDetection(type="server_vad"), + voice="shimmer", + input_audio_format="pcm16", + output_audio_format="pcm16", + input_audio_transcription=InputAudioTranscription(model="whisper-1"), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + + # create the realtime client session + async with client(settings=settings, create_response=True, kernel=kernel): + # start handling the messages from the realtime client + # and allow the callback to be used to forward the audio to the acs client + receive_task = asyncio.create_task(handle_realtime_messages(client)) + # receive messages from the ACS client and send them to the realtime client + await from_acs_to_realtime(client) + receive_task.cancel() + @app.route("/api/incomingCall", methods=["POST"]) async def incoming_call_handler() -> Response: @@ -46,17 +196,20 @@ async def incoming_call_handler() -> Response: for event_dict in await request.json: event = EventGridEvent.from_dict(event_dict) app.logger.info("incoming event data --> %s", event.data) + if event.event_type == SystemEventNames.EventGridSubscriptionValidationEventName: app.logger.info("Validating subscription") validation_code = event.data["validationCode"] validation_response = {"validationResponse": validation_code} return Response(response=json.dumps(validation_response), status=200) + if event.event_type == "Microsoft.Communication.IncomingCall": app.logger.info("Incoming call received: data=%s", event.data) - if event.data["from"]["kind"] == "phoneNumber": - caller_id = event.data["from"]["phoneNumber"]["value"] - else: - caller_id = event.data["from"]["rawId"] + caller_id = ( + event.data["from"]["phoneNumber"]["value"] + if event.data["from"]["kind"] == "phoneNumber" + else event.data["from"]["rawId"] + ) app.logger.info("incoming call handler caller id: %s", caller_id) incoming_call_context = event.data["incomingCallContext"] guid = uuid.uuid4() @@ -78,7 +231,6 @@ async def incoming_call_handler() -> Response: enable_bidirectional=True, audio_format=AudioFormat.PCM24_K_MONO, ) - answer_call_result = await acs_client.answer_call( incoming_call_context=incoming_call_context, operation_context="incomingCall", @@ -100,128 +252,37 @@ async def callbacks(contextId): app.logger.info( f"Received Event:-> {event['type']}, Correlation Id:-> {event_data['correlationId']}, CallConnectionId:-> {call_connection_id}" # noqa: E501 ) - if event["type"] == "Microsoft.Communication.CallConnected": - call_connection_properties = await acs_client.get_call_connection(call_connection_id).get_call_properties() - media_streaming_subscription = call_connection_properties.media_streaming_subscription - app.logger.info(f"MediaStreamingSubscription:--> {media_streaming_subscription}") - app.logger.info(f"Received CallConnected event for connection id: {call_connection_id}") - app.logger.info("CORRELATION ID:--> %s", event_data["correlationId"]) - app.logger.info("CALL CONNECTION ID:--> %s", event_data["callConnectionId"]) - elif ( - event["type"] == "Microsoft.Communication.MediaStreamingStarted" - or event["type"] == "Microsoft.Communication.MediaStreamingStopped" - ): - app.logger.info(f"Media streaming content type:--> {event_data['mediaStreamingUpdate']['contentType']}") - app.logger.info(f"Media streaming status:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatus']}") - app.logger.info( - f"Media streaming status details:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatusDetails']}" # noqa: E501 - ) - elif event["type"] == "Microsoft.Communication.MediaStreamingFailed": - app.logger.info( - f"Code:->{event_data['resultInformation']['code']}, Subcode:-> {event_data['resultInformation']['subCode']}" # noqa: E501 - ) - app.logger.info(f"Message:->{event_data['resultInformation']['message']}") - elif event["type"] == "Microsoft.Communication.CallDisconnected": - pass - return Response(status=200) - - -# WebSocket. -@app.websocket("/ws") -async def ws(): - print("Client connected to WebSocket") - kernel = create_kernel() - - client = AzureRealtime("websocket") - settings = OpenAIRealtimeExecutionSettings( - instructions="""You are a chat bot. Your name is Mosscap and - you have one goal: figure out what people need. - Your full name, should you need to know it, is - Splendid Speckled Mosscap. You communicate - effectively, but you tend to answer with long - flowery prose.""", - turn_detection=TurnDetection(type="server_vad"), - voice="shimmer", - input_audio_format="pcm16", - output_audio_format="pcm16", - input_audio_transcription=InputAudioTranscription(model="whisper-1"), - function_choice_behavior=FunctionChoiceBehavior.Auto(), - ) - receive_task = asyncio.create_task(receive_messages(client, settings, kernel)) - while True: - try: - # Receive data from the client - stream_data = await websocket.receive() - data = json.loads(stream_data) - kind = data["kind"] - if kind == "AudioData": - await client.send( - event=RealtimeEvent( - service_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, - service_event={"audio": data["audioData"]["data"]}, - ) + match event["type"]: + case "Microsoft.Communication.CallConnected": + call_connection_properties = await acs_client.get_call_connection( + call_connection_id + ).get_call_properties() + media_streaming_subscription = call_connection_properties.media_streaming_subscription + app.logger.info(f"MediaStreamingSubscription:--> {media_streaming_subscription}") + app.logger.info(f"Received CallConnected event for connection id: {call_connection_id}") + app.logger.info("CORRELATION ID:--> %s", event_data["correlationId"]) + app.logger.info("CALL CONNECTION ID:--> %s", event_data["callConnectionId"]) + case "Microsoft.Communication.MediaStreamingStarted" | "Microsoft.Communication.MediaStreamingStopped": + app.logger.info(f"Media streaming content type:--> {event_data['mediaStreamingUpdate']['contentType']}") + app.logger.info( + f"Media streaming status:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatus']}" ) - except Exception: - print("Websocket connection closed.") - break - receive_task.cancel() - - -async def receive_messages( - client: RealtimeClientBase, - settings: OpenAIRealtimeExecutionSettings, - kernel: Kernel, -): - async with client( - settings=settings, - create_response=True, - kernel=kernel, - ): - async for event in client.receive(): - match event: - case RealtimeAudioEvent(): - await websocket.send( - json.dumps({"kind": "AudioData", "audioData": {"data": event.service_event.delta}}) - ) - case _: - match event.service_type: - case "session.created": - print("Session Created Message") - print(f" Session Id: {event.service_event.session.id}") - pass - case "error": - print(f" Error: {event.service_event.error}") - pass - case "input_audio_buffer.cleared": - print("Input Audio Buffer Cleared Message") - pass - case "input_audio_buffer.speech_started": - print(f"Voice activity detection started at {event.service_event.audio_start_ms} [ms]") - await websocket.send(json.dumps({"Kind": "StopAudio", "AudioData": None, "StopAudio": {}})) - pass - case "input_audio_buffer.speech_stopped": - pass - case "conversation.item.input_audio_transcription.completed": - print(f" User:-- {event.service_event.transcript}") - case "conversation.item.input_audio_transcription.failed": - print(f" Error: {event.service_event.error}") - case "response.done": - print("Response Done Message") - print(f" Response Id: {event.service_event.response.id}") - if event.service_event.response.status_details: - print( - f" Status Details: {event.service_event.response.status_details.model_dump_json()}" - ) - case "response.audio_transcript.done": - print(f" AI:-- {event.service_event.transcript}") - - case _: - pass + app.logger.info( + f"Media streaming status details:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatusDetails']}" # noqa: E501 + ) + case "Microsoft.Communication.MediaStreamingFailed": + app.logger.info( + f"Code:->{event_data['resultInformation']['code']}, Subcode:-> {event_data['resultInformation']['subCode']}" # noqa: E501 + ) + app.logger.info(f"Message:->{event_data['resultInformation']['message']}") + case "Microsoft.Communication.CallDisconnected": + pass + return Response(status=200) @app.route("/") def home(): - return "Hello ACS CallAutomation!" + return "Hello SKxACS CallAutomation!" if __name__ == "__main__": diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md index 066c922f4d35..08d5ed6db385 100644 --- a/python/samples/demos/call_automation/readme.md +++ b/python/samples/demos/call_automation/readme.md @@ -1,31 +1,26 @@ -| page_type | languages | products | -| --------- | --------------------------------------- | --------------------------------------------------------------------------- | -| sample |
Python
|
azureazure-communication-services
| - # Call Automation - Quick Start Sample -This is a sample application demonstrated during Microsoft Ignite 2024. It highlights an integration of Azure Communication Services with Azure OpenAI Service to enable intelligent conversational agents. +This is a sample application. It highlights an integration of Azure Communication Services with Semantic Kernel, using the Azure OpenAI Service to enable intelligent conversational agents. ## Prerequisites - An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). - A deployed Communication Services resource. [Create a Communication Services resource](https://docs.microsoft.com/azure/communication-services/quickstarts/create-communication-resource). - A [phone number](https://learn.microsoft.com/en-us/azure/communication-services/quickstarts/telephony/get-phone-number) in your Azure Communication Services resource that can get inbound calls. NB: phone numbers are not available in free subscriptions. -- [Python](https://www.python.org/downloads/) 3.7 or above. +- [Python](https://www.python.org/downloads/) 3.9 or above. - An Azure OpenAI Resource and Deployed Model. See [instructions](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal). ## Before running the sample for the first time 1. Open an instance of PowerShell, Windows Terminal, Command Prompt or equivalent and navigate to the directory that you would like to clone the sample to. -2. git clone `https://github.com/Azure-Samples/communication-services-python-quickstarts.git`. -3. Navigate to `callautomation-azure-openai-voice` folder and open `main.py` file. +2. git clone `https://github.com/microsoft/semantic-kernel.git`. +3. Navigate to `python/samples/demos/call_automation` folder and open `main.py` file. ### Setup the Python environment Create and activate python virtual environment and install required packages using following command ``` pip install -r requirements.txt -pip install -r ./aoai-whl/rtclient-0.5.1-py3-none-any.whl ``` ### Setup and host your Azure DevTunnel @@ -40,21 +35,19 @@ devtunnel host ### Configuring application -Open `main.py` file to configure the following settings +Copy the `.env.example` file to `.env` and update the following values: 1. `ACS_CONNECTION_STRING`: Azure Communication Service resource's connection string. 2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use dev tunnel url) - -Open `azureOpenAIService.py` file to configure the following settings - 1. `AZURE_OPENAI_SERVICE_ENDPOINT`: Azure Open AI service endpoint -2. `AZURE_OPENAI_SERVICE_KEY`: Azure Open AI service key -3. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name +2. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name +3. 'AZURE_OPENAI_API_VERSION': Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' +4. `AZURE_OPENAI_SERVICE_KEY`: Azure Open AI service key, optionally, you can also use Entra Auth. ## Run app locally -1. Navigate to `callautomation-azure-openai-voice` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal +1. Navigate to `call_automation` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal 2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url. 3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). -Once that's completed you should have a running application. The best way to test this is to place a call to your ACS phone number and talk to your intelligent agent. \ No newline at end of file +Once that's completed you should have a running application. The best way to test this is to place a call to your ACS phone number and talk to your intelligent agent. diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index e5776c0f8190..dbda3f3630af 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -255,7 +255,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: case RealtimeAudioEvent(): await self._send( _create_openai_realtime_client_event( - event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.to_base64_bytestring() + event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.data_string ) ) case RealtimeTextEvent(): diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 79fdd0371a90..871ef1dd8599 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -49,11 +49,10 @@ async def receive( async for event in self.connection: if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: - audio_bytes = base64.b64decode(event.delta) if self.audio_output_callback: - await self.audio_output_callback(np.frombuffer(audio_bytes, dtype=np.int16)) + await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) yield RealtimeAudioEvent( - audio=AudioContent(data=audio_bytes, data_format="base64", inner_content=event), + audio=AudioContent(data=event.delta, data_format="base64", inner_content=event), service_type=event.type, service_event=event, ) diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index b2b47dc6e0ef..a749272f39e7 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -111,6 +111,14 @@ def data_uri(self) -> str: return self._data_uri.to_string(self.metadata) return "" + @computed_field # type: ignore + @property + def data_string(self) -> str: + """Returns the data as a string, using the data format.""" + if self._data_uri: + return self._data_uri._data_str() + return "" + @data_uri.setter def data_uri(self, value: str): """Set the data uri.""" @@ -193,13 +201,3 @@ def write_to_file(self, path: str | FilePath) -> None: def to_dict(self) -> dict[str, Any]: """Convert the instance to a dictionary.""" return {"type": "binary", "binary": {"uri": str(self)}} - - def to_base64_bytestring(self, encoding: str = "utf-8") -> str: - """Convert the instance to a bytestring.""" - if self._data_uri and self._data_uri.data_array is not None: - return b64encode(self._data_uri.data_array.tobytes()).decode(encoding) - if self._data_uri and self._data_uri.data_bytes: - return self._data_uri.data_bytes.decode(encoding) - if self._data_uri and self._data_uri.data_str: - return self._data_uri.data_str - return "" From b0334f27f9d816feba09e1d8f5eea764177e3281 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 14:34:34 +0100 Subject: [PATCH 32/50] remove computed field --- python/semantic_kernel/contents/binary_content.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index a749272f39e7..934b92acff65 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -111,7 +111,6 @@ def data_uri(self) -> str: return self._data_uri.to_string(self.metadata) return "" - @computed_field # type: ignore @property def data_string(self) -> str: """Returns the data as a string, using the data format.""" From b35661aa9a3d699f7dde76542a0a07b4d3891dc5 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 16:50:38 +0100 Subject: [PATCH 33/50] cleanup --- python/pyproject.toml | 5 ++-- ...py => 01a-chat_with_realtime_websocket.py} | 18 +++++-------- ...tc.py => 01b-chat_with_realtime_webrtc.py} | 9 +++---- ...a-chat_with_function_calling_websocket.py} | 8 +++--- ... 02b-chat_with_function_calling_webrtc.py} | 5 ++-- python/samples/demos/call_automation/main.py | 17 +++++++----- .../samples/demos/call_automation/readme.md | 6 ++++- .../demos/call_automation/requirements.txt | 1 - .../realtime/open_ai_realtime_base.py | 9 ++++--- .../realtime/open_ai_realtime_webrtc.py | 7 +++-- .../realtime/open_ai_realtime_websocket.py | 4 +-- .../connectors/ai/realtime_client_base.py | 9 +++---- .../semantic_kernel/contents/audio_content.py | 5 ---- .../contents/events/__init__.py | 2 ++ .../contents/events/realtime_event.py | 14 +++++++++- python/uv.lock | 27 +++++-------------- 16 files changed, 68 insertions(+), 78 deletions(-) rename python/samples/concepts/realtime/{01-chat_with_realtime_websocket.py => 01a-chat_with_realtime_websocket.py} (88%) rename python/samples/concepts/realtime/{01-chat_with_realtime_webrtc.py => 01b-chat_with_realtime_webrtc.py} (93%) rename python/samples/concepts/realtime/{02-azure_chat_with_function_calling.py => 02a-chat_with_function_calling_websocket.py} (95%) rename python/samples/concepts/realtime/{02-chat_with_function_calling.py => 02b-chat_with_function_calling_webrtc.py} (98%) diff --git a/python/pyproject.toml b/python/pyproject.toml index a46cc1f92ec2..b6785a40dfb8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -128,10 +128,9 @@ dapr = [ "dapr-ext-fastapi>=1.14.0", "flask-dapr>=1.14.0" ] -openai_realtime = [ - "openai[realtime] ~= 1.0", +realtime = [ + "websockets >= 13, < 15", "aiortc>=1.9.0", - "sounddevice>=0.5.1", ] [tool.uv] diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py similarity index 88% rename from python/samples/concepts/realtime/01-chat_with_realtime_websocket.py rename to python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py index d5d3624b9316..7dbfa7e06ef5 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py @@ -10,7 +10,7 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.contents.events import RealtimeTextEvent +from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -21,12 +21,11 @@ # This simple sample demonstrates how to use the OpenAI Realtime API to create # a chat bot that can listen and respond directly through audio. # It requires installing: -# - semantic-kernel[openai_realtime] +# - semantic-kernel[realtime] # - pyaudio # - sounddevice # - pydub -# - aiortc -# e.g. pip install pyaudio sounddevice pydub +# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. @@ -41,11 +40,8 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different + realtime_client = AzureRealtime("websocket") audio_player = AudioPlayerWebsocket() - realtime_client = AzureRealtime( - "websocket", - audio_output_callback=audio_player.client_callback, - ) audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) # Create the settings for the session settings = OpenAIRealtimeExecutionSettings( @@ -64,10 +60,10 @@ async def main() -> None: async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): match event: - # this can be used as an alternative to the callback function used above, + # this can be used as an alternative to the callback function used in other samples, # the callback is faster and smoother - # case RealtimeAudioEvent(): - # await audio_player.add_audio(event.audio) + case RealtimeAudioEvent(): + await audio_player.add_audio(event.audio) case RealtimeTextEvent(): print(event.text.text, end="") case _: diff --git a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py similarity index 93% rename from python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py rename to python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py index 298f7b242072..d6a6a6f37d81 100644 --- a/python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py @@ -25,12 +25,11 @@ # This simple sample demonstrates how to use the OpenAI Realtime API to create # a chat bot that can listen and respond directly through audio. # It requires installing: -# - semantic-kernel[openai_realtime] +# - semantic-kernel[realtime] # - pyaudio # - sounddevice # - pydub -# - aiortc -# e.g. pip install pyaudio sounddevice pydub +# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. @@ -67,9 +66,7 @@ async def main() -> None: # the context manager calls the create_session method on the client and start listening to the audio stream async with audio_player, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): - match event: - # case RealtimeAudioEvent(): - # await audio_player.add_audio(event.audio) + match event.service_type: case RealtimeTextEvent(): print(event.text.text, end="") case _: diff --git a/python/samples/concepts/realtime/02-azure_chat_with_function_calling.py b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py similarity index 95% rename from python/samples/concepts/realtime/02-azure_chat_with_function_calling.py rename to python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py index 7d751eee6a7c..63daa2d7f817 100644 --- a/python/samples/concepts/realtime/02-azure_chat_with_function_calling.py +++ b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py @@ -24,12 +24,11 @@ # This simple sample demonstrates how to use the OpenAI Realtime API to create # a chat bot that can listen and respond directly through audio. # It requires installing: -# - semantic-kernel[openai_realtime] +# - semantic-kernel[realtime] # - pyaudio # - sounddevice # - pydub -# - aiortc -# e.g. pip install pyaudio sounddevice pydub +# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] @kernel_function @@ -64,8 +63,9 @@ async def main() -> None: # create the audio player and audio track # both take a device_id parameter, which is the index of the device to use, if None the default device is used audio_player = AudioPlayerWebsocket() - # create the realtime client and optionally add the audio output function, this is optional + # create the realtime client and add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" + # (at this time Azure only support websockets) # they will behave the same way, even though the underlying protocol is quite different realtime_client = AzureRealtime( protocol="websocket", diff --git a/python/samples/concepts/realtime/02-chat_with_function_calling.py b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py similarity index 98% rename from python/samples/concepts/realtime/02-chat_with_function_calling.py rename to python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py index fe7f94870133..8c3afdfefce3 100644 --- a/python/samples/concepts/realtime/02-chat_with_function_calling.py +++ b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py @@ -31,12 +31,11 @@ # This simple sample demonstrates how to use the OpenAI Realtime API to create # a chat bot that can listen and respond directly through audio. # It requires installing: -# - semantic-kernel[openai_realtime] +# - semantic-kernel[realtime] # - pyaudio # - sounddevice # - pydub -# - aiortc -# e.g. pip install pyaudio sounddevice pydub +# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. diff --git a/python/samples/demos/call_automation/main.py b/python/samples/demos/call_automation/main.py index 1acf650a6225..a21caabce12c 100755 --- a/python/samples/demos/call_automation/main.py +++ b/python/samples/demos/call_automation/main.py @@ -14,7 +14,7 @@ # "Quart", # "azure-eventgrid", # "azure-communication-callautomation==1.4.0b1", -# "semantic-kernel[openai_realtime]", +# "semantic-kernel[realtime]", # ] # # [tool.uv.sources] @@ -42,16 +42,16 @@ from quart import Quart, Response, json, request, websocket from semantic_kernel import Kernel -from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior +from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( InputAudioTranscription, + ListenEvents, + OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents import AudioContent from semantic_kernel.contents.events import RealtimeAudioEvent from semantic_kernel.functions import kernel_function @@ -94,7 +94,7 @@ async def goodbye(self): kernel.add_plugin(plugin=HelperPlugin(), plugin_name="helpers", description="Helper functions for the realtime client.") -# region: handlers for audio and data streams +# region: Handlers for audio and data streams async def from_realtime_to_acs(audio: ndarray): @@ -155,7 +155,7 @@ async def handle_realtime_messages(client: RealtimeClientBase): print(f" AI:-- {event.service_event.transcript}") -# region: Quart routes +# region: Routes # WebSocket. @@ -285,6 +285,9 @@ def home(): return "Hello SKxACS CallAutomation!" +# region: Main + + if __name__ == "__main__": app.logger.setLevel(INFO) app.run(port=8080) diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md index 08d5ed6db385..ffd8771471cd 100644 --- a/python/samples/demos/call_automation/readme.md +++ b/python/samples/demos/call_automation/readme.md @@ -22,6 +22,7 @@ Create and activate python virtual environment and install required packages usi ``` pip install -r requirements.txt ``` +Alternatively, if you have `uv` installed, you can ship this step. ### Setup and host your Azure DevTunnel @@ -46,7 +47,10 @@ Copy the `.env.example` file to `.env` and update the following values: ## Run app locally -1. Navigate to `call_automation` folder and run `main.py` in debug mode or use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal +1. Navigate to `call_automation` folder and do one of the following to start the main application: + - run `main.py` in debug from your IDE + - use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal. + - execute `./main.py` directly (this uses `uv`, which will then install the requirements in a temporary virtual environment). 2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url. 3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). diff --git a/python/samples/demos/call_automation/requirements.txt b/python/samples/demos/call_automation/requirements.txt index 5c024fddac08..2bb034fdffc6 100644 --- a/python/samples/demos/call_automation/requirements.txt +++ b/python/samples/demos/call_automation/requirements.txt @@ -1,5 +1,4 @@ Quart>=0.19.6 azure-eventgrid==4.11.0 -aiohttp>= 3.11.9 azure-communication-callautomation==1.4.0b1 semantic-kernel \ No newline at end of file diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index dbda3f3630af..30981624ebbe 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -39,6 +39,7 @@ from semantic_kernel.contents.events.realtime_event import ( RealtimeAudioEvent, RealtimeEvent, + RealtimeEvents, RealtimeFunctionCallEvent, RealtimeFunctionResultEvent, RealtimeTextEvent, @@ -69,7 +70,7 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) - async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvent, None]: + async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]: """Handle all events but audio delta. Audio delta has to be handled by the implementation of the protocol as some @@ -194,7 +195,7 @@ async def update_session( async def _parse_function_call_arguments_done( self, event: ResponseFunctionCallArgumentsDoneEvent, - ) -> AsyncGenerator[RealtimeEvent | None]: + ) -> AsyncGenerator[RealtimeEvents | None]: """Handle response function call done. This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event. @@ -250,7 +251,7 @@ async def _send(self, event: RealtimeClientEvent) -> None: raise NotImplementedError @override - async def send(self, event: RealtimeEvent, **kwargs: Any) -> None: + async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: match event: case RealtimeAudioEvent(): await self._send( @@ -455,7 +456,7 @@ async def create_session( pass @override - def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]: + def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvents, None]: pass @override diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index 003a8699544d..ea4dff1df601 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -31,8 +31,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events import RealtimeEvent -from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent +from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -51,13 +50,13 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None audio_track: MediaStreamTrack | None = None - _receive_buffer: asyncio.Queue[RealtimeEvent] = PrivateAttr(default_factory=asyncio.Queue) + _receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue) @override async def receive( self, **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvent, None]: + ) -> AsyncGenerator[RealtimeEvents, None]: while True: event = await self._receive_buffer.get() yield event diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 871ef1dd8599..63485e4ca88c 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -20,7 +20,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvent +from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: @@ -42,7 +42,7 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): async def receive( self, **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvent, None]: + ) -> AsyncGenerator[RealtimeEvents, None]: await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index c77782a3a578..7857bf1f707a 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -5,20 +5,19 @@ from collections.abc import AsyncGenerator, Callable, Coroutine from typing import Any, ClassVar -from pydantic import PrivateAttr - if sys.version_info >= (3, 11): from typing import Self # pragma: no cover else: from typing_extensions import Self # pragma: no cover from numpy import ndarray +from pydantic import PrivateAttr from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.events.realtime_event import RealtimeEvent +from semantic_kernel.contents.events.realtime_event import RealtimeEvents from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class @@ -34,7 +33,7 @@ class RealtimeClientBase(AIServiceClientBase, ABC): _create_kwargs: dict[str, Any] | None = PrivateAttr(default=None) @abstractmethod - async def send(self, event: RealtimeEvent) -> None: + async def send(self, event: RealtimeEvents) -> None: """Send an event to the service. Args: @@ -47,7 +46,7 @@ async def send(self, event: RealtimeEvent) -> None: def receive( self, **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvent, None]: + ) -> AsyncGenerator[RealtimeEvents, None]: """Starts listening for messages from the service, generates events. Args: diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py index 5f356218ba2b..12bb47af9f64 100644 --- a/python/semantic_kernel/contents/audio_content.py +++ b/python/semantic_kernel/contents/audio_content.py @@ -86,8 +86,3 @@ def from_audio_file(cls: type[_T], path: str) -> _T: def to_dict(self) -> dict[str, Any]: """Convert the instance to a dictionary.""" return {"type": "audio_url", "audio_url": {"uri": str(self)}} - - @classmethod - def from_ndarray(cls: type[_T], data: ndarray, mime_type: str) -> _T: - """Create an instance from an ndarray.""" - return cls(data=data, mime_type=mime_type) diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/events/__init__.py index 1da1f993c4c3..445371ee6cbc 100644 --- a/python/semantic_kernel/contents/events/__init__.py +++ b/python/semantic_kernel/contents/events/__init__.py @@ -3,6 +3,7 @@ from semantic_kernel.contents.events.realtime_event import ( RealtimeAudioEvent, RealtimeEvent, + RealtimeEvents, RealtimeFunctionCallEvent, RealtimeFunctionResultEvent, RealtimeImageEvent, @@ -12,6 +13,7 @@ __all__ = [ "RealtimeAudioEvent", "RealtimeEvent", + "RealtimeEvents", "RealtimeFunctionCallEvent", "RealtimeFunctionResultEvent", "RealtimeImageEvent", diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py index 7ebfa231fa08..d74287d5ccf4 100644 --- a/python/semantic_kernel/contents/events/realtime_event.py +++ b/python/semantic_kernel/contents/events/realtime_event.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from typing import Any, ClassVar, Literal +from typing import Annotated, Any, ClassVar, Literal, Union from pydantic import Field @@ -11,6 +11,18 @@ from semantic_kernel.contents.text_content import TextContent from semantic_kernel.kernel_pydantic import KernelBaseModel +RealtimeEvents = Annotated[ + Union[ + "RealtimeEvent", + "RealtimeAudioEvent", + "RealtimeTextEvent", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + ], + Field(discriminator="event_type"), +] + class RealtimeEvent(KernelBaseModel): """Base class for all service events.""" diff --git a/python/uv.lock b/python/uv.lock index bd37dfbcc86f..91de1ef9b2f9 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -5035,11 +5035,6 @@ ollama = [ onnx = [ { name = "onnxruntime-genai", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, ] -openai-realtime = [ - { name = "aiortc", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "openai", extra = ["realtime"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "sounddevice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] pandas = [ { name = "pandas", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -5053,6 +5048,10 @@ qdrant = [ { name = "qdrant-client", version = "1.12.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and sys_platform == 'darwin') or (python_full_version >= '3.13' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform == 'win32')" }, { name = "qdrant-client", version = "1.13.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, ] +realtime = [ + { name = "aiortc", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "websockets", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] redis = [ { name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "redisvl", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -5085,7 +5084,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiohttp", specifier = "~=3.8" }, - { name = "aiortc", marker = "extra == 'openai-realtime'", specifier = ">=1.9.0" }, + { name = "aiortc", marker = "extra == 'realtime'", specifier = ">=1.9.0" }, { name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.32" }, { name = "autogen-agentchat", marker = "extra == 'autogen'", specifier = ">=0.2,<0.4" }, { name = "azure-ai-inference", marker = "extra == 'azure'", specifier = ">=1.0.0b6" }, @@ -5138,6 +5137,7 @@ requires-dist = [ { name = "types-redis", marker = "extra == 'redis'", specifier = "~=4.6.0.20240425" }, { name = "usearch", marker = "extra == 'usearch'", specifier = "~=2.16" }, { name = "weaviate-client", marker = "extra == 'weaviate'", specifier = ">=4.10,<5.0" }, + { name = "websockets", marker = "extra == 'realtime'", specifier = ">=13,<15" }, ] provides-extras = ["anthropic", "autogen", "aws", "azure", "chroma", "dapr", "google", "hugging-face", "milvus", "mistralai", "mongo", "notebooks", "ollama", "onnx", "pandas", "pinecone", "postgres", "qdrant", "redis", "usearch", "weaviate"] @@ -5337,21 +5337,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/93/84a16940c44f6ec62cf334f25aed3128a514dffc361397eee09421a1c7f2/snoop-0.6.0-py3-none-any.whl", hash = "sha256:f5ea9060e65594bf404e6841086b4a964cc27bc30569109c91a470f948b0f729", size = 27461 }, ] -[[package]] -name = "sounddevice" -version = "0.5.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/80/2d/b04ae180312b81dbb694504bee170eada5372242e186f6298139fd3a0513/sounddevice-0.5.1.tar.gz", hash = "sha256:09ca991daeda8ce4be9ac91e15a9a81c8f81efa6b695a348c9171ea0c16cb041", size = 52896 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/d1/464b5fca3decdd0cfec8c47f7b4161a0b12972453201c1bf03811f367c5e/sounddevice-0.5.1-py3-none-any.whl", hash = "sha256:e2017f182888c3f3c280d9fbac92e5dbddac024a7e3442f6e6116bd79dab8a9c", size = 32276 }, - { url = "https://files.pythonhosted.org/packages/6f/f6/6703fe7cf3d7b7279040c792aeec6334e7305956aba4a80f23e62c8fdc44/sounddevice-0.5.1-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:d16cb23d92322526a86a9490c427bf8d49e273d9ccc0bd096feecd229cde6031", size = 107916 }, - { url = "https://files.pythonhosted.org/packages/57/a5/78a5e71f5ec0faedc54f4053775d61407bfbd7d0c18228c7f3d4252fd276/sounddevice-0.5.1-py3-none-win32.whl", hash = "sha256:d84cc6231526e7a08e89beff229c37f762baefe5e0cc2747cbe8e3a565470055", size = 312494 }, - { url = "https://files.pythonhosted.org/packages/af/9b/15217b04f3b36d30de55fef542389d722de63f1ad81f9c72d8afc98cb6ab/sounddevice-0.5.1-py3-none-win_amd64.whl", hash = "sha256:4313b63f2076552b23ac3e0abd3bcfc0c1c6a696fc356759a13bd113c9df90f1", size = 363634 }, -] - [[package]] name = "soupsieve" version = "2.6" From acc7e2006c9860f4a18a20debaa08b35440abc9c Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 16:59:09 +0100 Subject: [PATCH 34/50] small fix in sample --- .../concepts/realtime/01b-chat_with_realtime_webrtc.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py index d6a6a6f37d81..524f3f7a2112 100644 --- a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py @@ -10,7 +10,6 @@ OpenAIRealtimeExecutionSettings, TurnDetection, ) -from semantic_kernel.contents.events.realtime_event import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -66,10 +65,10 @@ async def main() -> None: # the context manager calls the create_session method on the client and start listening to the audio stream async with audio_player, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): - match event.service_type: - case RealtimeTextEvent(): + match event.event_type: + case "text": print(event.text.text, end="") - case _: + case "service": # OpenAI Specific events if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") From 35ce7936b42b3e342d478f5f7ba1f74fe38cf46a Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 17:01:10 +0100 Subject: [PATCH 35/50] fix for binary content --- python/semantic_kernel/contents/binary_content.py | 2 +- python/semantic_kernel/contents/utils/data_uri.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 934b92acff65..1c43699fd999 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -130,7 +130,7 @@ def data_uri(self, value: str): @property def data(self) -> bytes | ndarray: """Get the data.""" - if self._data_uri and self._data_uri.data_array: + if self._data_uri and self._data_uri.data_array is not None: return self._data_uri.data_array.tobytes() if self._data_uri and self._data_uri.data_bytes: return self._data_uri.data_bytes diff --git a/python/semantic_kernel/contents/utils/data_uri.py b/python/semantic_kernel/contents/utils/data_uri.py index 03e75410d5e3..1695491e9110 100644 --- a/python/semantic_kernel/contents/utils/data_uri.py +++ b/python/semantic_kernel/contents/utils/data_uri.py @@ -152,8 +152,6 @@ def from_data_uri(cls: type[_T], data_uri: str | Url, default_mime_type: str = " def to_string(self, metadata: dict[str, str] = {}) -> str: """Return the data uri as a string.""" - if self.data_array: - data_str = self.data_array.tobytes().decode("utf-8") parameters = ";".join([f"{key}={val}" for key, val in metadata.items()]) parameters = f";{parameters}" if parameters else "" data_format = f"{self.data_format}" if self.data_format else "" From 7c6ad4911ea46b900f4ee13dd32d8242b3bac4e8 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 17 Feb 2025 17:03:57 +0100 Subject: [PATCH 36/50] additional experimental markers --- .../connectors/ai/open_ai/services/azure_realtime.py | 4 ++++ .../connectors/ai/open_ai/services/open_ai_realtime.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index b8281fc950ec..4acb57c6cf84 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -17,6 +17,7 @@ ) from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack @@ -28,6 +29,7 @@ __all__ = ["AzureRealtime"] +@experimental_class class AzureRealtime(OpenAIRealtimeBase): """Azure OpenAI Realtime service.""" @@ -129,6 +131,7 @@ def __init__( ) +@experimental_class class AzureRealtimeWebRTC(AzureRealtime, OpenAIRealtimeWebRTCBase, AzureOpenAIConfigBase): """OpenAI Realtime service using WebRTC protocol. @@ -147,6 +150,7 @@ def __init__( raise NotImplementedError("Azure Realtime with WebRTC is not yet supported.") +@experimental_class class AzureRealtimeWebsocket(AzureRealtime, OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): """OpenAI Realtime service using WebSocket protocol. diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 9d4c86ccd211..e1405a2b68e2 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -16,6 +16,7 @@ ) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack @@ -27,6 +28,7 @@ __all__ = ["OpenAIRealtime"] +@experimental_class class OpenAIRealtime(OpenAIRealtimeBase): """OpenAI Realtime service.""" @@ -109,6 +111,7 @@ def __init__( ) +@experimental_class class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase, OpenAIConfigBase): """OpenAI Realtime service using WebRTC protocol. @@ -130,6 +133,7 @@ def __init__( ) +@experimental_class class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase, OpenAIConfigBase): """OpenAI Realtime service using WebSocket protocol. From ac856d454bc688fbf8c22a6cf158f6950c9895df Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 18 Feb 2025 09:02:32 +0100 Subject: [PATCH 37/50] fixed mypy --- .../ai/open_ai/services/azure_realtime.py | 2 +- .../ai/open_ai/services/open_ai_realtime.py | 2 +- .../realtime/open_ai_realtime_base.py | 39 ++++++------------- .../realtime/open_ai_realtime_websocket.py | 4 +- .../contents/binary_content.py | 14 +++---- 5 files changed, 22 insertions(+), 39 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index 4acb57c6cf84..a2e4b4fb5d81 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -33,7 +33,7 @@ class AzureRealtime(OpenAIRealtimeBase): """Azure OpenAI Realtime service.""" - def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T": + def __new__(cls: type["_T"], protocol: Literal["websocket", "webrtc"], *args: Any, **kwargs: Any) -> "_T": """Pick the right subclass, based on protocol.""" subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} subclass = subclass_map[protocol] diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index e1405a2b68e2..ad1d0cd96957 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -32,7 +32,7 @@ class OpenAIRealtime(OpenAIRealtimeBase): """OpenAI Realtime service.""" - def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T": + def __new__(cls: type["_T"], protocol: Literal["websocket", "webrtc"], *args: Any, **kwargs: Any) -> "_T": """Pick the right subclass, based on protocol.""" subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} subclass = subclass_map[protocol] diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 30981624ebbe..53ef76aad811 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -87,36 +87,36 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt service_event=event, text=StreamingTextContent( inner_content=event, - text=event.delta, + text=event.delta, # type: ignore choice_index=0, ), ) case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: - if event.item.type == "function_call" and event.item.call_id and event.item.name: - self._call_id_to_function_map[event.item.call_id] = event.item.name + if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore + self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore yield RealtimeEvent(service_type=event.type, service_event=event) case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: yield RealtimeFunctionCallEvent( service_type=event.type, service_event=event, function_call=FunctionCallContent( - id=event.item_id, - name=self._call_id_to_function_map[event.call_id], - arguments=event.delta, - index=event.output_index, - metadata={"call_id": event.call_id}, + id=event.item_id, # type: ignore + name=self._call_id_to_function_map[event.call_id], # type: ignore + arguments=event.delta, # type: ignore + index=event.output_index, # type: ignore + metadata={"call_id": event.call_id}, # type: ignore inner_content=event, ), ) case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: - async for parsed_event in self._parse_function_call_arguments_done(event): + async for parsed_event in self._parse_function_call_arguments_done(event): # type: ignore if parsed_event: yield parsed_event case ListenEvents.ERROR.value: - logger.error("Error received: %s", event.error) + logger.error("Error received: %s", event.error.model_dump_json()) # type: ignore yield RealtimeEvent(service_type=event.type, service_event=event) case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: - logger.info("Session created or updated, session: %s", event.session) + logger.info("Session created or updated, session: %s", event.session.model_dump_json()) # type: ignore yield RealtimeEvent(service_type=event.type, service_event=event) case _: logger.debug(f"Received event: {event}") @@ -445,20 +445,3 @@ def _update_function_choice_settings_callback( self, ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: return update_settings_from_function_call_configuration - - @override - async def create_session( - self, - chat_history: "ChatHistory | None" = None, - settings: "PromptExecutionSettings | None" = None, - **kwargs: Any, - ) -> None: - pass - - @override - def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvents, None]: - pass - - @override - async def close_session(self) -> None: - pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 63485e4ca88c..b31a6448a322 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -57,8 +57,8 @@ async def receive( service_event=event, ) continue - async for event in self._parse_event(event): - yield event + async for realtime_event in self._parse_event(event): + yield realtime_event async def _send(self, event: RealtimeClientEvent) -> None: await self.connected.wait() diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 1c43699fd999..59c6ab71d6f4 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -111,13 +111,6 @@ def data_uri(self) -> str: return self._data_uri.to_string(self.metadata) return "" - @property - def data_string(self) -> str: - """Returns the data as a string, using the data format.""" - if self._data_uri: - return self._data_uri._data_str() - return "" - @data_uri.setter def data_uri(self, value: str): """Set the data uri.""" @@ -127,6 +120,13 @@ def data_uri(self, value: str): self._data_uri.update_data(value) self.metadata.update(self._data_uri.parameters) + @property + def data_string(self) -> str: + """Returns the data as a string, using the data format.""" + if self._data_uri: + return self._data_uri._data_str() + return "" + @property def data(self) -> bytes | ndarray: """Get the data.""" From 63147569eb420bee4cd195ab8b81c6a74a44b3c3 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 18 Feb 2025 09:08:13 +0100 Subject: [PATCH 38/50] binary content fix --- python/semantic_kernel/contents/binary_content.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 59c6ab71d6f4..aa161f78755f 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -195,6 +195,7 @@ def write_to_file(self, path: str | FilePath) -> None: self._data_uri.data_array.tofile(path) return with open(path, "wb") as file: + assert isinstance(self.data, bytes) # nosec file.write(self.data) def to_dict(self) -> dict[str, Any]: From 9d26bfae19cad1c8876149c9ea4e83d6b43190a8 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 20 Feb 2025 11:26:25 +0100 Subject: [PATCH 39/50] addressed comments --- .../01a-chat_with_realtime_websocket.py | 18 +-- .../realtime/01b-chat_with_realtime_webrtc.py | 23 ++-- ...2a-chat_with_function_calling_websocket.py | 12 +- .../02b-chat_with_function_calling_webrtc.py | 20 ++-- python/samples/concepts/realtime/utils.py | 37 ++++-- .../{main.py => call_automation.py} | 31 ++--- .../samples/demos/call_automation/readme.md | 16 +-- .../connectors/ai/open_ai/__init__.py | 12 +- .../ai/open_ai/services/azure_realtime.py | 101 +++------------- .../ai/open_ai/services/open_ai_realtime.py | 110 ++++++++++-------- .../realtime/open_ai_realtime_base.py | 6 +- .../realtime/open_ai_realtime_webrtc.py | 2 +- .../realtime/open_ai_realtime_websocket.py | 2 +- .../ai/open_ai/services/realtime/utils.py | 13 ++- .../connectors/ai/realtime_client_base.py | 2 +- .../{events => realtime_events}/__init__.py | 2 +- .../realtime_event.py | 0 17 files changed, 189 insertions(+), 218 deletions(-) rename python/samples/demos/call_automation/{main.py => call_automation.py} (91%) rename python/semantic_kernel/contents/{events => realtime_events}/__init__.py (85%) rename python/semantic_kernel/contents/{events => realtime_events}/realtime_event.py (100%) diff --git a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py index 7dbfa7e06ef5..12fee34a0d07 100644 --- a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py @@ -5,12 +5,11 @@ from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( - AzureRealtime, + AzureRealtimeWebsocket, ListenEvents, OpenAIRealtimeExecutionSettings, - TurnDetection, ) -from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -32,7 +31,7 @@ # you can also play around with the turn_detection settings to get the best results. # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, # so you may need to adjust these for your system. -# you can check the available devices by uncommenting line below the function +# you can disable the check for available devices by commenting the line below check_audio_devices() @@ -40,7 +39,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - realtime_client = AzureRealtime("websocket") + realtime_client = AzureRealtimeWebsocket() audio_player = AudioPlayerWebsocket() audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) # Create the settings for the session @@ -53,10 +52,12 @@ async def main() -> None: effectively, but you tend to answer with long flowery prose. """, + # there are different voices to choose from, since that list is bound to change, it is not checked beforehand, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice + # for more details. voice="shimmer", - turn_detection=TurnDetection(create_response=True, silence_duration_ms=800, threshold=0.8), ) - # the context manager calls the create_session method on the client and start listening to the audio stream + # the context manager calls the create_session method on the client and starts listening to the audio stream async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): match event: @@ -65,6 +66,7 @@ async def main() -> None: case RealtimeAudioEvent(): await audio_player.add_audio(event.audio) case RealtimeTextEvent(): + # the model returns both audio and transcript of the audio, which we will print print(event.text.text, end="") case _: # OpenAI Specific events @@ -76,7 +78,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py index 524f3f7a2112..5af08b9923c6 100644 --- a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py @@ -6,18 +6,13 @@ from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( ListenEvents, - OpenAIRealtime, OpenAIRealtimeExecutionSettings, - TurnDetection, + OpenAIRealtimeWebRTC, ) logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") utils_log.setLevel(logging.INFO) -aiortc_log = logging.getLogger("aiortc") -aiortc_log.setLevel(logging.WARNING) -aioice_log = logging.getLogger("aioice") -aioice_log.setLevel(logging.WARNING) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -35,7 +30,7 @@ # you can also play around with the turn_detection settings to get the best results. # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, # so you may need to adjust these for your system. -# you can check the available devices by uncommenting line below the function +# you can disable the check for available devices by commenting the line below check_audio_devices() @@ -44,10 +39,9 @@ async def main() -> None: # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different audio_player = AudioPlayerWebRTC() - realtime_client = OpenAIRealtime( - "webrtc", - audio_output_callback=audio_player.client_callback, + realtime_client = OpenAIRealtimeWebRTC( audio_track=AudioRecorderWebRTC(), + audio_output_callback=audio_player.client_callback, ) # Create the settings for the session settings = OpenAIRealtimeExecutionSettings( @@ -59,14 +53,17 @@ async def main() -> None: effectively, but you tend to answer with long flowery prose. """, + # there are different voices to choose from, since that list is bound to change, it is not checked beforehand, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice + # for more details. voice="alloy", - turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), ) - # the context manager calls the create_session method on the client and start listening to the audio stream + # the context manager calls the create_session method on the client and starts listening to the audio stream async with audio_player, realtime_client(settings=settings, create_response=True): async for event in realtime_client.receive(): match event.event_type: case "text": + # the model returns both audio and transcript of the audio, which we will print print(event.text.text, end="") case "service": # OpenAI Specific events @@ -78,7 +75,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py index 63daa2d7f817..6b0a8efb9ee0 100644 --- a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py +++ b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py @@ -9,13 +9,13 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( - AzureRealtime, + AzureRealtimeWebsocket, ListenEvents, OpenAIRealtimeExecutionSettings, TurnDetection, ) from semantic_kernel.contents import ChatHistory -from semantic_kernel.contents.events import RealtimeTextEvent +from semantic_kernel.contents.realtime_events import RealtimeTextEvent from semantic_kernel.functions import kernel_function logger = logging.getLogger(__name__) @@ -67,8 +67,7 @@ async def main() -> None: # you can define the protocol to use, either "websocket" or "webrtc" # (at this time Azure only support websockets) # they will behave the same way, even though the underlying protocol is quite different - realtime_client = AzureRealtime( - protocol="websocket", + realtime_client = AzureRealtimeWebsocket( audio_output_callback=audio_player.client_callback, ) audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) @@ -88,6 +87,7 @@ async def main() -> None: # the "input_audio_buffer.commit" and "response.create" event to the realtime api # to signal the end of the user's turn and start the response. # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection settings = OpenAIRealtimeExecutionSettings( instructions=instructions, voice="alloy", @@ -99,7 +99,7 @@ async def main() -> None: chat_history.add_user_message("Hi there, who are you?") chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") - # the context manager calls the create_session method on the client and start listening to the audio stream + # the context manager calls the create_session method on the client and starts listening to the audio stream async with ( audio_player, audio_recorder, @@ -128,7 +128,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py index 8c3afdfefce3..ea19458ba5dc 100644 --- a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py +++ b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py @@ -10,21 +10,17 @@ from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( ListenEvents, - OpenAIRealtime, OpenAIRealtimeExecutionSettings, + OpenAIRealtimeWebRTC, TurnDetection, ) from semantic_kernel.contents import ChatHistory -from semantic_kernel.contents.events import RealtimeTextEvent +from semantic_kernel.contents.realtime_events import RealtimeTextEvent from semantic_kernel.functions import kernel_function logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") utils_log.setLevel(logging.INFO) -aiortc_log = logging.getLogger("aiortc") -aiortc_log.setLevel(logging.WARNING) -aioice_log = logging.getLogger("aioice") -aioice_log.setLevel(logging.WARNING) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -42,9 +38,7 @@ # you can also play around with the turn_detection settings to get the best results. # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, # so you may need to adjust these for your system. -# you can check the available devices by uncommenting line below the function - - +# you can disable the check for available devices by commenting the line below check_audio_devices() @@ -84,8 +78,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - realtime_client = OpenAIRealtime( - protocol="webrtc", + realtime_client = OpenAIRealtimeWebRTC( audio_output_callback=audio_player.client_callback, audio_track=audio_track, ) @@ -105,6 +98,7 @@ async def main() -> None: # the "input_audio_buffer.commit" and "response.create" event to the realtime api # to signal the end of the user's turn and start the response. # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection settings = OpenAIRealtimeExecutionSettings( instructions=instructions, voice="alloy", @@ -116,7 +110,7 @@ async def main() -> None: chat_history.add_user_message("Hi there, who are you?") chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") - # the context manager calls the create_session method on the client and start listening to the audio stream + # the context manager calls the create_session method on the client and starts listening to the audio stream async with ( audio_player, realtime_client( @@ -143,7 +137,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instruction: start speaking, when you stop the API should detect you finished and start responding. " + "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py index 2adc87a88a20..b3056991d626 100644 --- a/python/samples/concepts/realtime/utils.py +++ b/python/samples/concepts/realtime/utils.py @@ -12,13 +12,12 @@ from aiortc.mediastreams import MediaStreamError, MediaStreamTrack from av.audio.frame import AudioFrame from av.frame import Frame -from pydantic import PrivateAttr +from pydantic import BaseModel, ConfigDict, PrivateAttr from sounddevice import InputStream, OutputStream from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent -from semantic_kernel.kernel_pydantic import KernelBaseModel +from semantic_kernel.contents import AudioContent +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent logger = logging.getLogger(__name__) @@ -40,8 +39,13 @@ def check_audio_devices(): # region: Recorders -class AudioRecorderWebRTC(KernelBaseModel, MediaStreamTrack): - """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.""" +class AudioRecorderWebRTC(BaseModel, MediaStreamTrack): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + This class is meant as a demo sample and is not meant for production use. + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) kind: ClassVar[str] = "audio" device: str | int | None = None @@ -156,8 +160,13 @@ async def start_recording(self): self._is_recording = False -class AudioRecorderWebsocket(KernelBaseModel): - """A simple class that implements a sounddevice for use with websockets.""" +class AudioRecorderWebsocket(BaseModel): + """A simple class that implements a sounddevice for use with websockets. + + This class is meant as a demo sample and is not meant for production use. + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) realtime_client: RealtimeClientBase device: str | int | None = None @@ -247,9 +256,11 @@ async def __aexit__(self, exc_type, exc, tb): # region: Players -class AudioPlayerWebRTC(KernelBaseModel): +class AudioPlayerWebRTC(BaseModel): """Simple class that plays audio using sounddevice. + This class is meant as a demo sample and is not meant for production use. + Make sure the device_id is set to the correct device for your system. The sample rate, channels and frame duration @@ -265,6 +276,8 @@ class AudioPlayerWebRTC(KernelBaseModel): """ + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + device: int | None = None sample_rate: int = SAMPLE_RATE_WEBRTC channels: int = PLAYER_CHANNELS_WEBRTC @@ -356,9 +369,11 @@ async def add_audio(self, audio_content: AudioContent) -> None: logger.error(f"Unknown audio content: {audio_content}") -class AudioPlayerWebsocket(KernelBaseModel): +class AudioPlayerWebsocket(BaseModel): """Simple class that plays audio using sounddevice. + This class is meant as a demo sample and is not meant for production use. + Make sure the device_id is set to the correct device for your system. The sample rate, channels and frame duration @@ -374,6 +389,8 @@ class AudioPlayerWebsocket(KernelBaseModel): """ + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + device: int | None = None sample_rate: int = SAMPLE_RATE channels: int = PLAYER_CHANNELS diff --git a/python/samples/demos/call_automation/main.py b/python/samples/demos/call_automation/call_automation.py similarity index 91% rename from python/samples/demos/call_automation/main.py rename to python/samples/demos/call_automation/call_automation.py index a21caabce12c..6b69dd1168bc 100755 --- a/python/samples/demos/call_automation/main.py +++ b/python/samples/demos/call_automation/call_automation.py @@ -1,25 +1,30 @@ #!/usr/bin/env uv run # noqa: CPY001 #################################################################### # Copyright (c) Microsoft. All rights reserved. # -# Sample Quart webapp with that connects to OpenAI or Azure OpenAI # -# If you have `uv` installed and the `OPENAI_API_KEY` # -# environment variable set, you can run this example with just # +# Sample Quart webapp with that connects to Azure OpenAI # +# If you have `uv` installed and the environment variables set: # +# `ACS_CONNECTION_STRING` # +# `CALLBACK_URI_HOST` # +# `AZURE_OPENAI_ENDPOINT` # +# `AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME` # +# `AZURE_OPENAI_API_VERSION` # +# `AZURE_OPENAI_API_KEY` (optionally) # +# See the readme.md for more info # +# You can run this example with just # # # -# `.python/samples/demo/call_automation/main.py` # +# `.call_automation.py` # #################################################################### # # /// script -# requires-python = ">=3.9" +# requires-python = ">=3.10" # dependencies = [ # "Quart", # "azure-eventgrid", # "azure-communication-callautomation==1.4.0b1", # "semantic-kernel[realtime]", # ] -# -# [tool.uv.sources] -# semantic-kernel = { path = "../../../", editable = true } # /// + import asyncio import base64 import os @@ -44,15 +49,15 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeWebsocket, InputAudioTranscription, ListenEvents, - OpenAIRealtime, OpenAIRealtimeExecutionSettings, TurnDetection, ) from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents import AudioContent -from semantic_kernel.contents.events import RealtimeAudioEvent +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent from semantic_kernel.functions import kernel_function # Callback events URI to handle callback events. @@ -119,7 +124,7 @@ async def from_acs_to_realtime(client: RealtimeClientBase): ) ) except Exception: - print("Websocket connection closed.") + app.logger.info("Websocket connection closed.") break @@ -161,10 +166,10 @@ async def handle_realtime_messages(client: RealtimeClientBase): # WebSocket. @app.websocket("/ws") async def ws(): - print("Client connected to WebSocket") + app.logger.info("Client connected to WebSocket") # create the client, using the audio callback - client = OpenAIRealtime("websocket", audio_output_callback=from_realtime_to_acs) + client = AzureRealtimeWebsocket(audio_output_callback=from_realtime_to_acs) settings = OpenAIRealtimeExecutionSettings( instructions="""You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md index ffd8771471cd..9c366c6a66b5 100644 --- a/python/samples/demos/call_automation/readme.md +++ b/python/samples/demos/call_automation/readme.md @@ -2,6 +2,8 @@ This is a sample application. It highlights an integration of Azure Communication Services with Semantic Kernel, using the Azure OpenAI Service to enable intelligent conversational agents. +Original code for this sample can be found [here](https://github.com/Azure-Samples/communication-services-python-quickstarts/tree/main/callautomation-openai-sample). + ## Prerequisites - An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). @@ -14,7 +16,7 @@ This is a sample application. It highlights an integration of Azure Communicatio 1. Open an instance of PowerShell, Windows Terminal, Command Prompt or equivalent and navigate to the directory that you would like to clone the sample to. 2. git clone `https://github.com/microsoft/semantic-kernel.git`. -3. Navigate to `python/samples/demos/call_automation` folder and open `main.py` file. +3. Navigate to `python/samples/demos/call_automation` folder and open `call_automation.py` file. ### Setup the Python environment @@ -40,17 +42,17 @@ Copy the `.env.example` file to `.env` and update the following values: 1. `ACS_CONNECTION_STRING`: Azure Communication Service resource's connection string. 2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use dev tunnel url) -1. `AZURE_OPENAI_SERVICE_ENDPOINT`: Azure Open AI service endpoint +1. `AZURE_OPENAI_ENDPOINT`: Azure Open AI service endpoint 2. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name -3. 'AZURE_OPENAI_API_VERSION': Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' -4. `AZURE_OPENAI_SERVICE_KEY`: Azure Open AI service key, optionally, you can also use Entra Auth. +3. `AZURE_OPENAI_API_VERSION`: Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' +4. `AZURE_OPENAI_API_KEY`: Azure Open AI API key, optionally, you can also use Entra Auth. ## Run app locally 1. Navigate to `call_automation` folder and do one of the following to start the main application: - - run `main.py` in debug from your IDE - - use command `python ./main.py` to run it from PowerShell, Command Prompt or Unix Terminal. - - execute `./main.py` directly (this uses `uv`, which will then install the requirements in a temporary virtual environment). + - run `call_automation.py` in debug from your IDE + - use command `python ./call_automation.py` to run it from PowerShell, Command Prompt or another Terminal. + - execute `./call_automation.py` directly in your terminal (this uses `uv`, which will then install the requirements in a temporary virtual environment, see [uv docs](https://docs.astral.sh/uv/guides/scripts) for more info). 2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url. 3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index b96b72322cd4..30aa43f3bb4f 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -35,14 +35,17 @@ ) from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion -from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtime +from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebsocket from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion -from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import ( + OpenAIRealtimeWebRTC, + OpenAIRealtimeWebsocket, +) from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio @@ -63,7 +66,7 @@ "AzureDataSourceParameters", "AzureEmbeddingDependency", "AzureOpenAISettings", - "AzureRealtime", + "AzureRealtimeWebsocket", "AzureTextCompletion", "AzureTextEmbedding", "AzureTextToAudio", @@ -80,8 +83,9 @@ "OpenAIChatPromptExecutionSettings", "OpenAIEmbeddingPromptExecutionSettings", "OpenAIPromptExecutionSettings", - "OpenAIRealtime", "OpenAIRealtimeExecutionSettings", + "OpenAIRealtimeWebRTC", + "OpenAIRealtimeWebsocket", "OpenAISettings", "OpenAITextCompletion", "OpenAITextEmbedding", diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index a2e4b4fb5d81..d229f99ccfeb 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. from collections.abc import Callable, Coroutine, Mapping -from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar +from typing import Any from numpy import ndarray from openai import AsyncAzureOpenAI @@ -10,8 +10,6 @@ from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_webrtc import OpenAIRealtimeWebRTCBase from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( OpenAIRealtimeWebsocketBase, ) @@ -19,32 +17,14 @@ from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError from semantic_kernel.utils.experimental_decorator import experimental_class -if TYPE_CHECKING: - from aiortc.mediastreams import MediaStreamTrack - - -_T = TypeVar("_T", bound="AzureRealtime") - - -__all__ = ["AzureRealtime"] - @experimental_class -class AzureRealtime(OpenAIRealtimeBase): - """Azure OpenAI Realtime service.""" - - def __new__(cls: type["_T"], protocol: Literal["websocket", "webrtc"], *args: Any, **kwargs: Any) -> "_T": - """Pick the right subclass, based on protocol.""" - subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} - subclass = subclass_map[protocol] - return super(AzureRealtime, subclass).__new__(subclass) +class AzureRealtimeWebsocket(OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): + """Azure OpenAI Realtime service using WebSocket protocol.""" def __init__( self, - protocol: Literal["websocket", "webrtc"], - *, audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, - audio_track: "MediaStreamTrack | None" = None, service_id: str | None = None, api_key: str | None = None, deployment_name: str | None = None, @@ -60,10 +40,9 @@ def __init__( env_file_encoding: str | None = None, **kwargs: Any, ) -> None: - """Initialize an OpenAIRealtime service. + """Initialize an AzureRealtimeWebsocket service. Args: - protocol: The protocol to use, must be either "websocket" or "webrtc". audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the @@ -71,29 +50,26 @@ def __init__( It is called first in both websockets and webrtc. Even when passed, the audio content will still be added to the receiving queue. - audio_track: The audio track to use for the service, only used by WebRTC. - A default is supplied if not provided. - It can be any class that implements the AudioStreamTrack interface. - service_id (str | None): The service ID for the Azure deployment. (Optional) - api_key (str | None): The optional api key. If provided, will override the value in the + service_id: The service ID for the Azure deployment. (Optional) + api_key: The optional api key. If provided, will override the value in the env vars or .env file. - deployment_name (str | None): The optional deployment. If provided, will override the value + deployment_name: The optional deployment. If provided, will override the value (chat_deployment_name) in the env vars or .env file. - endpoint (str | None): The optional deployment endpoint. If provided will override the value + endpoint: The optional deployment endpoint. If provided will override the value in the env vars or .env file. - base_url (str | None): The optional deployment base_url. If provided will override the value + base_url: The optional deployment base_url. If provided will override the value in the env vars or .env file. - api_version (str | None): The optional deployment api version. If provided will override the value + api_version: The optional deployment api version. If provided will override the value in the env vars or .env file. - ad_token (str | None): The Azure Active Directory token. (Optional) - ad_token_provider (AsyncAzureADTokenProvider): The Azure Active Directory token provider. (Optional) - token_endpoint (str | None): The token endpoint to request an Azure token. (Optional) - default_headers (Mapping[str, str]): The default headers mapping of string keys to + ad_token: The Azure Active Directory token. (Optional) + ad_token_provider: The Azure Active Directory token provider. (Optional) + token_endpoint: The token endpoint to request an Azure token. (Optional) + default_headers: The default headers mapping of string keys to string values for HTTP requests. (Optional) - async_client (AsyncAzureOpenAI | None): An existing client to use. (Optional) - env_file_path (str | None): Use the environment settings file as a fallback to + async_client: An existing client to use. (Optional) + env_file_path: Use the environment settings file as a fallback to environment variables. (Optional) - env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + env_file_encoding: The encoding of the environment settings file. (Optional) kwargs: Additional arguments. """ try: @@ -111,10 +87,7 @@ def __init__( raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not azure_openai_settings.realtime_deployment_name: raise ServiceInitializationError("The OpenAI realtime model ID is required.") - if audio_track: - kwargs["audio_track"] = audio_track super().__init__( - protocol=protocol, audio_output_callback=audio_output_callback, deployment_name=azure_openai_settings.realtime_deployment_name, endpoint=azure_openai_settings.endpoint, @@ -129,43 +102,3 @@ def __init__( client=async_client, **kwargs, ) - - -@experimental_class -class AzureRealtimeWebRTC(AzureRealtime, OpenAIRealtimeWebRTCBase, AzureOpenAIConfigBase): - """OpenAI Realtime service using WebRTC protocol. - - This should not be used directly, use OpenAIRealtime instead. - Set protocol="webrtc" to use this class. - """ - - protocol: ClassVar[Literal["webrtc"]] = "webrtc" - - def __init__( - self, - *args: Any, - **kwargs: Any, - ) -> None: - """Initialize an OpenAIRealtime service using WebRTC protocol.""" - raise NotImplementedError("Azure Realtime with WebRTC is not yet supported.") - - -@experimental_class -class AzureRealtimeWebsocket(AzureRealtime, OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): - """OpenAI Realtime service using WebSocket protocol. - - This should not be used directly, use OpenAIRealtime instead. - Set protocol="websocket" to use this class. - """ - - protocol: ClassVar[Literal["websocket"]] = "websocket" - - def __init__( - self, - *args: Any, - **kwargs: Any, - ) -> None: - super().__init__( - *args, - **kwargs, - ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index ad1d0cd96957..d0c9c485adf6 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. from collections.abc import Callable, Coroutine, Mapping -from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar +from typing import TYPE_CHECKING, Any from numpy import ndarray from openai import AsyncOpenAI @@ -9,7 +9,6 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_webrtc import OpenAIRealtimeWebRTCBase from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( OpenAIRealtimeWebsocketBase, @@ -22,28 +21,14 @@ from aiortc.mediastreams import MediaStreamTrack -_T = TypeVar("_T", bound="OpenAIRealtime") - - -__all__ = ["OpenAIRealtime"] - - @experimental_class -class OpenAIRealtime(OpenAIRealtimeBase): - """OpenAI Realtime service.""" - - def __new__(cls: type["_T"], protocol: Literal["websocket", "webrtc"], *args: Any, **kwargs: Any) -> "_T": - """Pick the right subclass, based on protocol.""" - subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()} - subclass = subclass_map[protocol] - return super(OpenAIRealtime, subclass).__new__(subclass) +class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): + """OpenAI Realtime service using WebRTC protocol.""" def __init__( self, - protocol: Literal["websocket", "webrtc"], - *, + audio_track: "MediaStreamTrack", audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, - audio_track: "MediaStreamTrack | None" = None, ai_model_id: str | None = None, api_key: str | None = None, org_id: str | None = None, @@ -98,7 +83,6 @@ def __init__( if audio_track: kwargs["audio_track"] = audio_track super().__init__( - protocol=protocol, audio_output_callback=audio_output_callback, ai_model_id=openai_settings.realtime_model_id, service_id=service_id, @@ -112,43 +96,67 @@ def __init__( @experimental_class -class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase, OpenAIConfigBase): - """OpenAI Realtime service using WebRTC protocol. - - This should not be used directly, use OpenAIRealtime instead. - Set protocol="webrtc" to use this class. - """ - - protocol: ClassVar[Literal["webrtc"]] = "webrtc" +class OpenAIRealtimeWebsocket(OpenAIRealtimeWebsocketBase, OpenAIConfigBase): + """OpenAI Realtime service using WebSocket protocol.""" def __init__( self, - *args: Any, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + ai_model_id: str | None = None, + api_key: str | None = None, + org_id: str | None = None, + service_id: str | None = None, + default_headers: Mapping[str, str] | None = None, + client: AsyncOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, **kwargs: Any, ) -> None: - """Initialize an OpenAIRealtime service using WebRTC protocol.""" - super().__init__( - *args, - **kwargs, - ) - - -@experimental_class -class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase, OpenAIConfigBase): - """OpenAI Realtime service using WebSocket protocol. - - This should not be used directly, use OpenAIRealtime instead. - Set protocol="websocket" to use this class. - """ - - protocol: ClassVar[Literal["websocket"]] = "websocket" + """Initialize an OpenAIRealtime service. - def __init__( - self, - *args: Any, - **kwargs: Any, - ) -> None: + Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible. + It is called first in both websockets and webrtc. + Even when passed, the audio content will still be + added to the receiving queue. + ai_model_id (str | None): OpenAI model name, see + https://platform.openai.com/docs/models + service_id (str | None): Service ID tied to the execution settings. + api_key (str | None): The optional API key to use. If provided will override, + the env vars or .env file value. + org_id (str | None): The optional org ID to use. If provided will override, + the env vars or .env file value. + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. + """ + try: + openai_settings = OpenAISettings.create( + api_key=api_key, + org_id=org_id, + realtime_model_id=ai_model_id, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not openai_settings.realtime_model_id: + raise ServiceInitializationError("The OpenAI realtime model ID is required.") super().__init__( - *args, + audio_output_callback=audio_output_callback, + ai_model_id=openai_settings.realtime_model_id, + service_id=service_id, + api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, + org_id=openai_settings.org_id, + ai_model_type=OpenAIModelTypes.REALTIME, + default_headers=default_headers, + client=client, **kwargs, ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index 53ef76aad811..e9aafe397a8e 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -36,7 +36,9 @@ from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.chat_message_content import ChatMessageContent -from semantic_kernel.contents.events.realtime_event import ( +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.realtime_events.realtime_event import ( RealtimeAudioEvent, RealtimeEvent, RealtimeEvents, @@ -44,8 +46,6 @@ RealtimeFunctionResultEvent, RealtimeTextEvent, ) -from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.function_result_content import FunctionResultContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent from semantic_kernel.kernel import Kernel diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index ea4dff1df601..ef389ccf8626 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -31,7 +31,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents +from semantic_kernel.contents.realtime_events.realtime_event import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index b31a6448a322..262b3b20d95c 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -20,7 +20,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent, RealtimeEvents +from semantic_kernel.contents.realtime_events.realtime_event import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py index bb815eead6dd..cf57b6769ebc 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py @@ -19,6 +19,7 @@ from openai.types.beta.realtime.session_update_event import Session from semantic_kernel.connectors.ai.open_ai.services.realtime.const import SendEvents +from semantic_kernel.exceptions import ContentException if TYPE_CHECKING: from semantic_kernel.connectors.ai.function_choice_behavior import ( @@ -50,7 +51,14 @@ def update_settings_from_function_call_configuration( def kernel_function_metadata_to_function_call_format( metadata: "KernelFunctionMetadata", ) -> dict[str, Any]: - """Convert the kernel function metadata to function calling format.""" + """Convert the kernel function metadata to function calling format. + + Function calling in the realtime API, uses a slightly different format than the chat completion API. + See https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-tools + for more details. + + TLDR: there is no "function" key, and the function details are at the same level as "type". + """ return { "type": "function", "name": metadata.fully_qualified_name, @@ -66,6 +74,7 @@ def kernel_function_metadata_to_function_call_format( def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: + """Create an OpenAI Realtime client event from a event type and kwargs.""" match event_type: case SendEvents.SESSION_UPDATE: return SessionUpdateEvent( @@ -123,4 +132,4 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) **kwargs, ) case _: - raise ValueError(f"Unknown event type: {event_type}") + raise ContentException(f"Unknown event type: {event_type}") diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 7857bf1f707a..1daf6f310acd 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -17,7 +17,7 @@ from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.events.realtime_event import RealtimeEvents +from semantic_kernel.contents.realtime_events.realtime_event import RealtimeEvents from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class diff --git a/python/semantic_kernel/contents/events/__init__.py b/python/semantic_kernel/contents/realtime_events/__init__.py similarity index 85% rename from python/semantic_kernel/contents/events/__init__.py rename to python/semantic_kernel/contents/realtime_events/__init__.py index 445371ee6cbc..e124fc49b2b3 100644 --- a/python/semantic_kernel/contents/events/__init__.py +++ b/python/semantic_kernel/contents/realtime_events/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -from semantic_kernel.contents.events.realtime_event import ( +from semantic_kernel.contents.realtime_events.realtime_event import ( RealtimeAudioEvent, RealtimeEvent, RealtimeEvents, diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/realtime_events/realtime_event.py similarity index 100% rename from python/semantic_kernel/contents/events/realtime_event.py rename to python/semantic_kernel/contents/realtime_events/realtime_event.py From 7e4c88f2ad07fd2df536843b3bd974df7574a8b2 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 21 Feb 2025 11:13:21 +0100 Subject: [PATCH 40/50] moved events into a file --- .../realtime/open_ai_realtime_base.py | 2 +- .../realtime/open_ai_realtime_webrtc.py | 2 +- .../realtime/open_ai_realtime_websocket.py | 2 +- .../connectors/ai/realtime_client_base.py | 2 +- .../realtime_event.py => realtime_events.py} | 0 .../contents/realtime_events/__init__.py | 21 ------------------- 6 files changed, 4 insertions(+), 25 deletions(-) rename python/semantic_kernel/contents/{realtime_events/realtime_event.py => realtime_events.py} (100%) delete mode 100644 python/semantic_kernel/contents/realtime_events/__init__.py diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py index e9aafe397a8e..2bb3eb85d0e8 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py @@ -38,7 +38,7 @@ from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.realtime_events.realtime_event import ( +from semantic_kernel.contents.realtime_events import ( RealtimeAudioEvent, RealtimeEvent, RealtimeEvents, diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py index ef389ccf8626..20ac25753e92 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py @@ -31,7 +31,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.realtime_events.realtime_event import RealtimeAudioEvent, RealtimeEvents +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py index 262b3b20d95c..28d78ceac8a0 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py @@ -20,7 +20,7 @@ from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.realtime_events.realtime_event import RealtimeAudioEvent, RealtimeEvents +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeEvents from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 1daf6f310acd..2f81300f4625 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -17,7 +17,7 @@ from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.realtime_events.realtime_event import RealtimeEvents +from semantic_kernel.contents.realtime_events import RealtimeEvents from semantic_kernel.services.ai_service_client_base import AIServiceClientBase from semantic_kernel.utils.experimental_decorator import experimental_class diff --git a/python/semantic_kernel/contents/realtime_events/realtime_event.py b/python/semantic_kernel/contents/realtime_events.py similarity index 100% rename from python/semantic_kernel/contents/realtime_events/realtime_event.py rename to python/semantic_kernel/contents/realtime_events.py diff --git a/python/semantic_kernel/contents/realtime_events/__init__.py b/python/semantic_kernel/contents/realtime_events/__init__.py deleted file mode 100644 index e124fc49b2b3..000000000000 --- a/python/semantic_kernel/contents/realtime_events/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -from semantic_kernel.contents.realtime_events.realtime_event import ( - RealtimeAudioEvent, - RealtimeEvent, - RealtimeEvents, - RealtimeFunctionCallEvent, - RealtimeFunctionResultEvent, - RealtimeImageEvent, - RealtimeTextEvent, -) - -__all__ = [ - "RealtimeAudioEvent", - "RealtimeEvent", - "RealtimeEvents", - "RealtimeFunctionCallEvent", - "RealtimeFunctionResultEvent", - "RealtimeImageEvent", - "RealtimeTextEvent", -] From 7903a13f03c109404969fae2d4bd96878cdc2d6c Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 21 Feb 2025 11:17:30 +0100 Subject: [PATCH 41/50] updated lock --- python/uv.lock | 70 +++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index 91de1ef9b2f9..696e0d1cf6ef 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,5 +1,4 @@ version = 1 -revision = 1 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin'", @@ -303,6 +302,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/27/198414c4c24e886770a03e0bed349582c40e3bfc2ec327034cc5d22c185f/autogen_agentchat-0.2.40-py3-none-any.whl", hash = "sha256:03f11ab89442a3b2408e7e46aa4a66d0be44e6f4447467efbb3ef4e35940176e", size = 382317 }, ] +[[package]] +name = "av" +version = "13.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0c/9d/486d31e76784cc0ad943f420c5e05867263b32b37e2f4b0f7f22fdc1ca3a/av-13.1.0.tar.gz", hash = "sha256:d3da736c55847d8596eb8c26c60e036f193001db3bc5c10da8665622d906c17e", size = 3957908 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/d6/1c4a8056a88e006681ac6a3d5ac6082f0a48e52bd565bfd350bfc7c6a37d/av-13.1.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2af44fae6d16c3a40dd1c85bda41b449be08a2c172d8f44fb63395ccf6e6fb4", size = 24260057 }, + { url = "https://files.pythonhosted.org/packages/23/be/cf89545117172d75a0c48066e6f368403237df623b2e3e93590fdeaef8bf/av-13.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0fea71fe06fd0dfe90a089200eb6468034797f860a321fa2d62e07d619c74749", size = 19475039 }, + { url = "https://files.pythonhosted.org/packages/4b/d0/8e261547f7763f320a4f5f68e139fea5f31814fddfe5503c8372123ebb8b/av-13.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:756997810dcca11811b598d209561cabd2071e5b472b867c295bb3e7022eecde", size = 31289005 }, + { url = "https://files.pythonhosted.org/packages/82/a3/00cacfe80ebbe0664876dd26558fb23b65d034ffd2ce0ddb12f1c746e7cb/av-13.1.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f432102edaec4ee90087a675acf486bff0c81b47d98b85eb3218afe84575b60", size = 30705668 }, + { url = "https://files.pythonhosted.org/packages/d7/37/faa98dca1a8f6c2e3f4ad3a935037872aff49a679b76918c5258cf5a1c70/av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d568c4d7a36df52c0774d52e6d730148775ead16daed81c10dafc2569b5a38d", size = 33122108 }, + { url = "https://files.pythonhosted.org/packages/25/81/c3a842477b558e23c7249f81cf723764c193636b6523267c2c02321da6b0/av-13.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa6f76e7c5e77bc5f99a27ada29f78c64fd4e0d42da2c4d203badc650bc0a686", size = 25775920 }, + { url = "https://files.pythonhosted.org/packages/39/54/c4227080c9700384db90072ace70d89b6a288b3748bd2ec0e32580a49e7f/av-13.1.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:867385e6701464a5c95903e24d2e0df1c7e0dbf211ed91d0ce639cd687373e10", size = 24255112 }, + { url = "https://files.pythonhosted.org/packages/32/4a/eb9348231655ca99b200b380f4edbceff7358c927a285badcc84b18fb1c9/av-13.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb7a3f319401a46b0017771268ff4928501e77cf00b1a2aa0721e20b2fd1146e", size = 19467930 }, + { url = "https://files.pythonhosted.org/packages/14/c7/48c80252bdbc3a75a54dd205a7fab8f613914009b9e5416202757208e040/av-13.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad904f860147bceaca65b0d3174a8153f35c570d465161d210f1879970b15559", size = 32207671 }, + { url = "https://files.pythonhosted.org/packages/f9/66/3332c7fa8c43b65680a94f279ea3e832b5500de3a1392bac6112881e984b/av-13.1.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a906e017b29d0eb80d9ccf7a98d19268122da792dbb68eb741cfebba156e6aed", size = 31520911 }, + { url = "https://files.pythonhosted.org/packages/e5/bb/2e03acb9b27591d97f700a3a6c27cfd1bc53fa148177747eda8a70cca1e9/av-13.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce894d7847897da7be63277a0875bd93c51327134ac226c67978de014c7979f", size = 34048399 }, + { url = "https://files.pythonhosted.org/packages/85/44/527aa3b65947d42cfe829326026edf0cd1a8c459390076034be275616c36/av-13.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:384bcdb5fc3238a263a5a25cc9efc690859fa4148cc4b07e00fae927178db22a", size = 25779569 }, + { url = "https://files.pythonhosted.org/packages/9b/aa/4bdd8ce59173574fc6e0c282c71ee6f96fca82643d97bf172bc4cb5a5674/av-13.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:261dbc3f4b55f4f8f3375b10b2258fca7f2ab7a6365c01bc65e77a0d5327a195", size = 24268674 }, + { url = "https://files.pythonhosted.org/packages/17/b4/b267dd5bad99eed49ec6731827c6bcb5ab03864bf732a7ebb81e3df79911/av-13.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83d259ef86b9054eb914bc7c6a7f6092a6d75cb939295e70ee979cfd92a67b99", size = 19475617 }, + { url = "https://files.pythonhosted.org/packages/68/32/4209e51f54d7b54a1feb576d309c671ed1ff437b54fcc4ec68c239199e0a/av-13.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b4d3ca159eceab97e3c0fb08fe756520fb95508417f76e48198fda2a5b0806", size = 32468873 }, + { url = "https://files.pythonhosted.org/packages/b6/d8/c174da5f06b24f3c9e36f91fd02a7411c39da9ce792c17964260d4be675e/av-13.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40e8f757e373b73a2dc4640852a00cce4a4a92ef19b2e642a96d6994cd1fffbf", size = 31818484 }, + { url = "https://files.pythonhosted.org/packages/7f/22/0dd8d1d5cad415772bb707d16aea8b81cf75d340d11d3668eea43468c730/av-13.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8aaec2c0bfd024359db3821d679009d4e637e1bee0321d20f61c54ed6b20f41", size = 34398652 }, + { url = "https://files.pythonhosted.org/packages/7b/ff/48fa68888b8d5bae36d915556ff18f9e5fdc6b5ff5ae23dc4904c9713168/av-13.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:5ea0deab0e6a739cb742fba2a3983d8102f7516a3cdf3c46669f3cac0ed1f351", size = 25781343 }, + { url = "https://files.pythonhosted.org/packages/82/6e/cdce12e534570df37d3fdcb3a74851d39e9ab79d388f3174dea9785a011a/av-13.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47642ebaebfe20519b2391bd5b7c38b596efcd052bfd09c8d33058f94ddd0fd6", size = 24229340 }, + { url = "https://files.pythonhosted.org/packages/7c/88/5359aeada9ea509426f2db63b6531833824a1b02470667b103479ddea7ae/av-13.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2f079c2daa3ae06557b3f6e9bed4fb9c876e8012175bec645ccd007199a302db", size = 19436445 }, + { url = "https://files.pythonhosted.org/packages/b4/d4/64995e5b800476c86dae4ea1444a0eac44e2c4985fac6401b08401e2df11/av-13.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f0de8252deeeb1887637e88d4d9d18514e5cfe276bdb9e6ca8e9eef89d1667a", size = 32120549 }, + { url = "https://files.pythonhosted.org/packages/68/76/9910694cf87d2d308d851f5b2b5c5b20f7f55411f596e2c158fb13bf84a3/av-13.1.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ad0024f4def11b0cedfeee478fa6c6fd7ed3955e13387e0f27261fdda6121b4", size = 31495305 }, + { url = "https://files.pythonhosted.org/packages/6a/a8/cd92de947b9595a0eb2c64e6f7ba295aac2687972050ae092173c2f6ea0c/av-13.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb88e2590eaed45233eb117f1dfab1a43ed9a997b2c46da9f08468dd00f14895", size = 34065325 }, + { url = "https://files.pythonhosted.org/packages/9d/d0/9869fcbd66422df2033d4b78a663e3c64aa6fe7eb9189c811d60f69d9871/av-13.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:c927e4fa4f6aeed4340b3e3b16b237d7cb743e5c1a55b92307407590ca4112aa", size = 25754728 }, + { url = "https://files.pythonhosted.org/packages/63/62/09859d91bc2309918d548ac4585973c53e7db27010c432d050f02206f9bd/av-13.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fc5118f78ee712b2c396f345e4c51e60e61e28f1f606adbd4060c4dc44b0b652", size = 23861117 }, + { url = "https://files.pythonhosted.org/packages/c7/43/f186435a0acad3a2bdf271ce51d3af97ac3153a410e54a623529d39a1818/av-13.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:81bcbd3545e523e7a350613be1866b515a5ee3fafa1d9d257d7ed02531fc2636", size = 19115008 }, + { url = "https://files.pythonhosted.org/packages/31/eb/a1b4af95a615ba73dfc3cfcb9387e40826c92d7d6d383a1b68685a7ef920/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83b2bc641e8e16bbf058de35f1ba79ebed358ac6fe3cb5a665366294774fdb18", size = 22852637 }, + { url = "https://files.pythonhosted.org/packages/0b/a6/94a34aa672af7fef2939e4a5d6c4c6c28e33da0c623aaa9485d977eeaa95/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d956ae3e68fabdc45eb2b986c2e842a31df084d8cfc90336509f07a727a9df62", size = 22703888 }, + { url = "https://files.pythonhosted.org/packages/b9/69/08a72ceed2c8a6e689dea2ef8e941df9469cbe144a600b83d45f821477fc/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ef076fcdf172aafcb21ea3ef7bd68cc9151b050016a8ace13b3dae3d08a4427", size = 24657784 }, + { url = "https://files.pythonhosted.org/packages/b7/8c/c20894580a4341a76c7c74b59c43e26e6652b0fc60f7248f2c1bc5fdbb5e/av-13.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bbf90397b7a466ff2879bd0944d55f796ad76c073fce50304315b83ad00113bd", size = 25562492 }, +] + [[package]] name = "azure-ai-inference" version = "1.0.0b9" @@ -1480,22 +1517,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/fb/54deefe679b7d1c1cc81d83396fcf28ad1a66d213bddeb275a8d28665918/google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d", size = 27866 }, ] -[[package]] -name = "google-genai" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "websockets", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8f/fa/e8c81d37ffe7d8aa05573494735cdc432a97b77f641a08caa959de19523d/google_genai-0.4.0.tar.gz", hash = "sha256:d14ce2e941063092cfc98726aeabcae44f179456e3a4906ee5f28dc91b0663fb", size = 107625 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9d/ac/cf91960fc842f8c3387be8abeaa01deb0e6b20a72a028b70107f58e13150/google_genai-0.4.0-py3-none-any.whl", hash = "sha256:2cbfea3cb47d4ac54ee3d3f9ecd79ff72298cac13e150828afdc5ed62768ed00", size = 113562 }, -] - [[package]] name = "google-generativeai" version = "0.8.4" @@ -5007,7 +5028,6 @@ dapr = [ ] google = [ { name = "google-cloud-aiplatform", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "google-genai", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "google-generativeai", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] hugging-face = [ @@ -5139,7 +5159,6 @@ requires-dist = [ { name = "weaviate-client", marker = "extra == 'weaviate'", specifier = ">=4.10,<5.0" }, { name = "websockets", marker = "extra == 'realtime'", specifier = ">=13,<15" }, ] -provides-extras = ["anthropic", "autogen", "aws", "azure", "chroma", "dapr", "google", "hugging-face", "milvus", "mistralai", "mongo", "notebooks", "ollama", "onnx", "pandas", "pinecone", "postgres", "qdrant", "redis", "usearch", "weaviate"] [package.metadata.requires-dev] dev = [ @@ -5393,19 +5412,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 }, ] -[[package]] -name = "taskgroup" -version = "0.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "exceptiongroup", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'win32')" }, - { name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform == 'win32')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/b1/74babcc824a57904e919f3af16d86c08b524c0691504baf038ef2d7f655c/taskgroup-0.2.2-py2.py3-none-any.whl", hash = "sha256:e2c53121609f4ae97303e9ea1524304b4de6faf9eb2c9280c7f87976479a52fb", size = 14237 }, -] - [[package]] name = "tenacity" version = "9.0.0" From f5e24ec5a8131d87704985b0bdaedef07ce40940 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 21 Feb 2025 12:10:24 +0100 Subject: [PATCH 42/50] fix typo --- .../samples/concepts/realtime/01b-chat_with_realtime_webrtc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py index 5af08b9923c6..c17d4518fc55 100644 --- a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py @@ -25,7 +25,7 @@ # - pydub # e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] -# The characterics of your speaker and microphone are a big factor in a smooth conversation +# The characteristics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. # you can also play around with the turn_detection settings to get the best results. # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, From 9249877612467cb1e245ea7714e2b8bc91155c28 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 24 Feb 2025 11:27:55 +0100 Subject: [PATCH 43/50] restructured realtime --- .../connectors/ai/open_ai/__init__.py | 3 +- .../ai/open_ai/services/azure_realtime.py | 4 +- .../ai/open_ai/services/open_ai_realtime.py | 864 +++++++++++++++++- .../ai/open_ai/services/realtime/__init__.py | 0 .../ai/open_ai/services/realtime/const.py | 54 -- .../realtime/open_ai_realtime_base.py | 447 --------- .../realtime/open_ai_realtime_webrtc.py | 213 ----- .../realtime/open_ai_realtime_websocket.py | 91 -- .../ai/open_ai/services/realtime/utils.py | 135 --- 9 files changed, 860 insertions(+), 951 deletions(-) delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py delete mode 100644 python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 30aa43f3bb4f..919310f448ad 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -43,14 +43,15 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import ( + ListenEvents, OpenAIRealtimeWebRTC, OpenAIRealtimeWebsocket, + SendEvents, ) from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index d229f99ccfeb..560062b95a0e 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -10,9 +10,7 @@ from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( - OpenAIRealtimeWebsocketBase, -) +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtimeWebsocketBase from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError from semantic_kernel.utils.experimental_decorator import experimental_class diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index d0c9c485adf6..dbe10c252212 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -1,25 +1,814 @@ # Copyright (c) Microsoft. All rights reserved. -from collections.abc import Callable, Coroutine, Mapping -from typing import TYPE_CHECKING, Any +import asyncio +import base64 +import contextlib +import json +import logging +import sys +from collections.abc import AsyncGenerator, Callable, Coroutine, Mapping +from enum import Enum +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +import numpy as np +from aiohttp import ClientSession +from aiortc import ( + MediaStreamTrack, + RTCConfiguration, + RTCDataChannel, + RTCIceServer, + RTCPeerConnection, + RTCSessionDescription, +) +from av.audio.frame import AudioFrame from numpy import ndarray from openai import AsyncOpenAI -from pydantic import ValidationError +from openai._models import construct_type_unchecked +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection +from openai.types.beta.realtime import ( + ConversationItem, + ConversationItemCreateEvent, + ConversationItemDeleteEvent, + ConversationItemTruncateEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, + RealtimeClientEvent, + RealtimeServerEvent, + ResponseCancelEvent, + ResponseCreateEvent, + ResponseFunctionCallArgumentsDoneEvent, + Session, + SessionUpdateEvent, +) +from openai.types.beta.realtime.response_create_event import Response +from pydantic import Field, PrivateAttr, ValidationError +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_calling_utils import ( + prepare_settings_for_function_calling, +) +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, +) from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_webrtc import OpenAIRealtimeWebRTCBase -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_websocket import ( - OpenAIRealtimeWebsocketBase, -) from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeEvents, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeTextEvent, +) +from semantic_kernel.contents.streaming_text_content import StreamingTextContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.exceptions import ContentException from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.kernel import Kernel from semantic_kernel.utils.experimental_decorator import experimental_class if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack + from semantic_kernel.connectors.ai.function_choice_behavior import ( + FunctionCallChoiceConfiguration, + FunctionChoiceType, + ) + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + +logger: logging.Logger = logging.getLogger(__name__) + + +@experimental_class +class SendEvents(str, Enum): + """Events that can be sent.""" + + SESSION_UPDATE = "session.update" + INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" + INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" + INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" + CONVERSATION_ITEM_CREATE = "conversation.item.create" + CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" + CONVERSATION_ITEM_DELETE = "conversation.item.delete" + RESPONSE_CREATE = "response.create" + RESPONSE_CANCEL = "response.cancel" + + +@experimental_class +class ListenEvents(str, Enum): + """Events that can be listened to.""" + + ERROR = "error" + SESSION_CREATED = "session.created" + SESSION_UPDATED = "session.updated" + CONVERSATION_CREATED = "conversation.created" + INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" + INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" + INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" + INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" + CONVERSATION_ITEM_CREATED = "conversation.item.created" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" + CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" + CONVERSATION_ITEM_DELETED = "conversation.item.deleted" + RESPONSE_CREATED = "response.created" + RESPONSE_DONE = "response.done" # contains usage info -> log + RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" + RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" + RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" + RESPONSE_CONTENT_PART_DONE = "response.content_part.done" + RESPONSE_TEXT_DELTA = "response.text.delta" + RESPONSE_TEXT_DONE = "response.text.done" + RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" + RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" + RESPONSE_AUDIO_DELTA = "response.audio.delta" + RESPONSE_AUDIO_DONE = "response.audio.done" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" + RATE_LIMITS_UPDATED = "rate_limits.updated" + + +def update_settings_from_function_call_configuration( + function_choice_configuration: "FunctionCallChoiceConfiguration", + settings: "PromptExecutionSettings", + type: "FunctionChoiceType", +) -> None: + """Update the settings from a FunctionChoiceConfiguration.""" + if ( + function_choice_configuration.available_functions + and hasattr(settings, "tool_choice") + and hasattr(settings, "tools") + ): + settings.tool_choice = type + settings.tools = [ + kernel_function_metadata_to_function_call_format(f) + for f in function_choice_configuration.available_functions + ] + + +def kernel_function_metadata_to_function_call_format( + metadata: "KernelFunctionMetadata", +) -> dict[str, Any]: + """Convert the kernel function metadata to function calling format. + + Function calling in the realtime API, uses a slightly different format than the chat completion API. + See https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-tools + for more details. + + TLDR: there is no "function" key, and the function details are at the same level as "type". + """ + return { + "type": "function", + "name": metadata.fully_qualified_name, + "description": metadata.description or "", + "parameters": { + "type": "object", + "properties": { + param.name: param.schema_data for param in metadata.parameters if param.include_in_function_choices + }, + "required": [p.name for p in metadata.parameters if p.is_required and p.include_in_function_choices], + }, + } + + +def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: + """Create an OpenAI Realtime client event from a event type and kwargs.""" + match event_type: + case SendEvents.SESSION_UPDATE: + return SessionUpdateEvent( + type=event_type, + session=Session.model_validate(kwargs.pop("session")), + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + return InputAudioBufferAppendEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + return InputAudioBufferCommitEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + return InputAudioBufferClearEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_CREATE: + if "event_id" in kwargs: + event_id = kwargs.pop("event_id") + if "previous_item_id" in kwargs: + previous_item_id = kwargs.pop("previous_item_id") + event_kwargs = {"event_id": event_id} if "event_id" in kwargs else {} + event_kwargs.update({"previous_item_id": previous_item_id} if "previous_item_id" in kwargs else {}) + return ConversationItemCreateEvent( + type=event_type, + item=ConversationItem.model_validate(kwargs), + **event_kwargs, + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + return ConversationItemTruncateEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + return ConversationItemDeleteEvent( + type=event_type, + **kwargs, + ) + case SendEvents.RESPONSE_CREATE: + event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} + return ResponseCreateEvent( + type=event_type, + response=Response.model_validate(kwargs), + **event_kwargs, + ) + case SendEvents.RESPONSE_CANCEL: + return ResponseCancelEvent( + type=event_type, + **kwargs, + ) + case _: + raise ContentException(f"Unknown event type: {event_type}") + + +@experimental_class +class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): + """OpenAI Realtime service.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True + protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" + kernel: Kernel | None = None + + _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) + _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) + + async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]: + """Handle all events but audio delta. + + Audio delta has to be handled by the implementation of the protocol as some + protocols have different ways of handling audio. + + We put all event in the output buffer, but after the interpreted one. + so when dealing with them, make sure to check the type of the event, since they + might be of different types. + """ + match event.type: + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: + yield RealtimeTextEvent( + service_type=event.type, + service_event=event, + text=StreamingTextContent( + inner_content=event, + text=event.delta, # type: ignore + choice_index=0, + ), + ) + case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: + if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore + self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: + yield RealtimeFunctionCallEvent( + service_type=event.type, + service_event=event, + function_call=FunctionCallContent( + id=event.item_id, # type: ignore + name=self._call_id_to_function_map[event.call_id], # type: ignore + arguments=event.delta, # type: ignore + index=event.output_index, # type: ignore + metadata={"call_id": event.call_id}, # type: ignore + inner_content=event, + ), + ) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: + async for parsed_event in self._parse_function_call_arguments_done(event): # type: ignore + if parsed_event: + yield parsed_event + case ListenEvents.ERROR.value: + logger.error("Error received: %s", event.error.model_dump_json()) # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: + logger.info("Session created or updated, session: %s", event.session.model_dump_json()) # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case _: + logger.debug(f"Received event: {event}") + yield RealtimeEvent(service_type=event.type, service_event=event) + + @override + async def update_session( + self, + chat_history: ChatHistory | None = None, + settings: PromptExecutionSettings | None = None, + create_response: bool = False, + **kwargs: Any, + ) -> None: + """Update the session in the service. + + Args: + chat_history: Chat history. + settings: Prompt execution settings, if kernel is linked to the service or passed as + Kwargs, it will be used to update the settings for function calling. + create_response: Create a response, get the model to start responding, default is False. + kwargs: Additional arguments, if 'kernel' is passed, it will be used to update the + settings for function calling, others will be ignored. + + """ + if kwargs: + if self._create_kwargs: + kwargs = {**self._create_kwargs, **kwargs} + else: + kwargs = self._create_kwargs or {} + if settings: + self._current_settings = settings + if "kernel" in kwargs: + self.kernel = kwargs["kernel"] + + if self._current_settings: + if self.kernel: + self._current_settings = prepare_settings_for_function_calling( + self._current_settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=self.kernel, # type: ignore + ) + await self.send( + RealtimeEvent( + service_type=SendEvents.SESSION_UPDATE, + service_event={"settings": self._current_settings}, + ) + ) + + if chat_history and len(chat_history) > 0: + for msg in chat_history.messages: + for item in msg.items: + match item: + case TextContent(): + await self.send( + RealtimeTextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item) + ) + case FunctionCallContent(): + await self.send( + RealtimeFunctionCallEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item + ) + ) + case FunctionResultContent(): + await self.send( + RealtimeFunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item + ) + ) + case _: + logger.error("Unsupported item type: %s", item) + + if create_response or kwargs.get("create_response", False) is True: + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) + + async def _parse_function_call_arguments_done( + self, + event: ResponseFunctionCallArgumentsDoneEvent, + ) -> AsyncGenerator[RealtimeEvents | None]: + """Handle response function call done. + + This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event. + + It then also yields any function results both back to the service, through `send` and to the developer. + + """ + # Step 1: check if function calling enabled: + if not self.kernel or ( + self._current_settings + and self._current_settings.function_choice_behavior + and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions + ): + yield RealtimeEvent(service_type=event.type, service_event=event) + return + # Step 2: check if there is a function that can be found. + plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) + if not plugin_name or not function_name: + logger.error("Function call needs to have a plugin name and function name") + yield RealtimeEvent(service_type=event.type, service_event=event) + return + + # Step 3: Parse into the function call content, and yield that. + item = FunctionCallContent( + id=event.item_id, + plugin_name=plugin_name, + function_name=function_name, + arguments=event.arguments, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + yield RealtimeFunctionCallEvent( + service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item, service_event=event + ) + + # Step 4: Invoke the function call + chat_history = ChatHistory() + await self.kernel.invoke_function_call(item, chat_history) + created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore + # Step 5: Create the function result event + result = RealtimeFunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, + function_result=created_output, + ) + # Step 6: send the result to the service and call `create response` + await self.send(result) + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) + # Step 7: yield the function result back to the developer as well + yield result + + async def _send(self, event: RealtimeClientEvent) -> None: + """Send an event to the service.""" + raise NotImplementedError + + @override + async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: + match event: + case RealtimeAudioEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.data_string + ) + ) + case RealtimeTextEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="message", + content=[ + { + "type": "input_text", + "text": event.text.text, + } + ], + role="user", + ), + ) + ) + case RealtimeFunctionCallEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="function_call", + name=event.function_call.name or event.function_call.function_name, + arguments="" + if not event.function_call.arguments + else event.function_call.arguments + if isinstance(event.function_call.arguments, str) + else json.dumps(event.function_call.arguments), + call_id=event.function_call.metadata.get("call_id"), + ), + ) + ) + case RealtimeFunctionResultEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + **dict( + type="function_call_output", + output=event.function_result.result, + call_id=event.function_result.metadata.get("call_id"), + ), + ) + ) + case _: + data = event.service_event + match event.service_type: + case SendEvents.SESSION_UPDATE: + if not data: + logger.error("Event data is empty") + return + settings = data.get("settings", None) + if not settings: + logger.error("Event data does not contain 'settings'") + return + if not isinstance(settings, OpenAIRealtimeExecutionSettings): + try: + settings = self.get_prompt_execution_settings_from_settings(settings) + except Exception as e: + logger.error( + f"Failed to properly create settings from passed settings: {settings}, error: {e}" + ) + return + assert isinstance(settings, OpenAIRealtimeExecutionSettings) # nosec + if not settings.ai_model_id: + settings.ai_model_id = self.ai_model_id + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + session=settings.prepare_settings_dict(), + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if not data or "audio" not in data: + logger.error("Event data does not contain 'audio'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + audio=data["audio"], + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) + case SendEvents.CONVERSATION_ITEM_CREATE: + if not data or "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + contents = content.items if isinstance(content, ChatMessageContent) else [content] + for item in contents: + match item: + case TextContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + **dict( + type="message", + content=[ + { + "type": "input_text", + "text": item.text, + } + ], + role="user", + ), + ) + ) + case FunctionCallContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + **dict( + type="function_call", + name=item.name or item.function_name, + arguments="" + if not item.arguments + else item.arguments + if isinstance(item.arguments, str) + else json.dumps(item.arguments), + call_id=item.metadata.get("call_id"), + ), + ) + ) + + case FunctionResultContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + **dict( + type="function_call_output", + output=item.result, + call_id=item.metadata.get("call_id"), + ), + ) + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + content_index=0, + audio_end_ms=data.get("audio_end_ms", 0), + ) + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + ) + ) + case SendEvents.RESPONSE_CREATE: + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, event_id=data.get("event_id", None) if data else None + ) + ) + case SendEvents.RESPONSE_CANCEL: + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + response_id=data.get("response_id", None) if data else None, + ) + ) + + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration + + +@experimental_class +class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): + """OpenAI WebRTC Realtime service.""" + + protocol: ClassVar[Literal["webrtc"]] = "webrtc" # type: ignore + peer_connection: RTCPeerConnection | None = None + data_channel: RTCDataChannel | None = None + audio_track: MediaStreamTrack | None = None + _receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue) + + @override + async def receive( + self, + **kwargs: Any, + ) -> AsyncGenerator[RealtimeEvents, None]: + while True: + event = await self._receive_buffer.get() + yield event + + async def _send(self, event: RealtimeClientEvent) -> None: + if not self.data_channel: + logger.error("Data channel not initialized") + return + while self.data_channel.readyState != "open": + await asyncio.sleep(0.1) + try: + self.data_channel.send(event.model_dump_json(exclude_none=True)) + except Exception as e: + logger.error(f"Failed to send event {event} with error: {e!s}") + + @override + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + if not self.audio_track: + raise Exception("Audio track not initialized") + self.peer_connection = RTCPeerConnection( + configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) + ) + + # track is the audio track being returned from the service + self.peer_connection.add_listener("track", self._on_track) + + # data channel is used to send and receive messages + self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") + self.data_channel.add_listener("message", self._on_data) + + # this is the incoming audio, which sends audio to the service + self.peer_connection.addTransceiver(self.audio_track) + + offer = await self.peer_connection.createOffer() + await self.peer_connection.setLocalDescription(offer) + + try: + ephemeral_token = await self._get_ephemeral_token() + headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} + + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", + headers=headers, + data=offer.sdp, + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"OpenAI WebRTC error: {error_text}") + + sdp_answer = await response.text() + answer = RTCSessionDescription(sdp=sdp_answer, type="answer") + await self.peer_connection.setRemoteDescription(answer) + logger.info("Connected to OpenAI WebRTC") + + except Exception as e: + logger.error(f"Failed to connect to OpenAI: {e!s}") + raise + + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.peer_connection: + with contextlib.suppress(asyncio.CancelledError): + await self.peer_connection.close() + self.peer_connection = None + if self.data_channel: + with contextlib.suppress(asyncio.CancelledError): + self.data_channel.close() + self.data_channel = None + + async def _on_track(self, track: "MediaStreamTrack") -> None: + logger.info(f"Received {track.kind} track from remote") + if track.kind != "audio": + return + while True: + try: + # This is a MediaStreamTrack, so the type is AudioFrame + # this might need to be updated if video becomes part of this + frame: AudioFrame = await track.recv() # type: ignore + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Error getting audio frame: {e!s}") + break + + try: + if self.audio_output_callback: + await self.audio_output_callback(frame.to_ndarray()) + + except Exception as e: + logger.error(f"Error playing remote audio frame: {e!s}") + try: + await self._receive_buffer.put( + RealtimeAudioEvent( + audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), + service_event=frame, + service_type=ListenEvents.RESPONSE_AUDIO_DELTA, + ), + ) + except Exception as e: + logger.error(f"Error processing remote audio frame: {e!s}") + await asyncio.sleep(0.01) + + async def _on_data(self, data: str) -> None: + """This method is called whenever a data channel message is received. + + The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. + Audio data is not send through this channel, use _on_track for that. + """ + try: + event = cast( + RealtimeServerEvent, + construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), + ) + except Exception as e: + logger.error(f"Failed to parse event {data} with error: {e!s}") + return + async for parsed_event in self._parse_event(event): + await self._receive_buffer.put(parsed_event) + + async def _get_ephemeral_token(self) -> str: + """Get an ephemeral token from OpenAI.""" + headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} + data = {"model": self.ai_model_id, "voice": "echo"} + + try: + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") + + result = await response.json() + return result["client_secret"]["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise + @experimental_class class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): @@ -95,6 +884,67 @@ def __init__( ) +@experimental_class +class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): + """OpenAI Realtime service.""" + + protocol: ClassVar[Literal["websocket"]] = "websocket" # type: ignore + connection: AsyncRealtimeConnection | None = None + connected: asyncio.Event = Field(default_factory=asyncio.Event) + + @override + async def receive( + self, + **kwargs: Any, + ) -> AsyncGenerator[RealtimeEvents, None]: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + + async for event in self.connection: + if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: + if self.audio_output_callback: + await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) + yield RealtimeAudioEvent( + audio=AudioContent(data=event.delta, data_format="base64", inner_content=event), + service_type=event.type, + service_event=event, + ) + continue + async for realtime_event in self._parse_event(event): + yield realtime_event + + async def _send(self, event: RealtimeClientEvent) -> None: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + try: + await self.connection.send(event) + except Exception as e: + logger.error(f"Error sending response: {e!s}") + + @override + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() + self.connected.set() + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.connected.is_set() and self.connection: + await self.connection.close() + self.connection = None + self.connected.clear() + + @experimental_class class OpenAIRealtimeWebsocket(OpenAIRealtimeWebsocketBase, OpenAIConfigBase): """OpenAI Realtime service using WebSocket protocol.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py deleted file mode 100644 index 533e00d24d53..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/const.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -from enum import Enum - -from semantic_kernel.utils.experimental_decorator import experimental_class - - -@experimental_class -class SendEvents(str, Enum): - """Events that can be sent.""" - - SESSION_UPDATE = "session.update" - INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" - INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" - INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" - CONVERSATION_ITEM_CREATE = "conversation.item.create" - CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" - CONVERSATION_ITEM_DELETE = "conversation.item.delete" - RESPONSE_CREATE = "response.create" - RESPONSE_CANCEL = "response.cancel" - - -@experimental_class -class ListenEvents(str, Enum): - """Events that can be listened to.""" - - ERROR = "error" - SESSION_CREATED = "session.created" - SESSION_UPDATED = "session.updated" - CONVERSATION_CREATED = "conversation.created" - INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" - INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" - INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" - INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" - CONVERSATION_ITEM_CREATED = "conversation.item.created" - CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" - CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" - CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" - CONVERSATION_ITEM_DELETED = "conversation.item.deleted" - RESPONSE_CREATED = "response.created" - RESPONSE_DONE = "response.done" # contains usage info -> log - RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" - RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" - RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" - RESPONSE_CONTENT_PART_DONE = "response.content_part.done" - RESPONSE_TEXT_DELTA = "response.text.delta" - RESPONSE_TEXT_DONE = "response.text.done" - RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" - RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" - RESPONSE_AUDIO_DELTA = "response.audio.delta" - RESPONSE_AUDIO_DONE = "response.audio.done" - RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" - RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" - RATE_LIMITS_UPDATED = "rate_limits.updated" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py deleted file mode 100644 index 2bb3eb85d0e8..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py +++ /dev/null @@ -1,447 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import json -import logging -import sys -from collections.abc import AsyncGenerator, Callable -from typing import TYPE_CHECKING, Any, ClassVar, Literal - -if sys.version_info >= (3, 12): - from typing import override # pragma: no cover -else: - from typing_extensions import override # pragma: no cover - -from openai.types.beta.realtime import ( - RealtimeClientEvent, - RealtimeServerEvent, - ResponseFunctionCallArgumentsDoneEvent, -) -from pydantic import PrivateAttr - -from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration -from semantic_kernel.connectors.ai.function_calling_utils import ( - prepare_settings_for_function_calling, -) -from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( - OpenAIRealtimeExecutionSettings, -) -from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents -from semantic_kernel.connectors.ai.open_ai.services.realtime.utils import ( - _create_openai_realtime_client_event, - update_settings_from_function_call_configuration, -) -from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings -from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ChatMessageContent -from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.function_result_content import FunctionResultContent -from semantic_kernel.contents.realtime_events import ( - RealtimeAudioEvent, - RealtimeEvent, - RealtimeEvents, - RealtimeFunctionCallEvent, - RealtimeFunctionResultEvent, - RealtimeTextEvent, -) -from semantic_kernel.contents.streaming_text_content import StreamingTextContent -from semantic_kernel.contents.text_content import TextContent -from semantic_kernel.kernel import Kernel -from semantic_kernel.utils.experimental_decorator import experimental_class - -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - from semantic_kernel.contents.chat_history import ChatHistory - - -logger: logging.Logger = logging.getLogger(__name__) - - -@experimental_class -class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): - """OpenAI Realtime service.""" - - SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" - kernel: Kernel | None = None - - _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) - _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) - - async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]: - """Handle all events but audio delta. - - Audio delta has to be handled by the implementation of the protocol as some - protocols have different ways of handling audio. - - We put all event in the output buffer, but after the interpreted one. - so when dealing with them, make sure to check the type of the event, since they - might be of different types. - """ - match event.type: - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: - yield RealtimeTextEvent( - service_type=event.type, - service_event=event, - text=StreamingTextContent( - inner_content=event, - text=event.delta, # type: ignore - choice_index=0, - ), - ) - case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: - if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore - self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore - yield RealtimeEvent(service_type=event.type, service_event=event) - case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: - yield RealtimeFunctionCallEvent( - service_type=event.type, - service_event=event, - function_call=FunctionCallContent( - id=event.item_id, # type: ignore - name=self._call_id_to_function_map[event.call_id], # type: ignore - arguments=event.delta, # type: ignore - index=event.output_index, # type: ignore - metadata={"call_id": event.call_id}, # type: ignore - inner_content=event, - ), - ) - case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: - async for parsed_event in self._parse_function_call_arguments_done(event): # type: ignore - if parsed_event: - yield parsed_event - case ListenEvents.ERROR.value: - logger.error("Error received: %s", event.error.model_dump_json()) # type: ignore - yield RealtimeEvent(service_type=event.type, service_event=event) - case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: - logger.info("Session created or updated, session: %s", event.session.model_dump_json()) # type: ignore - yield RealtimeEvent(service_type=event.type, service_event=event) - case _: - logger.debug(f"Received event: {event}") - yield RealtimeEvent(service_type=event.type, service_event=event) - - @override - async def update_session( - self, - chat_history: ChatHistory | None = None, - settings: PromptExecutionSettings | None = None, - create_response: bool = False, - **kwargs: Any, - ) -> None: - """Update the session in the service. - - Args: - chat_history: Chat history. - settings: Prompt execution settings, if kernel is linked to the service or passed as - Kwargs, it will be used to update the settings for function calling. - create_response: Create a response, get the model to start responding, default is False. - kwargs: Additional arguments, if 'kernel' is passed, it will be used to update the - settings for function calling, others will be ignored. - - """ - if kwargs: - if self._create_kwargs: - kwargs = {**self._create_kwargs, **kwargs} - else: - kwargs = self._create_kwargs or {} - if settings: - self._current_settings = settings - if "kernel" in kwargs: - self.kernel = kwargs["kernel"] - - if self._current_settings: - if self.kernel: - self._current_settings = prepare_settings_for_function_calling( - self._current_settings, - self.get_prompt_execution_settings_class(), - self._update_function_choice_settings_callback(), - kernel=self.kernel, # type: ignore - ) - await self.send( - RealtimeEvent( - service_type=SendEvents.SESSION_UPDATE, - service_event={"settings": self._current_settings}, - ) - ) - - if chat_history and len(chat_history) > 0: - for msg in chat_history.messages: - for item in msg.items: - match item: - case TextContent(): - await self.send( - RealtimeTextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item) - ) - case FunctionCallContent(): - await self.send( - RealtimeFunctionCallEvent( - service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item - ) - ) - case FunctionResultContent(): - await self.send( - RealtimeFunctionResultEvent( - service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item - ) - ) - case _: - logger.error("Unsupported item type: %s", item) - - if create_response or kwargs.get("create_response", False) is True: - await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) - - async def _parse_function_call_arguments_done( - self, - event: ResponseFunctionCallArgumentsDoneEvent, - ) -> AsyncGenerator[RealtimeEvents | None]: - """Handle response function call done. - - This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event. - - It then also yields any function results both back to the service, through `send` and to the developer. - - """ - # Step 1: check if function calling enabled: - if not self.kernel or ( - self._current_settings - and self._current_settings.function_choice_behavior - and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions - ): - yield RealtimeEvent(service_type=event.type, service_event=event) - return - # Step 2: check if there is a function that can be found. - plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) - if not plugin_name or not function_name: - logger.error("Function call needs to have a plugin name and function name") - yield RealtimeEvent(service_type=event.type, service_event=event) - return - - # Step 3: Parse into the function call content, and yield that. - item = FunctionCallContent( - id=event.item_id, - plugin_name=plugin_name, - function_name=function_name, - arguments=event.arguments, - index=event.output_index, - metadata={"call_id": event.call_id}, - ) - yield RealtimeFunctionCallEvent( - service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item, service_event=event - ) - - # Step 4: Invoke the function call - chat_history = ChatHistory() - await self.kernel.invoke_function_call(item, chat_history) - created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore - # Step 5: Create the function result event - result = RealtimeFunctionResultEvent( - service_type=SendEvents.CONVERSATION_ITEM_CREATE, - function_result=created_output, - ) - # Step 6: send the result to the service and call `create response` - await self.send(result) - await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) - # Step 7: yield the function result back to the developer as well - yield result - - async def _send(self, event: RealtimeClientEvent) -> None: - """Send an event to the service.""" - raise NotImplementedError - - @override - async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: - match event: - case RealtimeAudioEvent(): - await self._send( - _create_openai_realtime_client_event( - event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.data_string - ) - ) - case RealtimeTextEvent(): - await self._send( - _create_openai_realtime_client_event( - event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="message", - content=[ - { - "type": "input_text", - "text": event.text.text, - } - ], - role="user", - ), - ) - ) - case RealtimeFunctionCallEvent(): - await self._send( - _create_openai_realtime_client_event( - event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="function_call", - name=event.function_call.name or event.function_call.function_name, - arguments="" - if not event.function_call.arguments - else event.function_call.arguments - if isinstance(event.function_call.arguments, str) - else json.dumps(event.function_call.arguments), - call_id=event.function_call.metadata.get("call_id"), - ), - ) - ) - case RealtimeFunctionResultEvent(): - await self._send( - _create_openai_realtime_client_event( - event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="function_call_output", - output=event.function_result.result, - call_id=event.function_result.metadata.get("call_id"), - ), - ) - ) - case _: - data = event.service_event - match event.service_type: - case SendEvents.SESSION_UPDATE: - if not data: - logger.error("Event data is empty") - return - settings = data.get("settings", None) - if not settings: - logger.error("Event data does not contain 'settings'") - return - if not isinstance(settings, OpenAIRealtimeExecutionSettings): - try: - settings = self.get_prompt_execution_settings_from_settings(settings) - except Exception as e: - logger.error( - f"Failed to properly create settings from passed settings: {settings}, error: {e}" - ) - return - assert isinstance(settings, OpenAIRealtimeExecutionSettings) # nosec - if not settings.ai_model_id: - settings.ai_model_id = self.ai_model_id - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - session=settings.prepare_settings_dict(), - ) - ) - case SendEvents.INPUT_AUDIO_BUFFER_APPEND: - if not data or "audio" not in data: - logger.error("Event data does not contain 'audio'") - return - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - audio=data["audio"], - ) - ) - case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: - await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) - case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: - await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) - case SendEvents.CONVERSATION_ITEM_CREATE: - if not data or "item" not in data: - logger.error("Event data does not contain 'item'") - return - content = data["item"] - contents = content.items if isinstance(content, ChatMessageContent) else [content] - for item in contents: - match item: - case TextContent(): - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - **dict( - type="message", - content=[ - { - "type": "input_text", - "text": item.text, - } - ], - role="user", - ), - ) - ) - case FunctionCallContent(): - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - **dict( - type="function_call", - name=item.name or item.function_name, - arguments="" - if not item.arguments - else item.arguments - if isinstance(item.arguments, str) - else json.dumps(item.arguments), - call_id=item.metadata.get("call_id"), - ), - ) - ) - - case FunctionResultContent(): - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - **dict( - type="function_call_output", - output=item.result, - call_id=item.metadata.get("call_id"), - ), - ) - ) - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - if not data or "item_id" not in data: - logger.error("Event data does not contain 'item_id'") - return - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - item_id=data["item_id"], - content_index=0, - audio_end_ms=data.get("audio_end_ms", 0), - ) - ) - case SendEvents.CONVERSATION_ITEM_DELETE: - if not data or "item_id" not in data: - logger.error("Event data does not contain 'item_id'") - return - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - item_id=data["item_id"], - ) - ) - case SendEvents.RESPONSE_CREATE: - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, event_id=data.get("event_id", None) if data else None - ) - ) - case SendEvents.RESPONSE_CANCEL: - await self._send( - _create_openai_realtime_client_event( - event_type=event.service_type, - response_id=data.get("response_id", None) if data else None, - ) - ) - - @override - def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: - from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa - OpenAIRealtimeExecutionSettings, - ) - - return OpenAIRealtimeExecutionSettings - - @override - def _update_function_choice_settings_callback( - self, - ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: - return update_settings_from_function_call_configuration diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py deleted file mode 100644 index 20ac25753e92..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import asyncio -import contextlib -import json -import logging -import sys -from collections.abc import AsyncGenerator -from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast - -if sys.version_info >= (3, 12): - from typing import override # pragma: no cover -else: - from typing_extensions import override # pragma: no cover - -from aiohttp import ClientSession -from aiortc import ( - MediaStreamTrack, - RTCConfiguration, - RTCDataChannel, - RTCIceServer, - RTCPeerConnection, - RTCSessionDescription, -) -from av.audio.frame import AudioFrame -from openai._models import construct_type_unchecked -from openai.types.beta.realtime.realtime_client_event import RealtimeClientEvent -from openai.types.beta.realtime.realtime_server_event import RealtimeServerEvent -from pydantic import PrivateAttr - -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeEvents -from semantic_kernel.utils.experimental_decorator import experimental_class - -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - from semantic_kernel.contents.chat_history import ChatHistory - - -logger: logging.Logger = logging.getLogger(__name__) - - -@experimental_class -class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): - """OpenAI WebRTC Realtime service.""" - - protocol: ClassVar[Literal["webrtc"]] = "webrtc" # type: ignore - peer_connection: RTCPeerConnection | None = None - data_channel: RTCDataChannel | None = None - audio_track: MediaStreamTrack | None = None - _receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue) - - @override - async def receive( - self, - **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvents, None]: - while True: - event = await self._receive_buffer.get() - yield event - - async def _send(self, event: RealtimeClientEvent) -> None: - if not self.data_channel: - logger.error("Data channel not initialized") - return - while self.data_channel.readyState != "open": - await asyncio.sleep(0.1) - try: - self.data_channel.send(event.model_dump_json(exclude_none=True)) - except Exception as e: - logger.error(f"Failed to send event {event} with error: {e!s}") - - @override - async def create_session( - self, - chat_history: "ChatHistory | None" = None, - settings: "PromptExecutionSettings | None" = None, - **kwargs: Any, - ) -> None: - """Create a session in the service.""" - if not self.audio_track: - raise Exception("Audio track not initialized") - self.peer_connection = RTCPeerConnection( - configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) - ) - - # track is the audio track being returned from the service - self.peer_connection.add_listener("track", self._on_track) - - # data channel is used to send and receive messages - self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") - self.data_channel.add_listener("message", self._on_data) - - # this is the incoming audio, which sends audio to the service - self.peer_connection.addTransceiver(self.audio_track) - - offer = await self.peer_connection.createOffer() - await self.peer_connection.setLocalDescription(offer) - - try: - ephemeral_token = await self._get_ephemeral_token() - headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} - - async with ( - ClientSession() as session, - session.post( - f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", - headers=headers, - data=offer.sdp, - ) as response, - ): - if response.status not in [200, 201]: - error_text = await response.text() - raise Exception(f"OpenAI WebRTC error: {error_text}") - - sdp_answer = await response.text() - answer = RTCSessionDescription(sdp=sdp_answer, type="answer") - await self.peer_connection.setRemoteDescription(answer) - logger.info("Connected to OpenAI WebRTC") - - except Exception as e: - logger.error(f"Failed to connect to OpenAI: {e!s}") - raise - - if settings or chat_history or kwargs: - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) - - @override - async def close_session(self) -> None: - """Close the session in the service.""" - if self.peer_connection: - with contextlib.suppress(asyncio.CancelledError): - await self.peer_connection.close() - self.peer_connection = None - if self.data_channel: - with contextlib.suppress(asyncio.CancelledError): - self.data_channel.close() - self.data_channel = None - - async def _on_track(self, track: "MediaStreamTrack") -> None: - logger.info(f"Received {track.kind} track from remote") - if track.kind != "audio": - return - while True: - try: - # This is a MediaStreamTrack, so the type is AudioFrame - # this might need to be updated if video becomes part of this - frame: AudioFrame = await track.recv() # type: ignore - except asyncio.CancelledError: - break - except Exception as e: - logger.error(f"Error getting audio frame: {e!s}") - break - - try: - if self.audio_output_callback: - await self.audio_output_callback(frame.to_ndarray()) - - except Exception as e: - logger.error(f"Error playing remote audio frame: {e!s}") - try: - await self._receive_buffer.put( - RealtimeAudioEvent( - audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), - service_event=frame, - service_type=ListenEvents.RESPONSE_AUDIO_DELTA, - ), - ) - except Exception as e: - logger.error(f"Error processing remote audio frame: {e!s}") - await asyncio.sleep(0.01) - - async def _on_data(self, data: str) -> None: - """This method is called whenever a data channel message is received. - - The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. - Audio data is not send through this channel, use _on_track for that. - """ - try: - event = cast( - RealtimeServerEvent, - construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), - ) - except Exception as e: - logger.error(f"Failed to parse event {data} with error: {e!s}") - return - async for parsed_event in self._parse_event(event): - await self._receive_buffer.put(parsed_event) - - async def _get_ephemeral_token(self) -> str: - """Get an ephemeral token from OpenAI.""" - headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} - data = {"model": self.ai_model_id, "voice": "echo"} - - try: - async with ( - ClientSession() as session, - session.post( - f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data - ) as response, - ): - if response.status not in [200, 201]: - error_text = await response.text() - raise Exception(f"Failed to get ephemeral token: {error_text}") - - result = await response.json() - return result["client_secret"]["value"] - - except Exception as e: - logger.error(f"Failed to get ephemeral token: {e!s}") - raise diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py deleted file mode 100644 index 28d78ceac8a0..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -import asyncio -import base64 -import logging -import sys -from collections.abc import AsyncGenerator -from typing import TYPE_CHECKING, Any, ClassVar, Literal - -if sys.version_info >= (3, 12): - from typing import override # pragma: no cover -else: - from typing_extensions import override # pragma: no cover - -import numpy as np -from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection -from openai.types.beta.realtime.realtime_client_event import RealtimeClientEvent -from pydantic import Field - -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents -from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase -from semantic_kernel.contents.audio_content import AudioContent -from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeEvents -from semantic_kernel.utils.experimental_decorator import experimental_class - -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - from semantic_kernel.contents.chat_history import ChatHistory - -logger: logging.Logger = logging.getLogger(__name__) - - -@experimental_class -class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): - """OpenAI Realtime service.""" - - protocol: ClassVar[Literal["websocket"]] = "websocket" # type: ignore - connection: AsyncRealtimeConnection | None = None - connected: asyncio.Event = Field(default_factory=asyncio.Event) - - @override - async def receive( - self, - **kwargs: Any, - ) -> AsyncGenerator[RealtimeEvents, None]: - await self.connected.wait() - if not self.connection: - raise ValueError("Connection is not established.") - - async for event in self.connection: - if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: - if self.audio_output_callback: - await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) - yield RealtimeAudioEvent( - audio=AudioContent(data=event.delta, data_format="base64", inner_content=event), - service_type=event.type, - service_event=event, - ) - continue - async for realtime_event in self._parse_event(event): - yield realtime_event - - async def _send(self, event: RealtimeClientEvent) -> None: - await self.connected.wait() - if not self.connection: - raise ValueError("Connection is not established.") - try: - await self.connection.send(event) - except Exception as e: - logger.error(f"Error sending response: {e!s}") - - @override - async def create_session( - self, - chat_history: "ChatHistory | None" = None, - settings: "PromptExecutionSettings | None" = None, - **kwargs: Any, - ) -> None: - """Create a session in the service.""" - self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() - self.connected.set() - if settings or chat_history or kwargs: - await self.update_session(settings=settings, chat_history=chat_history, **kwargs) - - @override - async def close_session(self) -> None: - """Close the session in the service.""" - if self.connected.is_set() and self.connection: - await self.connection.close() - self.connection = None - self.connected.clear() diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py deleted file mode 100644 index cf57b6769ebc..000000000000 --- a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -from typing import TYPE_CHECKING, Any - -from openai.types.beta.realtime import ( - ConversationItem, - ConversationItemCreateEvent, - ConversationItemDeleteEvent, - ConversationItemTruncateEvent, - InputAudioBufferAppendEvent, - InputAudioBufferClearEvent, - InputAudioBufferCommitEvent, - RealtimeClientEvent, - ResponseCancelEvent, - ResponseCreateEvent, - SessionUpdateEvent, -) -from openai.types.beta.realtime.response_create_event import Response -from openai.types.beta.realtime.session_update_event import Session - -from semantic_kernel.connectors.ai.open_ai.services.realtime.const import SendEvents -from semantic_kernel.exceptions import ContentException - -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.function_choice_behavior import ( - FunctionCallChoiceConfiguration, - FunctionChoiceType, - ) - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata - - -def update_settings_from_function_call_configuration( - function_choice_configuration: "FunctionCallChoiceConfiguration", - settings: "PromptExecutionSettings", - type: "FunctionChoiceType", -) -> None: - """Update the settings from a FunctionChoiceConfiguration.""" - if ( - function_choice_configuration.available_functions - and hasattr(settings, "tool_choice") - and hasattr(settings, "tools") - ): - settings.tool_choice = type - settings.tools = [ - kernel_function_metadata_to_function_call_format(f) - for f in function_choice_configuration.available_functions - ] - - -def kernel_function_metadata_to_function_call_format( - metadata: "KernelFunctionMetadata", -) -> dict[str, Any]: - """Convert the kernel function metadata to function calling format. - - Function calling in the realtime API, uses a slightly different format than the chat completion API. - See https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-tools - for more details. - - TLDR: there is no "function" key, and the function details are at the same level as "type". - """ - return { - "type": "function", - "name": metadata.fully_qualified_name, - "description": metadata.description or "", - "parameters": { - "type": "object", - "properties": { - param.name: param.schema_data for param in metadata.parameters if param.include_in_function_choices - }, - "required": [p.name for p in metadata.parameters if p.is_required and p.include_in_function_choices], - }, - } - - -def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: - """Create an OpenAI Realtime client event from a event type and kwargs.""" - match event_type: - case SendEvents.SESSION_UPDATE: - return SessionUpdateEvent( - type=event_type, - session=Session.model_validate(kwargs.pop("session")), - **kwargs, - ) - case SendEvents.INPUT_AUDIO_BUFFER_APPEND: - return InputAudioBufferAppendEvent( - type=event_type, - **kwargs, - ) - case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: - return InputAudioBufferCommitEvent( - type=event_type, - **kwargs, - ) - case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: - return InputAudioBufferClearEvent( - type=event_type, - **kwargs, - ) - case SendEvents.CONVERSATION_ITEM_CREATE: - if "event_id" in kwargs: - event_id = kwargs.pop("event_id") - if "previous_item_id" in kwargs: - previous_item_id = kwargs.pop("previous_item_id") - event_kwargs = {"event_id": event_id} if "event_id" in kwargs else {} - event_kwargs.update({"previous_item_id": previous_item_id} if "previous_item_id" in kwargs else {}) - return ConversationItemCreateEvent( - type=event_type, - item=ConversationItem.model_validate(kwargs), - **event_kwargs, - ) - case SendEvents.CONVERSATION_ITEM_TRUNCATE: - return ConversationItemTruncateEvent( - type=event_type, - **kwargs, - ) - case SendEvents.CONVERSATION_ITEM_DELETE: - return ConversationItemDeleteEvent( - type=event_type, - **kwargs, - ) - case SendEvents.RESPONSE_CREATE: - event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} - return ResponseCreateEvent( - type=event_type, - response=Response.model_validate(kwargs), - **event_kwargs, - ) - case SendEvents.RESPONSE_CANCEL: - return ResponseCancelEvent( - type=event_type, - **kwargs, - ) - case _: - raise ContentException(f"Unknown event type: {event_type}") From aed73803061b350fd0c9df848ed0b1aaa2ce505c Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 25 Feb 2025 10:28:39 +0100 Subject: [PATCH 44/50] first set of tests --- .../ai/open_ai/services/open_ai_realtime.py | 35 ++- python/tests/conftest.py | 2 + .../open_ai/services/test_openai_realtime.py | 219 ++++++++++++++++++ 3 files changed, 245 insertions(+), 11 deletions(-) create mode 100644 python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index dbe10c252212..4048aaf98ae2 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -43,7 +43,6 @@ ResponseCancelEvent, ResponseCreateEvent, ResponseFunctionCallArgumentsDoneEvent, - Session, SessionUpdateEvent, ) from openai.types.beta.realtime.response_create_event import Response @@ -193,12 +192,16 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) """Create an OpenAI Realtime client event from a event type and kwargs.""" match event_type: case SendEvents.SESSION_UPDATE: + if "session" not in kwargs: + raise ContentException("Session is required for SessionUpdateEvent") return SessionUpdateEvent( type=event_type, - session=Session.model_validate(kwargs.pop("session")), + session=kwargs.pop("session"), **kwargs, ) case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if "audio" not in kwargs: + raise ContentException("Audio is required for InputAudioBufferAppendEvent") return InputAudioBufferAppendEvent( type=event_type, **kwargs, @@ -214,33 +217,43 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) **kwargs, ) case SendEvents.CONVERSATION_ITEM_CREATE: - if "event_id" in kwargs: - event_id = kwargs.pop("event_id") - if "previous_item_id" in kwargs: - previous_item_id = kwargs.pop("previous_item_id") - event_kwargs = {"event_id": event_id} if "event_id" in kwargs else {} - event_kwargs.update({"previous_item_id": previous_item_id} if "previous_item_id" in kwargs else {}) + if "item" not in kwargs: + raise ContentException("Item is required for ConversationItemCreateEvent") + event_kwargs = {} + event_id = kwargs.pop("event_id", None) + if event_id: + event_kwargs["event_id"] = event_id + previous_item_id = kwargs.pop("previous_item_id", None) + if previous_item_id: + event_kwargs["previous_item_id"] = previous_item_id return ConversationItemCreateEvent( type=event_type, item=ConversationItem.model_validate(kwargs), **event_kwargs, ) case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "content_index" not in kwargs: + kwargs["content_index"] = 0 return ConversationItemTruncateEvent( type=event_type, **kwargs, ) case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in kwargs: + raise ContentException("Item ID is required for ConversationItemDeleteEvent") return ConversationItemDeleteEvent( type=event_type, **kwargs, ) case SendEvents.RESPONSE_CREATE: - event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {} + if "response" in kwargs: + response: Response | None = Response.model_validate(kwargs.pop("response")) + else: + response = None return ResponseCreateEvent( type=event_type, - response=Response.model_validate(kwargs), - **event_kwargs, + response=response, + **kwargs, ) case SendEvents.RESPONSE_CANCEL: return ResponseCancelEvent( diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 6cef400a8cf1..60bb1bda97da 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -220,6 +220,7 @@ def azure_openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dic "AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME": "test_text_to_image_deployment", "AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME": "test_audio_to_text_deployment", "AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME": "test_text_to_audio_deployment", + "AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME": "test_realtime_deployment", "AZURE_OPENAI_API_KEY": "test_api_key", "AZURE_OPENAI_ENDPOINT": "https://test-endpoint.com", "AZURE_OPENAI_API_VERSION": "2023-03-15-preview", @@ -256,6 +257,7 @@ def openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dict): "OPENAI_TEXT_TO_IMAGE_MODEL_ID": "test_text_to_image_model_id", "OPENAI_AUDIO_TO_TEXT_MODEL_ID": "test_audio_to_text_model_id", "OPENAI_TEXT_TO_AUDIO_MODEL_ID": "test_text_to_audio_model_id", + "OPENAI_REALTIME_MODEL_ID": "test_realtime_model_id", } env_vars.update(override_env_param_dict) diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py new file mode 100644 index 000000000000..c074b0b3fd65 --- /dev/null +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -0,0 +1,219 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from typing import Any + +from aiortc import AudioStreamTrack +from openai.types.beta.realtime import ( + ConversationItem, + ConversationItemCreatedEvent, + ConversationItemCreateEvent, + ConversationItemDeletedEvent, + ConversationItemDeleteEvent, + ConversationItemTruncatedEvent, + ConversationItemTruncateEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, + ResponseAudioDeltaEvent, + ResponseAudioDoneEvent, + ResponseAudioTranscriptDeltaEvent, + ResponseCancelEvent, + ResponseCreateEvent, + ResponseFunctionCallArgumentsDoneEvent, + Session, + SessionCreatedEvent, + SessionUpdatedEvent, + SessionUpdateEvent, +) +from pytest import fixture, mark, param, raises + +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_type import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, +) +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import ( + ListenEvents, + OpenAIRealtimeWebRTC, + OpenAIRealtimeWebsocket, + SendEvents, + _create_openai_realtime_client_event, + update_settings_from_function_call_configuration, +) +from semantic_kernel.exceptions.content_exceptions import ContentException +from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + + +@fixture +async def websocket_stream(): + await asyncio.sleep(0) + yield SessionCreatedEvent(type=ListenEvents.SESSION_CREATED, session=Session(session_id="session_id"), event_id="1") + yield SessionUpdatedEvent(type=ListenEvents.SESSION_UPDATED, session=Session(session_id="session_id"), event_id="2") + yield ConversationItemCreatedEvent( + type=ListenEvents.CONVERSATION_ITEM_CREATED, + item=ConversationItem(id="item_id"), + event_id="3", + previous_item_id="2", + ) + yield ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED, item_id="item_id", event_id="4") + yield ConversationItemTruncatedEvent(type=ListenEvents.CONVERSATION_ITEM_TRUNCATED, event_id="5") + yield InputAudioBufferClearEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED, event_id="7") + yield InputAudioBufferCommitEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED, event_id="8") + yield ResponseCancelEvent(type=ListenEvents.RESPONSE_CANCELLED, event_id="9") + yield ResponseCreateEvent(type=ListenEvents.RESPONSE_CREATED, event_id="10") + yield ResponseFunctionCallArgumentsDoneEvent(type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, event_id="11") + yield ResponseAudioTranscriptDeltaEvent(type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, event_id="12") + yield ResponseAudioDoneEvent(type=ListenEvents.RESPONSE_AUDIO_DONE, event_id="13") + yield ResponseAudioDeltaEvent(type=ListenEvents.RESPONSE_AUDIO_DELTA, event_id="14") + + +@fixture +def audio_track(): + class AudioTrack(AudioStreamTrack): + kind = "audio" + + async def recv(self): + await asyncio.sleep(0) + return + + return AudioTrack() + + +def test_update_settings_from_function_call_config(): + config = FunctionCallChoiceConfiguration( + available_functions=[ + KernelFunctionMetadata(name="function_name", description="function_description", is_prompt=False) + ] + ) + + settings = OpenAIRealtimeExecutionSettings() + + update_settings_from_function_call_configuration(config, settings, FunctionChoiceType.AUTO) + + assert len(settings.tools) == 1 + assert settings.tools[0]["type"] == "function" + assert settings.tools[0]["name"] == "function_name" + assert settings.tools[0]["description"] == "function_description" + assert settings.tool_choice == FunctionChoiceType.AUTO.value + + +def test_openai_realtime_websocket(openai_unit_test_env): + realtime_client = OpenAIRealtimeWebsocket() + assert realtime_client is not None + + +def test_openai_realtime_webrtc(openai_unit_test_env, audio_track): + realtime_client = OpenAIRealtimeWebRTC(audio_track=audio_track) + assert realtime_client is not None + + +@mark.parametrize( + ["event_type", "event_kwargs", "expected_event", "expected_exception"], + [ + param( + SendEvents.SESSION_UPDATE, + {"session": {"id": "session_id"}}, + SessionUpdateEvent, + None, + id="session_update", + ), + param( + SendEvents.SESSION_UPDATE, + {}, + SessionUpdateEvent, + ContentException, + id="session_update_missing", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_APPEND, + {"audio": "audio_buffer_as_string"}, + InputAudioBufferAppendEvent, + None, + id="input_audio_buffer_append", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_APPEND, + {}, + InputAudioBufferAppendEvent, + ContentException, + id="input_audio_buffer_append_missing_audio", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_COMMIT, + {}, + InputAudioBufferCommitEvent, + None, + id="input_audio_buffer_commit", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_CLEAR, + {}, + InputAudioBufferClearEvent, + None, + id="input_audio_buffer_Clear", + ), + param( + SendEvents.CONVERSATION_ITEM_CREATE, + { + "event_id": "event_id", + "previous_item_id": "previous_item_id", + "item": {"id": "item_id"}, + }, + ConversationItemCreateEvent, + None, + id="conversation_item_create_event", + ), + param( + SendEvents.CONVERSATION_ITEM_CREATE, + {}, + ConversationItemCreateEvent, + ContentException, + id="conversation_item_create_event_no_item", + ), + param( + SendEvents.CONVERSATION_ITEM_TRUNCATE, + {"audio_end_ms": 1000, "item_id": "item_id"}, + ConversationItemTruncateEvent, + None, + id="conversation_item_truncate", + ), + param( + SendEvents.CONVERSATION_ITEM_DELETE, + {"item_id": "item_id"}, + ConversationItemDeleteEvent, + None, + id="conversation_item_delete", + ), + param( + SendEvents.CONVERSATION_ITEM_DELETE, + {}, + ConversationItemDeleteEvent, + ContentException, + id="conversation_item_delete_fail", + ), + param( + SendEvents.RESPONSE_CREATE, + {"response": {"instructions": "instructions"}}, + ResponseCreateEvent, + None, + id="response_create", + ), + param( + SendEvents.RESPONSE_CANCEL, + {}, + ResponseCancelEvent, + None, + id="response_cancel", + ), + ], +) +def test_create_openai_realtime_event( + event_type: SendEvents, event_kwargs: dict[str, Any], expected_event: Any, expected_exception: Exception | None +): + if expected_exception: + with raises(expected_exception): + _create_openai_realtime_client_event(event_type, **event_kwargs) + else: + event = _create_openai_realtime_client_event(event_type, **event_kwargs) + assert isinstance(event, expected_event) From d39869569ae1565f04e3021e46c1de9dce344d98 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 25 Feb 2025 15:19:35 +0100 Subject: [PATCH 45/50] added more tests --- .../ai/open_ai/services/open_ai_realtime.py | 99 ++++--- python/semantic_kernel/kernel.py | 2 +- .../open_ai/services/test_openai_realtime.py | 261 ++++++++++++++++++ 3 files changed, 311 insertions(+), 51 deletions(-) diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 4048aaf98ae2..dcc7cc840c3a 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -31,7 +31,6 @@ from openai._models import construct_type_unchecked from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection from openai.types.beta.realtime import ( - ConversationItem, ConversationItemCreateEvent, ConversationItemDeleteEvent, ConversationItemTruncateEvent, @@ -95,6 +94,8 @@ logger: logging.Logger = logging.getLogger(__name__) +# region constants + @experimental_class class SendEvents(str, Enum): @@ -145,6 +146,9 @@ class ListenEvents(str, Enum): RATE_LIMITS_UPDATED = "rate_limits.updated" +# region utils + + def update_settings_from_function_call_configuration( function_choice_configuration: "FunctionCallChoiceConfiguration", settings: "PromptExecutionSettings", @@ -219,18 +223,8 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) case SendEvents.CONVERSATION_ITEM_CREATE: if "item" not in kwargs: raise ContentException("Item is required for ConversationItemCreateEvent") - event_kwargs = {} - event_id = kwargs.pop("event_id", None) - if event_id: - event_kwargs["event_id"] = event_id - previous_item_id = kwargs.pop("previous_item_id", None) - if previous_item_id: - event_kwargs["previous_item_id"] = previous_item_id - return ConversationItemCreateEvent( - type=event_type, - item=ConversationItem.model_validate(kwargs), - **event_kwargs, - ) + kwargs["type"] = event_type + return ConversationItemCreateEvent(**kwargs) case SendEvents.CONVERSATION_ITEM_TRUNCATE: if "content_index" not in kwargs: kwargs["content_index"] = 0 @@ -260,8 +254,9 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) type=event_type, **kwargs, ) - case _: - raise ContentException(f"Unknown event type: {event_type}") + + +# region Base @experimental_class @@ -269,7 +264,6 @@ class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): """OpenAI Realtime service.""" SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True - protocol: ClassVar[Literal["websocket", "webrtc"]] = "websocket" kernel: Kernel | None = None _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) @@ -417,8 +411,9 @@ async def _parse_function_call_arguments_done( yield RealtimeEvent(service_type=event.type, service_event=event) return # Step 2: check if there is a function that can be found. - plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) - if not plugin_name or not function_name: + try: + plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) + except ValueError: logger.error("Function call needs to have a plugin name and function name") yield RealtimeEvent(service_type=event.type, service_event=event) return @@ -468,43 +463,43 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="message", - content=[ + item={ + "type": "message", + "content": [ { "type": "input_text", "text": event.text.text, } ], - role="user", - ), + "role": "user", + }, ) ) case RealtimeFunctionCallEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="function_call", - name=event.function_call.name or event.function_call.function_name, - arguments="" + item={ + "type": "function_call", + "name": event.function_call.name or event.function_call.function_name, + "arguments": "" if not event.function_call.arguments else event.function_call.arguments if isinstance(event.function_call.arguments, str) else json.dumps(event.function_call.arguments), - call_id=event.function_call.metadata.get("call_id"), - ), + "call_id": event.function_call.metadata.get("call_id"), + }, ) ) case RealtimeFunctionResultEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - **dict( - type="function_call_output", - output=event.function_result.result, - call_id=event.function_result.metadata.get("call_id"), - ), + item={ + "type": "function_call_output", + "output": event.function_result.result, + "call_id": event.function_result.metadata.get("call_id"), + }, ) ) case _: @@ -561,32 +556,32 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - **dict( - type="message", - content=[ + item={ + "type": "message", + "content": [ { "type": "input_text", "text": item.text, } ], - role="user", - ), + "role": "user", + }, ) ) case FunctionCallContent(): await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - **dict( - type="function_call", - name=item.name or item.function_name, - arguments="" + item={ + "type": "function_call", + "name": item.name or item.function_name, + "arguments": "" if not item.arguments else item.arguments if isinstance(item.arguments, str) else json.dumps(item.arguments), - call_id=item.metadata.get("call_id"), - ), + "call_id": item.metadata.get("call_id"), + }, ) ) @@ -594,11 +589,11 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - **dict( - type="function_call_output", - output=item.result, - call_id=item.metadata.get("call_id"), - ), + item={ + "type": "function_call_output", + "output": item.result, + "call_id": item.metadata.get("call_id"), + }, ) ) case SendEvents.CONVERSATION_ITEM_TRUNCATE: @@ -652,6 +647,7 @@ def _update_function_choice_settings_callback( return update_settings_from_function_call_configuration +# region WebRTC @experimental_class class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): """OpenAI WebRTC Realtime service.""" @@ -897,6 +893,9 @@ def __init__( ) +# region Websocket + + @experimental_class class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): """OpenAI Realtime service.""" diff --git a/python/semantic_kernel/kernel.py b/python/semantic_kernel/kernel.py index 03e3a48d75f5..ad71ffccfedb 100644 --- a/python/semantic_kernel/kernel.py +++ b/python/semantic_kernel/kernel.py @@ -321,7 +321,7 @@ async def invoke_function_call( function_call_count: int | None = None, request_index: int | None = None, is_streaming: bool = False, - function_behavior: "FunctionChoiceBehavior" = None, # type: ignore + function_behavior: "FunctionChoiceBehavior | None" = None, ) -> "AutoFunctionInvocationContext | None": """Processes the provided FunctionCallContent and updates the chat history.""" args_cloned = copy(arguments) if arguments else KernelArguments() diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py index c074b0b3fd65..65dbd6f5f7fe 100644 --- a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -2,6 +2,7 @@ import asyncio from typing import Any +from unittest.mock import patch from aiortc import AudioStreamTrack from openai.types.beta.realtime import ( @@ -12,15 +13,20 @@ ConversationItemDeleteEvent, ConversationItemTruncatedEvent, ConversationItemTruncateEvent, + ErrorEvent, InputAudioBufferAppendEvent, InputAudioBufferClearEvent, InputAudioBufferCommitEvent, + InputAudioBufferSpeechStartedEvent, + RealtimeServerEvent, ResponseAudioDeltaEvent, ResponseAudioDoneEvent, ResponseAudioTranscriptDeltaEvent, ResponseCancelEvent, ResponseCreateEvent, + ResponseFunctionCallArgumentsDeltaEvent, ResponseFunctionCallArgumentsDoneEvent, + ResponseOutputItemAddedEvent, Session, SessionCreatedEvent, SessionUpdatedEvent, @@ -29,6 +35,7 @@ from pytest import fixture, mark, param, raises from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior from semantic_kernel.connectors.ai.function_choice_type import FunctionChoiceType from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( OpenAIRealtimeExecutionSettings, @@ -41,7 +48,22 @@ _create_openai_realtime_client_event, update_settings_from_function_call_configuration, ) +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.image_content import ImageContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeTextEvent, +) +from semantic_kernel.contents.text_content import TextContent from semantic_kernel.exceptions.content_exceptions import ContentException +from semantic_kernel.functions import kernel_function from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata @@ -80,6 +102,13 @@ async def recv(self): return AudioTrack() +@fixture +def OpenAIWebsocket(openai_unit_test_env): + client = OpenAIRealtimeWebsocket() + client._call_id_to_function_map["call_id"] = "function_name" + return client + + def test_update_settings_from_function_call_config(): config = FunctionCallChoiceConfiguration( available_functions=[ @@ -217,3 +246,235 @@ def test_create_openai_realtime_event( else: event = _create_openai_realtime_client_event(event_type, **event_kwargs) assert isinstance(event, expected_event) + + +@mark.parametrize( + ["event", "expected_type"], + [ + param( + ResponseAudioTranscriptDeltaEvent( + content_index=0, + delta="text", + item_id="item_id", + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.audio_transcript.delta", + ), + [RealtimeTextEvent], + id="response_audio_transcript_delta", + ), + param( + ResponseOutputItemAddedEvent( + item=ConversationItem(id="item_id"), + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.output_item.added", + ), + [RealtimeEvent], + id="response_output_item_added", + ), + param( + ResponseOutputItemAddedEvent( + item=ConversationItem(id="item_id", type="function_call", call_id="call_id", name="function_to_call"), + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.output_item.added", + ), + [RealtimeEvent], + id="response_output_item_added_function_call", + ), + param( + ResponseFunctionCallArgumentsDeltaEvent( + call_id="call_id", + delta="argument delta", + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.delta", + ), + [RealtimeFunctionCallEvent], + id="response_function_call_arguments_delta", + ), + param( + ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments="argument delta", + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ), + [RealtimeEvent], + id="response_function_call_arguments_done_no_kernel", + ), + param( + ErrorEvent( + error={"code": "error_code", "message": "error_message", "type": "invalid_request_error"}, + event_id="event_id", + type="error", + ), + [RealtimeEvent], + id="error", + ), + param( + SessionCreatedEvent( + session=Session(id="session_id"), + event_id="event_id", + type="session.created", + ), + [RealtimeEvent], + id="session_created", + ), + param( + SessionUpdatedEvent( + session=Session(id="session_id"), + event_id="event_id", + type="session.updated", + ), + [RealtimeEvent], + id="session_updated", + ), + param( + InputAudioBufferSpeechStartedEvent( + audio_start_ms=0, + event_id="event_id", + item_id="item_id", + type="input_audio_buffer.speech_started", + ), + [RealtimeEvent], + id="other", + ), + ], +) +async def test_parse_event(OpenAIWebsocket, event: RealtimeServerEvent, expected_type: list[type]): + iter = 0 + async for result in OpenAIWebsocket._parse_event(event): + assert isinstance(result, expected_type[iter]) + iter += 1 + + +async def test_update_session(OpenAIWebsocket, kernel): + chat_history = ChatHistory( + messages=[ + ChatMessageContent(role="user", content="Hello"), + ChatMessageContent( + role="assistant", + items=[ + FunctionCallContent( + function_name="function_name", plugin_name="plugin", arguments={"arg1": "value"}, id="1" + ) + ], + ), + ChatMessageContent( + role="tool", + items=[ + FunctionResultContent(function_name="function_name", plugin_name="plugin", result="result", id="1") + ], + ), + ChatMessageContent( + role="user", + items=[ + TextContent(text="Hello again"), + ImageContent(uri="https://example.com/image.png"), + ], + ), + ] + ) + settings = OpenAIRealtimeExecutionSettings(instructions="instructions", ai_model_id="gpt-4o-realtime-preview") + with patch.object(OpenAIWebsocket, "_send") as mock_send: + await OpenAIWebsocket.update_session( + chat_history=chat_history, settings=settings, create_response=True, kernel=kernel + ) + mock_send.assert_awaited() + # session update, 4 conversation item create events, response create + # images are not supported, so ignored + assert len(mock_send.await_args_list) == 6 + assert OpenAIWebsocket._current_settings == settings + assert OpenAIWebsocket.kernel == kernel + + +async def test_parse_function_call_arguments_done(OpenAIWebsocket, kernel): + func_result = "result" + event = ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments='{"x": "' + func_result + '"}', + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ) + response_events = [RealtimeFunctionCallEvent, RealtimeFunctionResultEvent] + OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( + instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + ) + OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() + OpenAIWebsocket._call_id_to_function_map["call_id"] = "plugin_name-function_name" + func = kernel_function(name="function_name", description="function_description")(lambda x: x) + kernel.add_function(plugin_name="plugin_name", function_name="function_name", function=func) + OpenAIWebsocket.kernel = kernel + iter = 0 + with patch.object(OpenAIWebsocket, "_send") as mock_send: + async for event in OpenAIWebsocket._parse_function_call_arguments_done(event): + assert isinstance(event, response_events[iter]) + iter += 1 + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 2 + mock_send.assert_any_await( + ConversationItemCreateEvent( + type="conversation.item.create", + item=ConversationItem( + type="function_call_output", + output=func_result, + call_id="call_id", + ), + ) + ) + + +async def test_parse_function_call_arguments_done_fail(OpenAIWebsocket, kernel): + func_result = "result" + event = ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments='{"x": "' + func_result + '"}', + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ) + response_events = [RealtimeEvent] + OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( + instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + ) + OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() + # This function name is invalid + OpenAIWebsocket._call_id_to_function_map["call_id"] = "function_name" + func = kernel_function(name="function_name", description="function_description")(lambda x: x) + kernel.add_function(plugin_name="plugin_name", function_name="function_name", function=func) + OpenAIWebsocket.kernel = kernel + iter = 0 + async for event in OpenAIWebsocket._parse_function_call_arguments_done(event): + assert isinstance(event, response_events[iter]) + iter += 1 + + +async def test_send_audio(OpenAIWebsocket): + audio_event = RealtimeAudioEvent( + audio=AudioContent(data=b"audio data", mime_type="audio/wav"), + ) + with patch.object(OpenAIWebsocket, "_send") as mock_send: + await OpenAIWebsocket.send(audio_event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 1 + mock_send.assert_any_await( + InputAudioBufferAppendEvent( + audio="audio data", + type="input_audio_buffer.append", + ) + ) From 39739b7822d79967543ea326ca709e615a74ed34 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Wed, 26 Feb 2025 21:35:08 +0100 Subject: [PATCH 46/50] added audio callback to receive --- .../realtime/01a-chat_with_realtime_websocket.py | 2 +- .../ai/open_ai/services/open_ai_realtime.py | 6 ++++++ .../connectors/ai/realtime_client_base.py | 10 ++++++++++ python/semantic_kernel/contents/__init__.py | 16 ++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py index 12fee34a0d07..d857cb834773 100644 --- a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py @@ -9,7 +9,7 @@ ListenEvents, OpenAIRealtimeExecutionSettings, ) -from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeTextEvent +from semantic_kernel.contents import RealtimeAudioEvent, RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index dcc7cc840c3a..894edb9892c5 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -661,8 +661,11 @@ class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): @override async def receive( self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, **kwargs: Any, ) -> AsyncGenerator[RealtimeEvents, None]: + if audio_output_callback: + self.audio_output_callback = audio_output_callback while True: event = await self._receive_buffer.get() yield event @@ -907,8 +910,11 @@ class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): @override async def receive( self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, **kwargs: Any, ) -> AsyncGenerator[RealtimeEvents, None]: + if audio_output_callback: + self.audio_output_callback = audio_output_callback await self.connected.wait() if not self.connection: raise ValueError("Connection is not established.") diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 2f81300f4625..71c16ac210e6 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -45,11 +45,21 @@ async def send(self, event: RealtimeEvents) -> None: @abstractmethod def receive( self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, **kwargs: Any, ) -> AsyncGenerator[RealtimeEvents, None]: """Starts listening for messages from the service, generates events. Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible. + It is called first in both websockets and webrtc. + Even when passed, the audio content will still be + added to the receiving queue. + This can also be set in the constructor. + When supplied here it will override any value in the class. kwargs: Additional arguments. """ raise NotImplementedError diff --git a/python/semantic_kernel/contents/__init__.py b/python/semantic_kernel/contents/__init__.py index 5d70a49c1f93..cb69b29ac6c3 100644 --- a/python/semantic_kernel/contents/__init__.py +++ b/python/semantic_kernel/contents/__init__.py @@ -11,6 +11,15 @@ from semantic_kernel.contents.history_reducer.chat_history_summarization_reducer import ChatHistorySummarizationReducer from semantic_kernel.contents.history_reducer.chat_history_truncation_reducer import ChatHistoryTruncationReducer from semantic_kernel.contents.image_content import ImageContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeEvents, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeImageEvent, + RealtimeTextEvent, +) from semantic_kernel.contents.streaming_annotation_content import StreamingAnnotationContent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_file_reference_content import StreamingFileReferenceContent @@ -33,6 +42,13 @@ "FunctionCallContent", "FunctionResultContent", "ImageContent", + "RealtimeAudioEvent", + "RealtimeEvent", + "RealtimeEvents", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + "RealtimeTextEvent", "StreamingAnnotationContent", "StreamingChatMessageContent", "StreamingFileReferenceContent", From db01504c4f697055447ab84c6766054c16a8f985 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 28 Feb 2025 14:00:51 +0100 Subject: [PATCH 47/50] added tests and improved samples --- .../01a-chat_with_realtime_websocket.py | 8 +- .../realtime/01b-chat_with_realtime_webrtc.py | 13 +- ...2a-chat_with_function_calling_websocket.py | 49 ++-- .../02b-chat_with_function_calling_webrtc.py | 34 ++- .../demos/call_automation/call_automation.py | 38 ++-- .../samples/demos/call_automation/readme.md | 38 ++-- .../demos/call_automation/requirements.txt | 4 - .../services/anthropic_chat_completion.py | 6 +- .../azure_ai_inference_chat_completion.py | 6 +- .../services/bedrock_chat_completion.py | 6 +- .../connectors/ai/function_calling_utils.py | 4 +- .../services/google_ai_chat_completion.py | 6 +- .../services/vertex_ai_chat_completion.py | 6 +- .../ollama/services/ollama_chat_completion.py | 8 +- .../connectors/ai/open_ai/__init__.py | 2 + .../open_ai_realtime_execution_settings.py | 10 +- .../ai/open_ai/services/azure_realtime.py | 22 +- .../ai/open_ai/services/open_ai_realtime.py | 56 ++--- .../connectors/ai/realtime_client_base.py | 4 +- .../contents/chat_message_content.py | 14 +- .../contents/function_call_content.py | 3 - .../contents/function_result_content.py | 3 - .../streaming_chat_message_content.py | 26 ++- .../open_ai/services/test_openai_realtime.py | 214 ++++++++++++++++-- 24 files changed, 376 insertions(+), 204 deletions(-) delete mode 100644 python/samples/demos/call_automation/requirements.txt diff --git a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py index d857cb834773..e81f1ff5268c 100644 --- a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py +++ b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py @@ -5,9 +5,9 @@ from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, AzureRealtimeWebsocket, ListenEvents, - OpenAIRealtimeExecutionSettings, ) from semantic_kernel.contents import RealtimeAudioEvent, RealtimeTextEvent @@ -43,7 +43,7 @@ async def main() -> None: audio_player = AudioPlayerWebsocket() audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) # Create the settings for the session - settings = OpenAIRealtimeExecutionSettings( + settings = AzureRealtimeExecutionSettings( instructions=""" You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. @@ -78,7 +78,9 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py index c17d4518fc55..516c40a22b8c 100644 --- a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py +++ b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py @@ -38,11 +38,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - audio_player = AudioPlayerWebRTC() - realtime_client = OpenAIRealtimeWebRTC( - audio_track=AudioRecorderWebRTC(), - audio_output_callback=audio_player.client_callback, - ) + realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC()) # Create the settings for the session settings = OpenAIRealtimeExecutionSettings( instructions=""" @@ -58,9 +54,10 @@ async def main() -> None: # for more details. voice="alloy", ) + audio_player = AudioPlayerWebRTC() # the context manager calls the create_session method on the client and starts listening to the audio stream async with audio_player, realtime_client(settings=settings, create_response=True): - async for event in realtime_client.receive(): + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): match event.event_type: case "text": # the model returns both audio and transcript of the audio, which we will print @@ -75,7 +72,9 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py index 6b0a8efb9ee0..812502c7a5b5 100644 --- a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py +++ b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py @@ -9,9 +9,9 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, AzureRealtimeWebsocket, ListenEvents, - OpenAIRealtimeExecutionSettings, TurnDetection, ) from semantic_kernel.contents import ChatHistory @@ -60,44 +60,43 @@ async def main() -> None: kernel = Kernel() kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) + # create the realtime client, in this the Azure Websocket client, there are also OpenAI Websocket and WebRTC clients + # See 02b-chat_with_function_calling_webrtc.py for an example of the WebRTC client + realtime_client = AzureRealtimeWebsocket() # create the audio player and audio track # both take a device_id parameter, which is the index of the device to use, if None the default device is used audio_player = AudioPlayerWebsocket() - # create the realtime client and add the audio output function, this is optional - # you can define the protocol to use, either "websocket" or "webrtc" - # (at this time Azure only support websockets) - # they will behave the same way, even though the underlying protocol is quite different - realtime_client = AzureRealtimeWebsocket( - audio_output_callback=audio_player.client_callback, - ) audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) # Create the settings for the session # The realtime api, does not use a system message, but takes instructions as a parameter for a session - instructions = """ + # Another important setting is to tune the server_vad turn detection + # if this is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection + settings = AzureRealtimeExecutionSettings( + instructions=""" You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. Your full name, should you need to know it, is Splendid Speckled Mosscap. You communicate effectively, but you tend to answer with long flowery prose. - """ - # the key thing to decide on is to enable the server_vad turn detection - # if turn is turned off (by setting turn_detection=None), you will have to send - # the "input_audio_buffer.commit" and "response.create" event to the realtime api - # to signal the end of the user's turn and start the response. - # manual VAD is not part of this sample - # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection - settings = OpenAIRealtimeExecutionSettings( - instructions=instructions, + """, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice for the full list of voices # noqa: E501 voice="alloy", turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) - # and we can add a chat history to conversation after starting it + # and we can add a chat history to conversation to seed the conversation chat_history = ChatHistory() - chat_history.add_user_message("Hi there, who are you?") - chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + chat_history.add_user_message("Hi there, I'm based in Amsterdam.") + chat_history.add_assistant_message( + "I am Mosscap, a chat bot. I'm trying to figure out what people need, " + "I can tell you what the weather is or the time." + ) # the context manager calls the create_session method on the client and starts listening to the audio stream async with ( @@ -110,7 +109,9 @@ async def main() -> None: create_response=True, ), ): - async for event in realtime_client.receive(): + # the audio_output_callback can be added here or in the client constructor + # using this gives the smoothest experience + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): match event: case RealtimeTextEvent(): if print_transcript: @@ -128,7 +129,9 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py index ea19458ba5dc..b03908e721ca 100644 --- a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py +++ b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py @@ -74,33 +74,27 @@ async def main() -> None: # create the audio player and audio track # both take a device_id parameter, which is the index of the device to use, if None the default device is used audio_player = AudioPlayerWebRTC() - audio_track = AudioRecorderWebRTC() # create the realtime client and optionally add the audio output function, this is optional - # you can define the protocol to use, either "websocket" or "webrtc" - # they will behave the same way, even though the underlying protocol is quite different - realtime_client = OpenAIRealtimeWebRTC( - audio_output_callback=audio_player.client_callback, - audio_track=audio_track, - ) + # and can also be passed in the receive method + realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC()) # Create the settings for the session # The realtime api, does not use a system message, but takes instructions as a parameter for a session - instructions = """ + # Another important setting is to tune the server_vad turn detection + # if this is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection + settings = OpenAIRealtimeExecutionSettings( + instructions=""" You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. Your full name, should you need to know it, is Splendid Speckled Mosscap. You communicate effectively, but you tend to answer with long flowery prose. - """ - # the key thing to decide on is to enable the server_vad turn detection - # if turn is turned off (by setting turn_detection=None), you will have to send - # the "input_audio_buffer.commit" and "response.create" event to the realtime api - # to signal the end of the user's turn and start the response. - # manual VAD is not part of this sample - # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection - settings = OpenAIRealtimeExecutionSettings( - instructions=instructions, + """, voice="alloy", turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), @@ -120,7 +114,7 @@ async def main() -> None: create_response=True, ), ): - async for event in realtime_client.receive(): + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): match event: case RealtimeTextEvent(): if print_transcript: @@ -137,7 +131,9 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. " + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " "Press ctrl + c to stop the program." ) asyncio.run(main()) diff --git a/python/samples/demos/call_automation/call_automation.py b/python/samples/demos/call_automation/call_automation.py index 6b69dd1168bc..2ea8058167d9 100755 --- a/python/samples/demos/call_automation/call_automation.py +++ b/python/samples/demos/call_automation/call_automation.py @@ -1,18 +1,13 @@ -#!/usr/bin/env uv run # noqa: CPY001 +# Copyright (c) Microsoft. All rights reserved. + #################################################################### -# Copyright (c) Microsoft. All rights reserved. # # Sample Quart webapp with that connects to Azure OpenAI # -# If you have `uv` installed and the environment variables set: # -# `ACS_CONNECTION_STRING` # -# `CALLBACK_URI_HOST` # -# `AZURE_OPENAI_ENDPOINT` # -# `AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME` # -# `AZURE_OPENAI_API_VERSION` # -# `AZURE_OPENAI_API_KEY` (optionally) # -# See the readme.md for more info # -# You can run this example with just # -# # -# `.call_automation.py` # +# Make sure to install `uv`, see: # +# https://docs.astral.sh/uv/getting-started/installation/ # +# and rename .env.example to .env and fill in the values. # +# Follow the guidance in README.md for more info. # +# To run the app, use: # +# `uv run --env-file .env call_automation.py` # #################################################################### # # /// script @@ -49,15 +44,12 @@ from semantic_kernel import Kernel from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, AzureRealtimeWebsocket, - InputAudioTranscription, ListenEvents, - OpenAIRealtimeExecutionSettings, - TurnDetection, ) from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase -from semantic_kernel.contents import AudioContent -from semantic_kernel.contents.realtime_events import RealtimeAudioEvent +from semantic_kernel.contents import AudioContent, RealtimeAudioEvent from semantic_kernel.functions import kernel_function # Callback events URI to handle callback events. @@ -134,7 +126,7 @@ async def handle_realtime_messages(client: RealtimeClientBase): This function only handles the non-audio messages. Audio is done through the callback so that it is faster and smoother. """ - async for event in client.receive(): + async for event in client.receive(audio_output_callback=from_realtime_to_acs): match event.service_type: case ListenEvents.SESSION_CREATED: print("Session Created Message") @@ -169,19 +161,19 @@ async def ws(): app.logger.info("Client connected to WebSocket") # create the client, using the audio callback - client = AzureRealtimeWebsocket(audio_output_callback=from_realtime_to_acs) - settings = OpenAIRealtimeExecutionSettings( + client = AzureRealtimeWebsocket() + settings = AzureRealtimeExecutionSettings( instructions="""You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. Your full name, should you need to know it, is Splendid Speckled Mosscap. You communicate effectively, but you tend to answer with long flowery prose.""", - turn_detection=TurnDetection(type="server_vad"), + turn_detection={"type": "server_vad"}, voice="shimmer", input_audio_format="pcm16", output_audio_format="pcm16", - input_audio_transcription=InputAudioTranscription(model="whisper-1"), + input_audio_transcription={"model": "whisper-1"}, function_choice_behavior=FunctionChoiceBehavior.Auto(), ) diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md index 9c366c6a66b5..ca69b39e0a3b 100644 --- a/python/samples/demos/call_automation/readme.md +++ b/python/samples/demos/call_automation/readme.md @@ -6,25 +6,18 @@ Original code for this sample can be found [here](https://github.com/Azure-Sampl ## Prerequisites -- An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). +- An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). - A deployed Communication Services resource. [Create a Communication Services resource](https://docs.microsoft.com/azure/communication-services/quickstarts/create-communication-resource). - A [phone number](https://learn.microsoft.com/en-us/azure/communication-services/quickstarts/telephony/get-phone-number) in your Azure Communication Services resource that can get inbound calls. NB: phone numbers are not available in free subscriptions. - [Python](https://www.python.org/downloads/) 3.9 or above. - An Azure OpenAI Resource and Deployed Model. See [instructions](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal). +- Install `uv`, see [the uv docs](https://docs.astral.sh/uv/getting-started/installation/). -## Before running the sample for the first time +## To run the app 1. Open an instance of PowerShell, Windows Terminal, Command Prompt or equivalent and navigate to the directory that you would like to clone the sample to. 2. git clone `https://github.com/microsoft/semantic-kernel.git`. -3. Navigate to `python/samples/demos/call_automation` folder and open `call_automation.py` file. - -### Setup the Python environment - -Create and activate python virtual environment and install required packages using following command -``` -pip install -r requirements.txt -``` -Alternatively, if you have `uv` installed, you can ship this step. +3. Navigate to `python/samples/demos/call_automation` folder ### Setup and host your Azure DevTunnel @@ -41,19 +34,20 @@ devtunnel host Copy the `.env.example` file to `.env` and update the following values: 1. `ACS_CONNECTION_STRING`: Azure Communication Service resource's connection string. -2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use dev tunnel url) -1. `AZURE_OPENAI_ENDPOINT`: Azure Open AI service endpoint -2. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name -3. `AZURE_OPENAI_API_VERSION`: Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' -4. `AZURE_OPENAI_API_KEY`: Azure Open AI API key, optionally, you can also use Entra Auth. +2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use the dev tunnel url from the step above) +3. `AZURE_OPENAI_ENDPOINT`: Azure Open AI service endpoint +4. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name +5. `AZURE_OPENAI_API_VERSION`: Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' +6. `AZURE_OPENAI_API_KEY`: Azure Open AI API key, optionally, you can also use Entra Auth. -## Run app locally +## Run the app 1. Navigate to `call_automation` folder and do one of the following to start the main application: - - run `call_automation.py` in debug from your IDE - - use command `python ./call_automation.py` to run it from PowerShell, Command Prompt or another Terminal. - - execute `./call_automation.py` directly in your terminal (this uses `uv`, which will then install the requirements in a temporary virtual environment, see [uv docs](https://docs.astral.sh/uv/guides/scripts) for more info). -2. Browser should pop up with the below page. If not navigate it to `http://localhost:8080/`or your dev tunnel url. + - run `call_automation.py` in debug mode from your IDE (VSCode will load your .env variables into the environment automatically, other IDE's might need an extra step). + - execute `uv run --env-file .env call_automation.py` directly in your terminal (this uses `uv`, which will then install the requirements in a temporary virtual environment, see [uv docs](https://docs.astral.sh/uv/guides/scripts) for more info). +2. Browser should pop up with a simple page. If not navigate it to `http://localhost:8080/` or your dev tunnel url. 3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). -Once that's completed you should have a running application. The best way to test this is to place a call to your ACS phone number and talk to your intelligent agent. +Once that's completed you should have a running application. The way to test this is to place a call to your ACS phone number and talk to your intelligent agent! + +In the terminal you should see all sorts of logs from both ACS and Semantic Kernel. diff --git a/python/samples/demos/call_automation/requirements.txt b/python/samples/demos/call_automation/requirements.txt deleted file mode 100644 index 2bb034fdffc6..000000000000 --- a/python/samples/demos/call_automation/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -Quart>=0.19.6 -azure-eventgrid==4.11.0 -azure-communication-callautomation==1.4.0b1 -semantic-kernel \ No newline at end of file diff --git a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py index e64136fe9736..1c5d670c57f1 100644 --- a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py @@ -35,9 +35,9 @@ from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -261,7 +261,7 @@ def _create_chat_message_content( self, response: Message, response_metadata: dict[str, Any] ) -> "ChatMessageContent": """Create a chat message content object.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] items += self._get_tool_calls_from_message(response) for content_block in response.content: diff --git a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py index 74e2b82e650f..7baabd0ee6ba 100644 --- a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py @@ -33,9 +33,9 @@ from semantic_kernel.connectors.ai.function_calling_utils import update_settings_from_function_call_configuration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -240,7 +240,7 @@ def _create_chat_message_content( Returns: A chat message content object. """ - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if choice.message.content: items.append( TextContent( diff --git a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py index 5c4f3e6cd192..685deb7c436c 100644 --- a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py @@ -31,10 +31,10 @@ from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase from semantic_kernel.connectors.ai.completion_usage import CompletionUsage from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.image_content import ImageContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -240,7 +240,7 @@ def _create_chat_message_content(self, response: dict[str, Any]) -> ChatMessageC prompt_tokens=response["usage"]["inputTokens"], completion_tokens=response["usage"]["outputTokens"], ) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for content in response["output"]["message"]["content"]: if "text" in content: items.append(TextContent(text=content["text"], inner_content=content)) diff --git a/python/semantic_kernel/connectors/ai/function_calling_utils.py b/python/semantic_kernel/connectors/ai/function_calling_utils.py index 11d2c0a2eeb8..1e65fa59e537 100644 --- a/python/semantic_kernel/connectors/ai/function_calling_utils.py +++ b/python/semantic_kernel/connectors/ai/function_calling_utils.py @@ -7,7 +7,7 @@ from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError -from semantic_kernel.utils.experimental_decorator import experimental_function +from semantic_kernel.utils.feature_stage_decorator import experimental if TYPE_CHECKING: from semantic_kernel.connectors.ai.function_choice_behavior import ( @@ -140,7 +140,7 @@ def merge_streaming_function_results( ] -@experimental_function +@experimental def prepare_settings_for_function_calling( settings: "PromptExecutionSettings", settings_class: type["PromptExecutionSettings"], diff --git a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py index ef50f4f0ef1b..9b538b26ebec 100644 --- a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py @@ -36,9 +36,9 @@ format_gemini_function_name_to_kernel_function_fully_qualified_name, ) from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -244,7 +244,7 @@ def _create_chat_message_content( response_metadata = self._get_metadata_from_response(response) response_metadata.update(self._get_metadata_from_candidate(candidate)) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for idx, part in enumerate(candidate.content.parts): if part.text: items.append(TextContent(text=part.text, inner_content=response, metadata=response_metadata)) diff --git a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py index bd7c1346accf..beec827bfb2f 100644 --- a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py @@ -34,9 +34,9 @@ ) from semantic_kernel.connectors.ai.google.vertex_ai.vertex_ai_settings import VertexAISettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -235,7 +235,7 @@ def _create_chat_message_content(self, response: GenerationResponse, candidate: response_metadata = self._get_metadata_from_response(response) response_metadata.update(self._get_metadata_from_candidate(candidate)) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for idx, part in enumerate(candidate.content.parts): part_dict = part.to_dict() if "text" in part_dict: diff --git a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py index 103133af2c9f..68a62e434423 100644 --- a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py @@ -27,9 +27,9 @@ ) from semantic_kernel.contents import AuthorRole from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -255,7 +255,7 @@ def _parse_tool_calls(self, tool_calls: Sequence[Message.ToolCall] | None, items def _create_chat_message_content_from_chat_response(self, response: ChatResponse) -> ChatMessageContent: """Create a chat message content from the response.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if response.message.content: items.append( TextContent( @@ -274,7 +274,7 @@ def _create_chat_message_content_from_chat_response(self, response: ChatResponse def _create_chat_message_content(self, response: Mapping[str, Any]) -> ChatMessageContent: """Create a chat message content from the response.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if not (message := response.get("message", None)): raise ServiceInvalidResponseError("No message content found in response.") diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index 919310f448ad..34e11756fdb7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -23,6 +23,7 @@ OpenAITextPromptExecutionSettings, ) from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + AzureRealtimeExecutionSettings, InputAudioTranscription, OpenAIRealtimeExecutionSettings, TurnDetection, @@ -67,6 +68,7 @@ "AzureDataSourceParameters", "AzureEmbeddingDependency", "AzureOpenAISettings", + "AzureRealtimeExecutionSettings", "AzureRealtimeWebsocket", "AzureTextCompletion", "AzureTextEmbedding", diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index 446161e365d1..2c4fc74738b5 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -52,8 +52,8 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): voice: str | None = None input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None - input_audio_transcription: Annotated[InputAudioTranscription | Mapping[str, str] | None, Field()] = None - turn_detection: TurnDetection | None = None + input_audio_transcription: InputAudioTranscription | Mapping[str, str] | None = None + turn_detection: TurnDetection | Mapping[str, str] | None = None tools: Annotated[ list[dict[str, Any]] | None, Field( @@ -70,3 +70,9 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): ] = None temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None + + +class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings): + """Request settings for Azure OpenAI realtime services.""" + + pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index 560062b95a0e..39e5690fb3c1 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -1,22 +1,32 @@ # Copyright (c) Microsoft. All rights reserved. +import sys from collections.abc import Callable, Coroutine, Mapping from typing import Any +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + from numpy import ndarray from openai import AsyncAzureOpenAI from openai.lib.azure import AsyncAzureADTokenProvider from pydantic import ValidationError +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + AzureRealtimeExecutionSettings, +) from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtimeWebsocketBase from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError -from semantic_kernel.utils.experimental_decorator import experimental_class +from semantic_kernel.utils.feature_stage_decorator import experimental -@experimental_class +@experimental class AzureRealtimeWebsocket(OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): """Azure OpenAI Realtime service using WebSocket protocol.""" @@ -44,8 +54,8 @@ def __init__( audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the - least amount of latency possible. - It is called first in both websockets and webrtc. + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. Even when passed, the audio content will still be added to the receiving queue. service_id: The service ID for the Azure deployment. (Optional) @@ -100,3 +110,7 @@ def __init__( client=async_client, **kwargs, ) + + @override + def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]: + return AzureRealtimeExecutionSettings diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py index 894edb9892c5..d6422066394b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -52,9 +52,6 @@ prepare_settings_for_function_calling, ) from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( - OpenAIRealtimeExecutionSettings, -) from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes @@ -79,7 +76,7 @@ from semantic_kernel.exceptions import ContentException from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError from semantic_kernel.kernel import Kernel -from semantic_kernel.utils.experimental_decorator import experimental_class +from semantic_kernel.utils.feature_stage_decorator import experimental if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack @@ -97,7 +94,7 @@ # region constants -@experimental_class +@experimental class SendEvents(str, Enum): """Events that can be sent.""" @@ -112,7 +109,7 @@ class SendEvents(str, Enum): RESPONSE_CANCEL = "response.cancel" -@experimental_class +@experimental class ListenEvents(str, Enum): """Events that can be listened to.""" @@ -160,8 +157,8 @@ def update_settings_from_function_call_configuration( and hasattr(settings, "tool_choice") and hasattr(settings, "tools") ): - settings.tool_choice = type - settings.tools = [ + settings.tool_choice = type # type: ignore + settings.tools = [ # type: ignore kernel_function_metadata_to_function_call_format(f) for f in function_choice_configuration.available_functions ] @@ -259,7 +256,7 @@ def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) # region Base -@experimental_class +@experimental class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): """OpenAI Realtime service.""" @@ -513,17 +510,16 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: if not settings: logger.error("Event data does not contain 'settings'") return - if not isinstance(settings, OpenAIRealtimeExecutionSettings): - try: - settings = self.get_prompt_execution_settings_from_settings(settings) - except Exception as e: - logger.error( - f"Failed to properly create settings from passed settings: {settings}, error: {e}" - ) - return - assert isinstance(settings, OpenAIRealtimeExecutionSettings) # nosec - if not settings.ai_model_id: - settings.ai_model_id = self.ai_model_id + try: + settings = self.get_prompt_execution_settings_from_settings(settings) + except Exception as e: + logger.error( + f"Failed to properly create settings from passed settings: {settings}, error: {e}" + ) + return + assert isinstance(settings, self.get_prompt_execution_settings_class()) # nosec + if not settings.ai_model_id: # type: ignore + settings.ai_model_id = self.ai_model_id # type: ignore await self._send( _create_openai_realtime_client_event( event_type=event.service_type, @@ -648,11 +644,10 @@ def _update_function_choice_settings_callback( # region WebRTC -@experimental_class +@experimental class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): """OpenAI WebRTC Realtime service.""" - protocol: ClassVar[Literal["webrtc"]] = "webrtc" # type: ignore peer_connection: RTCPeerConnection | None = None data_channel: RTCDataChannel | None = None audio_track: MediaStreamTrack | None = None @@ -749,7 +744,7 @@ async def close_session(self) -> None: self.data_channel = None async def _on_track(self, track: "MediaStreamTrack") -> None: - logger.info(f"Received {track.kind} track from remote") + logger.debug(f"Received {track.kind} track from remote") if track.kind != "audio": return while True: @@ -822,7 +817,7 @@ async def _get_ephemeral_token(self) -> str: raise -@experimental_class +@experimental class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): """OpenAI Realtime service using WebRTC protocol.""" @@ -843,12 +838,11 @@ def __init__( """Initialize an OpenAIRealtime service. Args: - protocol: The protocol to use, must be either "websocket" or "webrtc". audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the - least amount of latency possible. - It is called first in both websockets and webrtc. + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. Even when passed, the audio content will still be added to the receiving queue. audio_track: The audio track to use for the service, only used by WebRTC. @@ -899,7 +893,7 @@ def __init__( # region Websocket -@experimental_class +@experimental class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): """OpenAI Realtime service.""" @@ -963,7 +957,7 @@ async def close_session(self) -> None: self.connected.clear() -@experimental_class +@experimental class OpenAIRealtimeWebsocket(OpenAIRealtimeWebsocketBase, OpenAIConfigBase): """OpenAI Realtime service using WebSocket protocol.""" @@ -986,8 +980,8 @@ def __init__( audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the - least amount of latency possible. - It is called first in both websockets and webrtc. + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. Even when passed, the audio content will still be added to the receiving queue. ai_model_id (str | None): OpenAI model name, see diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py index 71c16ac210e6..3992d116a4f7 100644 --- a/python/semantic_kernel/connectors/ai/realtime_client_base.py +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -19,10 +19,10 @@ from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.realtime_events import RealtimeEvents from semantic_kernel.services.ai_service_client_base import AIServiceClientBase -from semantic_kernel.utils.experimental_decorator import experimental_class +from semantic_kernel.utils.feature_stage_decorator import experimental -@experimental_class +@experimental class RealtimeClientBase(AIServiceClientBase, ABC): """Base class for a realtime client.""" diff --git a/python/semantic_kernel/contents/chat_message_content.py b/python/semantic_kernel/contents/chat_message_content.py index 861a168d142e..829b3f5c6aed 100644 --- a/python/semantic_kernel/contents/chat_message_content.py +++ b/python/semantic_kernel/contents/chat_message_content.py @@ -49,7 +49,7 @@ STREAMING_ANNOTATION_CONTENT_TAG: StreamingAnnotationContent, } -ITEM_TYPES = ( +CMC_ITEM_TYPES = Annotated[ AnnotationContent | BinaryContent | ImageContent @@ -59,8 +59,10 @@ | FileReferenceContent | StreamingAnnotationContent | StreamingFileReferenceContent - | AudioContent -) + | AudioContent, + Field(discriminator=DISCRIMINATOR_FIELD), +] + logger = logging.getLogger(__name__) @@ -89,7 +91,7 @@ class ChatMessageContent(KernelContent): tag: ClassVar[str] = CHAT_MESSAGE_CONTENT_TAG role: AuthorRole name: str | None = None - items: list[Annotated[ITEM_TYPES, Field(discriminator=DISCRIMINATOR_FIELD)]] = Field(default_factory=list) + items: list[CMC_ITEM_TYPES] = Field(default_factory=list) encoding: str | None = None finish_reason: FinishReason | None = None @@ -97,7 +99,7 @@ class ChatMessageContent(KernelContent): def __init__( self, role: AuthorRole, - items: list[ITEM_TYPES], + items: list[CMC_ITEM_TYPES], name: str | None = None, inner_content: Any | None = None, encoding: str | None = None, @@ -124,7 +126,7 @@ def __init__( def __init__( # type: ignore self, role: AuthorRole, - items: list[ITEM_TYPES] | None = None, + items: list[CMC_ITEM_TYPES] | None = None, content: str | None = None, inner_content: Any | None = None, name: str | None = None, diff --git a/python/semantic_kernel/contents/function_call_content.py b/python/semantic_kernel/contents/function_call_content.py index a8b2509336e1..863ba6dfbaf7 100644 --- a/python/semantic_kernel/contents/function_call_content.py +++ b/python/semantic_kernel/contents/function_call_content.py @@ -45,7 +45,6 @@ class FunctionCallContent(KernelContent): def __init__( self, - content_type: Literal[ContentTypes.FUNCTION_CALL_CONTENT] = FUNCTION_CALL_CONTENT_TAG, # type: ignore inner_content: Any | None = None, ai_model_id: str | None = None, id: str | None = None, @@ -60,7 +59,6 @@ def __init__( """Create function call content. Args: - content_type: The content type. inner_content (Any | None): The inner content. ai_model_id (str | None): The id of the AI model. id (str | None): The id of the function call. @@ -83,7 +81,6 @@ def __init__( else: function_name = name args = { - "content_type": content_type, "inner_content": inner_content, "ai_model_id": ai_model_id, "id": id, diff --git a/python/semantic_kernel/contents/function_result_content.py b/python/semantic_kernel/contents/function_result_content.py index c95460ae8596..b1d36b2bd5f8 100644 --- a/python/semantic_kernel/contents/function_result_content.py +++ b/python/semantic_kernel/contents/function_result_content.py @@ -42,7 +42,6 @@ class FunctionResultContent(KernelContent): def __init__( self, - content_type: Literal[ContentTypes.FUNCTION_RESULT_CONTENT] = FUNCTION_RESULT_CONTENT_TAG, # type: ignore inner_content: Any | None = None, ai_model_id: str | None = None, id: str | None = None, @@ -57,7 +56,6 @@ def __init__( """Create function result content. Args: - content_type: The content type. inner_content (Any | None): The inner content. ai_model_id (str | None): The id of the AI model. id (str | None): The id of the function call that the result relates to. @@ -80,7 +78,6 @@ def __init__( else: function_name = name args = { - "content_type": content_type, "inner_content": inner_content, "ai_model_id": ai_model_id, "id": id, diff --git a/python/semantic_kernel/contents/streaming_chat_message_content.py b/python/semantic_kernel/contents/streaming_chat_message_content.py index 2f7e27d32aaa..88c31ef31473 100644 --- a/python/semantic_kernel/contents/streaming_chat_message_content.py +++ b/python/semantic_kernel/contents/streaming_chat_message_content.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft. All rights reserved. from enum import Enum -from typing import Any, Union, overload +from typing import Annotated, Any, overload from xml.etree.ElementTree import Element # nosec from pydantic import Field @@ -9,6 +9,7 @@ from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.binary_content import BinaryContent from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.const import DISCRIMINATOR_FIELD from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent from semantic_kernel.contents.image_content import ImageContent @@ -21,15 +22,16 @@ from semantic_kernel.contents.utils.hashing import make_hashable from semantic_kernel.exceptions import ContentAdditionException -ITEM_TYPES = Union[ - BinaryContent, - AudioContent, - ImageContent, - StreamingTextContent, - FunctionCallContent, - FunctionResultContent, - StreamingFileReferenceContent, - StreamingAnnotationContent, +STREAMING_CMC_ITEM_TYPES = Annotated[ + BinaryContent + | AudioContent + | ImageContent + | FunctionResultContent + | FunctionCallContent + | StreamingTextContent + | StreamingAnnotationContent + | StreamingFileReferenceContent, + Field(discriminator=DISCRIMINATOR_FIELD), ] @@ -68,7 +70,7 @@ class StreamingChatMessageContent(ChatMessageContent, StreamingContentMixin): def __init__( self, role: AuthorRole, - items: list[ITEM_TYPES], + items: list[STREAMING_CMC_ITEM_TYPES], choice_index: int, name: str | None = None, inner_content: Any | None = None, @@ -98,7 +100,7 @@ def __init__( # type: ignore self, role: AuthorRole, choice_index: int, - items: list[ITEM_TYPES] | None = None, + items: list[STREAMING_CMC_ITEM_TYPES] | None = None, content: str | None = None, inner_content: Any | None = None, name: str | None = None, diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py index 65dbd6f5f7fe..a341f2bb5c4c 100644 --- a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -1,12 +1,15 @@ # Copyright (c) Microsoft. All rights reserved. import asyncio +from collections.abc import AsyncIterable from typing import Any -from unittest.mock import patch +from unittest.mock import AsyncMock, patch from aiortc import AudioStreamTrack +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection, AsyncRealtimeConnectionManager from openai.types.beta.realtime import ( ConversationItem, + ConversationItemContent, ConversationItemCreatedEvent, ConversationItemCreateEvent, ConversationItemDeletedEvent, @@ -15,14 +18,18 @@ ConversationItemTruncateEvent, ErrorEvent, InputAudioBufferAppendEvent, + InputAudioBufferClearedEvent, InputAudioBufferClearEvent, InputAudioBufferCommitEvent, + InputAudioBufferCommittedEvent, InputAudioBufferSpeechStartedEvent, + RealtimeResponse, RealtimeServerEvent, ResponseAudioDeltaEvent, ResponseAudioDoneEvent, ResponseAudioTranscriptDeltaEvent, ResponseCancelEvent, + ResponseCreatedEvent, ResponseCreateEvent, ResponseFunctionCallArgumentsDeltaEvent, ResponseFunctionCallArgumentsDoneEvent, @@ -48,6 +55,7 @@ _create_openai_realtime_client_event, update_settings_from_function_call_configuration, ) +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.chat_history import ChatHistory from semantic_kernel.contents.chat_message_content import ChatMessageContent @@ -66,28 +74,69 @@ from semantic_kernel.functions import kernel_function from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata - -@fixture -async def websocket_stream(): - await asyncio.sleep(0) - yield SessionCreatedEvent(type=ListenEvents.SESSION_CREATED, session=Session(session_id="session_id"), event_id="1") - yield SessionUpdatedEvent(type=ListenEvents.SESSION_UPDATED, session=Session(session_id="session_id"), event_id="2") - yield ConversationItemCreatedEvent( +events = [ + SessionCreatedEvent(type=ListenEvents.SESSION_CREATED, session=Session(id="session_id"), event_id="1"), + SessionUpdatedEvent(type=ListenEvents.SESSION_UPDATED, session=Session(id="session_id"), event_id="2"), + ConversationItemCreatedEvent( type=ListenEvents.CONVERSATION_ITEM_CREATED, item=ConversationItem(id="item_id"), event_id="3", previous_item_id="2", - ) - yield ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED, item_id="item_id", event_id="4") - yield ConversationItemTruncatedEvent(type=ListenEvents.CONVERSATION_ITEM_TRUNCATED, event_id="5") - yield InputAudioBufferClearEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED, event_id="7") - yield InputAudioBufferCommitEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED, event_id="8") - yield ResponseCancelEvent(type=ListenEvents.RESPONSE_CANCELLED, event_id="9") - yield ResponseCreateEvent(type=ListenEvents.RESPONSE_CREATED, event_id="10") - yield ResponseFunctionCallArgumentsDoneEvent(type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, event_id="11") - yield ResponseAudioTranscriptDeltaEvent(type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, event_id="12") - yield ResponseAudioDoneEvent(type=ListenEvents.RESPONSE_AUDIO_DONE, event_id="13") - yield ResponseAudioDeltaEvent(type=ListenEvents.RESPONSE_AUDIO_DELTA, event_id="14") + ), + ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED, item_id="item_id", event_id="4"), + ConversationItemTruncatedEvent( + type=ListenEvents.CONVERSATION_ITEM_TRUNCATED, event_id="5", audio_end_ms=0, content_index=0, item_id="item_id" + ), + InputAudioBufferClearedEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED, event_id="7"), + InputAudioBufferCommittedEvent( + type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED, + event_id="8", + item_id="item_id", + previous_item_id="previous_item_id", + ), + ResponseCreatedEvent(type=ListenEvents.RESPONSE_CREATED, event_id="10", response=RealtimeResponse()), + ResponseFunctionCallArgumentsDoneEvent( + type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, + event_id="11", + arguments="{}", + call_id="call_id", + item_id="item_id", + output_index=0, + response_id="response_id", + ), + ResponseAudioTranscriptDeltaEvent( + type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, + event_id="12", + content_index=0, + delta="text", + item_id="item_id", + output_index=0, + response_id="response_id", + ), + ResponseAudioDoneEvent( + type=ListenEvents.RESPONSE_AUDIO_DONE, + event_id="13", + item_id="item_id", + output_index=0, + response_id="response_id", + content_index=0, + ), + ResponseAudioDeltaEvent( + type=ListenEvents.RESPONSE_AUDIO_DELTA, + event_id="14", + item_id="item_id", + output_index=0, + response_id="response_id", + content_index=0, + delta="audio data", + ), +] + + +async def websocket_stream(**kwargs) -> AsyncIterable[RealtimeServerEvent]: + for event in events: + yield event + await asyncio.sleep(0) @fixture @@ -109,6 +158,13 @@ def OpenAIWebsocket(openai_unit_test_env): return client +@fixture +def OpenAIWebRTC(openai_unit_test_env, audio_track): + client = OpenAIRealtimeWebRTC(audio_track=audio_track) + client._call_id_to_function_map["call_id"] = "function_name" + return client + + def test_update_settings_from_function_call_config(): config = FunctionCallChoiceConfiguration( available_functions=[ @@ -478,3 +534,123 @@ async def test_send_audio(OpenAIWebsocket): type="input_audio_buffer.append", ) ) + + +@mark.parametrize("client", ["OpenAIWebRTC", "OpenAIWebsocket"]) +async def test_send_session_update(client, OpenAIWebRTC, OpenAIWebsocket): + openai_client = OpenAIWebRTC if client == "OpenAIWebRTC" else OpenAIWebsocket + settings = PromptExecutionSettings(ai_model_id="gpt-4o-realtime-preview") + session_event = RealtimeEvent( + service_type=SendEvents.SESSION_UPDATE, + service_event={"settings": settings}, + ) + with patch.object(openai_client, "_send") as mock_send: + await openai_client.send(event=session_event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 1 + mock_send.assert_any_await( + SessionUpdateEvent( + session={"model": "gpt-4o-realtime-preview"}, + type="session.update", + ) + ) + + +@mark.parametrize("client", ["OpenAIWebRTC", "OpenAIWebsocket"]) +async def test_send_conversation_item_create(client, OpenAIWebRTC, OpenAIWebsocket): + openai_client = OpenAIWebRTC if client == "OpenAIWebRTC" else OpenAIWebsocket + event = RealtimeEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, + service_event={ + "item": ChatMessageContent( + role="user", + items=[ + TextContent(text="Hello"), + FunctionCallContent( + function_name="function_name", + plugin_name="plugin", + arguments={"arg1": "value"}, + id="1", + metadata={"call_id": "call_id"}, + ), + FunctionResultContent( + function_name="function_name", + plugin_name="plugin", + result="result", + id="1", + metadata={"call_id": "call_id"}, + ), + ], + ) + }, + ) + + with patch.object(openai_client, "_send") as mock_send: + await openai_client.send(event=event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 3 + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + content=[ConversationItemContent(text="Hello", type="input_text")], + role="user", + type="message", + ), + type="conversation.item.create", + ) + ) + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + arguments='{"arg1": "value"}', + call_id="call_id", + name="plugin-function_name", + type="function_call", + ), + type="conversation.item.create", + ) + ) + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + call_id="call_id", + output="result", + type="function_call_output", + ), + type="conversation.item.create", + ) + ) + + +async def test_receive_websocket(OpenAIWebsocket): + connection_mock = AsyncMock(spec=AsyncRealtimeConnection) + connection_mock.recv = websocket_stream + + manager = AsyncMock(spec=AsyncRealtimeConnectionManager) + manager.enter.return_value = connection_mock + + with patch("openai.resources.beta.realtime.realtime.AsyncRealtime.connect") as mock_connect: + mock_connect.return_value = manager + async with OpenAIWebsocket(): + async for msg in OpenAIWebsocket.receive(): + assert isinstance(msg, RealtimeEvent) + + +async def test_receive_webrtc(OpenAIWebRTC): + counter = len(events) + with patch.object(OpenAIRealtimeWebRTC, "create_session"): + recv_task = asyncio.create_task(_stream_to_webrtc(OpenAIWebRTC)) + async with OpenAIWebRTC(): + async for msg in OpenAIWebRTC.receive(): + assert isinstance(msg, RealtimeEvent) + counter -= 1 + if counter == 0: + break + recv_task.cancel() + + +async def _stream_to_webrtc(client: OpenAIRealtimeWebRTC): + async for msg in websocket_stream(): + async for parsed_msg in client._parse_event(msg): + await client._receive_buffer.put(parsed_msg) + await asyncio.sleep(0) From f238ee8d64b4d5c431005271ebdf69d17f549795 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 3 Mar 2025 10:59:07 +0100 Subject: [PATCH 48/50] updated names of the samples and added readme --- python/samples/concepts/realtime/README.md | 50 +++++++++++++++++++ ...time_chat_with_function_calling_webrtc.py} | 0 ...e_chat_with_function_calling_websocket.py} | 0 ...brtc.py => simple_realtime_chat_webrtc.py} | 0 ...t.py => simple_realtime_chat_websocket.py} | 0 5 files changed, 50 insertions(+) create mode 100644 python/samples/concepts/realtime/README.md rename python/samples/concepts/realtime/{02b-chat_with_function_calling_webrtc.py => realtime_chat_with_function_calling_webrtc.py} (100%) rename python/samples/concepts/realtime/{02a-chat_with_function_calling_websocket.py => realtime_chat_with_function_calling_websocket.py} (100%) rename python/samples/concepts/realtime/{01b-chat_with_realtime_webrtc.py => simple_realtime_chat_webrtc.py} (100%) rename python/samples/concepts/realtime/{01a-chat_with_realtime_websocket.py => simple_realtime_chat_websocket.py} (100%) diff --git a/python/samples/concepts/realtime/README.md b/python/samples/concepts/realtime/README.md new file mode 100644 index 000000000000..c39e0448316a --- /dev/null +++ b/python/samples/concepts/realtime/README.md @@ -0,0 +1,50 @@ +# Realtime Multi-modal API Samples + +These samples are more complex then most because of the nature of these API's. They are designed to be run in real-time and require a microphone and speaker to be connected to your computer. + +To run these samples, you will need to have the following setup: + +- Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set. +- Environemnt variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`. +- To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment: + - semantic-kernel[realtime] + - pyaudio + - sounddevice + - pydub + e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +The samples all run as python scripts, that can either be started directly or through your IDE. + +All demos have a similar output, where the instructions are printed, each new *response item* from the API is put into a new `Mosscap (transcript):` line. The nature of these api's is such that the transcript arrives before the spoken audio, so if you interrupt the audio the transcript will not match the audio. + +The realtime api's work by sending event from the server to you and sending events back to the server, this is fully asynchronous. The samples show you can listen to the events being sent by the server and some are handled by the code in the samples, others are not. For instance one could add a clause in the match case in the receive loop that logs the usage that is part of the `response.done` event. + +For more info on the events, go to our documentation, as well as the documentation of [OpenAI](https://platform.openai.com/docs/guides/realtime) and [Azure](https://learn.microsoft.com/en-us/azure/ai-services/openai/realtime-audio-quickstart?tabs=keyless%2Cmacos&pivots=programming-language-python). + +## Simple chat samples + +### [Simple chat with realtime websocket](./simple_realtime_chat_websocket.py) + +This sample uses the websocket api with Azure OpenAI to run a simple interaction based on voice. If you want to use this sample with OpenAI, just change AzureRealtimeWebsocket into OpenAIRealtimeWebsocket. + +### [Simple chat with realtime WebRTC](./simple_realtime_chat_webrtc.py) + +This sample uses the WebRTC api with OpenAI to run a simple interaction based on voice. Because of the way the WebRTC protocol works this needs a different player and recorder than the websocket version. + +## Function calling samples + +The following two samples use function calling with the following functions: + +- get_weather: This function will return the weather for a given city, it is randomly generated and not based on any real data. +- get_time: This function will return the current time and date. +- goodbye: This function will end the conversation. + +A line is logged whenever one of these functions is called. + +### [Chat with function calling Websocket](./realtime_chat_with_function_calling_websocket.py) + +This sample uses the websocket api with Azure OpenAI to run the interaction with the voice model, but now with function calling. + +### [Chat with function calling WebRTC](./realtime_chat_with_function_calling_webrtc.py) + +This sample uses the WebRTC api with OpenAI to run the interaction with the voice model, but now with function calling. diff --git a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py similarity index 100% rename from python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py rename to python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py diff --git a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py similarity index 100% rename from python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py rename to python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py similarity index 100% rename from python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py rename to python/samples/concepts/realtime/simple_realtime_chat_webrtc.py diff --git a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py similarity index 100% rename from python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py rename to python/samples/concepts/realtime/simple_realtime_chat_websocket.py From 83dbe686070869dc1f367f286eb871ec1846d150 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 3 Mar 2025 11:43:29 +0100 Subject: [PATCH 49/50] typo --- python/samples/concepts/realtime/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/samples/concepts/realtime/README.md b/python/samples/concepts/realtime/README.md index c39e0448316a..a2dbb5d349f5 100644 --- a/python/samples/concepts/realtime/README.md +++ b/python/samples/concepts/realtime/README.md @@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T To run these samples, you will need to have the following setup: - Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set. -- Environemnt variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`. +- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`. - To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment: - semantic-kernel[realtime] - pyaudio From ec90ca7970d76a7a96ab974350be14098b31b958 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 3 Mar 2025 16:54:02 +0100 Subject: [PATCH 50/50] updated sample instructions --- ...ltime_chat_with_function_calling_webrtc.py | 20 +++++++++++-------- ...me_chat_with_function_calling_websocket.py | 20 +++++++++++-------- .../realtime/simple_realtime_chat_webrtc.py | 20 +++++++++++-------- .../simple_realtime_chat_websocket.py | 20 +++++++++++-------- 4 files changed, 48 insertions(+), 32 deletions(-) diff --git a/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py index b03908e721ca..2131807a0eae 100644 --- a/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py +++ b/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py @@ -24,14 +24,18 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# This simple sample demonstrates how to use the OpenAI Realtime API to create -# a chat bot that can listen and respond directly through audio. -# It requires installing: -# - semantic-kernel[realtime] -# - pyaudio -# - sounddevice -# - pydub -# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. diff --git a/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py index 812502c7a5b5..eaa83f250c54 100644 --- a/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py +++ b/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py @@ -21,14 +21,18 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -# This simple sample demonstrates how to use the OpenAI Realtime API to create -# a chat bot that can listen and respond directly through audio. -# It requires installing: -# - semantic-kernel[realtime] -# - pyaudio -# - sounddevice -# - pydub -# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" @kernel_function diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py index 516c40a22b8c..0b9c6a7e9485 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py @@ -16,14 +16,18 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# This simple sample demonstrates how to use the OpenAI Realtime API to create -# a chat bot that can listen and respond directly through audio. -# It requires installing: -# - semantic-kernel[realtime] -# - pyaudio -# - sounddevice -# - pydub -# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" # The characteristics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each. diff --git a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py index e81f1ff5268c..4a374c46518f 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py @@ -17,14 +17,18 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# This simple sample demonstrates how to use the OpenAI Realtime API to create -# a chat bot that can listen and respond directly through audio. -# It requires installing: -# - semantic-kernel[realtime] -# - pyaudio -# - sounddevice -# - pydub -# e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" # The characterics of your speaker and microphone are a big factor in a smooth conversation # so you may need to try out different devices for each.