diff --git a/python/.cspell.json b/python/.cspell.json index 1a99593354f1..485789ae22a1 100644 --- a/python/.cspell.json +++ b/python/.cspell.json @@ -47,6 +47,7 @@ "logprobs", "mistralai", "mongocluster", + "nd", "ndarray", "nopep", "NOSQL", @@ -73,4 +74,4 @@ "vertexai", "Weaviate" ] -} +} \ No newline at end of file diff --git a/python/.vscode/launch.json b/python/.vscode/launch.json index 831aaf5149bc..80145e18a817 100644 --- a/python/.vscode/launch.json +++ b/python/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "${file}", "console": "integratedTerminal", - "justMyCode": true + "justMyCode": false }, { "name": "Python FastAPI app with Dapr", diff --git a/python/pyproject.toml b/python/pyproject.toml index a7d7277eef61..b6785a40dfb8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -128,6 +128,10 @@ dapr = [ "dapr-ext-fastapi>=1.14.0", "flask-dapr>=1.14.0" ] +realtime = [ + "websockets >= 13, < 15", + "aiortc>=1.9.0", +] [tool.uv] prerelease = "if-necessary-or-explicit" @@ -225,5 +229,3 @@ name = "semantic_kernel" [build-system] requires = ["flit-core >= 3.9,<4.0"] build-backend = "flit_core.buildapi" - - diff --git a/python/samples/concepts/realtime/README.md b/python/samples/concepts/realtime/README.md new file mode 100644 index 000000000000..a2dbb5d349f5 --- /dev/null +++ b/python/samples/concepts/realtime/README.md @@ -0,0 +1,50 @@ +# Realtime Multi-modal API Samples + +These samples are more complex then most because of the nature of these API's. They are designed to be run in real-time and require a microphone and speaker to be connected to your computer. + +To run these samples, you will need to have the following setup: + +- Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set. +- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`. +- To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment: + - semantic-kernel[realtime] + - pyaudio + - sounddevice + - pydub + e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +The samples all run as python scripts, that can either be started directly or through your IDE. + +All demos have a similar output, where the instructions are printed, each new *response item* from the API is put into a new `Mosscap (transcript):` line. The nature of these api's is such that the transcript arrives before the spoken audio, so if you interrupt the audio the transcript will not match the audio. + +The realtime api's work by sending event from the server to you and sending events back to the server, this is fully asynchronous. The samples show you can listen to the events being sent by the server and some are handled by the code in the samples, others are not. For instance one could add a clause in the match case in the receive loop that logs the usage that is part of the `response.done` event. + +For more info on the events, go to our documentation, as well as the documentation of [OpenAI](https://platform.openai.com/docs/guides/realtime) and [Azure](https://learn.microsoft.com/en-us/azure/ai-services/openai/realtime-audio-quickstart?tabs=keyless%2Cmacos&pivots=programming-language-python). + +## Simple chat samples + +### [Simple chat with realtime websocket](./simple_realtime_chat_websocket.py) + +This sample uses the websocket api with Azure OpenAI to run a simple interaction based on voice. If you want to use this sample with OpenAI, just change AzureRealtimeWebsocket into OpenAIRealtimeWebsocket. + +### [Simple chat with realtime WebRTC](./simple_realtime_chat_webrtc.py) + +This sample uses the WebRTC api with OpenAI to run a simple interaction based on voice. Because of the way the WebRTC protocol works this needs a different player and recorder than the websocket version. + +## Function calling samples + +The following two samples use function calling with the following functions: + +- get_weather: This function will return the weather for a given city, it is randomly generated and not based on any real data. +- get_time: This function will return the current time and date. +- goodbye: This function will end the conversation. + +A line is logged whenever one of these functions is called. + +### [Chat with function calling Websocket](./realtime_chat_with_function_calling_websocket.py) + +This sample uses the websocket api with Azure OpenAI to run the interaction with the voice model, but now with function calling. + +### [Chat with function calling WebRTC](./realtime_chat_with_function_calling_webrtc.py) + +This sample uses the WebRTC api with OpenAI to run the interaction with the voice model, but now with function calling. diff --git a/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py new file mode 100644 index 000000000000..2131807a0eae --- /dev/null +++ b/python/samples/concepts/realtime/realtime_chat_with_function_calling_webrtc.py @@ -0,0 +1,143 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging +from datetime import datetime +from random import randint + +from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai import ( + ListenEvents, + OpenAIRealtimeExecutionSettings, + OpenAIRealtimeWebRTC, + TurnDetection, +) +from semantic_kernel.contents import ChatHistory +from semantic_kernel.contents.realtime_events import RealtimeTextEvent +from semantic_kernel.functions import kernel_function + +logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" + +# The characterics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can disable the check for available devices by commenting the line below +check_audio_devices() + + +@kernel_function +def get_weather(location: str) -> str: + """Get the weather for a location.""" + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + logger.info(f"@ Getting weather for {location}: {weather}") + return f"The weather in {location} is {weather}." + + +@kernel_function +def get_date_time() -> str: + """Get the current date and time.""" + logger.info("@ Getting current datetime") + return f"The current date and time is {datetime.now().isoformat()}." + + +@kernel_function +def goodbye(): + """When the user is done, say goodbye and then call this function.""" + logger.info("@ Goodbye has been called!") + raise KeyboardInterrupt + + +async def main() -> None: + print_transcript = True + # create the Kernel and add a simple function for function calling. + kernel = Kernel() + kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) + + # create the audio player and audio track + # both take a device_id parameter, which is the index of the device to use, if None the default device is used + audio_player = AudioPlayerWebRTC() + # create the realtime client and optionally add the audio output function, this is optional + # and can also be passed in the receive method + realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC()) + + # Create the settings for the session + # The realtime api, does not use a system message, but takes instructions as a parameter for a session + # Another important setting is to tune the server_vad turn detection + # if this is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection + settings = OpenAIRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + voice="alloy", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + # and we can add a chat history to conversation after starting it + chat_history = ChatHistory() + chat_history.add_user_message("Hi there, who are you?") + chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") + + # the context manager calls the create_session method on the client and starts listening to the audio stream + async with ( + audio_player, + realtime_client( + settings=settings, + chat_history=chat_history, + kernel=kernel, + create_response=True, + ), + ): + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): + match event: + case RealtimeTextEvent(): + if print_transcript: + print(event.text.text, end="") + case _: + # OpenAI Specific events + match event.service_type: + case ListenEvents.RESPONSE_CREATED: + if print_transcript: + print("\nMosscap (transcript): ", end="") + case ListenEvents.ERROR: + logger.error(event.service_event) + + +if __name__ == "__main__": + print( + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py new file mode 100644 index 000000000000..eaa83f250c54 --- /dev/null +++ b/python/samples/concepts/realtime/realtime_chat_with_function_calling_websocket.py @@ -0,0 +1,141 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging +from datetime import datetime +from random import randint + +from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, + AzureRealtimeWebsocket, + ListenEvents, + TurnDetection, +) +from semantic_kernel.contents import ChatHistory +from semantic_kernel.contents.realtime_events import RealtimeTextEvent +from semantic_kernel.functions import kernel_function + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" + + +@kernel_function +def get_weather(location: str) -> str: + """Get the weather for a location.""" + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + logger.info(f"@ Getting weather for {location}: {weather}") + return f"The weather in {location} is {weather}." + + +@kernel_function +def get_date_time() -> str: + """Get the current date and time.""" + logger.info("@ Getting current datetime") + return f"The current date and time is {datetime.now().isoformat()}." + + +@kernel_function +def goodbye(): + """When the user is done, say goodbye and then call this function.""" + logger.info("@ Goodbye has been called!") + raise KeyboardInterrupt + + +async def main() -> None: + print_transcript = True + # create the Kernel and add a simple function for function calling. + kernel = Kernel() + kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time]) + + # create the realtime client, in this the Azure Websocket client, there are also OpenAI Websocket and WebRTC clients + # See 02b-chat_with_function_calling_webrtc.py for an example of the WebRTC client + realtime_client = AzureRealtimeWebsocket() + # create the audio player and audio track + # both take a device_id parameter, which is the index of the device to use, if None the default device is used + audio_player = AudioPlayerWebsocket() + audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) + + # Create the settings for the session + # The realtime api, does not use a system message, but takes instructions as a parameter for a session + # Another important setting is to tune the server_vad turn detection + # if this is turned off (by setting turn_detection=None), you will have to send + # the "input_audio_buffer.commit" and "response.create" event to the realtime api + # to signal the end of the user's turn and start the response. + # manual VAD is not part of this sample + # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection + settings = AzureRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice for the full list of voices # noqa: E501 + voice="alloy", + turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + # and we can add a chat history to conversation to seed the conversation + chat_history = ChatHistory() + chat_history.add_user_message("Hi there, I'm based in Amsterdam.") + chat_history.add_assistant_message( + "I am Mosscap, a chat bot. I'm trying to figure out what people need, " + "I can tell you what the weather is or the time." + ) + + # the context manager calls the create_session method on the client and starts listening to the audio stream + async with ( + audio_player, + audio_recorder, + realtime_client( + settings=settings, + chat_history=chat_history, + kernel=kernel, + create_response=True, + ), + ): + # the audio_output_callback can be added here or in the client constructor + # using this gives the smoothest experience + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): + match event: + case RealtimeTextEvent(): + if print_transcript: + print(event.text.text, end="") + case _: + # OpenAI Specific events + match event.service_type: + case ListenEvents.RESPONSE_CREATED: + if print_transcript: + print("\nMosscap (transcript): ", end="") + case ListenEvents.ERROR: + print(event.service_event) + logger.error(event.service_event) + + +if __name__ == "__main__": + print( + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py new file mode 100644 index 000000000000..0b9c6a7e9485 --- /dev/null +++ b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py @@ -0,0 +1,84 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging + +from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices +from semantic_kernel.connectors.ai.open_ai import ( + ListenEvents, + OpenAIRealtimeExecutionSettings, + OpenAIRealtimeWebRTC, +) + +logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" + +# The characteristics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can disable the check for available devices by commenting the line below +check_audio_devices() + + +async def main() -> None: + # create the realtime client and optionally add the audio output function, this is optional + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC()) + # Create the settings for the session + settings = OpenAIRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + # there are different voices to choose from, since that list is bound to change, it is not checked beforehand, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice + # for more details. + voice="alloy", + ) + audio_player = AudioPlayerWebRTC() + # the context manager calls the create_session method on the client and starts listening to the audio stream + async with audio_player, realtime_client(settings=settings, create_response=True): + async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): + match event.event_type: + case "text": + # the model returns both audio and transcript of the audio, which we will print + print(event.text.text, end="") + case "service": + # OpenAI Specific events + if event.service_type == ListenEvents.SESSION_UPDATED: + print("Session updated") + if event.service_type == ListenEvents.RESPONSE_CREATED: + print("\nMosscap (transcript): ", end="") + + +if __name__ == "__main__": + print( + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py new file mode 100644 index 000000000000..4a374c46518f --- /dev/null +++ b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py @@ -0,0 +1,90 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import logging + +from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices +from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, + AzureRealtimeWebsocket, + ListenEvents, +) +from semantic_kernel.contents import RealtimeAudioEvent, RealtimeTextEvent + +logging.basicConfig(level=logging.WARNING) +utils_log = logging.getLogger("samples.concepts.realtime.utils") +utils_log.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +""" +This simple sample demonstrates how to use the OpenAI Realtime API to create +a chat bot that can listen and respond directly through audio. +It requires installing: +- semantic-kernel[realtime] +- pyaudio +- sounddevice +- pydub +e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime] + +For more details of the exact setup, see the README.md in the realtime folder. +""" + +# The characterics of your speaker and microphone are a big factor in a smooth conversation +# so you may need to try out different devices for each. +# you can also play around with the turn_detection settings to get the best results. +# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes, +# so you may need to adjust these for your system. +# you can disable the check for available devices by commenting the line below +check_audio_devices() + + +async def main() -> None: + # create the realtime client and optionally add the audio output function, this is optional + # you can define the protocol to use, either "websocket" or "webrtc" + # they will behave the same way, even though the underlying protocol is quite different + realtime_client = AzureRealtimeWebsocket() + audio_player = AudioPlayerWebsocket() + audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) + # Create the settings for the session + settings = AzureRealtimeExecutionSettings( + instructions=""" + You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose. + """, + # there are different voices to choose from, since that list is bound to change, it is not checked beforehand, + # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice + # for more details. + voice="shimmer", + ) + # the context manager calls the create_session method on the client and starts listening to the audio stream + async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True): + async for event in realtime_client.receive(): + match event: + # this can be used as an alternative to the callback function used in other samples, + # the callback is faster and smoother + case RealtimeAudioEvent(): + await audio_player.add_audio(event.audio) + case RealtimeTextEvent(): + # the model returns both audio and transcript of the audio, which we will print + print(event.text.text, end="") + case _: + # OpenAI Specific events + if event.service_type == ListenEvents.SESSION_UPDATED: + print("Session updated") + if event.service_type == ListenEvents.RESPONSE_CREATED: + print("\nMosscap (transcript): ", end="") + + +if __name__ == "__main__": + print( + "Instructions: The model will start speaking immediately," + "this can be turned off by removing `create_response=True` above." + "The model will detect when you stop and automatically generate a response. " + "Press ctrl + c to stop the program." + ) + asyncio.run(main()) diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py new file mode 100644 index 000000000000..b3056991d626 --- /dev/null +++ b/python/samples/concepts/realtime/utils.py @@ -0,0 +1,489 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +import logging +import threading +from typing import Any, ClassVar, Final, cast + +import numpy as np +import numpy.typing as npt +import sounddevice as sd +from aiortc.mediastreams import MediaStreamError, MediaStreamTrack +from av.audio.frame import AudioFrame +from av.frame import Frame +from pydantic import BaseModel, ConfigDict, PrivateAttr +from sounddevice import InputStream, OutputStream + +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents import AudioContent +from semantic_kernel.contents.realtime_events import RealtimeAudioEvent + +logger = logging.getLogger(__name__) + +SAMPLE_RATE: Final[int] = 24000 +RECORDER_CHANNELS: Final[int] = 1 +PLAYER_CHANNELS: Final[int] = 1 +FRAME_DURATION: Final[int] = 100 +SAMPLE_RATE_WEBRTC: Final[int] = 48000 +RECORDER_CHANNELS_WEBRTC: Final[int] = 1 +PLAYER_CHANNELS_WEBRTC: Final[int] = 2 +FRAME_DURATION_WEBRTC: Final[int] = 20 +DTYPE: Final[npt.DTypeLike] = np.int16 + + +def check_audio_devices(): + logger.info(sd.query_devices()) + + +# region: Recorders + + +class AudioRecorderWebRTC(BaseModel, MediaStreamTrack): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + This class is meant as a demo sample and is not meant for production use. + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + + kind: ClassVar[str] = "audio" + device: str | int | None = None + sample_rate: int + channels: int + frame_duration: int + dtype: npt.DTypeLike = DTYPE + frame_size: int = 0 + _queue: asyncio.Queue[Frame] = PrivateAttr(default_factory=asyncio.Queue) + _is_recording: bool = False + _stream: InputStream | None = None + _recording_task: asyncio.Task | None = None + _loop: asyncio.AbstractEventLoop | None = None + _pts: int = 0 + + def __init__( + self, + *, + device: str | int | None = None, + sample_rate: int = SAMPLE_RATE_WEBRTC, + channels: int = RECORDER_CHANNELS_WEBRTC, + frame_duration: int = FRAME_DURATION_WEBRTC, + dtype: npt.DTypeLike = DTYPE, + ): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + Make sure the device is set to the correct device for your system. + + Args: + device: The device id to use for recording audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + frame_duration: The duration of each audio frame in milliseconds. + dtype: The data type for the audio. + """ + super().__init__(**{ + "device": device, + "sample_rate": sample_rate, + "channels": channels, + "frame_duration": frame_duration, + "dtype": dtype, + "frame_size": int(sample_rate * frame_duration / 1000), + }) + MediaStreamTrack.__init__(self) + + async def recv(self) -> Frame: + """Receive the next frame of audio data.""" + if not self._recording_task: + self._recording_task = asyncio.create_task(self.start_recording()) + + try: + frame = await self._queue.get() + self._queue.task_done() + return frame + except Exception as e: + logger.error(f"Error receiving audio frame: {e!s}") + raise MediaStreamError("Failed to receive audio frame") + + def _sounddevice_callback(self, indata: np.ndarray, frames: int, time: Any, status: Any) -> None: + if status: + logger.warning(f"Audio input status: {status}") + if self._loop and self._loop.is_running(): + asyncio.run_coroutine_threadsafe(self._queue.put(self._create_frame(indata)), self._loop) + + def _create_frame(self, indata: np.ndarray) -> Frame: + audio_data = indata.copy() + if audio_data.dtype != self.dtype: + audio_data = ( + (audio_data * 32767).astype(self.dtype) if self.dtype == np.int16 else audio_data.astype(self.dtype) + ) + frame = AudioFrame( + format="s16", + layout="mono", + samples=len(audio_data), + ) + frame.rate = self.sample_rate + frame.pts = self._pts + frame.planes[0].update(audio_data.tobytes()) + self._pts += len(audio_data) + return frame + + async def start_recording(self): + """Start recording audio from the input device.""" + if self._is_recording: + return + + self._is_recording = True + self._loop = asyncio.get_running_loop() + self._pts = 0 # Reset pts when starting recording + + try: + self._stream = InputStream( + device=self.device, + channels=self.channels, + samplerate=self.sample_rate, + dtype=self.dtype, + blocksize=self.frame_size, + callback=self._sounddevice_callback, + ) + self._stream.start() + + while self._is_recording: + await asyncio.sleep(0.1) + except asyncio.CancelledError: + logger.debug("Recording task was stopped.") + except KeyboardInterrupt: + logger.debug("Recording task was stopped.") + except Exception as e: + logger.error(f"Error in audio recording: {e!s}") + raise + finally: + self._is_recording = False + + +class AudioRecorderWebsocket(BaseModel): + """A simple class that implements a sounddevice for use with websockets. + + This class is meant as a demo sample and is not meant for production use. + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + + realtime_client: RealtimeClientBase + device: str | int | None = None + sample_rate: int + channels: int + frame_duration: int + dtype: npt.DTypeLike = DTYPE + frame_size: int = 0 + _stream: InputStream | None = None + _pts: int = 0 + _stream_task: asyncio.Task | None = None + + def __init__( + self, + *, + realtime_client: RealtimeClientBase, + device: str | int | None = None, + sample_rate: int = SAMPLE_RATE, + channels: int = RECORDER_CHANNELS, + frame_duration: int = FRAME_DURATION, + dtype: npt.DTypeLike = DTYPE, + ): + """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice. + + Make sure the device is set to the correct device for your system. + + Args: + realtime_client: The RealtimeClientBase to use for streaming audio. + device: The device id to use for recording audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + frame_duration: The duration of each audio frame in milliseconds. + dtype: The data type for the audio. + **kwargs: Additional keyword arguments. + """ + super().__init__(**{ + "realtime_client": realtime_client, + "device": device, + "sample_rate": sample_rate, + "channels": channels, + "frame_duration": frame_duration, + "dtype": dtype, + "frame_size": int(sample_rate * frame_duration / 1000), + }) + + async def __aenter__(self): + """Stream audio data to a RealtimeClientBase.""" + if not self._stream_task: + self._stream_task = asyncio.create_task(self._start_stream()) + return self + + async def _start_stream(self): + self._pts = 0 # Reset pts when starting recording + self._stream = InputStream( + device=self.device, + channels=self.channels, + samplerate=self.sample_rate, + dtype=self.dtype, + blocksize=self.frame_size, + ) + self._stream.start() + try: + while True: + if self._stream.read_available < self.frame_size: + await asyncio.sleep(0) + continue + data, _ = self._stream.read(self.frame_size) + + await self.realtime_client.send( + RealtimeAudioEvent(audio=AudioContent(data=base64.b64encode(cast(Any, data)).decode("utf-8"))) + ) + + await asyncio.sleep(0) + except asyncio.CancelledError: + pass + + async def __aexit__(self, exc_type, exc, tb): + """Stop recording audio.""" + if self._stream_task: + self._stream_task.cancel() + await self._stream_task + if self._stream: + self._stream.stop() + self._stream.close() + + +# region: Players + + +class AudioPlayerWebRTC(BaseModel): + """Simple class that plays audio using sounddevice. + + This class is meant as a demo sample and is not meant for production use. + + Make sure the device_id is set to the correct device for your system. + + The sample rate, channels and frame duration + should be set to match the audio you + are receiving. + + Args: + device: The device id to use for playing audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + dtype: The data type for the audio. + frame_duration: The duration of each audio frame in milliseconds + + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + + device: int | None = None + sample_rate: int = SAMPLE_RATE_WEBRTC + channels: int = PLAYER_CHANNELS_WEBRTC + dtype: npt.DTypeLike = DTYPE + frame_duration: int = FRAME_DURATION_WEBRTC + _queue: asyncio.Queue[np.ndarray] | None = PrivateAttr(default=None) + _stream: OutputStream | None = PrivateAttr(default=None) + + async def __aenter__(self): + """Start the audio stream when entering a context.""" + self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + """Stop the audio stream when exiting a context.""" + self.stop() + + def start(self): + """Start the audio stream.""" + self._queue = asyncio.Queue() + self._stream = OutputStream( + callback=self._sounddevice_callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.dtype, + blocksize=int(self.sample_rate * self.frame_duration / 1000), + device=self.device, + ) + if self._stream and self._queue: + self._stream.start() + + def stop(self): + """Stop the audio stream.""" + if self._stream: + self._stream.stop() + self._stream = None + self._queue = None + + def _sounddevice_callback(self, outdata, frames, time, status): + """This callback is called by sounddevice when it needs more audio data to play.""" + if status: + logger.debug(f"Audio output status: {status}") + if self._queue: + if self._queue.empty(): + return + data = self._queue.get_nowait() + outdata[:] = data.reshape(outdata.shape) + self._queue.task_done() + else: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + + async def client_callback(self, content: np.ndarray): + """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" + if self._queue: + await self._queue.put(content) + else: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + + async def add_audio(self, audio_content: AudioContent) -> None: + """This function is used to add audio to the queue for playing. + + It first checks if there is a AudioFrame in the inner_content of the AudioContent. + If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. + """ + if not self._queue: + logger.error( + "Audio queue not initialized, make sure to call start before " + "using the player, or use the context manager." + ) + return + if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): + await self._queue.put(audio_content.inner_content.to_ndarray()) + return + if isinstance(audio_content.data, np.ndarray): + await self._queue.put(audio_content.data) + return + if isinstance(audio_content.data, bytes): + await self._queue.put(np.frombuffer(audio_content.data, dtype=self.dtype)) + return + if isinstance(audio_content.data, str): + await self._queue.put(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) + return + logger.error(f"Unknown audio content: {audio_content}") + + +class AudioPlayerWebsocket(BaseModel): + """Simple class that plays audio using sounddevice. + + This class is meant as a demo sample and is not meant for production use. + + Make sure the device_id is set to the correct device for your system. + + The sample rate, channels and frame duration + should be set to match the audio you + are receiving. + + Args: + device: The device id to use for playing audio. + sample_rate: The sample rate for the audio. + channels: The number of channels for the audio. + dtype: The data type for the audio. + frame_duration: The duration of each audio frame in milliseconds + + """ + + model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True) + + device: int | None = None + sample_rate: int = SAMPLE_RATE + channels: int = PLAYER_CHANNELS + dtype: npt.DTypeLike = DTYPE + frame_duration: int = FRAME_DURATION + _lock: Any = PrivateAttr(default_factory=threading.Lock) + _queue: list[np.ndarray] = PrivateAttr(default_factory=list) + _stream: OutputStream | None = PrivateAttr(default=None) + _frame_count: int = 0 + + async def __aenter__(self): + """Start the audio stream when entering a context.""" + self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + """Stop the audio stream when exiting a context.""" + self.stop() + + def start(self): + """Start the audio stream.""" + with self._lock: + self._queue = [] + self._stream = OutputStream( + callback=self._sounddevice_callback, + samplerate=self.sample_rate, + channels=self.channels, + dtype=self.dtype, + blocksize=int(self.sample_rate * self.frame_duration / 1000), + device=self.device, + ) + if self._stream: + self._stream.start() + + def stop(self): + """Stop the audio stream.""" + if self._stream: + self._stream.stop() + self._stream = None + with self._lock: + self._queue = [] + + def _sounddevice_callback(self, outdata, frames, time, status): + """This callback is called by sounddevice when it needs more audio data to play.""" + with self._lock: + if status: + logger.debug(f"Audio output status: {status}") + data = np.empty(0, dtype=np.int16) + + # get next item from queue if there is still space in the buffer + while len(data) < frames and len(self._queue) > 0: + item = self._queue.pop(0) + frames_needed = frames - len(data) + data = np.concatenate((data, item[:frames_needed])) + if len(item) > frames_needed: + self._queue.insert(0, item[frames_needed:]) + + self._frame_count += len(data) + + # fill the rest of the frames with zeros if there is no more data + if len(data) < frames: + data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) + + outdata[:] = data.reshape(-1, 1) + + def reset_frame_count(self): + self._frame_count = 0 + + def get_frame_count(self): + return self._frame_count + + async def client_callback(self, content: np.ndarray): + """This function can be passed to the audio_output_callback field of the RealtimeClientBase.""" + with self._lock: + self._queue.append(content) + + async def add_audio(self, audio_content: AudioContent) -> None: + """This function is used to add audio to the queue for playing. + + It first checks if there is a AudioFrame in the inner_content of the AudioContent. + If not, it checks if the data is a numpy array, bytes, or a string and converts it to a numpy array. + """ + with self._lock: + if audio_content.inner_content and isinstance(audio_content.inner_content, AudioFrame): + self._queue.append(audio_content.inner_content.to_ndarray()) + return + if isinstance(audio_content.data, np.ndarray): + self._queue.append(audio_content.data) + return + if isinstance(audio_content.data, bytes): + self._queue.append(np.frombuffer(audio_content.data, dtype=self.dtype)) + return + if isinstance(audio_content.data, str): + self._queue.append(np.frombuffer(audio_content.data.encode(), dtype=self.dtype)) + return + logger.error(f"Unknown audio content: {audio_content}") diff --git a/python/samples/demos/call_automation/.env.example b/python/samples/demos/call_automation/.env.example new file mode 100644 index 000000000000..055528e2c2f3 --- /dev/null +++ b/python/samples/demos/call_automation/.env.example @@ -0,0 +1,8 @@ +ACS_CONNECTION_STRING= +CALLBACK_URI_HOST= + +AZURE_OPENAI_SERVICE_ENDPOINT= +AZURE_OPENAI_DEPLOYMENT_MODEL_NAME= +AZURE_OPENAI_API_VERSION= + +AZURE_OPENAI_SERVICE_KEY= \ No newline at end of file diff --git a/python/samples/demos/call_automation/call_automation.py b/python/samples/demos/call_automation/call_automation.py new file mode 100755 index 000000000000..2ea8058167d9 --- /dev/null +++ b/python/samples/demos/call_automation/call_automation.py @@ -0,0 +1,290 @@ +# Copyright (c) Microsoft. All rights reserved. + +#################################################################### +# Sample Quart webapp with that connects to Azure OpenAI # +# Make sure to install `uv`, see: # +# https://docs.astral.sh/uv/getting-started/installation/ # +# and rename .env.example to .env and fill in the values. # +# Follow the guidance in README.md for more info. # +# To run the app, use: # +# `uv run --env-file .env call_automation.py` # +#################################################################### +# +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "Quart", +# "azure-eventgrid", +# "azure-communication-callautomation==1.4.0b1", +# "semantic-kernel[realtime]", +# ] +# /// + +import asyncio +import base64 +import os +import uuid +from datetime import datetime +from logging import INFO +from random import randint +from urllib.parse import urlencode, urlparse, urlunparse + +from azure.communication.callautomation import ( + AudioFormat, + MediaStreamingAudioChannelType, + MediaStreamingContentType, + MediaStreamingOptions, + MediaStreamingTransportType, +) +from azure.communication.callautomation.aio import CallAutomationClient +from azure.eventgrid import EventGridEvent, SystemEventNames +from numpy import ndarray +from quart import Quart, Response, json, request, websocket + +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, + AzureRealtimeWebsocket, + ListenEvents, +) +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents import AudioContent, RealtimeAudioEvent +from semantic_kernel.functions import kernel_function + +# Callback events URI to handle callback events. +CALLBACK_URI_HOST = os.environ["CALLBACK_URI_HOST"] +CALLBACK_EVENTS_URI = CALLBACK_URI_HOST + "/api/callbacks" + +acs_client = CallAutomationClient.from_connection_string(os.environ["ACS_CONNECTION_STRING"]) +app = Quart(__name__) + +# region: Semantic Kernel + +kernel = Kernel() + + +class HelperPlugin: + """Helper plugin for the Semantic Kernel.""" + + @kernel_function + def get_weather(self, location: str) -> str: + """Get the weather for a location.""" + app.logger.info(f"@ Getting weather for {location}") + weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing") + weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec + return f"The weather in {location} is {weather}." + + @kernel_function + def get_date_time(self) -> str: + """Get the current date and time.""" + app.logger.info("@ Getting current datetime") + return f"The current date and time is {datetime.now().isoformat()}." + + @kernel_function + async def goodbye(self): + """When the user is done, say goodbye and then call this function.""" + app.logger.info("@ Goodbye has been called!") + global call_connection_id + await acs_client.get_call_connection(call_connection_id).hang_up(is_for_everyone=True) + + +kernel.add_plugin(plugin=HelperPlugin(), plugin_name="helpers", description="Helper functions for the realtime client.") + +# region: Handlers for audio and data streams + + +async def from_realtime_to_acs(audio: ndarray): + """Function that forwards the audio from the model to the websocket of the ACS client.""" + await websocket.send( + json.dumps({"kind": "AudioData", "audioData": {"data": base64.b64encode(audio.tobytes()).decode("utf-8")}}) + ) + + +async def from_acs_to_realtime(client: RealtimeClientBase): + """Function that forwards the audio from the ACS client to the model.""" + while True: + try: + # Receive data from the ACS client + stream_data = await websocket.receive() + data = json.loads(stream_data) + if data["kind"] == "AudioData": + # send it to the Realtime service + await client.send( + event=RealtimeAudioEvent( + audio=AudioContent(data=data["audioData"]["data"], data_format="base64", inner_content=data), + ) + ) + except Exception: + app.logger.info("Websocket connection closed.") + break + + +async def handle_realtime_messages(client: RealtimeClientBase): + """Function that handles the messages from the Realtime service. + + This function only handles the non-audio messages. + Audio is done through the callback so that it is faster and smoother. + """ + async for event in client.receive(audio_output_callback=from_realtime_to_acs): + match event.service_type: + case ListenEvents.SESSION_CREATED: + print("Session Created Message") + print(f" Session Id: {event.service_event.session.id}") + case ListenEvents.ERROR: + print(f" Error: {event.service_event.error}") + case ListenEvents.INPUT_AUDIO_BUFFER_CLEARED: + print("Input Audio Buffer Cleared Message") + case ListenEvents.INPUT_AUDIO_BUFFER_SPEECH_STARTED: + print(f"Voice activity detection started at {event.service_event.audio_start_ms} [ms]") + await websocket.send(json.dumps({"Kind": "StopAudio", "AudioData": None, "StopAudio": {}})) + + case ListenEvents.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED: + print(f" User:-- {event.service_event.transcript}") + case ListenEvents.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED: + print(f" Error: {event.service_event.error}") + case ListenEvents.RESPONSE_DONE: + print("Response Done Message") + print(f" Response Id: {event.service_event.response.id}") + if event.service_event.response.status_details: + print(f" Status Details: {event.service_event.response.status_details.model_dump_json()}") + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE: + print(f" AI:-- {event.service_event.transcript}") + + +# region: Routes + + +# WebSocket. +@app.websocket("/ws") +async def ws(): + app.logger.info("Client connected to WebSocket") + + # create the client, using the audio callback + client = AzureRealtimeWebsocket() + settings = AzureRealtimeExecutionSettings( + instructions="""You are a chat bot. Your name is Mosscap and + you have one goal: figure out what people need. + Your full name, should you need to know it, is + Splendid Speckled Mosscap. You communicate + effectively, but you tend to answer with long + flowery prose.""", + turn_detection={"type": "server_vad"}, + voice="shimmer", + input_audio_format="pcm16", + output_audio_format="pcm16", + input_audio_transcription={"model": "whisper-1"}, + function_choice_behavior=FunctionChoiceBehavior.Auto(), + ) + + # create the realtime client session + async with client(settings=settings, create_response=True, kernel=kernel): + # start handling the messages from the realtime client + # and allow the callback to be used to forward the audio to the acs client + receive_task = asyncio.create_task(handle_realtime_messages(client)) + # receive messages from the ACS client and send them to the realtime client + await from_acs_to_realtime(client) + receive_task.cancel() + + +@app.route("/api/incomingCall", methods=["POST"]) +async def incoming_call_handler() -> Response: + app.logger.info("incoming event data") + for event_dict in await request.json: + event = EventGridEvent.from_dict(event_dict) + app.logger.info("incoming event data --> %s", event.data) + + if event.event_type == SystemEventNames.EventGridSubscriptionValidationEventName: + app.logger.info("Validating subscription") + validation_code = event.data["validationCode"] + validation_response = {"validationResponse": validation_code} + return Response(response=json.dumps(validation_response), status=200) + + if event.event_type == "Microsoft.Communication.IncomingCall": + app.logger.info("Incoming call received: data=%s", event.data) + caller_id = ( + event.data["from"]["phoneNumber"]["value"] + if event.data["from"]["kind"] == "phoneNumber" + else event.data["from"]["rawId"] + ) + app.logger.info("incoming call handler caller id: %s", caller_id) + incoming_call_context = event.data["incomingCallContext"] + guid = uuid.uuid4() + query_parameters = urlencode({"callerId": caller_id}) + callback_uri = f"{CALLBACK_EVENTS_URI}/{guid}?{query_parameters}" + + parsed_url = urlparse(CALLBACK_EVENTS_URI) + websocket_url = urlunparse(("wss", parsed_url.netloc, "/ws", "", "", "")) + + app.logger.info("callback url: %s", callback_uri) + app.logger.info("websocket url: %s", websocket_url) + + media_streaming_options = MediaStreamingOptions( + transport_url=websocket_url, + transport_type=MediaStreamingTransportType.WEBSOCKET, + content_type=MediaStreamingContentType.AUDIO, + audio_channel_type=MediaStreamingAudioChannelType.MIXED, + start_media_streaming=True, + enable_bidirectional=True, + audio_format=AudioFormat.PCM24_K_MONO, + ) + answer_call_result = await acs_client.answer_call( + incoming_call_context=incoming_call_context, + operation_context="incomingCall", + callback_url=callback_uri, + media_streaming=media_streaming_options, + ) + app.logger.info("Answered call for connection id: %s", answer_call_result.call_connection_id) + return Response(status=200) + return Response(status=200) + + +@app.route("/api/callbacks/", methods=["POST"]) +async def callbacks(contextId): + for event in await request.json: + # Parsing callback events + global call_connection_id + event_data = event["data"] + call_connection_id = event_data["callConnectionId"] + app.logger.info( + f"Received Event:-> {event['type']}, Correlation Id:-> {event_data['correlationId']}, CallConnectionId:-> {call_connection_id}" # noqa: E501 + ) + match event["type"]: + case "Microsoft.Communication.CallConnected": + call_connection_properties = await acs_client.get_call_connection( + call_connection_id + ).get_call_properties() + media_streaming_subscription = call_connection_properties.media_streaming_subscription + app.logger.info(f"MediaStreamingSubscription:--> {media_streaming_subscription}") + app.logger.info(f"Received CallConnected event for connection id: {call_connection_id}") + app.logger.info("CORRELATION ID:--> %s", event_data["correlationId"]) + app.logger.info("CALL CONNECTION ID:--> %s", event_data["callConnectionId"]) + case "Microsoft.Communication.MediaStreamingStarted" | "Microsoft.Communication.MediaStreamingStopped": + app.logger.info(f"Media streaming content type:--> {event_data['mediaStreamingUpdate']['contentType']}") + app.logger.info( + f"Media streaming status:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatus']}" + ) + app.logger.info( + f"Media streaming status details:--> {event_data['mediaStreamingUpdate']['mediaStreamingStatusDetails']}" # noqa: E501 + ) + case "Microsoft.Communication.MediaStreamingFailed": + app.logger.info( + f"Code:->{event_data['resultInformation']['code']}, Subcode:-> {event_data['resultInformation']['subCode']}" # noqa: E501 + ) + app.logger.info(f"Message:->{event_data['resultInformation']['message']}") + case "Microsoft.Communication.CallDisconnected": + pass + return Response(status=200) + + +@app.route("/") +def home(): + return "Hello SKxACS CallAutomation!" + + +# region: Main + + +if __name__ == "__main__": + app.logger.setLevel(INFO) + app.run(port=8080) diff --git a/python/samples/demos/call_automation/readme.md b/python/samples/demos/call_automation/readme.md new file mode 100644 index 000000000000..ca69b39e0a3b --- /dev/null +++ b/python/samples/demos/call_automation/readme.md @@ -0,0 +1,53 @@ +# Call Automation - Quick Start Sample + +This is a sample application. It highlights an integration of Azure Communication Services with Semantic Kernel, using the Azure OpenAI Service to enable intelligent conversational agents. + +Original code for this sample can be found [here](https://github.com/Azure-Samples/communication-services-python-quickstarts/tree/main/callautomation-openai-sample). + +## Prerequisites + +- An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F). +- A deployed Communication Services resource. [Create a Communication Services resource](https://docs.microsoft.com/azure/communication-services/quickstarts/create-communication-resource). +- A [phone number](https://learn.microsoft.com/en-us/azure/communication-services/quickstarts/telephony/get-phone-number) in your Azure Communication Services resource that can get inbound calls. NB: phone numbers are not available in free subscriptions. +- [Python](https://www.python.org/downloads/) 3.9 or above. +- An Azure OpenAI Resource and Deployed Model. See [instructions](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal). +- Install `uv`, see [the uv docs](https://docs.astral.sh/uv/getting-started/installation/). + +## To run the app + +1. Open an instance of PowerShell, Windows Terminal, Command Prompt or equivalent and navigate to the directory that you would like to clone the sample to. +2. git clone `https://github.com/microsoft/semantic-kernel.git`. +3. Navigate to `python/samples/demos/call_automation` folder + +### Setup and host your Azure DevTunnel + +[Azure DevTunnels](https://learn.microsoft.com/en-us/azure/developer/dev-tunnels/overview) is an Azure service that enables you to share local web services hosted on the internet. Use the commands below to connect your local development environment to the public internet. This creates a tunnel with a persistent endpoint URL and which allows anonymous access. We will then use this endpoint to notify your application of calling events from the ACS Call Automation service. + +```bash +devtunnel create --allow-anonymous +devtunnel port create -p 8080 +devtunnel host +``` + +### Configuring application + +Copy the `.env.example` file to `.env` and update the following values: + +1. `ACS_CONNECTION_STRING`: Azure Communication Service resource's connection string. +2. `CALLBACK_URI_HOST`: Base url of the app. (For local development use the dev tunnel url from the step above) +3. `AZURE_OPENAI_ENDPOINT`: Azure Open AI service endpoint +4. `AZURE_OPENAI_DEPLOYMENT_MODEL_NAME`: Azure Open AI deployment name +5. `AZURE_OPENAI_API_VERSION`: Azure Open AI API version, this should be one that includes the realtime api, for instance '2024-10-01-preview' +6. `AZURE_OPENAI_API_KEY`: Azure Open AI API key, optionally, you can also use Entra Auth. + +## Run the app + +1. Navigate to `call_automation` folder and do one of the following to start the main application: + - run `call_automation.py` in debug mode from your IDE (VSCode will load your .env variables into the environment automatically, other IDE's might need an extra step). + - execute `uv run --env-file .env call_automation.py` directly in your terminal (this uses `uv`, which will then install the requirements in a temporary virtual environment, see [uv docs](https://docs.astral.sh/uv/guides/scripts) for more info). +2. Browser should pop up with a simple page. If not navigate it to `http://localhost:8080/` or your dev tunnel url. +3. Register an EventGrid Webhook for the IncomingCall(`https:///api/incomingCall`) event that points to your devtunnel URI. Instructions [here](https://learn.microsoft.com/en-us/azure/communication-services/concepts/call-automation/incoming-call-notification). + +Once that's completed you should have a running application. The way to test this is to place a call to your ACS phone number and talk to your intelligent agent! + +In the terminal you should see all sorts of logs from both ACS and Semantic Kernel. diff --git a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py index e64136fe9736..1c5d670c57f1 100644 --- a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py @@ -35,9 +35,9 @@ from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -261,7 +261,7 @@ def _create_chat_message_content( self, response: Message, response_metadata: dict[str, Any] ) -> "ChatMessageContent": """Create a chat message content object.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] items += self._get_tool_calls_from_message(response) for content_block in response.content: diff --git a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py index bb225052f383..88cb7ca5abd9 100644 --- a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py @@ -33,9 +33,9 @@ from semantic_kernel.connectors.ai.function_calling_utils import update_settings_from_function_call_configuration from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -240,7 +240,7 @@ def _create_chat_message_content( Returns: A chat message content object. """ - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if choice.message.content: items.append( TextContent( diff --git a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py index 9266823e0988..64df31e5967b 100644 --- a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py @@ -28,10 +28,10 @@ from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase from semantic_kernel.connectors.ai.completion_usage import CompletionUsage from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.image_content import ImageContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -238,7 +238,7 @@ def _create_chat_message_content(self, response: dict[str, Any]) -> ChatMessageC prompt_tokens=response["usage"]["inputTokens"], completion_tokens=response["usage"]["outputTokens"], ) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for content in response["output"]["message"]["content"]: if "text" in content: items.append(TextContent(text=content["text"], inner_content=content)) diff --git a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py index b7be735e95d8..974d59af92be 100644 --- a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py +++ b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py @@ -264,7 +264,9 @@ async def get_streaming_chat_message_contents( for msg in messages: if msg is not None: all_messages.append(msg) - if any(isinstance(item, FunctionCallContent) for item in msg.items): + if not function_call_returned and any( + isinstance(item, FunctionCallContent) for item in msg.items + ): function_call_returned = True yield messages @@ -432,7 +434,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str: return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id def _yield_function_result_messages(self, function_result_messages: list) -> bool: - """Determine if the function result messages should be yielded.""" + """Determine if the function result messages should be yielded. + + If there are messages and if the first message has items, then yield the messages. + """ return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0 # endregion diff --git a/python/semantic_kernel/connectors/ai/function_calling_utils.py b/python/semantic_kernel/connectors/ai/function_calling_utils.py index 48415aff9725..1e65fa59e537 100644 --- a/python/semantic_kernel/connectors/ai/function_calling_utils.py +++ b/python/semantic_kernel/connectors/ai/function_calling_utils.py @@ -1,10 +1,13 @@ # Copyright (c) Microsoft. All rights reserved. from collections import OrderedDict +from collections.abc import Callable +from copy import deepcopy from typing import TYPE_CHECKING, Any from semantic_kernel.contents.utils.author_role import AuthorRole from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.utils.feature_stage_decorator import experimental if TYPE_CHECKING: from semantic_kernel.connectors.ai.function_choice_behavior import ( @@ -15,6 +18,7 @@ from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + from semantic_kernel.kernel import Kernel def update_settings_from_function_call_configuration( @@ -134,3 +138,36 @@ def merge_streaming_function_results( function_invoke_attempt=function_invoke_attempt, ) ] + + +@experimental +def prepare_settings_for_function_calling( + settings: "PromptExecutionSettings", + settings_class: type["PromptExecutionSettings"], + update_settings_callback: Callable[..., None], + kernel: "Kernel", +) -> "PromptExecutionSettings": + """Prepare settings for the service. + + Args: + settings: Prompt execution settings. + settings_class: The settings class. + update_settings_callback: The callback to update the settings. + kernel: Kernel instance. + + Returns: + PromptExecutionSettings of type settings_class. + """ + settings = deepcopy(settings) + if not isinstance(settings, settings_class): + settings = settings_class.from_prompt_execution_settings(settings) + + if settings.function_choice_behavior: + # Configure the function choice behavior into the settings object + # that will become part of the request to the AI service + settings.function_choice_behavior.configure( + kernel=kernel, + update_settings_callback=update_settings_callback, + settings=settings, + ) + return settings diff --git a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py index ef50f4f0ef1b..9b538b26ebec 100644 --- a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py @@ -36,9 +36,9 @@ format_gemini_function_name_to_kernel_function_fully_qualified_name, ) from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -244,7 +244,7 @@ def _create_chat_message_content( response_metadata = self._get_metadata_from_response(response) response_metadata.update(self._get_metadata_from_candidate(candidate)) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for idx, part in enumerate(candidate.content.parts): if part.text: items.append(TextContent(text=part.text, inner_content=response, metadata=response_metadata)) diff --git a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py index bd7c1346accf..beec827bfb2f 100644 --- a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py @@ -34,9 +34,9 @@ ) from semantic_kernel.connectors.ai.google.vertex_ai.vertex_ai_settings import VertexAISettings from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -235,7 +235,7 @@ def _create_chat_message_content(self, response: GenerationResponse, candidate: response_metadata = self._get_metadata_from_response(response) response_metadata.update(self._get_metadata_from_candidate(candidate)) - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] for idx, part in enumerate(candidate.content.parts): part_dict = part.to_dict() if "text" in part_dict: diff --git a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py index 103133af2c9f..68a62e434423 100644 --- a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py +++ b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py @@ -27,9 +27,9 @@ ) from semantic_kernel.contents import AuthorRole from semantic_kernel.contents.chat_history import ChatHistory -from semantic_kernel.contents.chat_message_content import ITEM_TYPES, ChatMessageContent +from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent from semantic_kernel.contents.function_call_content import FunctionCallContent -from semantic_kernel.contents.streaming_chat_message_content import ITEM_TYPES as STREAMING_ITEM_TYPES +from semantic_kernel.contents.streaming_chat_message_content import STREAMING_CMC_ITEM_TYPES as STREAMING_ITEM_TYPES from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent @@ -255,7 +255,7 @@ def _parse_tool_calls(self, tool_calls: Sequence[Message.ToolCall] | None, items def _create_chat_message_content_from_chat_response(self, response: ChatResponse) -> ChatMessageContent: """Create a chat message content from the response.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if response.message.content: items.append( TextContent( @@ -274,7 +274,7 @@ def _create_chat_message_content_from_chat_response(self, response: ChatResponse def _create_chat_message_content(self, response: Mapping[str, Any]) -> ChatMessageContent: """Create a chat message content from the response.""" - items: list[ITEM_TYPES] = [] + items: list[CMC_ITEM_TYPES] = [] if not (message := response.get("message", None)): raise ServiceInvalidResponseError("No message content found in response.") diff --git a/python/semantic_kernel/connectors/ai/open_ai/__init__.py b/python/semantic_kernel/connectors/ai/open_ai/__init__.py index a3103ae86446..34e11756fdb7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/__init__.py +++ b/python/semantic_kernel/connectors/ai/open_ai/__init__.py @@ -22,6 +22,12 @@ OpenAIPromptExecutionSettings, OpenAITextPromptExecutionSettings, ) +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + AzureRealtimeExecutionSettings, + InputAudioTranscription, + OpenAIRealtimeExecutionSettings, + TurnDetection, +) from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import ( OpenAITextToAudioExecutionSettings, ) @@ -30,12 +36,19 @@ ) from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion +from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebsocket from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import ( + ListenEvents, + OpenAIRealtimeWebRTC, + OpenAIRealtimeWebsocket, + SendEvents, +) from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio @@ -55,6 +68,8 @@ "AzureDataSourceParameters", "AzureEmbeddingDependency", "AzureOpenAISettings", + "AzureRealtimeExecutionSettings", + "AzureRealtimeWebsocket", "AzureTextCompletion", "AzureTextEmbedding", "AzureTextToAudio", @@ -63,12 +78,17 @@ "DataSourceFieldsMapping", "DataSourceFieldsMapping", "ExtraBody", + "InputAudioTranscription", + "ListenEvents", "OpenAIAudioToText", "OpenAIAudioToTextExecutionSettings", "OpenAIChatCompletion", "OpenAIChatPromptExecutionSettings", "OpenAIEmbeddingPromptExecutionSettings", "OpenAIPromptExecutionSettings", + "OpenAIRealtimeExecutionSettings", + "OpenAIRealtimeWebRTC", + "OpenAIRealtimeWebsocket", "OpenAISettings", "OpenAITextCompletion", "OpenAITextEmbedding", @@ -77,4 +97,6 @@ "OpenAITextToAudioExecutionSettings", "OpenAITextToImage", "OpenAITextToImageExecutionSettings", + "SendEvents", + "TurnDetection", ] diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py new file mode 100644 index 000000000000..2c4fc74738b5 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft. All rights reserved. + +from collections.abc import Mapping, Sequence +from typing import Annotated, Any, Literal + +from pydantic import Field + +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.kernel_pydantic import KernelBaseModel + + +class InputAudioTranscription(KernelBaseModel): + """Input audio transcription settings. + + Args: + model: The model to use for transcription, currently only "whisper-1" is supported. + language: The language of the audio, should be in ISO-639-1 format, like 'en'. + prompt: An optional text to guide the model's style or continue a previous audio segment. + The prompt should match the audio language. + """ + + model: Literal["whisper-1"] | None = None + language: str | None = None + prompt: str | None = None + + +class TurnDetection(KernelBaseModel): + """Turn detection settings. + + Args: + type: The type of turn detection, currently only "server_vad" is supported. + threshold: The threshold for voice activity detection, should be between 0 and 1. + prefix_padding_ms: The padding before the detected voice activity, in milliseconds. + silence_duration_ms: The duration of silence to detect the end of a turn, in milliseconds. + create_response: Whether to create a response for each detected turn. + + """ + + type: Literal["server_vad"] = "server_vad" + threshold: Annotated[float | None, Field(ge=0.0, le=1.0)] = None + prefix_padding_ms: Annotated[int | None, Field(ge=0)] = None + silence_duration_ms: Annotated[int | None, Field(ge=0)] = None + create_response: bool | None = None + + +class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): + """Request settings for OpenAI realtime services.""" + + modalities: Sequence[Literal["audio", "text"]] | None = None + ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None + instructions: str | None = None + voice: str | None = None + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None + output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | None = None + input_audio_transcription: InputAudioTranscription | Mapping[str, str] | None = None + turn_detection: TurnDetection | Mapping[str, str] | None = None + tools: Annotated[ + list[dict[str, Any]] | None, + Field( + description="Do not set this manually. It is set by the service based " + "on the function choice configuration.", + ), + ] = None + tool_choice: Annotated[ + str | None, + Field( + description="Do not set this manually. It is set by the service based " + "on the function choice configuration.", + ), + ] = None + temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None + max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None + + +class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings): + """Request settings for Azure OpenAI realtime services.""" + + pass diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py index da50e4ee56b6..94d8691534fa 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py @@ -3,9 +3,11 @@ import logging from collections.abc import Awaitable, Callable, Mapping from copy import copy +from typing import Any from openai import AsyncAzureOpenAI from pydantic import ConfigDict, validate_call +from pydantic_core import Url from semantic_kernel.connectors.ai.open_ai.const import DEFAULT_AZURE_API_VERSION from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler, OpenAIModelTypes @@ -27,7 +29,7 @@ def __init__( deployment_name: str, ai_model_type: OpenAIModelTypes, endpoint: HttpsUrl | None = None, - base_url: HttpsUrl | None = None, + base_url: Url | None = None, api_version: str = DEFAULT_AZURE_API_VERSION, service_id: str | None = None, api_key: str | None = None, @@ -37,6 +39,7 @@ def __init__( default_headers: Mapping[str, str] | None = None, client: AsyncAzureOpenAI | None = None, instruction_role: str | None = None, + **kwargs: Any, ) -> None: """Internal class for configuring a connection to an Azure OpenAI service. @@ -47,7 +50,7 @@ def __init__( deployment_name (str): Name of the deployment. ai_model_type (OpenAIModelTypes): The type of OpenAI model to deploy. endpoint (HttpsUrl): The specific endpoint URL for the deployment. (Optional) - base_url (HttpsUrl): The base URL for Azure services. (Optional) + base_url (Url): The base URL for Azure services. (Optional) api_version (str): Azure API version. Defaults to the defined DEFAULT_AZURE_API_VERSION. service_id (str): Service ID for the deployment. (Optional) api_key (str): API key for Azure services. (Optional) @@ -59,6 +62,7 @@ def __init__( client (AsyncAzureOpenAI): An existing client to use. (Optional) instruction_role (str | None): The role to use for 'instruction' messages, for example, summarization prompts could use `developer` or `system`. (Optional) + kwargs: Additional keyword arguments. """ # Merge APP_INFO into the headers if it exists @@ -79,18 +83,29 @@ def __init__( "Please provide either api_key, ad_token or ad_token_provider or a client." ) - if not base_url: - if not endpoint: - raise ServiceInitializationError("Please provide an endpoint or a base_url") - base_url = HttpsUrl(f"{str(endpoint).rstrip('/')}/openai/deployments/{deployment_name}") - client = AsyncAzureOpenAI( - base_url=str(base_url), - api_version=api_version, - api_key=api_key, - azure_ad_token=ad_token, - azure_ad_token_provider=ad_token_provider, - default_headers=merged_headers, - ) + if not endpoint and not base_url: + raise ServiceInitializationError("Please provide an endpoint or a base_url") + + args: dict[str, Any] = { + "default_headers": merged_headers, + } + if api_version: + args["api_version"] = api_version + if ad_token: + args["azure_ad_token"] = ad_token + if ad_token_provider: + args["azure_ad_token_provider"] = ad_token_provider + if api_key: + args["api_key"] = api_key + if base_url: + args["base_url"] = str(base_url) + if endpoint and not base_url: + args["azure_endpoint"] = str(endpoint) + # TODO (eavanvalkenburg): Remove the check on model type when the package fixes: https://github.com/openai/openai-python/issues/2120 + if deployment_name and ai_model_type != OpenAIModelTypes.REALTIME: + args["azure_deployment"] = deployment_name + + client = AsyncAzureOpenAI(**args) args = { "ai_model_id": deployment_name, "client": client, @@ -100,7 +115,7 @@ def __init__( args["service_id"] = service_id if instruction_role: args["instruction_role"] = instruction_role - super().__init__(**args) + super().__init__(**args, **kwargs) def to_dict(self) -> dict[str, str]: """Convert the configuration to a dictionary.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py new file mode 100644 index 000000000000..39e5690fb3c1 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -0,0 +1,116 @@ +# Copyright (c) Microsoft. All rights reserved. + +import sys +from collections.abc import Callable, Coroutine, Mapping +from typing import Any + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +from numpy import ndarray +from openai import AsyncAzureOpenAI +from openai.lib.azure import AsyncAzureADTokenProvider +from pydantic import ValidationError + +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + AzureRealtimeExecutionSettings, +) +from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtimeWebsocketBase +from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.utils.feature_stage_decorator import experimental + + +@experimental +class AzureRealtimeWebsocket(OpenAIRealtimeWebsocketBase, AzureOpenAIConfigBase): + """Azure OpenAI Realtime service using WebSocket protocol.""" + + def __init__( + self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + service_id: str | None = None, + api_key: str | None = None, + deployment_name: str | None = None, + endpoint: str | None = None, + base_url: str | None = None, + api_version: str | None = None, + ad_token: str | None = None, + ad_token_provider: AsyncAzureADTokenProvider | None = None, + token_endpoint: str | None = None, + default_headers: Mapping[str, str] | None = None, + async_client: AsyncAzureOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + **kwargs: Any, + ) -> None: + """Initialize an AzureRealtimeWebsocket service. + + Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. + Even when passed, the audio content will still be + added to the receiving queue. + service_id: The service ID for the Azure deployment. (Optional) + api_key: The optional api key. If provided, will override the value in the + env vars or .env file. + deployment_name: The optional deployment. If provided, will override the value + (chat_deployment_name) in the env vars or .env file. + endpoint: The optional deployment endpoint. If provided will override the value + in the env vars or .env file. + base_url: The optional deployment base_url. If provided will override the value + in the env vars or .env file. + api_version: The optional deployment api version. If provided will override the value + in the env vars or .env file. + ad_token: The Azure Active Directory token. (Optional) + ad_token_provider: The Azure Active Directory token provider. (Optional) + token_endpoint: The token endpoint to request an Azure token. (Optional) + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + async_client: An existing client to use. (Optional) + env_file_path: Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding: The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. + """ + try: + azure_openai_settings = AzureOpenAISettings.create( + api_key=api_key, + base_url=base_url, + endpoint=endpoint, + realtime_deployment_name=deployment_name, + api_version=api_version, + token_endpoint=token_endpoint, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not azure_openai_settings.realtime_deployment_name: + raise ServiceInitializationError("The OpenAI realtime model ID is required.") + super().__init__( + audio_output_callback=audio_output_callback, + deployment_name=azure_openai_settings.realtime_deployment_name, + endpoint=azure_openai_settings.endpoint, + base_url=azure_openai_settings.base_url, + api_version=azure_openai_settings.api_version, + ad_token=ad_token, + ad_token_provider=ad_token_provider, + token_endpoint=azure_openai_settings.token_endpoint, + ai_model_type=OpenAIModelTypes.REALTIME, + service_id=service_id, + default_headers=default_headers, + client=async_client, + **kwargs, + ) + + @override + def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]: + return AzureRealtimeExecutionSettings diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py index d3d72795665b..7883be04f4ff 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_config_base.py @@ -3,6 +3,7 @@ import logging from collections.abc import Mapping from copy import copy +from typing import Any from openai import AsyncOpenAI from pydantic import ConfigDict, Field, validate_call @@ -30,6 +31,7 @@ def __init__( default_headers: Mapping[str, str] | None = None, client: AsyncOpenAI | None = None, instruction_role: str | None = None, + **kwargs: Any, ) -> None: """Initialize a client for OpenAI services. @@ -51,6 +53,7 @@ def __init__( client (AsyncOpenAI): An existing OpenAI client, optional. instruction_role (str): The role to use for 'instruction' messages, for example, summarization prompts could use `developer` or `system`. (Optional) + kwargs: Additional keyword arguments. """ # Merge APP_INFO into the headers if it exists @@ -76,7 +79,7 @@ def __init__( args["service_id"] = service_id if instruction_role: args["instruction_role"] = instruction_role - super().__init__(**args) + super().__init__(**args, **kwargs) def to_dict(self) -> dict[str, str]: """Create a dict of the service settings.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py index 7a1f43da234e..ea2e05deead7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py @@ -12,3 +12,4 @@ class OpenAIModelTypes(Enum): TEXT_TO_IMAGE = "text-to-image" AUDIO_TO_TEXT = "audio-to-text" TEXT_TO_AUDIO = "text-to-audio" + REALTIME = "realtime" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py new file mode 100644 index 000000000000..d6422066394b --- /dev/null +++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py @@ -0,0 +1,1024 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +import base64 +import contextlib +import json +import logging +import sys +from collections.abc import AsyncGenerator, Callable, Coroutine, Mapping +from enum import Enum +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover + +import numpy as np +from aiohttp import ClientSession +from aiortc import ( + MediaStreamTrack, + RTCConfiguration, + RTCDataChannel, + RTCIceServer, + RTCPeerConnection, + RTCSessionDescription, +) +from av.audio.frame import AudioFrame +from numpy import ndarray +from openai import AsyncOpenAI +from openai._models import construct_type_unchecked +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection +from openai.types.beta.realtime import ( + ConversationItemCreateEvent, + ConversationItemDeleteEvent, + ConversationItemTruncateEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, + RealtimeClientEvent, + RealtimeServerEvent, + ResponseCancelEvent, + ResponseCreateEvent, + ResponseFunctionCallArgumentsDoneEvent, + SessionUpdateEvent, +) +from openai.types.beta.realtime.response_create_event import Response +from pydantic import Field, PrivateAttr, ValidationError + +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_calling_utils import ( + prepare_settings_for_function_calling, +) +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase +from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler +from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes +from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeEvents, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeTextEvent, +) +from semantic_kernel.contents.streaming_text_content import StreamingTextContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.exceptions import ContentException +from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError +from semantic_kernel.kernel import Kernel +from semantic_kernel.utils.feature_stage_decorator import experimental + +if TYPE_CHECKING: + from aiortc.mediastreams import MediaStreamTrack + + from semantic_kernel.connectors.ai.function_choice_behavior import ( + FunctionCallChoiceConfiguration, + FunctionChoiceType, + ) + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + +logger: logging.Logger = logging.getLogger(__name__) + +# region constants + + +@experimental +class SendEvents(str, Enum): + """Events that can be sent.""" + + SESSION_UPDATE = "session.update" + INPUT_AUDIO_BUFFER_APPEND = "input_audio_buffer.append" + INPUT_AUDIO_BUFFER_COMMIT = "input_audio_buffer.commit" + INPUT_AUDIO_BUFFER_CLEAR = "input_audio_buffer.clear" + CONVERSATION_ITEM_CREATE = "conversation.item.create" + CONVERSATION_ITEM_TRUNCATE = "conversation.item.truncate" + CONVERSATION_ITEM_DELETE = "conversation.item.delete" + RESPONSE_CREATE = "response.create" + RESPONSE_CANCEL = "response.cancel" + + +@experimental +class ListenEvents(str, Enum): + """Events that can be listened to.""" + + ERROR = "error" + SESSION_CREATED = "session.created" + SESSION_UPDATED = "session.updated" + CONVERSATION_CREATED = "conversation.created" + INPUT_AUDIO_BUFFER_COMMITTED = "input_audio_buffer.committed" + INPUT_AUDIO_BUFFER_CLEARED = "input_audio_buffer.cleared" + INPUT_AUDIO_BUFFER_SPEECH_STARTED = "input_audio_buffer.speech_started" + INPUT_AUDIO_BUFFER_SPEECH_STOPPED = "input_audio_buffer.speech_stopped" + CONVERSATION_ITEM_CREATED = "conversation.item.created" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED = "conversation.item.input_audio_transcription.completed" + CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED = "conversation.item.input_audio_transcription.failed" + CONVERSATION_ITEM_TRUNCATED = "conversation.item.truncated" + CONVERSATION_ITEM_DELETED = "conversation.item.deleted" + RESPONSE_CREATED = "response.created" + RESPONSE_DONE = "response.done" # contains usage info -> log + RESPONSE_OUTPUT_ITEM_ADDED = "response.output_item.added" + RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" + RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" + RESPONSE_CONTENT_PART_DONE = "response.content_part.done" + RESPONSE_TEXT_DELTA = "response.text.delta" + RESPONSE_TEXT_DONE = "response.text.done" + RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" + RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" + RESPONSE_AUDIO_DELTA = "response.audio.delta" + RESPONSE_AUDIO_DONE = "response.audio.done" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" + RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" + RATE_LIMITS_UPDATED = "rate_limits.updated" + + +# region utils + + +def update_settings_from_function_call_configuration( + function_choice_configuration: "FunctionCallChoiceConfiguration", + settings: "PromptExecutionSettings", + type: "FunctionChoiceType", +) -> None: + """Update the settings from a FunctionChoiceConfiguration.""" + if ( + function_choice_configuration.available_functions + and hasattr(settings, "tool_choice") + and hasattr(settings, "tools") + ): + settings.tool_choice = type # type: ignore + settings.tools = [ # type: ignore + kernel_function_metadata_to_function_call_format(f) + for f in function_choice_configuration.available_functions + ] + + +def kernel_function_metadata_to_function_call_format( + metadata: "KernelFunctionMetadata", +) -> dict[str, Any]: + """Convert the kernel function metadata to function calling format. + + Function calling in the realtime API, uses a slightly different format than the chat completion API. + See https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-tools + for more details. + + TLDR: there is no "function" key, and the function details are at the same level as "type". + """ + return { + "type": "function", + "name": metadata.fully_qualified_name, + "description": metadata.description or "", + "parameters": { + "type": "object", + "properties": { + param.name: param.schema_data for param in metadata.parameters if param.include_in_function_choices + }, + "required": [p.name for p in metadata.parameters if p.is_required and p.include_in_function_choices], + }, + } + + +def _create_openai_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent: + """Create an OpenAI Realtime client event from a event type and kwargs.""" + match event_type: + case SendEvents.SESSION_UPDATE: + if "session" not in kwargs: + raise ContentException("Session is required for SessionUpdateEvent") + return SessionUpdateEvent( + type=event_type, + session=kwargs.pop("session"), + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if "audio" not in kwargs: + raise ContentException("Audio is required for InputAudioBufferAppendEvent") + return InputAudioBufferAppendEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + return InputAudioBufferCommitEvent( + type=event_type, + **kwargs, + ) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + return InputAudioBufferClearEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_CREATE: + if "item" not in kwargs: + raise ContentException("Item is required for ConversationItemCreateEvent") + kwargs["type"] = event_type + return ConversationItemCreateEvent(**kwargs) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if "content_index" not in kwargs: + kwargs["content_index"] = 0 + return ConversationItemTruncateEvent( + type=event_type, + **kwargs, + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if "item_id" not in kwargs: + raise ContentException("Item ID is required for ConversationItemDeleteEvent") + return ConversationItemDeleteEvent( + type=event_type, + **kwargs, + ) + case SendEvents.RESPONSE_CREATE: + if "response" in kwargs: + response: Response | None = Response.model_validate(kwargs.pop("response")) + else: + response = None + return ResponseCreateEvent( + type=event_type, + response=response, + **kwargs, + ) + case SendEvents.RESPONSE_CANCEL: + return ResponseCancelEvent( + type=event_type, + **kwargs, + ) + + +# region Base + + +@experimental +class OpenAIRealtimeBase(OpenAIHandler, RealtimeClientBase): + """OpenAI Realtime service.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = True + kernel: Kernel | None = None + + _current_settings: PromptExecutionSettings | None = PrivateAttr(default=None) + _call_id_to_function_map: dict[str, str] = PrivateAttr(default_factory=dict) + + async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[RealtimeEvents, None]: + """Handle all events but audio delta. + + Audio delta has to be handled by the implementation of the protocol as some + protocols have different ways of handling audio. + + We put all event in the output buffer, but after the interpreted one. + so when dealing with them, make sure to check the type of the event, since they + might be of different types. + """ + match event.type: + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: + yield RealtimeTextEvent( + service_type=event.type, + service_event=event, + text=StreamingTextContent( + inner_content=event, + text=event.delta, # type: ignore + choice_index=0, + ), + ) + case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: + if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore + self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA.value: + yield RealtimeFunctionCallEvent( + service_type=event.type, + service_event=event, + function_call=FunctionCallContent( + id=event.item_id, # type: ignore + name=self._call_id_to_function_map[event.call_id], # type: ignore + arguments=event.delta, # type: ignore + index=event.output_index, # type: ignore + metadata={"call_id": event.call_id}, # type: ignore + inner_content=event, + ), + ) + case ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value: + async for parsed_event in self._parse_function_call_arguments_done(event): # type: ignore + if parsed_event: + yield parsed_event + case ListenEvents.ERROR.value: + logger.error("Error received: %s", event.error.model_dump_json()) # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: + logger.info("Session created or updated, session: %s", event.session.model_dump_json()) # type: ignore + yield RealtimeEvent(service_type=event.type, service_event=event) + case _: + logger.debug(f"Received event: {event}") + yield RealtimeEvent(service_type=event.type, service_event=event) + + @override + async def update_session( + self, + chat_history: ChatHistory | None = None, + settings: PromptExecutionSettings | None = None, + create_response: bool = False, + **kwargs: Any, + ) -> None: + """Update the session in the service. + + Args: + chat_history: Chat history. + settings: Prompt execution settings, if kernel is linked to the service or passed as + Kwargs, it will be used to update the settings for function calling. + create_response: Create a response, get the model to start responding, default is False. + kwargs: Additional arguments, if 'kernel' is passed, it will be used to update the + settings for function calling, others will be ignored. + + """ + if kwargs: + if self._create_kwargs: + kwargs = {**self._create_kwargs, **kwargs} + else: + kwargs = self._create_kwargs or {} + if settings: + self._current_settings = settings + if "kernel" in kwargs: + self.kernel = kwargs["kernel"] + + if self._current_settings: + if self.kernel: + self._current_settings = prepare_settings_for_function_calling( + self._current_settings, + self.get_prompt_execution_settings_class(), + self._update_function_choice_settings_callback(), + kernel=self.kernel, # type: ignore + ) + await self.send( + RealtimeEvent( + service_type=SendEvents.SESSION_UPDATE, + service_event={"settings": self._current_settings}, + ) + ) + + if chat_history and len(chat_history) > 0: + for msg in chat_history.messages: + for item in msg.items: + match item: + case TextContent(): + await self.send( + RealtimeTextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item) + ) + case FunctionCallContent(): + await self.send( + RealtimeFunctionCallEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item + ) + ) + case FunctionResultContent(): + await self.send( + RealtimeFunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item + ) + ) + case _: + logger.error("Unsupported item type: %s", item) + + if create_response or kwargs.get("create_response", False) is True: + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) + + async def _parse_function_call_arguments_done( + self, + event: ResponseFunctionCallArgumentsDoneEvent, + ) -> AsyncGenerator[RealtimeEvents | None]: + """Handle response function call done. + + This always yields at least 1 event, either a RealtimeEvent or a RealtimeFunctionResultEvent with the raw event. + + It then also yields any function results both back to the service, through `send` and to the developer. + + """ + # Step 1: check if function calling enabled: + if not self.kernel or ( + self._current_settings + and self._current_settings.function_choice_behavior + and not self._current_settings.function_choice_behavior.auto_invoke_kernel_functions + ): + yield RealtimeEvent(service_type=event.type, service_event=event) + return + # Step 2: check if there is a function that can be found. + try: + plugin_name, function_name = self._call_id_to_function_map.pop(event.call_id, "-").split("-", 1) + except ValueError: + logger.error("Function call needs to have a plugin name and function name") + yield RealtimeEvent(service_type=event.type, service_event=event) + return + + # Step 3: Parse into the function call content, and yield that. + item = FunctionCallContent( + id=event.item_id, + plugin_name=plugin_name, + function_name=function_name, + arguments=event.arguments, + index=event.output_index, + metadata={"call_id": event.call_id}, + ) + yield RealtimeFunctionCallEvent( + service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item, service_event=event + ) + + # Step 4: Invoke the function call + chat_history = ChatHistory() + await self.kernel.invoke_function_call(item, chat_history) + created_output: FunctionResultContent = chat_history.messages[-1].items[0] # type: ignore + # Step 5: Create the function result event + result = RealtimeFunctionResultEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, + function_result=created_output, + ) + # Step 6: send the result to the service and call `create response` + await self.send(result) + await self.send(RealtimeEvent(service_type=SendEvents.RESPONSE_CREATE)) + # Step 7: yield the function result back to the developer as well + yield result + + async def _send(self, event: RealtimeClientEvent) -> None: + """Send an event to the service.""" + raise NotImplementedError + + @override + async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: + match event: + case RealtimeAudioEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.data_string + ) + ) + case RealtimeTextEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + item={ + "type": "message", + "content": [ + { + "type": "input_text", + "text": event.text.text, + } + ], + "role": "user", + }, + ) + ) + case RealtimeFunctionCallEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + item={ + "type": "function_call", + "name": event.function_call.name or event.function_call.function_name, + "arguments": "" + if not event.function_call.arguments + else event.function_call.arguments + if isinstance(event.function_call.arguments, str) + else json.dumps(event.function_call.arguments), + "call_id": event.function_call.metadata.get("call_id"), + }, + ) + ) + case RealtimeFunctionResultEvent(): + await self._send( + _create_openai_realtime_client_event( + event_type=SendEvents.CONVERSATION_ITEM_CREATE, + item={ + "type": "function_call_output", + "output": event.function_result.result, + "call_id": event.function_result.metadata.get("call_id"), + }, + ) + ) + case _: + data = event.service_event + match event.service_type: + case SendEvents.SESSION_UPDATE: + if not data: + logger.error("Event data is empty") + return + settings = data.get("settings", None) + if not settings: + logger.error("Event data does not contain 'settings'") + return + try: + settings = self.get_prompt_execution_settings_from_settings(settings) + except Exception as e: + logger.error( + f"Failed to properly create settings from passed settings: {settings}, error: {e}" + ) + return + assert isinstance(settings, self.get_prompt_execution_settings_class()) # nosec + if not settings.ai_model_id: # type: ignore + settings.ai_model_id = self.ai_model_id # type: ignore + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + session=settings.prepare_settings_dict(), + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_APPEND: + if not data or "audio" not in data: + logger.error("Event data does not contain 'audio'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + audio=data["audio"], + ) + ) + case SendEvents.INPUT_AUDIO_BUFFER_COMMIT: + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) + case SendEvents.INPUT_AUDIO_BUFFER_CLEAR: + await self._send(_create_openai_realtime_client_event(event_type=event.service_type)) + case SendEvents.CONVERSATION_ITEM_CREATE: + if not data or "item" not in data: + logger.error("Event data does not contain 'item'") + return + content = data["item"] + contents = content.items if isinstance(content, ChatMessageContent) else [content] + for item in contents: + match item: + case TextContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item={ + "type": "message", + "content": [ + { + "type": "input_text", + "text": item.text, + } + ], + "role": "user", + }, + ) + ) + case FunctionCallContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item={ + "type": "function_call", + "name": item.name or item.function_name, + "arguments": "" + if not item.arguments + else item.arguments + if isinstance(item.arguments, str) + else json.dumps(item.arguments), + "call_id": item.metadata.get("call_id"), + }, + ) + ) + + case FunctionResultContent(): + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item={ + "type": "function_call_output", + "output": item.result, + "call_id": item.metadata.get("call_id"), + }, + ) + ) + case SendEvents.CONVERSATION_ITEM_TRUNCATE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + content_index=0, + audio_end_ms=data.get("audio_end_ms", 0), + ) + ) + case SendEvents.CONVERSATION_ITEM_DELETE: + if not data or "item_id" not in data: + logger.error("Event data does not contain 'item_id'") + return + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + item_id=data["item_id"], + ) + ) + case SendEvents.RESPONSE_CREATE: + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, event_id=data.get("event_id", None) if data else None + ) + ) + case SendEvents.RESPONSE_CANCEL: + await self._send( + _create_openai_realtime_client_event( + event_type=event.service_type, + response_id=data.get("response_id", None) if data else None, + ) + ) + + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: + from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa + OpenAIRealtimeExecutionSettings, + ) + + return OpenAIRealtimeExecutionSettings + + @override + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + return update_settings_from_function_call_configuration + + +# region WebRTC +@experimental +class OpenAIRealtimeWebRTCBase(OpenAIRealtimeBase): + """OpenAI WebRTC Realtime service.""" + + peer_connection: RTCPeerConnection | None = None + data_channel: RTCDataChannel | None = None + audio_track: MediaStreamTrack | None = None + _receive_buffer: asyncio.Queue[RealtimeEvents] = PrivateAttr(default_factory=asyncio.Queue) + + @override + async def receive( + self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + **kwargs: Any, + ) -> AsyncGenerator[RealtimeEvents, None]: + if audio_output_callback: + self.audio_output_callback = audio_output_callback + while True: + event = await self._receive_buffer.get() + yield event + + async def _send(self, event: RealtimeClientEvent) -> None: + if not self.data_channel: + logger.error("Data channel not initialized") + return + while self.data_channel.readyState != "open": + await asyncio.sleep(0.1) + try: + self.data_channel.send(event.model_dump_json(exclude_none=True)) + except Exception as e: + logger.error(f"Failed to send event {event} with error: {e!s}") + + @override + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + if not self.audio_track: + raise Exception("Audio track not initialized") + self.peer_connection = RTCPeerConnection( + configuration=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]) + ) + + # track is the audio track being returned from the service + self.peer_connection.add_listener("track", self._on_track) + + # data channel is used to send and receive messages + self.data_channel = self.peer_connection.createDataChannel("oai-events", protocol="json") + self.data_channel.add_listener("message", self._on_data) + + # this is the incoming audio, which sends audio to the service + self.peer_connection.addTransceiver(self.audio_track) + + offer = await self.peer_connection.createOffer() + await self.peer_connection.setLocalDescription(offer) + + try: + ephemeral_token = await self._get_ephemeral_token() + headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"} + + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}", + headers=headers, + data=offer.sdp, + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"OpenAI WebRTC error: {error_text}") + + sdp_answer = await response.text() + answer = RTCSessionDescription(sdp=sdp_answer, type="answer") + await self.peer_connection.setRemoteDescription(answer) + logger.info("Connected to OpenAI WebRTC") + + except Exception as e: + logger.error(f"Failed to connect to OpenAI: {e!s}") + raise + + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.peer_connection: + with contextlib.suppress(asyncio.CancelledError): + await self.peer_connection.close() + self.peer_connection = None + if self.data_channel: + with contextlib.suppress(asyncio.CancelledError): + self.data_channel.close() + self.data_channel = None + + async def _on_track(self, track: "MediaStreamTrack") -> None: + logger.debug(f"Received {track.kind} track from remote") + if track.kind != "audio": + return + while True: + try: + # This is a MediaStreamTrack, so the type is AudioFrame + # this might need to be updated if video becomes part of this + frame: AudioFrame = await track.recv() # type: ignore + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Error getting audio frame: {e!s}") + break + + try: + if self.audio_output_callback: + await self.audio_output_callback(frame.to_ndarray()) + + except Exception as e: + logger.error(f"Error playing remote audio frame: {e!s}") + try: + await self._receive_buffer.put( + RealtimeAudioEvent( + audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame), + service_event=frame, + service_type=ListenEvents.RESPONSE_AUDIO_DELTA, + ), + ) + except Exception as e: + logger.error(f"Error processing remote audio frame: {e!s}") + await asyncio.sleep(0.01) + + async def _on_data(self, data: str) -> None: + """This method is called whenever a data channel message is received. + + The data is parsed into a RealtimeServerEvent (by OpenAI code) and then processed. + Audio data is not send through this channel, use _on_track for that. + """ + try: + event = cast( + RealtimeServerEvent, + construct_type_unchecked(value=json.loads(data), type_=cast(Any, RealtimeServerEvent)), + ) + except Exception as e: + logger.error(f"Failed to parse event {data} with error: {e!s}") + return + async for parsed_event in self._parse_event(event): + await self._receive_buffer.put(parsed_event) + + async def _get_ephemeral_token(self) -> str: + """Get an ephemeral token from OpenAI.""" + headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"} + data = {"model": self.ai_model_id, "voice": "echo"} + + try: + async with ( + ClientSession() as session, + session.post( + f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data + ) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") + + result = await response.json() + return result["client_secret"]["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise + + +@experimental +class OpenAIRealtimeWebRTC(OpenAIRealtimeWebRTCBase, OpenAIConfigBase): + """OpenAI Realtime service using WebRTC protocol.""" + + def __init__( + self, + audio_track: "MediaStreamTrack", + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + ai_model_id: str | None = None, + api_key: str | None = None, + org_id: str | None = None, + service_id: str | None = None, + default_headers: Mapping[str, str] | None = None, + client: AsyncOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + **kwargs: Any, + ) -> None: + """Initialize an OpenAIRealtime service. + + Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. + Even when passed, the audio content will still be + added to the receiving queue. + audio_track: The audio track to use for the service, only used by WebRTC. + A default is supplied if not provided. + It can be any class that implements the AudioStreamTrack interface. + ai_model_id (str | None): OpenAI model name, see + https://platform.openai.com/docs/models + service_id (str | None): Service ID tied to the execution settings. + api_key (str | None): The optional API key to use. If provided will override, + the env vars or .env file value. + org_id (str | None): The optional org ID to use. If provided will override, + the env vars or .env file value. + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. + """ + try: + openai_settings = OpenAISettings.create( + api_key=api_key, + org_id=org_id, + realtime_model_id=ai_model_id, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not openai_settings.realtime_model_id: + raise ServiceInitializationError("The OpenAI realtime model ID is required.") + if audio_track: + kwargs["audio_track"] = audio_track + super().__init__( + audio_output_callback=audio_output_callback, + ai_model_id=openai_settings.realtime_model_id, + service_id=service_id, + api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, + org_id=openai_settings.org_id, + ai_model_type=OpenAIModelTypes.REALTIME, + default_headers=default_headers, + client=client, + **kwargs, + ) + + +# region Websocket + + +@experimental +class OpenAIRealtimeWebsocketBase(OpenAIRealtimeBase): + """OpenAI Realtime service.""" + + protocol: ClassVar[Literal["websocket"]] = "websocket" # type: ignore + connection: AsyncRealtimeConnection | None = None + connected: asyncio.Event = Field(default_factory=asyncio.Event) + + @override + async def receive( + self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + **kwargs: Any, + ) -> AsyncGenerator[RealtimeEvents, None]: + if audio_output_callback: + self.audio_output_callback = audio_output_callback + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + + async for event in self.connection: + if event.type == ListenEvents.RESPONSE_AUDIO_DELTA.value: + if self.audio_output_callback: + await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16)) + yield RealtimeAudioEvent( + audio=AudioContent(data=event.delta, data_format="base64", inner_content=event), + service_type=event.type, + service_event=event, + ) + continue + async for realtime_event in self._parse_event(event): + yield realtime_event + + async def _send(self, event: RealtimeClientEvent) -> None: + await self.connected.wait() + if not self.connection: + raise ValueError("Connection is not established.") + try: + await self.connection.send(event) + except Exception as e: + logger.error(f"Error sending response: {e!s}") + + @override + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service.""" + self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter() + self.connected.set() + if settings or chat_history or kwargs: + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + + @override + async def close_session(self) -> None: + """Close the session in the service.""" + if self.connected.is_set() and self.connection: + await self.connection.close() + self.connection = None + self.connected.clear() + + +@experimental +class OpenAIRealtimeWebsocket(OpenAIRealtimeWebsocketBase, OpenAIConfigBase): + """OpenAI Realtime service using WebSocket protocol.""" + + def __init__( + self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + ai_model_id: str | None = None, + api_key: str | None = None, + org_id: str | None = None, + service_id: str | None = None, + default_headers: Mapping[str, str] | None = None, + client: AsyncOpenAI | None = None, + env_file_path: str | None = None, + env_file_encoding: str | None = None, + **kwargs: Any, + ) -> None: + """Initialize an OpenAIRealtime service. + + Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible, because it is called first before further processing. + It can also be set in the `receive` method. + Even when passed, the audio content will still be + added to the receiving queue. + ai_model_id (str | None): OpenAI model name, see + https://platform.openai.com/docs/models + service_id (str | None): Service ID tied to the execution settings. + api_key (str | None): The optional API key to use. If provided will override, + the env vars or .env file value. + org_id (str | None): The optional org ID to use. If provided will override, + the env vars or .env file value. + default_headers: The default headers mapping of string keys to + string values for HTTP requests. (Optional) + client (Optional[AsyncOpenAI]): An existing client to use. (Optional) + env_file_path (str | None): Use the environment settings file as a fallback to + environment variables. (Optional) + env_file_encoding (str | None): The encoding of the environment settings file. (Optional) + kwargs: Additional arguments. + """ + try: + openai_settings = OpenAISettings.create( + api_key=api_key, + org_id=org_id, + realtime_model_id=ai_model_id, + env_file_path=env_file_path, + env_file_encoding=env_file_encoding, + ) + except ValidationError as ex: + raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex + if not openai_settings.realtime_model_id: + raise ServiceInitializationError("The OpenAI realtime model ID is required.") + super().__init__( + audio_output_callback=audio_output_callback, + ai_model_id=openai_settings.realtime_model_id, + service_id=service_id, + api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None, + org_id=openai_settings.org_id, + ai_model_type=OpenAIModelTypes.REALTIME, + default_headers=default_headers, + client=client, + **kwargs, + ) diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py index a943757048c5..47ebc4c2b7b7 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py @@ -3,6 +3,7 @@ from typing import ClassVar from pydantic import SecretStr +from pydantic_core import Url from semantic_kernel.connectors.ai.open_ai.const import DEFAULT_AZURE_API_VERSION from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError @@ -55,6 +56,12 @@ class AzureOpenAISettings(KernelBaseSettings): Resource Management > Deployments in the Azure portal or, alternatively, under Management > Deployments in Azure OpenAI Studio. (Env var AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME) + - realtime_deployment_name: str - The name of the Azure Realtime deployment. This value + will correspond to the custom name you chose for your deployment + when you deployed a model. This value can be found under + Resource Management > Deployments in the Azure portal or, alternatively, + under Management > Deployments in Azure OpenAI Studio. + (Env var AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME) - api_key: SecretStr - The API key for the Azure deployment. This value can be found in the Keys & Endpoint section when examining your resource in the Azure portal. You can use either KEY1 or KEY2. @@ -85,8 +92,9 @@ class AzureOpenAISettings(KernelBaseSettings): text_to_image_deployment_name: str | None = None audio_to_text_deployment_name: str | None = None text_to_audio_deployment_name: str | None = None + realtime_deployment_name: str | None = None endpoint: HttpsUrl | None = None - base_url: HttpsUrl | None = None + base_url: Url | None = None api_key: SecretStr | None = None api_version: str = DEFAULT_AZURE_API_VERSION token_endpoint: str = "https://cognitiveservices.azure.com/.default" diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py index 6423a5385a33..7276af4b1f3b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py @@ -32,6 +32,9 @@ class OpenAISettings(KernelBaseSettings): (Env var OPENAI_AUDIO_TO_TEXT_MODEL_ID) - text_to_audio_model_id: str | None - The OpenAI text to audio model ID to use, for example, jukebox-1. (Env var OPENAI_TEXT_TO_AUDIO_MODEL_ID) + - realtime_model_id: str | None - The OpenAI realtime model ID to use, + for example, gpt-4o-realtime-preview-2024-12-17. + (Env var OPENAI_REALTIME_MODEL_ID) - env_file_path: str | None - if provided, the .env settings are read from this file path location """ @@ -45,3 +48,4 @@ class OpenAISettings(KernelBaseSettings): text_to_image_model_id: str | None = None audio_to_text_model_id: str | None = None text_to_audio_model_id: str | None = None + realtime_model_id: str | None = None diff --git a/python/semantic_kernel/connectors/ai/realtime_client_base.py b/python/semantic_kernel/connectors/ai/realtime_client_base.py new file mode 100644 index 000000000000..3992d116a4f7 --- /dev/null +++ b/python/semantic_kernel/connectors/ai/realtime_client_base.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft. All rights reserved. + +import sys +from abc import ABC, abstractmethod +from collections.abc import AsyncGenerator, Callable, Coroutine +from typing import Any, ClassVar + +if sys.version_info >= (3, 11): + from typing import Self # pragma: no cover +else: + from typing_extensions import Self # pragma: no cover + +from numpy import ndarray +from pydantic import PrivateAttr + +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.realtime_events import RealtimeEvents +from semantic_kernel.services.ai_service_client_base import AIServiceClientBase +from semantic_kernel.utils.feature_stage_decorator import experimental + + +@experimental +class RealtimeClientBase(AIServiceClientBase, ABC): + """Base class for a realtime client.""" + + SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None + _chat_history: ChatHistory | None = PrivateAttr(default=None) + _settings: PromptExecutionSettings | None = PrivateAttr(default=None) + _create_kwargs: dict[str, Any] | None = PrivateAttr(default=None) + + @abstractmethod + async def send(self, event: RealtimeEvents) -> None: + """Send an event to the service. + + Args: + event: The event to send. + kwargs: Additional arguments. + """ + raise NotImplementedError + + @abstractmethod + def receive( + self, + audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None, + **kwargs: Any, + ) -> AsyncGenerator[RealtimeEvents, None]: + """Starts listening for messages from the service, generates events. + + Args: + audio_output_callback: The audio output callback, optional. + This should be a coroutine, that takes a ndarray with audio as input. + The goal of this function is to allow you to play the audio with the + least amount of latency possible. + It is called first in both websockets and webrtc. + Even when passed, the audio content will still be + added to the receiving queue. + This can also be set in the constructor. + When supplied here it will override any value in the class. + kwargs: Additional arguments. + """ + raise NotImplementedError + + @abstractmethod + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service. + + Args: + settings: Prompt execution settings. + chat_history: Chat history. + kwargs: Additional arguments. + """ + raise NotImplementedError + + @abstractmethod + async def update_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Update a session in the service. + + Can be used when using the context manager instead of calling create_session with these same arguments. + + Args: + settings: Prompt execution settings. + chat_history: Chat history. + kwargs: Additional arguments. + """ + raise NotImplementedError + + @abstractmethod + async def close_session(self) -> None: + """Close the session in the service.""" + pass + + def _update_function_choice_settings_callback( + self, + ) -> Callable[[FunctionCallChoiceConfiguration, "PromptExecutionSettings", FunctionChoiceType], None]: + """Return the callback function to update the settings from a function call configuration. + + Override this method to provide a custom callback function to + update the settings from a function call configuration. + """ + return lambda configuration, settings, choice_type: None + + async def __aenter__(self) -> "Self": + """Enter the context manager. + + Default implementation calls the create session method. + """ + await self.create_session(self._chat_history, self._settings) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Exit the context manager.""" + await self.close_session() + + def __call__( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> Self: + """Call the service and set the chat history and settings. + + Args: + chat_history: Chat history. + settings: Prompt execution settings. + kwargs: Additional arguments, can include `kernel` or specific settings for the service. + Check the update_session method for the specific service for more details. + """ + self._chat_history = chat_history + self._settings = settings + self._create_kwargs = kwargs + return self diff --git a/python/semantic_kernel/contents/__init__.py b/python/semantic_kernel/contents/__init__.py index 5d70a49c1f93..cb69b29ac6c3 100644 --- a/python/semantic_kernel/contents/__init__.py +++ b/python/semantic_kernel/contents/__init__.py @@ -11,6 +11,15 @@ from semantic_kernel.contents.history_reducer.chat_history_summarization_reducer import ChatHistorySummarizationReducer from semantic_kernel.contents.history_reducer.chat_history_truncation_reducer import ChatHistoryTruncationReducer from semantic_kernel.contents.image_content import ImageContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeEvents, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeImageEvent, + RealtimeTextEvent, +) from semantic_kernel.contents.streaming_annotation_content import StreamingAnnotationContent from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent from semantic_kernel.contents.streaming_file_reference_content import StreamingFileReferenceContent @@ -33,6 +42,13 @@ "FunctionCallContent", "FunctionResultContent", "ImageContent", + "RealtimeAudioEvent", + "RealtimeEvent", + "RealtimeEvents", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + "RealtimeTextEvent", "StreamingAnnotationContent", "StreamingChatMessageContent", "StreamingFileReferenceContent", diff --git a/python/semantic_kernel/contents/audio_content.py b/python/semantic_kernel/contents/audio_content.py index ed32a7d0f595..12bb47af9f64 100644 --- a/python/semantic_kernel/contents/audio_content.py +++ b/python/semantic_kernel/contents/audio_content.py @@ -77,7 +77,7 @@ def __init__( ) @classmethod - def from_audio_file(cls: type[_T], path: str) -> "AudioContent": + def from_audio_file(cls: type[_T], path: str) -> _T: """Create an instance from an audio file.""" mime_type = mimetypes.guess_type(path)[0] with open(path, "rb") as audio_file: diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py index 1a0a4850569f..aa161f78755f 100644 --- a/python/semantic_kernel/contents/binary_content.py +++ b/python/semantic_kernel/contents/binary_content.py @@ -121,9 +121,16 @@ def data_uri(self, value: str): self.metadata.update(self._data_uri.parameters) @property - def data(self) -> bytes: + def data_string(self) -> str: + """Returns the data as a string, using the data format.""" + if self._data_uri: + return self._data_uri._data_str() + return "" + + @property + def data(self) -> bytes | ndarray: """Get the data.""" - if self._data_uri and self._data_uri.data_array: + if self._data_uri and self._data_uri.data_array is not None: return self._data_uri.data_array.tobytes() if self._data_uri and self._data_uri.data_bytes: return self._data_uri.data_bytes @@ -188,6 +195,7 @@ def write_to_file(self, path: str | FilePath) -> None: self._data_uri.data_array.tofile(path) return with open(path, "wb") as file: + assert isinstance(self.data, bytes) # nosec file.write(self.data) def to_dict(self) -> dict[str, Any]: diff --git a/python/semantic_kernel/contents/chat_message_content.py b/python/semantic_kernel/contents/chat_message_content.py index 4a35e03457a7..829b3f5c6aed 100644 --- a/python/semantic_kernel/contents/chat_message_content.py +++ b/python/semantic_kernel/contents/chat_message_content.py @@ -10,6 +10,7 @@ from pydantic import Field from semantic_kernel.contents.annotation_content import AnnotationContent +from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.binary_content import BinaryContent from semantic_kernel.contents.const import ( ANNOTATION_CONTENT_TAG, @@ -48,7 +49,7 @@ STREAMING_ANNOTATION_CONTENT_TAG: StreamingAnnotationContent, } -ITEM_TYPES = ( +CMC_ITEM_TYPES = Annotated[ AnnotationContent | BinaryContent | ImageContent @@ -58,7 +59,10 @@ | FileReferenceContent | StreamingAnnotationContent | StreamingFileReferenceContent -) + | AudioContent, + Field(discriminator=DISCRIMINATOR_FIELD), +] + logger = logging.getLogger(__name__) @@ -87,7 +91,7 @@ class ChatMessageContent(KernelContent): tag: ClassVar[str] = CHAT_MESSAGE_CONTENT_TAG role: AuthorRole name: str | None = None - items: list[Annotated[ITEM_TYPES, Field(discriminator=DISCRIMINATOR_FIELD)]] = Field(default_factory=list) + items: list[CMC_ITEM_TYPES] = Field(default_factory=list) encoding: str | None = None finish_reason: FinishReason | None = None @@ -95,7 +99,7 @@ class ChatMessageContent(KernelContent): def __init__( self, role: AuthorRole, - items: list[ITEM_TYPES], + items: list[CMC_ITEM_TYPES], name: str | None = None, inner_content: Any | None = None, encoding: str | None = None, @@ -122,7 +126,7 @@ def __init__( def __init__( # type: ignore self, role: AuthorRole, - items: list[ITEM_TYPES] | None = None, + items: list[CMC_ITEM_TYPES] | None = None, content: str | None = None, inner_content: Any | None = None, name: str | None = None, diff --git a/python/semantic_kernel/contents/function_call_content.py b/python/semantic_kernel/contents/function_call_content.py index 7067311f4c8a..863ba6dfbaf7 100644 --- a/python/semantic_kernel/contents/function_call_content.py +++ b/python/semantic_kernel/contents/function_call_content.py @@ -45,7 +45,6 @@ class FunctionCallContent(KernelContent): def __init__( self, - content_type: Literal[ContentTypes.FUNCTION_CALL_CONTENT] = FUNCTION_CALL_CONTENT_TAG, # type: ignore inner_content: Any | None = None, ai_model_id: str | None = None, id: str | None = None, @@ -60,7 +59,6 @@ def __init__( """Create function call content. Args: - content_type: The content type. inner_content (Any | None): The inner content. ai_model_id (str | None): The id of the AI model. id (str | None): The id of the function call. @@ -83,7 +81,6 @@ def __init__( else: function_name = name args = { - "content_type": content_type, "inner_content": inner_content, "ai_model_id": ai_model_id, "id": id, @@ -124,6 +121,7 @@ def __add__(self, other: "FunctionCallContent | None") -> "FunctionCallContent": index=self.index or other.index, name=self.name or other.name, arguments=self.combine_arguments(self.arguments, other.arguments), + metadata=self.metadata | other.metadata, ) def combine_arguments( diff --git a/python/semantic_kernel/contents/function_result_content.py b/python/semantic_kernel/contents/function_result_content.py index c95460ae8596..b1d36b2bd5f8 100644 --- a/python/semantic_kernel/contents/function_result_content.py +++ b/python/semantic_kernel/contents/function_result_content.py @@ -42,7 +42,6 @@ class FunctionResultContent(KernelContent): def __init__( self, - content_type: Literal[ContentTypes.FUNCTION_RESULT_CONTENT] = FUNCTION_RESULT_CONTENT_TAG, # type: ignore inner_content: Any | None = None, ai_model_id: str | None = None, id: str | None = None, @@ -57,7 +56,6 @@ def __init__( """Create function result content. Args: - content_type: The content type. inner_content (Any | None): The inner content. ai_model_id (str | None): The id of the AI model. id (str | None): The id of the function call that the result relates to. @@ -80,7 +78,6 @@ def __init__( else: function_name = name args = { - "content_type": content_type, "inner_content": inner_content, "ai_model_id": ai_model_id, "id": id, diff --git a/python/semantic_kernel/contents/realtime_events.py b/python/semantic_kernel/contents/realtime_events.py new file mode 100644 index 000000000000..d74287d5ccf4 --- /dev/null +++ b/python/semantic_kernel/contents/realtime_events.py @@ -0,0 +1,67 @@ +# Copyright (c) Microsoft. All rights reserved. + +from typing import Annotated, Any, ClassVar, Literal, Union + +from pydantic import Field + +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.image_content import ImageContent +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.kernel_pydantic import KernelBaseModel + +RealtimeEvents = Annotated[ + Union[ + "RealtimeEvent", + "RealtimeAudioEvent", + "RealtimeTextEvent", + "RealtimeFunctionCallEvent", + "RealtimeFunctionResultEvent", + "RealtimeImageEvent", + ], + Field(discriminator="event_type"), +] + + +class RealtimeEvent(KernelBaseModel): + """Base class for all service events.""" + + service_event: Any | None = Field(default=None, description="The event content.") + service_type: str | None = None + event_type: ClassVar[Literal["service"]] = "service" + + +class RealtimeAudioEvent(RealtimeEvent): + """Audio event type.""" + + event_type: ClassVar[Literal["audio"]] = "audio" # type: ignore + audio: AudioContent = Field(..., description="Audio content.") + + +class RealtimeTextEvent(RealtimeEvent): + """Text event type.""" + + event_type: ClassVar[Literal["text"]] = "text" # type: ignore + text: TextContent = Field(..., description="Text content.") + + +class RealtimeFunctionCallEvent(RealtimeEvent): + """Function call event type.""" + + event_type: ClassVar[Literal["function_call"]] = "function_call" # type: ignore + function_call: FunctionCallContent = Field(..., description="Function call content.") + + +class RealtimeFunctionResultEvent(RealtimeEvent): + """Function result event type.""" + + event_type: ClassVar[Literal["function_result"]] = "function_result" # type: ignore + function_result: FunctionResultContent = Field(..., description="Function result content.") + + +class RealtimeImageEvent(RealtimeEvent): + """Image event type.""" + + event_type: ClassVar[Literal["image"]] = "image" # type: ignore + image: ImageContent = Field(..., description="Image content.") diff --git a/python/semantic_kernel/contents/streaming_chat_message_content.py b/python/semantic_kernel/contents/streaming_chat_message_content.py index 32b8bd55a3f6..88c31ef31473 100644 --- a/python/semantic_kernel/contents/streaming_chat_message_content.py +++ b/python/semantic_kernel/contents/streaming_chat_message_content.py @@ -1,13 +1,15 @@ # Copyright (c) Microsoft. All rights reserved. from enum import Enum -from typing import Any, Union, overload +from typing import Annotated, Any, overload from xml.etree.ElementTree import Element # nosec from pydantic import Field +from semantic_kernel.contents.audio_content import AudioContent from semantic_kernel.contents.binary_content import BinaryContent from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.const import DISCRIMINATOR_FIELD from semantic_kernel.contents.function_call_content import FunctionCallContent from semantic_kernel.contents.function_result_content import FunctionResultContent from semantic_kernel.contents.image_content import ImageContent @@ -20,14 +22,16 @@ from semantic_kernel.contents.utils.hashing import make_hashable from semantic_kernel.exceptions import ContentAdditionException -ITEM_TYPES = Union[ - BinaryContent, - ImageContent, - StreamingTextContent, - FunctionCallContent, - FunctionResultContent, - StreamingFileReferenceContent, - StreamingAnnotationContent, +STREAMING_CMC_ITEM_TYPES = Annotated[ + BinaryContent + | AudioContent + | ImageContent + | FunctionResultContent + | FunctionCallContent + | StreamingTextContent + | StreamingAnnotationContent + | StreamingFileReferenceContent, + Field(discriminator=DISCRIMINATOR_FIELD), ] @@ -66,7 +70,7 @@ class StreamingChatMessageContent(ChatMessageContent, StreamingContentMixin): def __init__( self, role: AuthorRole, - items: list[ITEM_TYPES], + items: list[STREAMING_CMC_ITEM_TYPES], choice_index: int, name: str | None = None, inner_content: Any | None = None, @@ -96,7 +100,7 @@ def __init__( # type: ignore self, role: AuthorRole, choice_index: int, - items: list[ITEM_TYPES] | None = None, + items: list[STREAMING_CMC_ITEM_TYPES] | None = None, content: str | None = None, inner_content: Any | None = None, name: str | None = None, diff --git a/python/semantic_kernel/kernel.py b/python/semantic_kernel/kernel.py index 03e3a48d75f5..ad71ffccfedb 100644 --- a/python/semantic_kernel/kernel.py +++ b/python/semantic_kernel/kernel.py @@ -321,7 +321,7 @@ async def invoke_function_call( function_call_count: int | None = None, request_index: int | None = None, is_streaming: bool = False, - function_behavior: "FunctionChoiceBehavior" = None, # type: ignore + function_behavior: "FunctionChoiceBehavior | None" = None, ) -> "AutoFunctionInvocationContext | None": """Processes the provided FunctionCallContent and updates the chat history.""" args_cloned = copy(arguments) if arguments else KernelArguments() diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 6cef400a8cf1..60bb1bda97da 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -220,6 +220,7 @@ def azure_openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dic "AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME": "test_text_to_image_deployment", "AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME": "test_audio_to_text_deployment", "AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME": "test_text_to_audio_deployment", + "AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME": "test_realtime_deployment", "AZURE_OPENAI_API_KEY": "test_api_key", "AZURE_OPENAI_ENDPOINT": "https://test-endpoint.com", "AZURE_OPENAI_API_VERSION": "2023-03-15-preview", @@ -256,6 +257,7 @@ def openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dict): "OPENAI_TEXT_TO_IMAGE_MODEL_ID": "test_text_to_image_model_id", "OPENAI_AUDIO_TO_TEXT_MODEL_ID": "test_audio_to_text_model_id", "OPENAI_TEXT_TO_AUDIO_MODEL_ID": "test_text_to_audio_model_id", + "OPENAI_REALTIME_MODEL_ID": "test_realtime_model_id", } env_vars.update(override_env_param_dict) diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py new file mode 100644 index 000000000000..a341f2bb5c4c --- /dev/null +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -0,0 +1,656 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from collections.abc import AsyncIterable +from typing import Any +from unittest.mock import AsyncMock, patch + +from aiortc import AudioStreamTrack +from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection, AsyncRealtimeConnectionManager +from openai.types.beta.realtime import ( + ConversationItem, + ConversationItemContent, + ConversationItemCreatedEvent, + ConversationItemCreateEvent, + ConversationItemDeletedEvent, + ConversationItemDeleteEvent, + ConversationItemTruncatedEvent, + ConversationItemTruncateEvent, + ErrorEvent, + InputAudioBufferAppendEvent, + InputAudioBufferClearedEvent, + InputAudioBufferClearEvent, + InputAudioBufferCommitEvent, + InputAudioBufferCommittedEvent, + InputAudioBufferSpeechStartedEvent, + RealtimeResponse, + RealtimeServerEvent, + ResponseAudioDeltaEvent, + ResponseAudioDoneEvent, + ResponseAudioTranscriptDeltaEvent, + ResponseCancelEvent, + ResponseCreatedEvent, + ResponseCreateEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallArgumentsDoneEvent, + ResponseOutputItemAddedEvent, + Session, + SessionCreatedEvent, + SessionUpdatedEvent, + SessionUpdateEvent, +) +from pytest import fixture, mark, param, raises + +from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration +from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior +from semantic_kernel.connectors.ai.function_choice_type import FunctionChoiceType +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( + OpenAIRealtimeExecutionSettings, +) +from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import ( + ListenEvents, + OpenAIRealtimeWebRTC, + OpenAIRealtimeWebsocket, + SendEvents, + _create_openai_realtime_client_event, + update_settings_from_function_call_configuration, +) +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.contents.audio_content import AudioContent +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.chat_message_content import ChatMessageContent +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.contents.function_result_content import FunctionResultContent +from semantic_kernel.contents.image_content import ImageContent +from semantic_kernel.contents.realtime_events import ( + RealtimeAudioEvent, + RealtimeEvent, + RealtimeFunctionCallEvent, + RealtimeFunctionResultEvent, + RealtimeTextEvent, +) +from semantic_kernel.contents.text_content import TextContent +from semantic_kernel.exceptions.content_exceptions import ContentException +from semantic_kernel.functions import kernel_function +from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata + +events = [ + SessionCreatedEvent(type=ListenEvents.SESSION_CREATED, session=Session(id="session_id"), event_id="1"), + SessionUpdatedEvent(type=ListenEvents.SESSION_UPDATED, session=Session(id="session_id"), event_id="2"), + ConversationItemCreatedEvent( + type=ListenEvents.CONVERSATION_ITEM_CREATED, + item=ConversationItem(id="item_id"), + event_id="3", + previous_item_id="2", + ), + ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED, item_id="item_id", event_id="4"), + ConversationItemTruncatedEvent( + type=ListenEvents.CONVERSATION_ITEM_TRUNCATED, event_id="5", audio_end_ms=0, content_index=0, item_id="item_id" + ), + InputAudioBufferClearedEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED, event_id="7"), + InputAudioBufferCommittedEvent( + type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED, + event_id="8", + item_id="item_id", + previous_item_id="previous_item_id", + ), + ResponseCreatedEvent(type=ListenEvents.RESPONSE_CREATED, event_id="10", response=RealtimeResponse()), + ResponseFunctionCallArgumentsDoneEvent( + type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, + event_id="11", + arguments="{}", + call_id="call_id", + item_id="item_id", + output_index=0, + response_id="response_id", + ), + ResponseAudioTranscriptDeltaEvent( + type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, + event_id="12", + content_index=0, + delta="text", + item_id="item_id", + output_index=0, + response_id="response_id", + ), + ResponseAudioDoneEvent( + type=ListenEvents.RESPONSE_AUDIO_DONE, + event_id="13", + item_id="item_id", + output_index=0, + response_id="response_id", + content_index=0, + ), + ResponseAudioDeltaEvent( + type=ListenEvents.RESPONSE_AUDIO_DELTA, + event_id="14", + item_id="item_id", + output_index=0, + response_id="response_id", + content_index=0, + delta="audio data", + ), +] + + +async def websocket_stream(**kwargs) -> AsyncIterable[RealtimeServerEvent]: + for event in events: + yield event + await asyncio.sleep(0) + + +@fixture +def audio_track(): + class AudioTrack(AudioStreamTrack): + kind = "audio" + + async def recv(self): + await asyncio.sleep(0) + return + + return AudioTrack() + + +@fixture +def OpenAIWebsocket(openai_unit_test_env): + client = OpenAIRealtimeWebsocket() + client._call_id_to_function_map["call_id"] = "function_name" + return client + + +@fixture +def OpenAIWebRTC(openai_unit_test_env, audio_track): + client = OpenAIRealtimeWebRTC(audio_track=audio_track) + client._call_id_to_function_map["call_id"] = "function_name" + return client + + +def test_update_settings_from_function_call_config(): + config = FunctionCallChoiceConfiguration( + available_functions=[ + KernelFunctionMetadata(name="function_name", description="function_description", is_prompt=False) + ] + ) + + settings = OpenAIRealtimeExecutionSettings() + + update_settings_from_function_call_configuration(config, settings, FunctionChoiceType.AUTO) + + assert len(settings.tools) == 1 + assert settings.tools[0]["type"] == "function" + assert settings.tools[0]["name"] == "function_name" + assert settings.tools[0]["description"] == "function_description" + assert settings.tool_choice == FunctionChoiceType.AUTO.value + + +def test_openai_realtime_websocket(openai_unit_test_env): + realtime_client = OpenAIRealtimeWebsocket() + assert realtime_client is not None + + +def test_openai_realtime_webrtc(openai_unit_test_env, audio_track): + realtime_client = OpenAIRealtimeWebRTC(audio_track=audio_track) + assert realtime_client is not None + + +@mark.parametrize( + ["event_type", "event_kwargs", "expected_event", "expected_exception"], + [ + param( + SendEvents.SESSION_UPDATE, + {"session": {"id": "session_id"}}, + SessionUpdateEvent, + None, + id="session_update", + ), + param( + SendEvents.SESSION_UPDATE, + {}, + SessionUpdateEvent, + ContentException, + id="session_update_missing", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_APPEND, + {"audio": "audio_buffer_as_string"}, + InputAudioBufferAppendEvent, + None, + id="input_audio_buffer_append", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_APPEND, + {}, + InputAudioBufferAppendEvent, + ContentException, + id="input_audio_buffer_append_missing_audio", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_COMMIT, + {}, + InputAudioBufferCommitEvent, + None, + id="input_audio_buffer_commit", + ), + param( + SendEvents.INPUT_AUDIO_BUFFER_CLEAR, + {}, + InputAudioBufferClearEvent, + None, + id="input_audio_buffer_Clear", + ), + param( + SendEvents.CONVERSATION_ITEM_CREATE, + { + "event_id": "event_id", + "previous_item_id": "previous_item_id", + "item": {"id": "item_id"}, + }, + ConversationItemCreateEvent, + None, + id="conversation_item_create_event", + ), + param( + SendEvents.CONVERSATION_ITEM_CREATE, + {}, + ConversationItemCreateEvent, + ContentException, + id="conversation_item_create_event_no_item", + ), + param( + SendEvents.CONVERSATION_ITEM_TRUNCATE, + {"audio_end_ms": 1000, "item_id": "item_id"}, + ConversationItemTruncateEvent, + None, + id="conversation_item_truncate", + ), + param( + SendEvents.CONVERSATION_ITEM_DELETE, + {"item_id": "item_id"}, + ConversationItemDeleteEvent, + None, + id="conversation_item_delete", + ), + param( + SendEvents.CONVERSATION_ITEM_DELETE, + {}, + ConversationItemDeleteEvent, + ContentException, + id="conversation_item_delete_fail", + ), + param( + SendEvents.RESPONSE_CREATE, + {"response": {"instructions": "instructions"}}, + ResponseCreateEvent, + None, + id="response_create", + ), + param( + SendEvents.RESPONSE_CANCEL, + {}, + ResponseCancelEvent, + None, + id="response_cancel", + ), + ], +) +def test_create_openai_realtime_event( + event_type: SendEvents, event_kwargs: dict[str, Any], expected_event: Any, expected_exception: Exception | None +): + if expected_exception: + with raises(expected_exception): + _create_openai_realtime_client_event(event_type, **event_kwargs) + else: + event = _create_openai_realtime_client_event(event_type, **event_kwargs) + assert isinstance(event, expected_event) + + +@mark.parametrize( + ["event", "expected_type"], + [ + param( + ResponseAudioTranscriptDeltaEvent( + content_index=0, + delta="text", + item_id="item_id", + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.audio_transcript.delta", + ), + [RealtimeTextEvent], + id="response_audio_transcript_delta", + ), + param( + ResponseOutputItemAddedEvent( + item=ConversationItem(id="item_id"), + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.output_item.added", + ), + [RealtimeEvent], + id="response_output_item_added", + ), + param( + ResponseOutputItemAddedEvent( + item=ConversationItem(id="item_id", type="function_call", call_id="call_id", name="function_to_call"), + event_id="event_id", + output_index=0, + response_id="response_id", + type="response.output_item.added", + ), + [RealtimeEvent], + id="response_output_item_added_function_call", + ), + param( + ResponseFunctionCallArgumentsDeltaEvent( + call_id="call_id", + delta="argument delta", + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.delta", + ), + [RealtimeFunctionCallEvent], + id="response_function_call_arguments_delta", + ), + param( + ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments="argument delta", + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ), + [RealtimeEvent], + id="response_function_call_arguments_done_no_kernel", + ), + param( + ErrorEvent( + error={"code": "error_code", "message": "error_message", "type": "invalid_request_error"}, + event_id="event_id", + type="error", + ), + [RealtimeEvent], + id="error", + ), + param( + SessionCreatedEvent( + session=Session(id="session_id"), + event_id="event_id", + type="session.created", + ), + [RealtimeEvent], + id="session_created", + ), + param( + SessionUpdatedEvent( + session=Session(id="session_id"), + event_id="event_id", + type="session.updated", + ), + [RealtimeEvent], + id="session_updated", + ), + param( + InputAudioBufferSpeechStartedEvent( + audio_start_ms=0, + event_id="event_id", + item_id="item_id", + type="input_audio_buffer.speech_started", + ), + [RealtimeEvent], + id="other", + ), + ], +) +async def test_parse_event(OpenAIWebsocket, event: RealtimeServerEvent, expected_type: list[type]): + iter = 0 + async for result in OpenAIWebsocket._parse_event(event): + assert isinstance(result, expected_type[iter]) + iter += 1 + + +async def test_update_session(OpenAIWebsocket, kernel): + chat_history = ChatHistory( + messages=[ + ChatMessageContent(role="user", content="Hello"), + ChatMessageContent( + role="assistant", + items=[ + FunctionCallContent( + function_name="function_name", plugin_name="plugin", arguments={"arg1": "value"}, id="1" + ) + ], + ), + ChatMessageContent( + role="tool", + items=[ + FunctionResultContent(function_name="function_name", plugin_name="plugin", result="result", id="1") + ], + ), + ChatMessageContent( + role="user", + items=[ + TextContent(text="Hello again"), + ImageContent(uri="https://example.com/image.png"), + ], + ), + ] + ) + settings = OpenAIRealtimeExecutionSettings(instructions="instructions", ai_model_id="gpt-4o-realtime-preview") + with patch.object(OpenAIWebsocket, "_send") as mock_send: + await OpenAIWebsocket.update_session( + chat_history=chat_history, settings=settings, create_response=True, kernel=kernel + ) + mock_send.assert_awaited() + # session update, 4 conversation item create events, response create + # images are not supported, so ignored + assert len(mock_send.await_args_list) == 6 + assert OpenAIWebsocket._current_settings == settings + assert OpenAIWebsocket.kernel == kernel + + +async def test_parse_function_call_arguments_done(OpenAIWebsocket, kernel): + func_result = "result" + event = ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments='{"x": "' + func_result + '"}', + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ) + response_events = [RealtimeFunctionCallEvent, RealtimeFunctionResultEvent] + OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( + instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + ) + OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() + OpenAIWebsocket._call_id_to_function_map["call_id"] = "plugin_name-function_name" + func = kernel_function(name="function_name", description="function_description")(lambda x: x) + kernel.add_function(plugin_name="plugin_name", function_name="function_name", function=func) + OpenAIWebsocket.kernel = kernel + iter = 0 + with patch.object(OpenAIWebsocket, "_send") as mock_send: + async for event in OpenAIWebsocket._parse_function_call_arguments_done(event): + assert isinstance(event, response_events[iter]) + iter += 1 + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 2 + mock_send.assert_any_await( + ConversationItemCreateEvent( + type="conversation.item.create", + item=ConversationItem( + type="function_call_output", + output=func_result, + call_id="call_id", + ), + ) + ) + + +async def test_parse_function_call_arguments_done_fail(OpenAIWebsocket, kernel): + func_result = "result" + event = ResponseFunctionCallArgumentsDoneEvent( + call_id="call_id", + arguments='{"x": "' + func_result + '"}', + event_id="event_id", + output_index=0, + item_id="item_id", + response_id="response_id", + type="response.function_call_arguments.done", + ) + response_events = [RealtimeEvent] + OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( + instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + ) + OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() + # This function name is invalid + OpenAIWebsocket._call_id_to_function_map["call_id"] = "function_name" + func = kernel_function(name="function_name", description="function_description")(lambda x: x) + kernel.add_function(plugin_name="plugin_name", function_name="function_name", function=func) + OpenAIWebsocket.kernel = kernel + iter = 0 + async for event in OpenAIWebsocket._parse_function_call_arguments_done(event): + assert isinstance(event, response_events[iter]) + iter += 1 + + +async def test_send_audio(OpenAIWebsocket): + audio_event = RealtimeAudioEvent( + audio=AudioContent(data=b"audio data", mime_type="audio/wav"), + ) + with patch.object(OpenAIWebsocket, "_send") as mock_send: + await OpenAIWebsocket.send(audio_event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 1 + mock_send.assert_any_await( + InputAudioBufferAppendEvent( + audio="audio data", + type="input_audio_buffer.append", + ) + ) + + +@mark.parametrize("client", ["OpenAIWebRTC", "OpenAIWebsocket"]) +async def test_send_session_update(client, OpenAIWebRTC, OpenAIWebsocket): + openai_client = OpenAIWebRTC if client == "OpenAIWebRTC" else OpenAIWebsocket + settings = PromptExecutionSettings(ai_model_id="gpt-4o-realtime-preview") + session_event = RealtimeEvent( + service_type=SendEvents.SESSION_UPDATE, + service_event={"settings": settings}, + ) + with patch.object(openai_client, "_send") as mock_send: + await openai_client.send(event=session_event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 1 + mock_send.assert_any_await( + SessionUpdateEvent( + session={"model": "gpt-4o-realtime-preview"}, + type="session.update", + ) + ) + + +@mark.parametrize("client", ["OpenAIWebRTC", "OpenAIWebsocket"]) +async def test_send_conversation_item_create(client, OpenAIWebRTC, OpenAIWebsocket): + openai_client = OpenAIWebRTC if client == "OpenAIWebRTC" else OpenAIWebsocket + event = RealtimeEvent( + service_type=SendEvents.CONVERSATION_ITEM_CREATE, + service_event={ + "item": ChatMessageContent( + role="user", + items=[ + TextContent(text="Hello"), + FunctionCallContent( + function_name="function_name", + plugin_name="plugin", + arguments={"arg1": "value"}, + id="1", + metadata={"call_id": "call_id"}, + ), + FunctionResultContent( + function_name="function_name", + plugin_name="plugin", + result="result", + id="1", + metadata={"call_id": "call_id"}, + ), + ], + ) + }, + ) + + with patch.object(openai_client, "_send") as mock_send: + await openai_client.send(event=event) + mock_send.assert_awaited() + assert len(mock_send.await_args_list) == 3 + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + content=[ConversationItemContent(text="Hello", type="input_text")], + role="user", + type="message", + ), + type="conversation.item.create", + ) + ) + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + arguments='{"arg1": "value"}', + call_id="call_id", + name="plugin-function_name", + type="function_call", + ), + type="conversation.item.create", + ) + ) + mock_send.assert_any_await( + ConversationItemCreateEvent( + item=ConversationItem( + call_id="call_id", + output="result", + type="function_call_output", + ), + type="conversation.item.create", + ) + ) + + +async def test_receive_websocket(OpenAIWebsocket): + connection_mock = AsyncMock(spec=AsyncRealtimeConnection) + connection_mock.recv = websocket_stream + + manager = AsyncMock(spec=AsyncRealtimeConnectionManager) + manager.enter.return_value = connection_mock + + with patch("openai.resources.beta.realtime.realtime.AsyncRealtime.connect") as mock_connect: + mock_connect.return_value = manager + async with OpenAIWebsocket(): + async for msg in OpenAIWebsocket.receive(): + assert isinstance(msg, RealtimeEvent) + + +async def test_receive_webrtc(OpenAIWebRTC): + counter = len(events) + with patch.object(OpenAIRealtimeWebRTC, "create_session"): + recv_task = asyncio.create_task(_stream_to_webrtc(OpenAIWebRTC)) + async with OpenAIWebRTC(): + async for msg in OpenAIWebRTC.receive(): + assert isinstance(msg, RealtimeEvent) + counter -= 1 + if counter == 0: + break + recv_task.cancel() + + +async def _stream_to_webrtc(client: OpenAIRealtimeWebRTC): + async for msg in websocket_stream(): + async for parsed_msg in client._parse_event(msg): + await client._receive_buffer.put(parsed_msg) + await asyncio.sleep(0) diff --git a/python/uv.lock b/python/uv.lock index 4dafbf89c550..696e0d1cf6ef 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,19 +1,18 @@ version = 1 -revision = 1 requires-python = ">=3.10" resolution-markers = [ - "python_full_version < '3.11' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.12.*' and sys_platform == 'darwin'", "python_full_version >= '3.13' and sys_platform == 'darwin'", - "python_full_version < '3.11' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", "python_full_version >= '3.13' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform == 'win32'", - "python_full_version == '3.11.*' and sys_platform == 'win32'", - "python_full_version == '3.12.*' and sys_platform == 'win32'", + "python_full_version == '3.12.*' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform == 'linux'", "python_full_version >= '3.13' and sys_platform == 'win32'", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and sys_platform == 'win32'", + "python_full_version < '3.11' and sys_platform == 'win32'", ] supported-markers = [ "sys_platform == 'darwin'", @@ -130,6 +129,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/54/ebb815bc0fe057d8e7a11c086c479e972e827082f39aeebc6019dd4f0862/aiohttp-3.11.13-cp313-cp313-win_amd64.whl", hash = "sha256:5ceb81a4db2decdfa087381b5fc5847aa448244f973e5da232610304e199e7b2", size = 436452 }, ] +[[package]] +name = "aioice" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dnspython", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "ifaddr", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/b6/e2b0e48ccb5b04fe29265e93f14a0915f416e359c897ae87d570566c430b/aioice-0.9.0.tar.gz", hash = "sha256:fc2401b1c4b6e19372eaaeaa28fd1bd9cbf6b0e412e48625297c53b495eebd1e", size = 40324 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/35/d21e48d3ba25d32aba5d142d54c4491376c659dd74d052a30dd25198007b/aioice-0.9.0-py3-none-any.whl", hash = "sha256:b609597a3a5a611e0004ff04772e16aceb881d51c25c0afc4ceac05d5e50024e", size = 24177 }, +] + +[[package]] +name = "aiortc" +version = "1.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aioice", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "av", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "google-crc32c", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyee", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pylibsrtp", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyopenssl", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/f8/408e092748521889c9d33dddcef920afd9891cf6db4615ba6b6bfe114ff8/aiortc-1.10.1.tar.gz", hash = "sha256:64926ad86bde20c1a4dacb7c3a164e57b522606b70febe261fada4acf79641b5", size = 1179406 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/6b/74547a30d1ddcc81f905ef4ff7fcc2c89b7482cb2045688f2aaa4fa918aa/aiortc-1.10.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3bef536f38394b518aefae9dbf9cdd08f39e4c425f316f9692f0d8dc724810bd", size = 1218457 }, + { url = "https://files.pythonhosted.org/packages/46/92/b4ccf39cd18e366ace2a11dc7d98ed55967b4b325707386b5788149db15e/aiortc-1.10.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8842c02e38513d9432ef22982572833487bb015f23348fa10a690616dbf55143", size = 898855 }, + { url = "https://files.pythonhosted.org/packages/a4/e9/2676de48b493787d8b03129713e6bb2dfbacca2a565090f2a89cbad71f96/aiortc-1.10.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:954a420de01c0bf6b07a0c58b662029b1c4204ddbd8f5c4162bbdebd43f882b1", size = 1750403 }, + { url = "https://files.pythonhosted.org/packages/c3/9d/ab6d09183cdaf5df060923d9bd5c9ed5fb1802661d9401dba35f3c85a57b/aiortc-1.10.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7c0d46fb30307a9d7deb4b7d66f0b0e73b77a7221b063fb6dc78821a5d2aa1e", size = 1867886 }, + { url = "https://files.pythonhosted.org/packages/c2/71/0b5666e6b965dbd9a7f331aa827a6c3ab3eb4d582fefb686a7f4227b7954/aiortc-1.10.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89582f6923046f79f15d9045f432bc78191eacc95f6bed18714e86ec935188d9", size = 1893709 }, + { url = "https://files.pythonhosted.org/packages/9d/0a/8c0c78fad79ef595a0ed6e2ab413900e6bd0eac65fc5c31c9d8736bff909/aiortc-1.10.1-cp39-abi3-win32.whl", hash = "sha256:d1cbe87f740b33ffaa8e905f21092773e74916be338b64b81c8b79af4c3847eb", size = 923265 }, + { url = "https://files.pythonhosted.org/packages/73/12/a27dd588a4988021da88cb4d338d8ee65ac097afc14e9193ab0be4a48790/aiortc-1.10.1-cp39-abi3-win_amd64.whl", hash = "sha256:c9a5a0b23f8a77540068faec8837fa0a65b0396c20f09116bdb874b75e0b6abe", size = 1009488 }, +] + [[package]] name = "aiosignal" version = "1.3.2" @@ -265,6 +302,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/27/198414c4c24e886770a03e0bed349582c40e3bfc2ec327034cc5d22c185f/autogen_agentchat-0.2.40-py3-none-any.whl", hash = "sha256:03f11ab89442a3b2408e7e46aa4a66d0be44e6f4447467efbb3ef4e35940176e", size = 382317 }, ] +[[package]] +name = "av" +version = "13.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0c/9d/486d31e76784cc0ad943f420c5e05867263b32b37e2f4b0f7f22fdc1ca3a/av-13.1.0.tar.gz", hash = "sha256:d3da736c55847d8596eb8c26c60e036f193001db3bc5c10da8665622d906c17e", size = 3957908 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/d6/1c4a8056a88e006681ac6a3d5ac6082f0a48e52bd565bfd350bfc7c6a37d/av-13.1.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2af44fae6d16c3a40dd1c85bda41b449be08a2c172d8f44fb63395ccf6e6fb4", size = 24260057 }, + { url = "https://files.pythonhosted.org/packages/23/be/cf89545117172d75a0c48066e6f368403237df623b2e3e93590fdeaef8bf/av-13.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0fea71fe06fd0dfe90a089200eb6468034797f860a321fa2d62e07d619c74749", size = 19475039 }, + { url = "https://files.pythonhosted.org/packages/4b/d0/8e261547f7763f320a4f5f68e139fea5f31814fddfe5503c8372123ebb8b/av-13.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:756997810dcca11811b598d209561cabd2071e5b472b867c295bb3e7022eecde", size = 31289005 }, + { url = "https://files.pythonhosted.org/packages/82/a3/00cacfe80ebbe0664876dd26558fb23b65d034ffd2ce0ddb12f1c746e7cb/av-13.1.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f432102edaec4ee90087a675acf486bff0c81b47d98b85eb3218afe84575b60", size = 30705668 }, + { url = "https://files.pythonhosted.org/packages/d7/37/faa98dca1a8f6c2e3f4ad3a935037872aff49a679b76918c5258cf5a1c70/av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d568c4d7a36df52c0774d52e6d730148775ead16daed81c10dafc2569b5a38d", size = 33122108 }, + { url = "https://files.pythonhosted.org/packages/25/81/c3a842477b558e23c7249f81cf723764c193636b6523267c2c02321da6b0/av-13.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa6f76e7c5e77bc5f99a27ada29f78c64fd4e0d42da2c4d203badc650bc0a686", size = 25775920 }, + { url = "https://files.pythonhosted.org/packages/39/54/c4227080c9700384db90072ace70d89b6a288b3748bd2ec0e32580a49e7f/av-13.1.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:867385e6701464a5c95903e24d2e0df1c7e0dbf211ed91d0ce639cd687373e10", size = 24255112 }, + { url = "https://files.pythonhosted.org/packages/32/4a/eb9348231655ca99b200b380f4edbceff7358c927a285badcc84b18fb1c9/av-13.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb7a3f319401a46b0017771268ff4928501e77cf00b1a2aa0721e20b2fd1146e", size = 19467930 }, + { url = "https://files.pythonhosted.org/packages/14/c7/48c80252bdbc3a75a54dd205a7fab8f613914009b9e5416202757208e040/av-13.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad904f860147bceaca65b0d3174a8153f35c570d465161d210f1879970b15559", size = 32207671 }, + { url = "https://files.pythonhosted.org/packages/f9/66/3332c7fa8c43b65680a94f279ea3e832b5500de3a1392bac6112881e984b/av-13.1.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a906e017b29d0eb80d9ccf7a98d19268122da792dbb68eb741cfebba156e6aed", size = 31520911 }, + { url = "https://files.pythonhosted.org/packages/e5/bb/2e03acb9b27591d97f700a3a6c27cfd1bc53fa148177747eda8a70cca1e9/av-13.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce894d7847897da7be63277a0875bd93c51327134ac226c67978de014c7979f", size = 34048399 }, + { url = "https://files.pythonhosted.org/packages/85/44/527aa3b65947d42cfe829326026edf0cd1a8c459390076034be275616c36/av-13.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:384bcdb5fc3238a263a5a25cc9efc690859fa4148cc4b07e00fae927178db22a", size = 25779569 }, + { url = "https://files.pythonhosted.org/packages/9b/aa/4bdd8ce59173574fc6e0c282c71ee6f96fca82643d97bf172bc4cb5a5674/av-13.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:261dbc3f4b55f4f8f3375b10b2258fca7f2ab7a6365c01bc65e77a0d5327a195", size = 24268674 }, + { url = "https://files.pythonhosted.org/packages/17/b4/b267dd5bad99eed49ec6731827c6bcb5ab03864bf732a7ebb81e3df79911/av-13.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83d259ef86b9054eb914bc7c6a7f6092a6d75cb939295e70ee979cfd92a67b99", size = 19475617 }, + { url = "https://files.pythonhosted.org/packages/68/32/4209e51f54d7b54a1feb576d309c671ed1ff437b54fcc4ec68c239199e0a/av-13.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b4d3ca159eceab97e3c0fb08fe756520fb95508417f76e48198fda2a5b0806", size = 32468873 }, + { url = "https://files.pythonhosted.org/packages/b6/d8/c174da5f06b24f3c9e36f91fd02a7411c39da9ce792c17964260d4be675e/av-13.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40e8f757e373b73a2dc4640852a00cce4a4a92ef19b2e642a96d6994cd1fffbf", size = 31818484 }, + { url = "https://files.pythonhosted.org/packages/7f/22/0dd8d1d5cad415772bb707d16aea8b81cf75d340d11d3668eea43468c730/av-13.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8aaec2c0bfd024359db3821d679009d4e637e1bee0321d20f61c54ed6b20f41", size = 34398652 }, + { url = "https://files.pythonhosted.org/packages/7b/ff/48fa68888b8d5bae36d915556ff18f9e5fdc6b5ff5ae23dc4904c9713168/av-13.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:5ea0deab0e6a739cb742fba2a3983d8102f7516a3cdf3c46669f3cac0ed1f351", size = 25781343 }, + { url = "https://files.pythonhosted.org/packages/82/6e/cdce12e534570df37d3fdcb3a74851d39e9ab79d388f3174dea9785a011a/av-13.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47642ebaebfe20519b2391bd5b7c38b596efcd052bfd09c8d33058f94ddd0fd6", size = 24229340 }, + { url = "https://files.pythonhosted.org/packages/7c/88/5359aeada9ea509426f2db63b6531833824a1b02470667b103479ddea7ae/av-13.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2f079c2daa3ae06557b3f6e9bed4fb9c876e8012175bec645ccd007199a302db", size = 19436445 }, + { url = "https://files.pythonhosted.org/packages/b4/d4/64995e5b800476c86dae4ea1444a0eac44e2c4985fac6401b08401e2df11/av-13.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f0de8252deeeb1887637e88d4d9d18514e5cfe276bdb9e6ca8e9eef89d1667a", size = 32120549 }, + { url = "https://files.pythonhosted.org/packages/68/76/9910694cf87d2d308d851f5b2b5c5b20f7f55411f596e2c158fb13bf84a3/av-13.1.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ad0024f4def11b0cedfeee478fa6c6fd7ed3955e13387e0f27261fdda6121b4", size = 31495305 }, + { url = "https://files.pythonhosted.org/packages/6a/a8/cd92de947b9595a0eb2c64e6f7ba295aac2687972050ae092173c2f6ea0c/av-13.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb88e2590eaed45233eb117f1dfab1a43ed9a997b2c46da9f08468dd00f14895", size = 34065325 }, + { url = "https://files.pythonhosted.org/packages/9d/d0/9869fcbd66422df2033d4b78a663e3c64aa6fe7eb9189c811d60f69d9871/av-13.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:c927e4fa4f6aeed4340b3e3b16b237d7cb743e5c1a55b92307407590ca4112aa", size = 25754728 }, + { url = "https://files.pythonhosted.org/packages/63/62/09859d91bc2309918d548ac4585973c53e7db27010c432d050f02206f9bd/av-13.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fc5118f78ee712b2c396f345e4c51e60e61e28f1f606adbd4060c4dc44b0b652", size = 23861117 }, + { url = "https://files.pythonhosted.org/packages/c7/43/f186435a0acad3a2bdf271ce51d3af97ac3153a410e54a623529d39a1818/av-13.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:81bcbd3545e523e7a350613be1866b515a5ee3fafa1d9d257d7ed02531fc2636", size = 19115008 }, + { url = "https://files.pythonhosted.org/packages/31/eb/a1b4af95a615ba73dfc3cfcb9387e40826c92d7d6d383a1b68685a7ef920/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83b2bc641e8e16bbf058de35f1ba79ebed358ac6fe3cb5a665366294774fdb18", size = 22852637 }, + { url = "https://files.pythonhosted.org/packages/0b/a6/94a34aa672af7fef2939e4a5d6c4c6c28e33da0c623aaa9485d977eeaa95/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d956ae3e68fabdc45eb2b986c2e842a31df084d8cfc90336509f07a727a9df62", size = 22703888 }, + { url = "https://files.pythonhosted.org/packages/b9/69/08a72ceed2c8a6e689dea2ef8e941df9469cbe144a600b83d45f821477fc/av-13.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ef076fcdf172aafcb21ea3ef7bd68cc9151b050016a8ace13b3dae3d08a4427", size = 24657784 }, + { url = "https://files.pythonhosted.org/packages/b7/8c/c20894580a4341a76c7c74b59c43e26e6652b0fc60f7248f2c1bc5fdbb5e/av-13.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bbf90397b7a466ff2879bd0944d55f796ad76c073fce50304315b83ad00113bd", size = 25562492 }, +] + [[package]] name = "azure-ai-inference" version = "1.0.0b9" @@ -1866,6 +1941,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "ifaddr" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/ac/fb4c578f4a3256561548cd825646680edcadb9440f3f68add95ade1eb791/ifaddr-0.2.0.tar.gz", hash = "sha256:cc0cbfcaabf765d44595825fb96a99bb12c79716b73b44330ea38ee2b0c4aed4", size = 10485 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/1f/19ebc343cc71a7ffa78f17018535adc5cbdd87afb31d7c34874680148b32/ifaddr-0.2.0-py3-none-any.whl", hash = "sha256:085e0305cfe6f16ab12d72e2024030f5d52674afad6911bb1eee207177b8a748", size = 12314 }, +] + [[package]] name = "importlib-metadata" version = "8.5.0" @@ -3936,6 +4020,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/a9/3b9642025174bbe67e900785fb99c9bfe91ea584b0b7126ff99945c24a0e/pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820", size = 30746 }, ] +[[package]] +name = "pyee" +version = "12.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/37/8fb6e653597b2b67ef552ed49b438d5398ba3b85a9453f8ada0fd77d455c/pyee-12.1.1.tar.gz", hash = "sha256:bbc33c09e2ff827f74191e3e5bbc6be7da02f627b7ec30d86f5ce1a6fb2424a3", size = 30915 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/68/7e150cba9eeffdeb3c5cecdb6896d70c8edd46ce41c0491e12fb2b2256ff/pyee-12.1.1-py3-none-any.whl", hash = "sha256:18a19c650556bb6b32b406d7f017c8f513aceed1ef7ca618fb65de7bd2d347ef", size = 15527 }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -3959,6 +4055,24 @@ crypto = [ { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] +[[package]] +name = "pylibsrtp" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2e/49/1c5101ecfeda540699e0754dddfc91c401fbf736ebe99d66e59fe3dad2ba/pylibsrtp-0.11.0.tar.gz", hash = "sha256:5a8d19b1448baebde5ae3cedfa51f10e8ada3d9d99f43046ced0ecf1c105b8ec", size = 10786 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/95/65650bf56e1080beb5f7c963a0bb11a6ee7599bfd89b33ff4525d2b5824b/pylibsrtp-0.11.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:36c6b33347d47c889b7dd465c6ae1f44d7705d00436ca613fd2a8f5dd401b104", size = 1727506 }, + { url = "https://files.pythonhosted.org/packages/4e/b0/f12c489ea8716e74343559abc5d0dfb94d66bcfe1924d64d58424a50f496/pylibsrtp-0.11.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cf18b80f9513484a70e55136ece6ec80e7d21c03cc69abbb428e4f2745ca3cee", size = 2058008 }, + { url = "https://files.pythonhosted.org/packages/e1/2e/6040cd6da6f82f3aa1763c8c45f7fcfdfe08db5560c73f5e1deb4c36c2bb/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81bbe0cd777979f7fc45c85f0c619c9cbe709faffbf91675d9dcce560734b353", size = 2566705 }, + { url = "https://files.pythonhosted.org/packages/2b/c9/fd313ac3a23e9c45493131d9fa3463770289e59bb8422c6c6877ab3add40/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78fcdfe63925ea9a5017884c31fe9687b9b8b9f7d9beb7e25e3be47aa6ece495", size = 2168163 }, + { url = "https://files.pythonhosted.org/packages/f9/b3/ae0bac50cc0cca4b8c14de8063ba410ed3edd82c71a2315f284c9be7d679/pylibsrtp-0.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1909f7e781a7675d5c92cbad9e7ed3642e626e2bea5834243e423976e5420ac3", size = 2224343 }, + { url = "https://files.pythonhosted.org/packages/51/c4/650c2cecd5810f84adc89f3a94a28ea02d7ac8eaf3ee718a629c6f8ebf09/pylibsrtp-0.11.0-cp39-abi3-win32.whl", hash = "sha256:15123cecd377248747c95de9305ac314f3bcccdae46022bb4b9d60a552a26a10", size = 1156330 }, + { url = "https://files.pythonhosted.org/packages/fe/78/724307095b95c937e54c48133be3e85779cebea770f7536be555217b31f2/pylibsrtp-0.11.0-cp39-abi3-win_amd64.whl", hash = "sha256:bea2fb98029d19de516538b13c4827b6474d6f85d9ea50fae349e9671b946f7a", size = 1486448 }, +] + [[package]] name = "pymeta3" version = "0.5.1" @@ -4039,6 +4153,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/7b/8d0767251e687966cf19a4ad032d597ab135d26af5ecebbdb8895ea92cf0/pymongo-4.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3854db4be39cb9e0c34add1fd7e515deab0b4ee30f3cc3978e057746d119ac12", size = 987871 }, ] +[[package]] +name = "pyopenssl" +version = "25.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "typing-extensions", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/26/e25b4a374b4639e0c235527bbe31c0524f26eda701d79456a7e1877f4cc5/pyopenssl-25.0.0.tar.gz", hash = "sha256:cd2cef799efa3936bb08e8ccb9433a575722b9dd986023f1cabc4ae64e9dac16", size = 179573 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/d7/eb76863d2060dcbe7c7e6cccfd95ac02ea0b9acc37745a0d99ff6457aefb/pyOpenSSL-25.0.0-py3-none-any.whl", hash = "sha256:424c247065e46e76a37411b9ab1782541c23bb658bf003772c3405fbaa128e90", size = 56453 }, +] + [[package]] name = "pyparsing" version = "3.2.1" @@ -4941,6 +5068,10 @@ qdrant = [ { name = "qdrant-client", version = "1.12.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and sys_platform == 'darwin') or (python_full_version >= '3.13' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform == 'win32')" }, { name = "qdrant-client", version = "1.13.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'win32')" }, ] +realtime = [ + { name = "aiortc", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "websockets", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] redis = [ { name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "redisvl", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -4973,6 +5104,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiohttp", specifier = "~=3.8" }, + { name = "aiortc", marker = "extra == 'realtime'", specifier = ">=1.9.0" }, { name = "anthropic", marker = "extra == 'anthropic'", specifier = "~=0.32" }, { name = "autogen-agentchat", marker = "extra == 'autogen'", specifier = ">=0.2,<0.4" }, { name = "azure-ai-inference", marker = "extra == 'azure'", specifier = ">=1.0.0b6" }, @@ -5025,8 +5157,8 @@ requires-dist = [ { name = "types-redis", marker = "extra == 'redis'", specifier = "~=4.6.0.20240425" }, { name = "usearch", marker = "extra == 'usearch'", specifier = "~=2.16" }, { name = "weaviate-client", marker = "extra == 'weaviate'", specifier = ">=4.10,<5.0" }, + { name = "websockets", marker = "extra == 'realtime'", specifier = ">=13,<15" }, ] -provides-extras = ["anthropic", "autogen", "aws", "azure", "chroma", "dapr", "google", "hugging-face", "milvus", "mistralai", "mongo", "notebooks", "ollama", "onnx", "pandas", "pinecone", "postgres", "qdrant", "redis", "usearch", "weaviate"] [package.metadata.requires-dev] dev = [