WIP

eavanvalkenburg · eavanvalkenburg · commit cf1859694771 · 2025-01-31T15:52:53.000+01:00
diff --git a/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py b/python/samples/concepts/audio/04-chat_with_realtime_api_simple.py
@@ -43,7 +43,7 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = OpenAIRealtime(protocol="webrtc")
+    realtime_client = OpenAIRealtime("webrtc")
     # Create the settings for the session
     settings = OpenAIRealtimeExecutionSettings(
         instructions="""
diff --git a/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py b/python/samples/concepts/audio/05-chat_with_realtime_api_complex.py
@@ -52,35 +52,41 @@ def get_weather(location: str) -> str:
     """Get the weather for a location."""
     weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
     weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
-    logger.info(f"Getting weather for {location}: {weather}")
+    logger.info(f"@ Getting weather for {location}: {weather}")
     return f"The weather in {location} is {weather}."
 
 
 @kernel_function
 def get_date_time() -> str:
     """Get the current date and time."""
-    logger.info("Getting current datetime")
+    logger.info("@ Getting current datetime")
     return f"The current date and time is {datetime.now().isoformat()}."
 
 
+@kernel_function
+def goodbye():
+    """When the user is done, say goodbye and then call this function."""
+    logger.info("@ Goodbye has been called!")
+    raise KeyboardInterrupt
+
+
 async def main() -> None:
     print_transcript = True
     # create the Kernel and add a simple function for function calling.
     kernel = Kernel()
-    kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather)
-    kernel.add_function(plugin_name="time", function_name="get_date_time", function=get_date_time)
+    kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
 
     # create the audio player and audio track
     # both take a device_id parameter, which is the index of the device to use, if None the default device is used
-    audio_player = SKAudioPlayer()
+    audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
     audio_track = SKAudioTrack()
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
     realtime_client = OpenAIRealtime(
-        protocol="webrtc",
+        protocol="websocket",
         audio_output_callback=audio_player.client_callback,
-        audio_track=audio_track,
+        # audio_track=audio_track,
     )
 
     # Create the settings for the session
@@ -110,7 +116,7 @@ async def main() -> None:
     chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
 
     # the context manager calls the create_session method on the client and start listening to the audio stream
-    async with realtime_client, audio_player:
+    async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
         await realtime_client.update_session(
             settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
         )
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_realtime.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-from collections.abc import Callable, Coroutine, Mapping
+from collections.abc import AsyncGenerator, Callable, Coroutine, Mapping
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar
 
 from numpy import ndarray
@@ -15,42 +15,82 @@
     OpenAIRealtimeWebsocketBase,
 )
 from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
+from semantic_kernel.contents.chat_history import ChatHistory
+from semantic_kernel.contents.events.realtime_event import RealtimeEvent
 from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
 
 if TYPE_CHECKING:
     from aiortc.mediastreams import MediaStreamTrack
 
+    from semantic_kernel.connectors.ai import PromptExecutionSettings
+    from semantic_kernel.contents import ChatHistory
+
 _T = TypeVar("_T", bound="OpenAIRealtime")
 
 
-class OpenAIRealtime(OpenAIConfigBase, OpenAIRealtimeBase):
+__all__ = ["OpenAIRealtime"]
+
+
+class RealtimeClientStub(RealtimeClientBase):
+    """This class makes sure that IDE's don't complain about missing methods in the below superclass."""
+
+    async def send(self, event: Any) -> None:
+        pass
+
+    async def create_session(
+        self,
+        settings: "PromptExecutionSettings | None" = None,
+        chat_history: "ChatHistory | None" = None,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
+    def receive(self, **kwargs: Any) -> AsyncGenerator[RealtimeEvent, None]:
+        pass
+
+    async def update_session(
+        self,
+        settings: "PromptExecutionSettings | None" = None,
+        chat_history: "ChatHistory | None" = None,
+        **kwargs: Any,
+    ) -> None:
+        pass
+
+    async def close_session(self) -> None:
+        pass
+
+
+class OpenAIRealtime(OpenAIRealtimeBase, RealtimeClientStub):
     """OpenAI Realtime service."""
 
-    def __new__(cls: type["_T"], *args: Any, **kwargs: Any) -> "_T":
+    def __new__(cls: type["_T"], protocol: str, *args: Any, **kwargs: Any) -> "_T":
         """Pick the right subclass, based on protocol."""
         subclass_map = {subcl.protocol: subcl for subcl in cls.__subclasses__()}
-        subclass = subclass_map[kwargs.pop("protocol", "websocket")]
+        subclass = subclass_map[protocol]
         return super(OpenAIRealtime, subclass).__new__(subclass)
 
     def __init__(
         self,
-        protocol: Literal["websocket", "webrtc"] = "websocket",
+        protocol: Literal["websocket", "webrtc"],
+        *,
         audio_output_callback: Callable[[ndarray], Coroutine[Any, Any, None]] | None = None,
         audio_track: "MediaStreamTrack | None" = None,
         ai_model_id: str | None = None,
         api_key: str | None = None,
         org_id: str | None = None,
         service_id: str | None = None,
         default_headers: Mapping[str, str] | None = None,
-        async_client: AsyncOpenAI | None = None,
+        client: AsyncOpenAI | None = None,
         env_file_path: str | None = None,
         env_file_encoding: str | None = None,
         **kwargs: Any,
     ) -> None:
         """Initialize an OpenAIRealtime service.
 
         Args:
-            protocol: The protocol to use, can be either "websocket" or "webrtc".
+            protocol: The protocol to use, must be either "websocket" or "webrtc".
             audio_output_callback: The audio output callback, optional.
                 This should be a coroutine, that takes a ndarray with audio as input.
                 The goal of this function is to allow you to play the audio with the
@@ -70,7 +110,7 @@ def __init__(
                 the env vars or .env file value.
             default_headers: The default headers mapping of string keys to
                 string values for HTTP requests. (Optional)
-            async_client (Optional[AsyncOpenAI]): An existing client to use. (Optional)
+            client (Optional[AsyncOpenAI]): An existing client to use. (Optional)
             env_file_path (str | None): Use the environment settings file as a fallback to
                 environment variables. (Optional)
             env_file_encoding (str | None): The encoding of the environment settings file. (Optional)
@@ -88,7 +128,6 @@ def __init__(
             raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex
         if not openai_settings.realtime_model_id:
             raise ServiceInitializationError("The OpenAI text model ID is required.")
-        kwargs = {"audio_track": audio_track} if protocol == "webrtc" and audio_track else {}
         super().__init__(
             protocol=protocol,
             audio_output_callback=audio_output_callback,
@@ -98,12 +137,12 @@ def __init__(
             org_id=openai_settings.org_id,
             ai_model_type=OpenAIModelTypes.REALTIME,
             default_headers=default_headers,
-            client=async_client,
+            client=client,
             **kwargs,
         )
 
 
-class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase):
+class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase, OpenAIConfigBase):
     """OpenAI Realtime service using WebRTC protocol.
 
     This should not be used directly, use OpenAIRealtime instead.
@@ -112,12 +151,33 @@ class OpenAIRealtimeWebRTC(OpenAIRealtime, OpenAIRealtimeWebRTCBase):
 
     protocol: ClassVar[Literal["webrtc"]] = "webrtc"
 
+    def __init__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize an OpenAIRealtime service using WebRTC protocol."""
+        super().__init__(
+            *args,
+            **kwargs,
+        )
+
 
-class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase):
+class OpenAIRealtimeWebSocket(OpenAIRealtime, OpenAIRealtimeWebsocketBase, OpenAIConfigBase):
     """OpenAI Realtime service using WebSocket protocol.
 
     This should not be used directly, use OpenAIRealtime instead.
     Set protocol="websocket" to use this class.
     """
 
     protocol: ClassVar[Literal["websocket"]] = "websocket"
+
+    def __init__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            *args,
+            **kwargs,
+        )
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_base.py
@@ -1,10 +1,8 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-import base64
 import json
 import logging
 import sys
-from abc import abstractmethod
 from collections.abc import AsyncGenerator, Callable, Coroutine
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
@@ -146,11 +144,24 @@ async def update_session(
             )
         if chat_history and len(chat_history) > 0:
             for msg in chat_history.messages:
-                await self.send(
-                    ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=msg)
-                )
+                for item in msg.items:
+                    match item:
+                        case TextContent():
+                            await self.send(TextEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, text=item))
+                        case FunctionCallContent():
+                            await self.send(
+                                FunctionCallEvent(service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_call=item)
+                            )
+                        case FunctionResultContent():
+                            await self.send(
+                                FunctionResultEvent(
+                                    service_type=SendEvents.CONVERSATION_ITEM_CREATE, function_result=item
+                                )
+                            )
+                        case _:
+                            logger.error("Unsupported item type: %s", item)
         if create_response:
-            await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE))
+            await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE))
 
     @override
     def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]:
@@ -191,24 +202,21 @@ async def _parse_function_call_arguments_done(
             index=event.output_index,
             metadata={"call_id": event.call_id},
         )
-        yield FunctionCallEvent(
-            event_type="function_call",
-            service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE,
-            function_call=item,
-        )
+        yield FunctionCallEvent(service_type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, function_call=item)
         chat_history = ChatHistory()
         await self.kernel.invoke_function_call(item, chat_history)
         created_output: FunctionResultContent = chat_history.messages[-1].items[0]  # type: ignore
         # This returns the output to the service
-        await self.send(
-            ServiceEvent(event_type="service", service_type=SendEvents.CONVERSATION_ITEM_CREATE, event=created_output)
+        result = FunctionResultEvent(
+            service_type=SendEvents.CONVERSATION_ITEM_CREATE,
+            function_result=created_output,
         )
+        await self.send(result)
         # The model doesn't start responding to the tool call automatically, so triggering it here.
-        await self.send(ServiceEvent(event_type="service", service_type=SendEvents.RESPONSE_CREATE))
+        await self.send(ServiceEvent(service_type=SendEvents.RESPONSE_CREATE))
         # This allows a user to have a full conversation in his code
-        yield FunctionResultEvent(event_type="function_result", function_result=created_output)
+        yield result
 
-    @abstractmethod
     async def _send(self, event: RealtimeClientEvent) -> None:
         """Send an event to the service."""
         raise NotImplementedError
@@ -217,14 +225,9 @@ async def _send(self, event: RealtimeClientEvent) -> None:
     async def send(self, event: RealtimeEvent, **kwargs: Any) -> None:
         match event.event_type:
             case "audio":
-                if isinstance(event.audio.data, ndarray):
-                    audio_data = base64.b64encode(event.audio.data.tobytes()).decode("utf-8")
-                else:
-                    audio_data = event.audio.data.decode("utf-8")
                 await self._send(
                     _create_realtime_client_event(
-                        event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND,
-                        audio=audio_data,
+                        event_type=SendEvents.INPUT_AUDIO_BUFFER_APPEND, audio=event.audio.to_base64_bytestring()
                     )
                 )
             case "text":
@@ -286,7 +289,7 @@ async def send(self, event: RealtimeEvent, **kwargs: Any) -> None:
                         await self._send(
                             _create_realtime_client_event(
                                 event_type=event.service_type,
-                                **settings.prepare_settings_dict(),
+                                session=settings.prepare_settings_dict(),
                             )
                         )
                     case SendEvents.INPUT_AUDIO_BUFFER_APPEND:
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_webrtc.py
@@ -161,9 +161,8 @@ async def _on_track(self, track: "MediaStreamTrack") -> None:
             try:
                 await self._receive_buffer.put(
                     AudioEvent(
-                        event_type="audio",
                         service_type=ListenEvents.RESPONSE_AUDIO_DELTA,
-                        audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame),  # type: ignore
+                        audio=AudioContent(data=frame.to_ndarray(), data_format="np.int16", inner_content=frame),
                     ),
                 )
             except Exception as e:
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/open_ai_realtime_websocket.py
@@ -20,9 +20,7 @@
 from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents
 from semantic_kernel.connectors.ai.open_ai.services.realtime.open_ai_realtime_base import OpenAIRealtimeBase
 from semantic_kernel.contents.audio_content import AudioContent
-from semantic_kernel.contents.events.realtime_event import RealtimeEvent
-from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
-from semantic_kernel.contents.utils.author_role import AuthorRole
+from semantic_kernel.contents.events.realtime_event import AudioEvent, RealtimeEvent
 from semantic_kernel.utils.experimental_decorator import experimental_class
 
 if TYPE_CHECKING:
@@ -54,18 +52,11 @@ async def receive(
                 if self.audio_output_callback:
                     await self.audio_output_callback(np.frombuffer(base64.b64decode(event.delta), dtype=np.int16))
                 try:
-                    yield (
-                        event.type,
-                        StreamingChatMessageContent(
-                            role=AuthorRole.ASSISTANT,
-                            items=[
-                                AudioContent(
-                                    data=base64.b64decode(event.delta),
-                                    data_format="base64",
-                                    inner_content=event,
-                                )
-                            ],  # type: ignore
-                            choice_index=event.content_index,
+                    yield AudioEvent(
+                        audio=AudioContent(
+                            data=base64.b64decode(event.delta),
+                            data_format="base64",
+                            inner_content=event,
                         ),
                     )
                 except Exception as e:
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py b/python/semantic_kernel/connectors/ai/open_ai/services/realtime/utils.py
@@ -68,11 +68,10 @@ def kernel_function_metadata_to_function_call_format(
 def _create_realtime_client_event(event_type: SendEvents, **kwargs: Any) -> RealtimeClientEvent:
     match event_type:
         case SendEvents.SESSION_UPDATE:
-            event_kwargs = {"event_id": kwargs.pop("event_id")} if "event_id" in kwargs else {}
             return SessionUpdateEvent(
                 type=event_type,
-                session=Session.model_validate(kwargs),
-                **event_kwargs,
+                session=Session.model_validate(kwargs.pop("session")),
+                **kwargs,
             )
         case SendEvents.INPUT_AUDIO_BUFFER_APPEND:
             return InputAudioBufferAppendEvent(
diff --git a/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py b/python/semantic_kernel/connectors/ai/utils/realtime_helpers.py
diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py
diff --git a/python/semantic_kernel/contents/events/realtime_event.py b/python/semantic_kernel/contents/events/realtime_event.py