microsoft · moonbox3 · Mar 4, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025
@@ -47,6 +47,7 @@
         "logprobs",
         "mistralai",
         "mongocluster",
+        "nd",
         "ndarray",
         "nopep",
         "NOSQL",
@@ -73,4 +74,4 @@
         "vertexai",
         "Weaviate"
     ]
-}
+}
@@ -10,7 +10,7 @@
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
-            "justMyCode": true
+            "justMyCode": false
         },
         {
             "name": "Python FastAPI app with Dapr",

@@ -128,6 +128,10 @@ dapr = [
     "dapr-ext-fastapi>=1.14.0",
     "flask-dapr>=1.14.0"
 ]
+realtime = [
+    "websockets >= 13, < 15",
+    "aiortc>=1.9.0",
+]
 
 [tool.uv]
 prerelease = "if-necessary-or-explicit"
@@ -225,5 +229,3 @@ name = "semantic_kernel"
 [build-system]
 requires = ["flit-core >= 3.9,<4.0"]
 build-backend = "flit_core.buildapi"
-
-
@@ -0,0 +1,50 @@
+# Realtime Multi-modal API Samples
+
+These samples are more complex then most because of the nature of these API's. They are designed to be run in real-time and require a microphone and speaker to be connected to your computer.
+
+To run these samples, you will need to have the following setup:
+
+- Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set.
+- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`.
+- To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment:
+  - semantic-kernel[realtime]
+  - pyaudio
+  - sounddevice
+  - pydub
+    e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+
+The samples all run as python scripts, that can either be started directly or through your IDE.
+
+All demos have a similar output, where the instructions are printed, each new *response item* from the API is put into a new `Mosscap (transcript):` line. The nature of these api's is such that the transcript arrives before the spoken audio, so if you interrupt the audio the transcript will not match the audio.
+
+The realtime api's work by sending event from the server to you and sending events back to the server, this is fully asynchronous. The samples show you can listen to the events being sent by the server and some are handled by the code in the samples, others are not. For instance one could add a clause in the match case in the receive loop that logs the usage that is part of the `response.done` event.
+
+For more info on the events, go to our documentation, as well as the documentation of [OpenAI](https://platform.openai.com/docs/guides/realtime) and [Azure](https://learn.microsoft.com/en-us/azure/ai-services/openai/realtime-audio-quickstart?tabs=keyless%2Cmacos&pivots=programming-language-python).
+
+## Simple chat samples
+
+### [Simple chat with realtime websocket](./simple_realtime_chat_websocket.py)
+
+This sample uses the websocket api with Azure OpenAI to run a simple interaction based on voice. If you want to use this sample with OpenAI, just change AzureRealtimeWebsocket into OpenAIRealtimeWebsocket.
+
+### [Simple chat with realtime WebRTC](./simple_realtime_chat_webrtc.py)
+
+This sample uses the WebRTC api with OpenAI to run a simple interaction based on voice. Because of the way the WebRTC protocol works this needs a different player and recorder than the websocket version.
+
+## Function calling samples
+
+The following two samples use function calling with the following functions:
+
+- get_weather: This function will return the weather for a given city, it is randomly generated and not based on any real data.
+- get_time: This function will return the current time and date.
+- goodbye: This function will end the conversation.
+
+A line is logged whenever one of these functions is called.
+
+### [Chat with function calling Websocket](./realtime_chat_with_function_calling_websocket.py)
+
+This sample uses the websocket api with Azure OpenAI to run the interaction with the voice model, but now with function calling.
+
+### [Chat with function calling WebRTC](./realtime_chat_with_function_calling_webrtc.py)
+
+This sample uses the WebRTC api with OpenAI to run the interaction with the voice model, but now with function calling.
@@ -0,0 +1,143 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+from datetime import datetime
+from random import randint
+
+from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai import FunctionChoiceBehavior
+from semantic_kernel.connectors.ai.open_ai import (
+    ListenEvents,
+    OpenAIRealtimeExecutionSettings,
+    OpenAIRealtimeWebRTC,
+    TurnDetection,
+)
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.realtime_events import RealtimeTextEvent
+from semantic_kernel.functions import kernel_function
+
+logging.basicConfig(level=logging.WARNING)
+utils_log = logging.getLogger("samples.concepts.realtime.utils")
+utils_log.setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+"""
+This simple sample demonstrates how to use the OpenAI Realtime API to create
+a chat bot that can listen and respond directly through audio.
+It requires installing:
+- semantic-kernel[realtime]
+- pyaudio
+- sounddevice
+- pydub
+e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+
+For more details of the exact setup, see the README.md in the realtime folder.
+"""
+
+# The characterics of your speaker and microphone are a big factor in a smooth conversation
+# so you may need to try out different devices for each.
+# you can also play around with the turn_detection settings to get the best results.
+# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
+# so you may need to adjust these for your system.
+# you can disable the check for available devices by commenting the line below
+check_audio_devices()
+
+
+@kernel_function
+def get_weather(location: str) -> str:
+    """Get the weather for a location."""
+    weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
+    weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
+    logger.info(f"@ Getting weather for {location}: {weather}")
+    return f"The weather in {location} is {weather}."
+
+
+@kernel_function
+def get_date_time() -> str:
+    """Get the current date and time."""
+    logger.info("@ Getting current datetime")
+    return f"The current date and time is {datetime.now().isoformat()}."
+
+
+@kernel_function
+def goodbye():
+    """When the user is done, say goodbye and then call this function."""
+    logger.info("@ Goodbye has been called!")
+    raise KeyboardInterrupt
+
+
+async def main() -> None:
+    print_transcript = True
+    # create the Kernel and add a simple function for function calling.
+    kernel = Kernel()
+    kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
+
+    # create the audio player and audio track
+    # both take a device_id parameter, which is the index of the device to use, if None the default device is used
+    audio_player = AudioPlayerWebRTC()
+    # create the realtime client and optionally add the audio output function, this is optional
+    # and can also be passed in the receive method
+    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
+
+    # Create the settings for the session
+    # The realtime api, does not use a system message, but takes instructions as a parameter for a session
+    # Another important setting is to tune the server_vad turn detection
+    # if this is turned off (by setting turn_detection=None), you will have to send
+    # the "input_audio_buffer.commit" and "response.create" event to the realtime api
+    # to signal the end of the user's turn and start the response.
+    # manual VAD is not part of this sample
+    # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
+    settings = OpenAIRealtimeExecutionSettings(
+        instructions="""
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """,
+        voice="alloy",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+        function_choice_behavior=FunctionChoiceBehavior.Auto(),
+    )
+    # and we can add a chat history to conversation after starting it
+    chat_history = ChatHistory()
+    chat_history.add_user_message("Hi there, who are you?")
+    chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
+    async with (
+        audio_player,
+        realtime_client(
+            settings=settings,
+            chat_history=chat_history,
+            kernel=kernel,
+            create_response=True,
+        ),
+    ):
+        async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
+            match event:
+                case RealtimeTextEvent():
+                    if print_transcript:
+                        print(event.text.text, end="")
+                case _:
+                    # OpenAI Specific events
+                    match event.service_type:
+                        case ListenEvents.RESPONSE_CREATED:
+                            if print_transcript:
+                                print("\nMosscap (transcript): ", end="")
+                        case ListenEvents.ERROR:
+                            logger.error(event.service_event)
+
+
+if __name__ == "__main__":
+    print(
+        "Instructions: The model will start speaking immediately,"
+        "this can be turned off by removing `create_response=True` above."
+        "The model will detect when you stop and automatically generate a response. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())
@@ -0,0 +1,141 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+from datetime import datetime
+from random import randint
+
+from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai import FunctionChoiceBehavior
+from semantic_kernel.connectors.ai.open_ai import (
+    AzureRealtimeExecutionSettings,
+    AzureRealtimeWebsocket,
+    ListenEvents,
+    TurnDetection,
+)
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.realtime_events import RealtimeTextEvent
+from semantic_kernel.functions import kernel_function
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+"""
+This simple sample demonstrates how to use the OpenAI Realtime API to create
+a chat bot that can listen and respond directly through audio.
+It requires installing:
+- semantic-kernel[realtime]
+- pyaudio
+- sounddevice
+- pydub
+e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+
+For more details of the exact setup, see the README.md in the realtime folder.
+"""
+
+
+@kernel_function
+def get_weather(location: str) -> str:
+    """Get the weather for a location."""
+    weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
+    weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
+    logger.info(f"@ Getting weather for {location}: {weather}")
+    return f"The weather in {location} is {weather}."
+
+
+@kernel_function
+def get_date_time() -> str:
+    """Get the current date and time."""
+    logger.info("@ Getting current datetime")
+    return f"The current date and time is {datetime.now().isoformat()}."
+
+
+@kernel_function
+def goodbye():
+    """When the user is done, say goodbye and then call this function."""
+    logger.info("@ Goodbye has been called!")
+    raise KeyboardInterrupt
+
+
+async def main() -> None:
+    print_transcript = True
+    # create the Kernel and add a simple function for function calling.
+    kernel = Kernel()
+    kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
+
+    # create the realtime client, in this the Azure Websocket client, there are also OpenAI Websocket and WebRTC clients
+    # See 02b-chat_with_function_calling_webrtc.py for an example of the WebRTC client
+    realtime_client = AzureRealtimeWebsocket()
+    # create the audio player and audio track
+    # both take a device_id parameter, which is the index of the device to use, if None the default device is used
+    audio_player = AudioPlayerWebsocket()
+    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
+
+    # Create the settings for the session
+    # The realtime api, does not use a system message, but takes instructions as a parameter for a session
+    # Another important setting is to tune the server_vad turn detection
+    # if this is turned off (by setting turn_detection=None), you will have to send
+    # the "input_audio_buffer.commit" and "response.create" event to the realtime api
+    # to signal the end of the user's turn and start the response.
+    # manual VAD is not part of this sample
+    # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
+    settings = AzureRealtimeExecutionSettings(
+        instructions="""
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """,
+        # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice for the full list of voices # noqa: E501
+        voice="alloy",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+        function_choice_behavior=FunctionChoiceBehavior.Auto(),
+    )
+    # and we can add a chat history to conversation to seed the conversation
+    chat_history = ChatHistory()
+    chat_history.add_user_message("Hi there, I'm based in Amsterdam.")
+    chat_history.add_assistant_message(
+        "I am Mosscap, a chat bot. I'm trying to figure out what people need, "
+        "I can tell you what the weather is or the time."
+    )
+
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
+    async with (
+        audio_player,
+        audio_recorder,
+        realtime_client(
+            settings=settings,
+            chat_history=chat_history,
+            kernel=kernel,
+            create_response=True,
+        ),
+    ):
+        # the audio_output_callback can be added here or in the client constructor
+        # using this gives the smoothest experience
+        async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
+            match event:
+                case RealtimeTextEvent():
+                    if print_transcript:
+                        print(event.text.text, end="")
+                case _:
+                    # OpenAI Specific events
+                    match event.service_type:
+                        case ListenEvents.RESPONSE_CREATED:
+                            if print_transcript:
+                                print("\nMosscap (transcript): ", end="")
+                        case ListenEvents.ERROR:
+                            print(event.service_event)
+                            logger.error(event.service_event)
+
+
+if __name__ == "__main__":
+    print(
+        "Instructions: The model will start speaking immediately,"
+        "this can be turned off by removing `create_response=True` above."
+        "The model will detect when you stop and automatically generate a response. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())