addressed comments

microsoft · Feb 20, 2025 · b9c2b54 · b9c2b54
1 parent eb5b8f6
commit b9c2b54
Show file tree

Hide file tree

Showing 17 changed files with 189 additions and 218 deletions.
diff --git a/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py b/python/samples/concepts/realtime/01a-chat_with_realtime_websocket.py
@@ -5,12 +5,11 @@
 
 from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
-    AzureRealtime,
+    AzureRealtimeWebsocket,
     ListenEvents,
     OpenAIRealtimeExecutionSettings,
-    TurnDetection,
 )
-from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent
+from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeTextEvent
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
@@ -32,15 +31,15 @@
 # you can also play around with the turn_detection settings to get the best results.
 # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
 # so you may need to adjust these for your system.
-# you can check the available devices by uncommenting line below the function
+# you can disable the check for available devices by commenting the line below
 check_audio_devices()
 
 
 async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = AzureRealtime("websocket")
+    realtime_client = AzureRealtimeWebsocket()
     audio_player = AudioPlayerWebsocket()
     audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
     # Create the settings for the session
@@ -53,10 +52,12 @@ async def main() -> None:
     effectively, but you tend to answer with long
     flowery prose.
     """,
+        # there are different voices to choose from, since that list is bound to change, it is not checked beforehand,
+        # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
+        # for more details.
         voice="shimmer",
-        turn_detection=TurnDetection(create_response=True, silence_duration_ms=800, threshold=0.8),
     )
-    # the context manager calls the create_session method on the client and start listening to the audio stream
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
         async for event in realtime_client.receive():
             match event:
@@ -65,6 +66,7 @@ async def main() -> None:
                 case RealtimeAudioEvent():
                     await audio_player.add_audio(event.audio)
                 case RealtimeTextEvent():
+                    # the model returns both audio and transcript of the audio, which we will print
                     print(event.text.text, end="")
                 case _:
                     # OpenAI Specific events
@@ -76,7 +78,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
diff --git a/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py b/python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py
@@ -6,18 +6,13 @@
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
     ListenEvents,
-    OpenAIRealtime,
     OpenAIRealtimeExecutionSettings,
-    TurnDetection,
+    OpenAIRealtimeWebRTC,
 )
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
 utils_log.setLevel(logging.INFO)
-aiortc_log = logging.getLogger("aiortc")
-aiortc_log.setLevel(logging.WARNING)
-aioice_log = logging.getLogger("aioice")
-aioice_log.setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -35,7 +30,7 @@
 # you can also play around with the turn_detection settings to get the best results.
 # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
 # so you may need to adjust these for your system.
-# you can check the available devices by uncommenting line below the function
+# you can disable the check for available devices by commenting the line below
 check_audio_devices()
 
 
@@ -44,10 +39,9 @@ async def main() -> None:
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
     audio_player = AudioPlayerWebRTC()
-    realtime_client = OpenAIRealtime(
-        "webrtc",
-        audio_output_callback=audio_player.client_callback,
+    realtime_client = OpenAIRealtimeWebRTC(
         audio_track=AudioRecorderWebRTC(),
+        audio_output_callback=audio_player.client_callback,
     )
     # Create the settings for the session
     settings = OpenAIRealtimeExecutionSettings(
@@ -59,14 +53,17 @@ async def main() -> None:
     effectively, but you tend to answer with long
     flowery prose.
     """,
+        # there are different voices to choose from, since that list is bound to change, it is not checked beforehand,
+        # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
+        # for more details.
         voice="alloy",
-        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
     )
-    # the context manager calls the create_session method on the client and start listening to the audio stream
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with audio_player, realtime_client(settings=settings, create_response=True):
         async for event in realtime_client.receive():
             match event.event_type:
                 case "text":
+                    # the model returns both audio and transcript of the audio, which we will print
                     print(event.text.text, end="")
                 case "service":
                     # OpenAI Specific events
@@ -78,7 +75,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
diff --git a/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py b/python/samples/concepts/realtime/02a-chat_with_function_calling_websocket.py
@@ -9,13 +9,13 @@
 from semantic_kernel import Kernel
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
-    AzureRealtime,
+    AzureRealtimeWebsocket,
     ListenEvents,
     OpenAIRealtimeExecutionSettings,
     TurnDetection,
 )
 from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.events import RealtimeTextEvent
+from semantic_kernel.contents.realtime_events import RealtimeTextEvent
 from semantic_kernel.functions import kernel_function
 
 logger = logging.getLogger(__name__)
@@ -67,8 +67,7 @@ async def main() -> None:
     # you can define the protocol to use, either "websocket" or "webrtc"
     # (at this time Azure only support websockets)
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = AzureRealtime(
-        protocol="websocket",
+    realtime_client = AzureRealtimeWebsocket(
         audio_output_callback=audio_player.client_callback,
     )
     audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
@@ -88,6 +87,7 @@ async def main() -> None:
     # the "input_audio_buffer.commit" and "response.create" event to the realtime api
     # to signal the end of the user's turn and start the response.
     # manual VAD is not part of this sample
+    # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
     settings = OpenAIRealtimeExecutionSettings(
         instructions=instructions,
         voice="alloy",
@@ -99,7 +99,7 @@ async def main() -> None:
     chat_history.add_user_message("Hi there, who are you?")
     chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
 
-    # the context manager calls the create_session method on the client and start listening to the audio stream
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with (
         audio_player,
         audio_recorder,
@@ -128,7 +128,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
diff --git a/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py b/python/samples/concepts/realtime/02b-chat_with_function_calling_webrtc.py
@@ -10,21 +10,17 @@
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
     ListenEvents,
-    OpenAIRealtime,
     OpenAIRealtimeExecutionSettings,
+    OpenAIRealtimeWebRTC,
     TurnDetection,
 )
 from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.events import RealtimeTextEvent
+from semantic_kernel.contents.realtime_events import RealtimeTextEvent
 from semantic_kernel.functions import kernel_function
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
 utils_log.setLevel(logging.INFO)
-aiortc_log = logging.getLogger("aiortc")
-aiortc_log.setLevel(logging.WARNING)
-aioice_log = logging.getLogger("aioice")
-aioice_log.setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -42,9 +38,7 @@
 # you can also play around with the turn_detection settings to get the best results.
 # It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
 # so you may need to adjust these for your system.
-# you can check the available devices by uncommenting line below the function
-
-
+# you can disable the check for available devices by commenting the line below
 check_audio_devices()
 
 
@@ -84,8 +78,7 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = OpenAIRealtime(
-        protocol="webrtc",
+    realtime_client = OpenAIRealtimeWebRTC(
         audio_output_callback=audio_player.client_callback,
         audio_track=audio_track,
     )
@@ -105,6 +98,7 @@ async def main() -> None:
     # the "input_audio_buffer.commit" and "response.create" event to the realtime api
     # to signal the end of the user's turn and start the response.
     # manual VAD is not part of this sample
+    # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
     settings = OpenAIRealtimeExecutionSettings(
         instructions=instructions,
         voice="alloy",
@@ -116,7 +110,7 @@ async def main() -> None:
     chat_history.add_user_message("Hi there, who are you?")
     chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
 
-    # the context manager calls the create_session method on the client and start listening to the audio stream
+    # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with (
         audio_player,
         realtime_client(
@@ -143,7 +137,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py
@@ -12,13 +12,12 @@
 from aiortc.mediastreams import MediaStreamError, MediaStreamTrack
 from av.audio.frame import AudioFrame
 from av.frame import Frame
-from pydantic import PrivateAttr
+from pydantic import BaseModel, ConfigDict, PrivateAttr
 from sounddevice import InputStream, OutputStream
 
 from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
-from semantic_kernel.contents.audio_content import AudioContent
-from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent
-from semantic_kernel.kernel_pydantic import KernelBaseModel
+from semantic_kernel.contents import AudioContent
+from semantic_kernel.contents.realtime_events import RealtimeAudioEvent
 
 logger = logging.getLogger(__name__)
 
@@ -40,8 +39,13 @@ def check_audio_devices():
 # region: Recorders
 
 
-class AudioRecorderWebRTC(KernelBaseModel, MediaStreamTrack):
-    """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice."""
+class AudioRecorderWebRTC(BaseModel, MediaStreamTrack):
+    """A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.
+
+    This class is meant as a demo sample and is not meant for production use.
+    """
+
+    model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)
 
     kind: ClassVar[str] = "audio"
     device: str | int | None = None
@@ -156,8 +160,13 @@ async def start_recording(self):
             self._is_recording = False
 
 
-class AudioRecorderWebsocket(KernelBaseModel):
-    """A simple class that implements a sounddevice for use with websockets."""
+class AudioRecorderWebsocket(BaseModel):
+    """A simple class that implements a sounddevice for use with websockets.
+
+    This class is meant as a demo sample and is not meant for production use.
+    """
+
+    model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)
 
     realtime_client: RealtimeClientBase
     device: str | int | None = None
@@ -247,9 +256,11 @@ async def __aexit__(self, exc_type, exc, tb):
 # region: Players
 
 
-class AudioPlayerWebRTC(KernelBaseModel):
+class AudioPlayerWebRTC(BaseModel):
     """Simple class that plays audio using sounddevice.
 
+    This class is meant as a demo sample and is not meant for production use.
+
     Make sure the device_id is set to the correct device for your system.
 
     The sample rate, channels and frame duration
@@ -265,6 +276,8 @@ class AudioPlayerWebRTC(KernelBaseModel):
 
     """
 
+    model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)
+
     device: int | None = None
     sample_rate: int = SAMPLE_RATE_WEBRTC
     channels: int = PLAYER_CHANNELS_WEBRTC
@@ -356,9 +369,11 @@ async def add_audio(self, audio_content: AudioContent) -> None:
         logger.error(f"Unknown audio content: {audio_content}")
 
 
-class AudioPlayerWebsocket(KernelBaseModel):
+class AudioPlayerWebsocket(BaseModel):
     """Simple class that plays audio using sounddevice.
 
+    This class is meant as a demo sample and is not meant for production use.
+
     Make sure the device_id is set to the correct device for your system.
 
     The sample rate, channels and frame duration
@@ -374,6 +389,8 @@ class AudioPlayerWebsocket(KernelBaseModel):
 
     """
 
+    model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)
+
     device: int | None = None
     sample_rate: int = SAMPLE_RATE
     channels: int = PLAYER_CHANNELS