Skip to content

Commit

Permalink
addressed comments
Browse files Browse the repository at this point in the history
  • Loading branch information
eavanvalkenburg committed Feb 20, 2025
1 parent eb5b8f6 commit b9c2b54
Show file tree
Hide file tree
Showing 17 changed files with 189 additions and 218 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
AzureRealtime,
AzureRealtimeWebsocket,
ListenEvents,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.contents.events import RealtimeAudioEvent, RealtimeTextEvent
from semantic_kernel.contents.realtime_events import RealtimeAudioEvent, RealtimeTextEvent

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
Expand All @@ -32,15 +31,15 @@
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function
# you can disable the check for available devices by commenting the line below
check_audio_devices()


async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = AzureRealtime("websocket")
realtime_client = AzureRealtimeWebsocket()
audio_player = AudioPlayerWebsocket()
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
Expand All @@ -53,10 +52,12 @@ async def main() -> None:
effectively, but you tend to answer with long
flowery prose.
""",
# there are different voices to choose from, since that list is bound to change, it is not checked beforehand,
# see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
# for more details.
voice="shimmer",
turn_detection=TurnDetection(create_response=True, silence_duration_ms=800, threshold=0.8),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
async for event in realtime_client.receive():
match event:
Expand All @@ -65,6 +66,7 @@ async def main() -> None:
case RealtimeAudioEvent():
await audio_player.add_audio(event.audio)
case RealtimeTextEvent():
# the model returns both audio and transcript of the audio, which we will print
print(event.text.text, end="")
case _:
# OpenAI Specific events
Expand All @@ -76,7 +78,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
23 changes: 10 additions & 13 deletions python/samples/concepts/realtime/01b-chat_with_realtime_webrtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,13 @@
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
OpenAIRealtimeWebRTC,
)

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

Expand All @@ -35,7 +30,7 @@
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function
# you can disable the check for available devices by commenting the line below
check_audio_devices()


Expand All @@ -44,10 +39,9 @@ async def main() -> None:
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
audio_player = AudioPlayerWebRTC()
realtime_client = OpenAIRealtime(
"webrtc",
audio_output_callback=audio_player.client_callback,
realtime_client = OpenAIRealtimeWebRTC(
audio_track=AudioRecorderWebRTC(),
audio_output_callback=audio_player.client_callback,
)
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
Expand All @@ -59,14 +53,17 @@ async def main() -> None:
effectively, but you tend to answer with long
flowery prose.
""",
# there are different voices to choose from, since that list is bound to change, it is not checked beforehand,
# see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
# for more details.
voice="alloy",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, realtime_client(settings=settings, create_response=True):
async for event in realtime_client.receive():
match event.event_type:
case "text":
# the model returns both audio and transcript of the audio, which we will print
print(event.text.text, end="")
case "service":
# OpenAI Specific events
Expand All @@ -78,7 +75,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
AzureRealtime,
AzureRealtimeWebsocket,
ListenEvents,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.events import RealtimeTextEvent
from semantic_kernel.contents.realtime_events import RealtimeTextEvent
from semantic_kernel.functions import kernel_function

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -67,8 +67,7 @@ async def main() -> None:
# you can define the protocol to use, either "websocket" or "webrtc"
# (at this time Azure only support websockets)
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = AzureRealtime(
protocol="websocket",
realtime_client = AzureRealtimeWebsocket(
audio_output_callback=audio_player.client_callback,
)
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
Expand All @@ -88,6 +87,7 @@ async def main() -> None:
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
# for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
settings = OpenAIRealtimeExecutionSettings(
instructions=instructions,
voice="alloy",
Expand All @@ -99,7 +99,7 @@ async def main() -> None:
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

# the context manager calls the create_session method on the client and start listening to the audio stream
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with (
audio_player,
audio_recorder,
Expand Down Expand Up @@ -128,7 +128,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,17 @@
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
TurnDetection,
)
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.events import RealtimeTextEvent
from semantic_kernel.contents.realtime_events import RealtimeTextEvent
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

Expand All @@ -42,9 +38,7 @@
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function


# you can disable the check for available devices by commenting the line below
check_audio_devices()


Expand Down Expand Up @@ -84,8 +78,7 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime(
protocol="webrtc",
realtime_client = OpenAIRealtimeWebRTC(
audio_output_callback=audio_player.client_callback,
audio_track=audio_track,
)
Expand All @@ -105,6 +98,7 @@ async def main() -> None:
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
# for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
settings = OpenAIRealtimeExecutionSettings(
instructions=instructions,
voice="alloy",
Expand All @@ -116,7 +110,7 @@ async def main() -> None:
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

# the context manager calls the create_session method on the client and start listening to the audio stream
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with (
audio_player,
realtime_client(
Expand All @@ -143,7 +137,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Instructions: Begin speaking. The API will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
37 changes: 27 additions & 10 deletions python/samples/concepts/realtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
from aiortc.mediastreams import MediaStreamError, MediaStreamTrack
from av.audio.frame import AudioFrame
from av.frame import Frame
from pydantic import PrivateAttr
from pydantic import BaseModel, ConfigDict, PrivateAttr
from sounddevice import InputStream, OutputStream

from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.events.realtime_event import RealtimeAudioEvent
from semantic_kernel.kernel_pydantic import KernelBaseModel
from semantic_kernel.contents import AudioContent
from semantic_kernel.contents.realtime_events import RealtimeAudioEvent

logger = logging.getLogger(__name__)

Expand All @@ -40,8 +39,13 @@ def check_audio_devices():
# region: Recorders


class AudioRecorderWebRTC(KernelBaseModel, MediaStreamTrack):
"""A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice."""
class AudioRecorderWebRTC(BaseModel, MediaStreamTrack):
"""A simple class that implements the WebRTC MediaStreamTrack for audio from sounddevice.
This class is meant as a demo sample and is not meant for production use.
"""

model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)

kind: ClassVar[str] = "audio"
device: str | int | None = None
Expand Down Expand Up @@ -156,8 +160,13 @@ async def start_recording(self):
self._is_recording = False


class AudioRecorderWebsocket(KernelBaseModel):
"""A simple class that implements a sounddevice for use with websockets."""
class AudioRecorderWebsocket(BaseModel):
"""A simple class that implements a sounddevice for use with websockets.
This class is meant as a demo sample and is not meant for production use.
"""

model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)

realtime_client: RealtimeClientBase
device: str | int | None = None
Expand Down Expand Up @@ -247,9 +256,11 @@ async def __aexit__(self, exc_type, exc, tb):
# region: Players


class AudioPlayerWebRTC(KernelBaseModel):
class AudioPlayerWebRTC(BaseModel):
"""Simple class that plays audio using sounddevice.
This class is meant as a demo sample and is not meant for production use.
Make sure the device_id is set to the correct device for your system.
The sample rate, channels and frame duration
Expand All @@ -265,6 +276,8 @@ class AudioPlayerWebRTC(KernelBaseModel):
"""

model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)

device: int | None = None
sample_rate: int = SAMPLE_RATE_WEBRTC
channels: int = PLAYER_CHANNELS_WEBRTC
Expand Down Expand Up @@ -356,9 +369,11 @@ async def add_audio(self, audio_content: AudioContent) -> None:
logger.error(f"Unknown audio content: {audio_content}")


class AudioPlayerWebsocket(KernelBaseModel):
class AudioPlayerWebsocket(BaseModel):
"""Simple class that plays audio using sounddevice.
This class is meant as a demo sample and is not meant for production use.
Make sure the device_id is set to the correct device for your system.
The sample rate, channels and frame duration
Expand All @@ -374,6 +389,8 @@ class AudioPlayerWebsocket(KernelBaseModel):
"""

model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True, validate_assignment=True)

device: int | None = None
sample_rate: int = SAMPLE_RATE
channels: int = PLAYER_CHANNELS
Expand Down
Loading

0 comments on commit b9c2b54

Please sign in to comment.