Skip to content

Commit 9268d10

Browse files
removed built-in audio players, split for websocket and rtc
1 parent 6d117be commit 9268d10

File tree

15 files changed

+689
-743
lines changed

15 files changed

+689
-743
lines changed

docs/decisions/00XX-realtime-api-clients.md

Lines changed: 0 additions & 346 deletions
This file was deleted.

python/samples/concepts/audio/utils.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

python/samples/concepts/audio/04-chat_with_realtime_api_simple.py renamed to python/samples/concepts/realtime/01-chat_with_realtime_webrtc.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,17 @@
33
import asyncio
44
import logging
55

6-
from samples.concepts.audio.utils import check_audio_devices
6+
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
77
from semantic_kernel.connectors.ai.open_ai import (
88
ListenEvents,
99
OpenAIRealtime,
1010
OpenAIRealtimeExecutionSettings,
1111
TurnDetection,
1212
)
13-
from semantic_kernel.connectors.ai.utils import SKAudioPlayer
1413

1514
logging.basicConfig(level=logging.WARNING)
15+
utils_log = logging.getLogger("samples.concepts.realtime.utils")
16+
utils_log.setLevel(logging.INFO)
1617
aiortc_log = logging.getLogger("aiortc")
1718
aiortc_log.setLevel(logging.WARNING)
1819
aioice_log = logging.getLogger("aioice")
@@ -43,7 +44,12 @@ async def main() -> None:
4344
# create the realtime client and optionally add the audio output function, this is optional
4445
# you can define the protocol to use, either "websocket" or "webrtc"
4546
# they will behave the same way, even though the underlying protocol is quite different
46-
realtime_client = OpenAIRealtime("webrtc")
47+
audio_player = AudioPlayerWebRTC()
48+
realtime_client = OpenAIRealtime(
49+
"webrtc",
50+
audio_output_callback=audio_player.client_callback,
51+
audio_track=AudioRecorderWebRTC(),
52+
)
4753
# Create the settings for the session
4854
settings = OpenAIRealtimeExecutionSettings(
4955
instructions="""
@@ -58,15 +64,15 @@ async def main() -> None:
5864
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
5965
)
6066
# the context manager calls the create_session method on the client and start listening to the audio stream
61-
audio_player = SKAudioPlayer()
67+
6268
print("Mosscap (transcript): ", end="")
6369
async with realtime_client, audio_player:
6470
await realtime_client.update_session(settings=settings, create_response=True)
6571

6672
async for event in realtime_client.receive():
6773
match event.event_type:
68-
case "audio":
69-
await audio_player.add_audio(event.audio)
74+
# case "audio":
75+
# await audio_player.add_audio(event.audio)
7076
case "text":
7177
print(event.text.text, end="")
7278
case "service":
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import asyncio
4+
import logging
5+
6+
from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
7+
from semantic_kernel.connectors.ai.open_ai import (
8+
ListenEvents,
9+
OpenAIRealtime,
10+
OpenAIRealtimeExecutionSettings,
11+
TurnDetection,
12+
)
13+
14+
logging.basicConfig(level=logging.WARNING)
15+
utils_log = logging.getLogger("samples.concepts.realtime.utils")
16+
utils_log.setLevel(logging.INFO)
17+
aiortc_log = logging.getLogger("aiortc")
18+
aiortc_log.setLevel(logging.WARNING)
19+
aioice_log = logging.getLogger("aioice")
20+
aioice_log.setLevel(logging.WARNING)
21+
logger = logging.getLogger(__name__)
22+
logger.setLevel(logging.INFO)
23+
24+
# This simple sample demonstrates how to use the OpenAI Realtime API to create
25+
# a chat bot that can listen and respond directly through audio.
26+
# It requires installing:
27+
# - semantic-kernel[openai_realtime]
28+
# - pyaudio
29+
# - sounddevice
30+
# - pydub
31+
# - aiortc
32+
# e.g. pip install pyaudio sounddevice pydub
33+
34+
# The characterics of your speaker and microphone are a big factor in a smooth conversation
35+
# so you may need to try out different devices for each.
36+
# you can also play around with the turn_detection settings to get the best results.
37+
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
38+
# so you may need to adjust these for your system.
39+
# you can check the available devices by uncommenting line below the function
40+
check_audio_devices()
41+
42+
43+
async def main() -> None:
44+
# create the realtime client and optionally add the audio output function, this is optional
45+
# you can define the protocol to use, either "websocket" or "webrtc"
46+
# they will behave the same way, even though the underlying protocol is quite different
47+
audio_player = AudioPlayerWebsocket()
48+
realtime_client = OpenAIRealtime(
49+
"websocket",
50+
audio_output_callback=audio_player.client_callback,
51+
)
52+
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
53+
# Create the settings for the session
54+
settings = OpenAIRealtimeExecutionSettings(
55+
instructions="""
56+
You are a chat bot. Your name is Mosscap and
57+
you have one goal: figure out what people need.
58+
Your full name, should you need to know it, is
59+
Splendid Speckled Mosscap. You communicate
60+
effectively, but you tend to answer with long
61+
flowery prose.
62+
""",
63+
voice="shimmer",
64+
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
65+
)
66+
# the context manager calls the create_session method on the client and start listening to the audio stream
67+
print("Mosscap (transcript): ", end="")
68+
69+
async with realtime_client, audio_player, audio_recorder:
70+
await realtime_client.update_session(settings=settings, create_response=True)
71+
72+
async for event in realtime_client.receive():
73+
match event.event_type:
74+
# this can be used as an alternative to the callback function used above,
75+
# the callback is faster and smoother
76+
# case "audio":
77+
# await audio_player.add_audio(event.audio)
78+
case "text":
79+
print(event.text.text, end="")
80+
case "service":
81+
# OpenAI Specific events
82+
if event.service_type == ListenEvents.SESSION_UPDATED:
83+
print("Session updated")
84+
if event.service_type == ListenEvents.RESPONSE_CREATED:
85+
print("")
86+
if event.service_type == ListenEvents.ERROR:
87+
logger.error(event.event)
88+
89+
90+
if __name__ == "__main__":
91+
print(
92+
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
93+
"Press ctrl + c to stop the program."
94+
)
95+
asyncio.run(main())

python/samples/concepts/audio/05-chat_with_realtime_api_complex.py renamed to python/samples/concepts/realtime/02-chat_with_function_calling.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from datetime import datetime
66
from random import randint
77

8-
from samples.concepts.audio.utils import check_audio_devices
8+
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
99
from semantic_kernel import Kernel
1010
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
1111
from semantic_kernel.connectors.ai.open_ai import (
@@ -14,11 +14,12 @@
1414
OpenAIRealtimeExecutionSettings,
1515
TurnDetection,
1616
)
17-
from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack
1817
from semantic_kernel.contents import ChatHistory
1918
from semantic_kernel.functions import kernel_function
2019

2120
logging.basicConfig(level=logging.WARNING)
21+
utils_log = logging.getLogger("samples.concepts.realtime.utils")
22+
utils_log.setLevel(logging.INFO)
2223
aiortc_log = logging.getLogger("aiortc")
2324
aiortc_log.setLevel(logging.WARNING)
2425
aioice_log = logging.getLogger("aioice")
@@ -78,15 +79,15 @@ async def main() -> None:
7879

7980
# create the audio player and audio track
8081
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
81-
audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
82-
audio_track = SKAudioTrack()
82+
audio_player = AudioPlayerWebRTC()
83+
audio_track = AudioRecorderWebRTC()
8384
# create the realtime client and optionally add the audio output function, this is optional
8485
# you can define the protocol to use, either "websocket" or "webrtc"
8586
# they will behave the same way, even though the underlying protocol is quite different
8687
realtime_client = OpenAIRealtime(
87-
protocol="websocket",
88+
protocol="webrtc",
8889
audio_output_callback=audio_player.client_callback,
89-
# audio_track=audio_track,
90+
audio_track=audio_track,
9091
)
9192

9293
# Create the settings for the session
@@ -116,7 +117,7 @@ async def main() -> None:
116117
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
117118

118119
# the context manager calls the create_session method on the client and start listening to the audio stream
119-
async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
120+
async with realtime_client, audio_player:
120121
await realtime_client.update_session(
121122
settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
122123
)

0 commit comments

Comments
 (0)