forked from microsoft/semantic-kernel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrealtime_chat_with_function_calling_webrtc.py
143 lines (124 loc) · 5.86 KB
/
realtime_chat_with_function_calling_webrtc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Copyright (c) Microsoft. All rights reserved.
import asyncio
import logging
from datetime import datetime
from random import randint
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
TurnDetection,
)
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.realtime_events import RealtimeTextEvent
from semantic_kernel.functions import kernel_function
logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
utils_log.setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
"""
This simple sample demonstrates how to use the OpenAI Realtime API to create
a chat bot that can listen and respond directly through audio.
It requires installing:
- semantic-kernel[realtime]
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
For more details of the exact setup, see the README.md in the realtime folder.
"""
# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can disable the check for available devices by commenting the line below
check_audio_devices()
@kernel_function
def get_weather(location: str) -> str:
"""Get the weather for a location."""
weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec
logger.info(f"@ Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."
@kernel_function
def get_date_time() -> str:
"""Get the current date and time."""
logger.info("@ Getting current datetime")
return f"The current date and time is {datetime.now().isoformat()}."
@kernel_function
def goodbye():
"""When the user is done, say goodbye and then call this function."""
logger.info("@ Goodbye has been called!")
raise KeyboardInterrupt
async def main() -> None:
print_transcript = True
# create the Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
audio_player = AudioPlayerWebRTC()
# create the realtime client and optionally add the audio output function, this is optional
# and can also be passed in the receive method
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
# Create the settings for the session
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
# Another important setting is to tune the server_vad turn detection
# if this is turned off (by setting turn_detection=None), you will have to send
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
# for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
settings = OpenAIRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
""",
voice="alloy",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
# and we can add a chat history to conversation after starting it
chat_history = ChatHistory()
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with (
audio_player,
realtime_client(
settings=settings,
chat_history=chat_history,
kernel=kernel,
create_response=True,
),
):
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
match event:
case RealtimeTextEvent():
if print_transcript:
print(event.text.text, end="")
case _:
# OpenAI Specific events
match event.service_type:
case ListenEvents.RESPONSE_CREATED:
if print_transcript:
print("\nMosscap (transcript): ", end="")
case ListenEvents.ERROR:
logger.error(event.service_event)
if __name__ == "__main__":
print(
"Instructions: The model will start speaking immediately,"
"this can be turned off by removing `create_response=True` above."
"The model will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())