Skip to content

Commit 6c30fed

Browse files
committed
improving azure speech to text function to add support for top 4 languages
1 parent d330d87 commit 6c30fed

File tree

1 file changed

+64
-5
lines changed

1 file changed

+64
-5
lines changed

common/audio_utils.py

+64-5
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,71 @@ def recognize_whisper_api_from_file(file_name: str, whisper_model: str):
8282
transcript = recognize_whisper_api(audio_file, whisper_model)
8383
return transcript
8484

85+
8586
def recognize_azure_speech_to_text_from_file(file_path: str, key: str, region: str):
86-
speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
87-
audio_config = speechsdk.AudioConfig(filename=file_path)
88-
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
89-
result = speech_recognizer.recognize_once_async().get()
90-
return result.text
87+
"""
88+
Recognize speech from an audio file with automatic language detection
89+
across the top 6 spoken languages globally.
90+
91+
Args:
92+
file_path (str): Path to the audio file.
93+
key (str): Azure Speech Service subscription key.
94+
region (str): Azure service region.
95+
96+
Returns:
97+
string: Transcribed text.
98+
99+
Raises:
100+
RuntimeError: If an error occurs during speech recognition.
101+
"""
102+
try:
103+
# Create a speech configuration with your subscription key and region
104+
speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
105+
106+
# Create an audio configuration pointing to the audio file
107+
audio_config = speechsdk.AudioConfig(filename=file_path)
108+
109+
# Top 4 most spoken languages (ISO language codes)
110+
# SDK only supports 4 languages as options
111+
languages = ["en-US", "zh-CN", "hi-IN", "es-ES"]
112+
113+
# Configure auto language detection with the specified languages
114+
auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=languages)
115+
116+
# Create a speech recognizer with the auto language detection configuration
117+
speech_recognizer = speechsdk.SpeechRecognizer(
118+
speech_config=speech_config,
119+
audio_config=audio_config,
120+
auto_detect_source_language_config=auto_detect_source_language_config
121+
)
122+
123+
# Perform speech recognition
124+
result = speech_recognizer.recognize_once_async().get()
125+
126+
# Check the result
127+
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
128+
# Retrieve the detected language
129+
detected_language = result.properties.get(
130+
speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult,
131+
"Unknown"
132+
)
133+
logging.debug("Detected Language %s", detected_language, exc_info=True)
134+
return result.text
135+
136+
elif result.reason == speechsdk.ResultReason.NoMatch:
137+
raise RuntimeError("No speech could be recognized from the audio.")
138+
139+
elif result.reason == speechsdk.ResultReason.Canceled:
140+
cancellation_details = speechsdk.CancellationDetails(result)
141+
raise RuntimeError(f"Speech Recognition canceled: {cancellation_details.reason}. "
142+
f"Error details: {cancellation_details.error_details}")
143+
144+
else:
145+
raise RuntimeError("Unknown error occurred during speech recognition.")
146+
147+
except Exception as e:
148+
raise RuntimeError(f"An error occurred during speech recognition: {e}")
149+
91150

92151
def speech_to_text_from_file(file_path: str):
93152
"""

0 commit comments

Comments
 (0)