diff --git a/build/modules.conf.in b/build/modules.conf.in index 8851ded303c..c2e0d7c08b4 100755 --- a/build/modules.conf.in +++ b/build/modules.conf.in @@ -47,6 +47,8 @@ applications/mod_voicemail #asr_tts/mod_flite #asr_tts/mod_pocketsphinx #asr_tts/mod_tts_commandline +#asr_tts/mod_google_asr +#asr_tts/mod_openai_asr codecs/mod_amr #codecs/mod_amrwb codecs/mod_b64 diff --git a/configure.ac b/configure.ac index 2141e5e3c0f..c9c807f8d4b 100755 --- a/configure.ac +++ b/configure.ac @@ -2016,6 +2016,8 @@ AC_CONFIG_FILES([Makefile src/mod/asr_tts/mod_flite/Makefile src/mod/asr_tts/mod_pocketsphinx/Makefile src/mod/asr_tts/mod_tts_commandline/Makefile + src/mod/asr_tts/mod_google_asr/Makefile + src/mod/asr_tts/mod_openai_asr/Makefile src/mod/codecs/mod_amr/Makefile src/mod/codecs/mod_amrwb/Makefile src/mod/codecs/mod_b64/Makefile diff --git a/src/mod/asr_tts/mod_google_asr/Makefile.am b/src/mod/asr_tts/mod_google_asr/Makefile.am new file mode 100644 index 00000000000..a829a855c9c --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/Makefile.am @@ -0,0 +1,12 @@ + +include $(top_srcdir)/build/modmake.rulesam + +MODNAME=mod_google_asr +mod_LTLIBRARIES = mod_google_asr.la +mod_google_asr_la_SOURCES = mod_google_asr.c utils.c curl.c +mod_google_asr_la_CFLAGS = $(AM_CFLAGS) -I. -Wno-pointer-arith +mod_google_asr_la_LIBADD = $(switch_builddir)/libfreeswitch.la +mod_google_asr_la_LDFLAGS = -avoid-version -module -no-undefined -shared + +$(am_mod_google_asr_la_OBJECTS): mod_google_asr.h + diff --git a/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml b/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml new file mode 100644 index 00000000000..76772fa7776 --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml b/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml new file mode 100644 index 00000000000..121cc2b13d4 --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/src/mod/asr_tts/mod_google_asr/curl.c b/src/mod/asr_tts/mod_google_asr/curl.c new file mode 100644 index 00000000000..636a49c59c6 --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/curl.c @@ -0,0 +1,129 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + */ +#include "mod_google_asr.h" + +static size_t curl_io_write_callback(char *buffer, size_t size, size_t nitems, void *user_data) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)user_data; + size_t len = (size * nitems); + + if(len > 0 && asr_ctx->curl_recv_buffer_ref) { + switch_buffer_write(asr_ctx->curl_recv_buffer_ref, buffer, len); + } + + return len; +} + +static size_t curl_io_read_callback(char *buffer, size_t size, size_t nitems, void *user_data) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)user_data; + size_t nmax = (size * nitems); + size_t ncur = (asr_ctx->curl_send_buffer_len > nmax) ? nmax : asr_ctx->curl_send_buffer_len; + + if(ncur > 0) { + memmove(buffer, asr_ctx->curl_send_buffer_ref, ncur); + asr_ctx->curl_send_buffer_ref += ncur; + asr_ctx->curl_send_buffer_len -= ncur; + } + + return ncur; +} + +switch_status_t curl_perform(asr_ctx_t *asr_ctx, globals_t *globals) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + CURL *curl_handle = NULL; + switch_curl_slist_t *headers = NULL; + char *epurl = NULL; + switch_CURLcode curl_ret = 0; + long http_resp = 0; + + if(asr_ctx->api_key) { + epurl = switch_string_replace(globals->api_url, "${api-key}", asr_ctx->api_key); + } else { + epurl = strdup(globals->api_url); + } + + curl_handle = switch_curl_easy_init(); + headers = switch_curl_slist_append(headers, "Content-Type: application/json; charset=utf-8"); + + switch_curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, headers); + switch_curl_easy_setopt(curl_handle, CURLOPT_POST, 1); + switch_curl_easy_setopt(curl_handle, CURLOPT_NOSIGNAL, 1); + switch_curl_easy_setopt(curl_handle, CURLOPT_READFUNCTION, curl_io_read_callback); + switch_curl_easy_setopt(curl_handle, CURLOPT_READDATA, (void *)asr_ctx); + switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, curl_io_write_callback); + switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)asr_ctx); + + if(globals->connect_timeout > 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, globals->connect_timeout); + } + if(globals->request_timeout > 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, globals->request_timeout); + } + if(globals->user_agent) { + switch_curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, globals->user_agent); + } + if(strncasecmp(epurl, "https", 5) == 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0); + switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 0); + } + if(globals->proxy) { + if(globals->proxy_credentials != NULL) { + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYUSERPWD, globals->proxy_credentials); + } + if(strncasecmp(globals->proxy, "https", 5) == 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY_SSL_VERIFYPEER, 0); + } + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY, globals->proxy); + } + + switch_curl_easy_setopt(curl_handle, CURLOPT_URL, epurl); + + curl_ret = switch_curl_easy_perform(curl_handle); + if(!curl_ret) { + switch_curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &http_resp); + if(!http_resp) { switch_curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CONNECTCODE, &http_resp); } + } else { + http_resp = curl_ret; + } + + if(http_resp != 200) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "http-error=[%ld] (%s)\n", http_resp, globals->api_url); + status = SWITCH_STATUS_FALSE; + } + + if(asr_ctx->curl_recv_buffer_ref) { + if(switch_buffer_inuse(asr_ctx->curl_recv_buffer_ref) > 0) { + switch_buffer_write(asr_ctx->curl_recv_buffer_ref, "\0", 1); + } + } + + if(curl_handle) { + switch_curl_easy_cleanup(curl_handle); + } + + if(headers) { + switch_curl_slist_free_all(headers); + } + + switch_safe_free(epurl); + return status; +} diff --git a/src/mod/asr_tts/mod_google_asr/mod_google_asr.c b/src/mod/asr_tts/mod_google_asr/mod_google_asr.c new file mode 100644 index 00000000000..736577b26be --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/mod_google_asr.c @@ -0,0 +1,792 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + * Google Speech-To-Text service for the Freeswitch. + * https://cloud.google.com/speech-to-text/docs/reference/rest + * + * Development repository: + * https://github.com/akscf/mod_google_asr + * + */ +#include "mod_google_asr.h" + +globals_t globals; + +SWITCH_MODULE_LOAD_FUNCTION(mod_google_asr_load); +SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_google_asr_shutdown); +SWITCH_MODULE_DEFINITION(mod_google_asr, mod_google_asr_load, mod_google_asr_shutdown, NULL); + + +static void *SWITCH_THREAD_FUNC transcribe_thread(switch_thread_t *thread, void *obj) { + volatile asr_ctx_t *_ref = (asr_ctx_t *)obj; + asr_ctx_t *asr_ctx = (asr_ctx_t *)_ref; + switch_status_t status = SWITCH_STATUS_FALSE; + switch_byte_t *base64_buffer = NULL; + switch_byte_t *curl_send_buffer = NULL; + switch_buffer_t *chunk_buffer = NULL; + switch_buffer_t *curl_recv_buffer = NULL; + switch_memory_pool_t *pool = NULL; + time_t speech_timeout = 0; + uint32_t base64_buffer_size = 0, chunk_buffer_size = 0, recv_len = 0; + uint32_t schunks = 0; + uint8_t fl_cbuff_overflow = SWITCH_FALSE; + const void *curl_recv_buffer_ptr = NULL; + void *pop = NULL; + + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->refs++; + switch_mutex_unlock(asr_ctx->mutex); + + if(switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_core_new_memory_pool()\n"); + goto out; + } + if(switch_buffer_create_dynamic(&curl_recv_buffer, 1024, 4096, 32648) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create_dynamic()\n"); + goto out; + } + + while(SWITCH_TRUE) { + if(globals.fl_shutdown || asr_ctx->fl_destroyed ) { + break; + } + + if(chunk_buffer_size == 0) { + switch_mutex_lock(asr_ctx->mutex); + chunk_buffer_size = asr_ctx->chunk_buffer_size; + switch_mutex_unlock(asr_ctx->mutex); + + if(chunk_buffer_size > 0) { + if(switch_buffer_create(pool, &chunk_buffer, chunk_buffer_size) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_buffer_create()\n"); + break; + } + switch_buffer_zero(chunk_buffer); + } + + goto timer_next; + } + + fl_cbuff_overflow = SWITCH_FALSE; + while(switch_queue_trypop(asr_ctx->q_audio, &pop) == SWITCH_STATUS_SUCCESS) { + xdata_buffer_t *audio_buffer = (xdata_buffer_t *)pop; + if(globals.fl_shutdown || asr_ctx->fl_destroyed ) { + xdata_buffer_free(&audio_buffer); + break; + } + if(audio_buffer && audio_buffer->len) { + if(switch_buffer_write(chunk_buffer, audio_buffer->data, audio_buffer->len) >= chunk_buffer_size) { + fl_cbuff_overflow = SWITCH_TRUE; + break; + } + schunks++; + } + xdata_buffer_free(&audio_buffer); + } + + if(fl_cbuff_overflow) { + speech_timeout = 1; + } else { + if(schunks && asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING) { + if(!speech_timeout) { + speech_timeout = asr_ctx->silence_sec + switch_epoch_time_now(NULL); + } + } + if(speech_timeout && (asr_ctx->vad_state == SWITCH_VAD_STATE_START_TALKING || asr_ctx->vad_state == SWITCH_VAD_STATE_TALKING)) { + speech_timeout = 0; + } + } + + if(speech_timeout && speech_timeout <= switch_epoch_time_now(NULL)) { + const void *chunk_buffer_ptr = NULL; + uint32_t buf_len = switch_buffer_peek_zerocopy(chunk_buffer, &chunk_buffer_ptr); + uint32_t b64_len = BASE64_ENC_SZ(buf_len) + 1; + uint32_t stt_failed = 0; + + if(base64_buffer_size == 0 || base64_buffer_size < b64_len) { + if(base64_buffer_size > 0) { switch_safe_free(base64_buffer); } + switch_zmalloc(base64_buffer, b64_len); + base64_buffer_size = b64_len; + } else { + memset(base64_buffer, 0x0, b64_len); + } + + if(switch_b64_encode((uint8_t *)chunk_buffer_ptr, buf_len, base64_buffer, base64_buffer_size) == SWITCH_STATUS_SUCCESS) { + curl_send_buffer = (switch_byte_t *)switch_mprintf( "{'config':{" \ + "'languageCode':'%s', 'encoding':'%s', 'sampleRateHertz':'%u', 'audioChannelCount':'%u', 'maxAlternatives':'%u', " \ + "'profanityFilter':'%s', 'enableWordTimeOffsets':'%s', 'enableWordConfidence':'%s', 'enableAutomaticPunctuation':'%s', " \ + "'enableSpokenPunctuation':'%s', 'enableSpokenEmojis':'%s', 'model':'%s', 'useEnhanced':'%s', " \ + " 'diarizationConfig':{'enableSpeakerDiarization': '%s', 'minSpeakerCount': '%u', 'maxSpeakerCount': '%u'}, " \ + "'metadata':{'interactionType':'%s', 'microphoneDistance':'%s', 'recordingDeviceType':'%s'}}, 'audio':{'content':'%s'}}", + asr_ctx->lang, + globals.opt_encoding, + asr_ctx->samplerate, + asr_ctx->channels, + asr_ctx->opt_max_alternatives, + BOOL2STR(asr_ctx->opt_enable_profanity_filter), + BOOL2STR(asr_ctx->opt_enable_word_time_offsets), + BOOL2STR(asr_ctx->opt_enable_word_confidence), + BOOL2STR(asr_ctx->opt_enable_automatic_punctuation), + BOOL2STR(asr_ctx->opt_enable_spoken_punctuation), + BOOL2STR(asr_ctx->opt_enable_spoken_emojis), + asr_ctx->opt_speech_model, + BOOL2STR(asr_ctx->opt_use_enhanced_model), + BOOL2STR(asr_ctx->opt_enable_speaker_diarization), + asr_ctx->opt_diarization_min_speaker_count, + asr_ctx->opt_diarization_max_speaker_count, + asr_ctx->opt_meta_interaction_type, + asr_ctx->opt_meta_microphone_distance, + asr_ctx->opt_meta_recording_device_type, + base64_buffer + ); + + asr_ctx->curl_send_buffer_ref = curl_send_buffer; + asr_ctx->curl_send_buffer_len = strlen((const char *)curl_send_buffer); + asr_ctx->curl_recv_buffer_ref = curl_recv_buffer; + + for(int rqtry = 0; rqtry < asr_ctx->retries_on_error; rqtry++) { + switch_buffer_zero(curl_recv_buffer); + status = curl_perform(asr_ctx, &globals); + if(status == SWITCH_STATUS_SUCCESS || globals.fl_shutdown || asr_ctx->fl_destroyed) { break; } + switch_yield(1000); + } + + recv_len = switch_buffer_peek_zerocopy(curl_recv_buffer, &curl_recv_buffer_ptr); + if(status == SWITCH_STATUS_SUCCESS) { + if(curl_recv_buffer_ptr && recv_len) { + char *txt = parse_response((char *)curl_recv_buffer_ptr, NULL); +#ifdef MOD_GOOGLE_ASR_DEBUG + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Service response [%s]\n", (char *)curl_recv_buffer_ptr); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Text [%s]\n", txt ? txt : "null"); +#endif + if(!txt) txt = strdup(""); + if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->transcription_results++; + switch_mutex_unlock(asr_ctx->mutex); + } else { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n"); + switch_safe_free(txt); + } + } else { + stt_failed = 1; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Empty service response!\n"); + } + } else { + stt_failed = 1; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to perform request!\n"); + } + switch_safe_free(curl_send_buffer); + } else { + stt_failed = 1; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_b64_encode() failed\n"); + } + + if(stt_failed) { + char *txt = strdup("[transcription failed]"); + if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->transcription_results++; + switch_mutex_unlock(asr_ctx->mutex); + } else { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n"); + switch_safe_free(txt); + } + } + + schunks = 0; + speech_timeout = 0; + switch_buffer_zero(chunk_buffer); + } + + timer_next: + switch_yield(10000); + } + +out: + switch_safe_free(base64_buffer); + switch_safe_free(curl_send_buffer); + + if(curl_recv_buffer) { + switch_buffer_destroy(&curl_recv_buffer); + } + if(chunk_buffer) { + switch_buffer_destroy(&chunk_buffer); + } + if(pool) { + switch_core_destroy_memory_pool(&pool); + } + + switch_mutex_lock(asr_ctx->mutex); + if(asr_ctx->refs > 0) asr_ctx->refs--; + switch_mutex_unlock(asr_ctx->mutex); + + switch_mutex_lock(globals.mutex); + if(globals.active_threads) globals.active_threads--; + switch_mutex_unlock(globals.mutex); + + return NULL; +} + +// --------------------------------------------------------------------------------------------------------------------------------------------- +// asr interface +// --------------------------------------------------------------------------------------------------------------------------------------------- +static switch_status_t asr_open(switch_asr_handle_t *ah, const char *codec, int samplerate, const char *dest, switch_asr_flag_t *flags) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + switch_threadattr_t *attr = NULL; + switch_thread_t *thread = NULL; + asr_ctx_t *asr_ctx = NULL; + + if(strcmp(codec, "L16") !=0) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unsupported encoding: %s\n", codec); + switch_goto_status(SWITCH_STATUS_FALSE, out); + } + + if((asr_ctx = switch_core_alloc(ah->memory_pool, sizeof(asr_ctx_t))) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_core_alloc()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + asr_ctx->channels = 1; + asr_ctx->chunk_buffer_size = 0; + asr_ctx->samplerate = samplerate; + asr_ctx->silence_sec = globals.speech_silence_sec; + asr_ctx->lang = (char *)globals.default_lang; + asr_ctx->api_key = globals.api_key; + asr_ctx->retries_on_error = globals.retries_on_error; + + asr_ctx->opt_max_alternatives = globals.opt_max_alternatives; + asr_ctx->opt_enable_profanity_filter = globals.opt_enable_profanity_filter; + asr_ctx->opt_enable_word_time_offsets = globals.opt_enable_word_time_offsets; + asr_ctx->opt_enable_word_confidence = globals.opt_enable_word_confidence; + asr_ctx->opt_enable_automatic_punctuation = globals.opt_enable_automatic_punctuation; + asr_ctx->opt_enable_spoken_punctuation = globals.opt_enable_spoken_punctuation; + asr_ctx->opt_enable_spoken_emojis = globals.opt_enable_spoken_emojis; + asr_ctx->opt_meta_interaction_type = globals.opt_meta_interaction_type; + asr_ctx->opt_meta_microphone_distance = globals.opt_meta_microphone_distance; + asr_ctx->opt_meta_recording_device_type = globals.opt_meta_recording_device_type; + asr_ctx->opt_speech_model = globals.opt_speech_model; + asr_ctx->opt_use_enhanced_model = globals.opt_use_enhanced_model; + asr_ctx->opt_enable_speaker_diarization = SWITCH_FALSE; + asr_ctx->opt_diarization_min_speaker_count = 1; + asr_ctx->opt_diarization_max_speaker_count = 1; + + if((status = switch_mutex_init(&asr_ctx->mutex, SWITCH_MUTEX_NESTED, ah->memory_pool)) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_mutex_init()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + switch_queue_create(&asr_ctx->q_audio, QUEUE_SIZE, ah->memory_pool); + switch_queue_create(&asr_ctx->q_text, QUEUE_SIZE, ah->memory_pool); + + asr_ctx->vad_buffer = NULL; + asr_ctx->frame_len = 0; + asr_ctx->vad_buffer_size = 0; + asr_ctx->vad_stored_frames = 0; + asr_ctx->fl_vad_first_cycle = SWITCH_TRUE; + + if((asr_ctx->vad = switch_vad_init(asr_ctx->samplerate, asr_ctx->channels)) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_vad_init()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + switch_vad_set_mode(asr_ctx->vad, -1); + switch_vad_set_param(asr_ctx->vad, "debug", globals.fl_vad_debug); + if(globals.vad_silence_ms > 0) { switch_vad_set_param(asr_ctx->vad, "silence_ms", globals.vad_silence_ms); } + if(globals.vad_voice_ms > 0) { switch_vad_set_param(asr_ctx->vad, "voice_ms", globals.vad_voice_ms); } + if(globals.vad_threshold > 0) { switch_vad_set_param(asr_ctx->vad, "thresh", globals.vad_threshold); } + + ah->private_info = asr_ctx; + + switch_mutex_lock(globals.mutex); + globals.active_threads++; + switch_mutex_unlock(globals.mutex); + + switch_threadattr_create(&attr, ah->memory_pool); + switch_threadattr_detach_set(attr, 1); + switch_threadattr_stacksize_set(attr, SWITCH_THREAD_STACKSIZE); + switch_thread_create(&thread, attr, transcribe_thread, asr_ctx, ah->memory_pool); + +out: + return status; +} + +static switch_status_t asr_close(switch_asr_handle_t *ah, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + uint8_t fl_wloop = SWITCH_TRUE; + + assert(asr_ctx != NULL); + + asr_ctx->fl_abort = SWITCH_TRUE; + asr_ctx->fl_destroyed = SWITCH_TRUE; + + switch_mutex_lock(asr_ctx->mutex); + fl_wloop = (asr_ctx->refs != 0); + switch_mutex_unlock(asr_ctx->mutex); + + if(fl_wloop) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for unlock (refs=%u)...\n", asr_ctx->refs); + while(fl_wloop) { + switch_mutex_lock(asr_ctx->mutex); + fl_wloop = (asr_ctx->refs != 0); + switch_mutex_unlock(asr_ctx->mutex); + switch_yield(100000); + } + } + + if(asr_ctx->q_audio) { + xdata_buffer_queue_clean(asr_ctx->q_audio); + switch_queue_term(asr_ctx->q_audio); + } + if(asr_ctx->q_text) { + text_queue_clean(asr_ctx->q_text); + switch_queue_term(asr_ctx->q_text); + } + if(asr_ctx->vad) { + switch_vad_destroy(&asr_ctx->vad); + } + + if(asr_ctx->vad_buffer) { + switch_buffer_destroy(&asr_ctx->vad_buffer); + } + + switch_set_flag(ah, SWITCH_ASR_FLAG_CLOSED); + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_feed(switch_asr_handle_t *ah, void *data, unsigned int data_len, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + switch_vad_state_t vad_state = 0; + uint8_t fl_has_audio = SWITCH_FALSE; + + assert(asr_ctx != NULL); + + if(switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) { + return SWITCH_STATUS_BREAK; + } + if(asr_ctx->fl_destroyed || asr_ctx->fl_abort) { + return SWITCH_STATUS_BREAK; + } + if(asr_ctx->fl_pause) { + return SWITCH_STATUS_SUCCESS; + } + if(!data || !data_len) { + return SWITCH_STATUS_BREAK; + } + + if(data_len > 0 && asr_ctx->frame_len == 0) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->frame_len = data_len; + asr_ctx->vad_buffer_size = asr_ctx->frame_len * VAD_STORE_FRAMES; + asr_ctx->chunk_buffer_size = asr_ctx->samplerate * globals.speech_max_sec; + switch_mutex_unlock(asr_ctx->mutex); + + if(switch_buffer_create(ah->memory_pool, &asr_ctx->vad_buffer, asr_ctx->vad_buffer_size) != SWITCH_STATUS_SUCCESS) { + asr_ctx->vad_buffer_size = 0; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create()\n"); + } + } + + if(asr_ctx->vad_buffer_size) { + if(asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING || (asr_ctx->vad_state == vad_state && vad_state == SWITCH_VAD_STATE_NONE)) { + if(data_len <= asr_ctx->frame_len) { + if(asr_ctx->vad_stored_frames >= VAD_STORE_FRAMES) { + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + asr_ctx->fl_vad_first_cycle = SWITCH_FALSE; + } + switch_buffer_write(asr_ctx->vad_buffer, data, MIN(asr_ctx->frame_len, data_len)); + asr_ctx->vad_stored_frames++; + } + } + + vad_state = switch_vad_process(asr_ctx->vad, (int16_t *)data, (data_len / sizeof(int16_t))); + if(vad_state == SWITCH_VAD_STATE_START_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_TRUE; + } else if (vad_state == SWITCH_VAD_STATE_STOP_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_FALSE; + switch_vad_reset(asr_ctx->vad); + } else if (vad_state == SWITCH_VAD_STATE_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_TRUE; + } + } else { + fl_has_audio = SWITCH_TRUE; + } + + if(fl_has_audio) { + if(vad_state == SWITCH_VAD_STATE_START_TALKING && asr_ctx->vad_stored_frames > 0) { + xdata_buffer_t *tau_buf = NULL; + const void *ptr = NULL; + switch_size_t vblen = 0; + uint32_t rframes = 0, rlen = 0; + int ofs = 0; + + if((vblen = switch_buffer_peek_zerocopy(asr_ctx->vad_buffer, &ptr)) && ptr && vblen > 0) { + rframes = (asr_ctx->vad_stored_frames >= VAD_RECOVERY_FRAMES ? VAD_RECOVERY_FRAMES : (asr_ctx->fl_vad_first_cycle ? asr_ctx->vad_stored_frames : VAD_RECOVERY_FRAMES)); + rlen = (rframes * asr_ctx->frame_len); + ofs = (vblen - rlen); + + if(ofs < 0) { + uint32_t hdr_sz = -ofs; + uint32_t hdr_ofs = (asr_ctx->vad_buffer_size - hdr_sz); + + switch_zmalloc(tau_buf, sizeof(xdata_buffer_t)); + + tau_buf->len = (hdr_sz + vblen + data_len); + switch_malloc(tau_buf->data, tau_buf->len); + + memcpy(tau_buf->data, (void *)(ptr + hdr_ofs), hdr_sz); + memcpy(tau_buf->data + hdr_sz , (void *)(ptr + 0), vblen); + memcpy(tau_buf->data + rlen, data, data_len); + + if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) { + xdata_buffer_free(&tau_buf); + } + + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + } else { + switch_zmalloc(tau_buf, sizeof(xdata_buffer_t)); + + tau_buf->len = (rlen + data_len); + switch_malloc(tau_buf->data, tau_buf->len); + + memcpy(tau_buf->data, (void *)(ptr + ofs), rlen); + memcpy(tau_buf->data + rlen, data, data_len); + + if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) { + xdata_buffer_free(&tau_buf); + } + + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + } + } + } else { + xdata_buffer_push(asr_ctx->q_audio, data, data_len); + } + } + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_check_results(switch_asr_handle_t *ah, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + if(asr_ctx->fl_pause) { + return SWITCH_STATUS_FALSE; + } + + return (asr_ctx->transcription_results > 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE); +} + +static switch_status_t asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + switch_status_t status = SWITCH_STATUS_FALSE; + void *pop = NULL; + + assert(asr_ctx != NULL); + + if(switch_queue_trypop(asr_ctx->q_text, &pop) == SWITCH_STATUS_SUCCESS) { + if(pop) { + *xmlstr = (char *)pop; + status = SWITCH_STATUS_SUCCESS; + + switch_mutex_lock(asr_ctx->mutex); + if(asr_ctx->transcription_results > 0) asr_ctx->transcription_results--; + switch_mutex_unlock(asr_ctx->mutex); + } + } + + return status; +} + +static switch_status_t asr_start_input_timers(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_start_timers = SWITCH_TRUE; + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_pause(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_pause = SWITCH_TRUE; + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_resume(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_pause = SWITCH_FALSE; + + return SWITCH_STATUS_SUCCESS; +} + +static void asr_text_param(switch_asr_handle_t *ah, char *param, const char *val) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + if(strcasecmp(param, "lang") == 0) { + if(val) asr_ctx->lang = switch_core_strdup(ah->memory_pool, gcp_get_language(val)); + } else if(strcasecmp(param, "silence") == 0) { + if(val) asr_ctx->silence_sec = atoi(val); + } else if(strcasecmp(param, "key") == 0) { + if(val) asr_ctx->api_key = switch_core_strdup(ah->memory_pool, val); + } else if(!strcasecmp(param, "speech-model")) { + if(val) asr_ctx->opt_speech_model = switch_core_strdup(ah->memory_pool, val); + } else if(!strcasecmp(param, "use-enhanced-model")) { + if(val) asr_ctx->opt_use_enhanced_model = switch_true(val); + } else if(!strcasecmp(param, "max-alternatives")) { + if(val) asr_ctx->opt_max_alternatives = atoi(val); + } else if(!strcasecmp(param, "enable-word-time-offsets")) { + if(val) asr_ctx->opt_enable_word_time_offsets = switch_true(val); + } else if(!strcasecmp(param, "enable-enable-word-confidence;")) { + if(val) asr_ctx->opt_enable_word_confidence = switch_true(val); + } else if(!strcasecmp(param, "enable-profanity-filter")) { + if(val) asr_ctx->opt_enable_profanity_filter = switch_true(val); + } else if(!strcasecmp(param, "enable-automatic-punctuation")) { + if(val) asr_ctx->opt_enable_automatic_punctuation = switch_true(val); + } else if(!strcasecmp(param, "enable-spoken-punctuation")) { + if(val) asr_ctx->opt_enable_spoken_punctuation = switch_true(val); + } else if(!strcasecmp(param, "enable-spoken-emojis")) { + if(val) asr_ctx->opt_enable_spoken_emojis = switch_true(val); + } else if(!strcasecmp(param, "microphone-distance")) { + if(val) asr_ctx->opt_meta_microphone_distance = switch_core_strdup(ah->memory_pool, gcp_get_microphone_distance(val)); + } else if(!strcasecmp(param, "recording-device-type")) { + if(val) asr_ctx->opt_meta_recording_device_type = switch_core_strdup(ah->memory_pool, gcp_get_recording_device(val)); + } else if(!strcasecmp(param, "interaction-type")) { + if(val) asr_ctx->opt_meta_interaction_type = switch_core_strdup(ah->memory_pool, gcp_get_interaction(val)); + } else if(!strcasecmp(param, "enable-speaker-diarizatio")) { + if(val) asr_ctx->opt_enable_speaker_diarization = switch_true(val); + } else if(!strcasecmp(param, "diarization-min-speakers")) { + if(val) asr_ctx->opt_diarization_min_speaker_count = atoi(val); + } else if(!strcasecmp(param, "diarization-max-speakers")) { + if(val) asr_ctx->opt_diarization_max_speaker_count = atoi(val); + } +} + +static void asr_numeric_param(switch_asr_handle_t *ah, char *param, int val) { +} + +static void asr_float_param(switch_asr_handle_t *ah, char *param, double val) { +} + +static switch_status_t asr_load_grammar(switch_asr_handle_t *ah, const char *grammar, const char *name) { + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_unload_grammar(switch_asr_handle_t *ah, const char *name) { + return SWITCH_STATUS_SUCCESS; +} + +#define CMD_SYNTAX "path_to/filename.(mp3|wav) []\n" +SWITCH_STANDARD_API(google_asr_cmd_handler) { + //switch_status_t status = 0; + char *mycmd = NULL, *argv[10] = { 0 }; int argc = 0; + + if (!zstr(cmd)) { + mycmd = strdup(cmd); + switch_assert(mycmd); + argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0]))); + } + if(argc == 0) { + goto usage; + } + + // + // todo + // + + stream->write_function(stream, "-ERR: not yet implemented\n"); + goto out; +usage: + stream->write_function(stream, "-ERR:\nUsage: %s\n", CMD_SYNTAX); + +out: + + switch_safe_free(mycmd); + return SWITCH_STATUS_SUCCESS; +} + +// --------------------------------------------------------------------------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------------------------------------------------------------------------- +SWITCH_MODULE_LOAD_FUNCTION(mod_google_asr_load) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + switch_xml_t cfg, xml, settings, param; + switch_api_interface_t *commands_interface; + switch_asr_interface_t *asr_interface; + + memset(&globals, 0, sizeof(globals)); + switch_mutex_init(&globals.mutex, SWITCH_MUTEX_NESTED, pool); + + if((xml = switch_xml_open_cfg(MOD_CONFIG_NAME, &cfg, NULL)) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open configuration: %s\n", MOD_CONFIG_NAME); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + if((settings = switch_xml_child(cfg, "settings"))) { + for (param = switch_xml_child(settings, "param"); param; param = param->next) { + char *var = (char *) switch_xml_attr_soft(param, "name"); + char *val = (char *) switch_xml_attr_soft(param, "value"); + + if(!strcasecmp(var, "vad-silence-ms")) { + if(val) globals.vad_silence_ms = atoi (val); + } else if(!strcasecmp(var, "vad-voice-ms")) { + if(val) globals.vad_voice_ms = atoi (val); + } else if(!strcasecmp(var, "vad-threshold")) { + if(val) globals.vad_threshold = atoi (val); + } else if(!strcasecmp(var, "vad-debug")) { + if(val) globals.fl_vad_debug = switch_true(val); + } else if(!strcasecmp(var, "api-key")) { + if(val) globals.api_key = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "api-url")) { + if(val) globals.api_url = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "user-agent")) { + if(val) globals.user_agent = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "proxy")) { + if(val) globals.proxy = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "proxy-credentials")) { + if(val) globals.proxy_credentials = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "default-language")) { + if(val) globals.default_lang = switch_core_strdup(pool, gcp_get_language(val)); + } else if(!strcasecmp(var, "encoding")) { + if(val) globals.opt_encoding = switch_core_strdup(pool, gcp_get_encoding(val)); + } else if(!strcasecmp(var, "speech-max-sec")) { + if(val) globals.speech_max_sec = atoi(val); + } else if(!strcasecmp(var, "speech-silence-sec")) { + if(val) globals.speech_silence_sec = atoi(val); + } else if(!strcasecmp(var, "request-timeout")) { + if(val) globals.request_timeout = atoi(val); + } else if(!strcasecmp(var, "connect-timeout")) { + if(val) globals.connect_timeout = atoi(val); + } else if(!strcasecmp(var, "retries-on-error")) { + if(val) globals.retries_on_error = atoi(val); + } else if(!strcasecmp(var, "speech-model")) { + if(val) globals.opt_speech_model = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "use-enhanced-model")) { + if(val) globals.opt_use_enhanced_model = switch_true(val); + } else if(!strcasecmp(var, "max-alternatives")) { + if(val) globals.opt_max_alternatives = atoi(val); + } else if(!strcasecmp(var, "enable-word-time-offsets")) { + if(val) globals.opt_enable_word_time_offsets = switch_true(val); + } else if(!strcasecmp(var, "enable-word-confidence")) { + if(val) globals.opt_enable_word_confidence = switch_true(val); + } else if(!strcasecmp(var, "enable-profanity-filter")) { + if(val) globals.opt_enable_profanity_filter = switch_true(val); + } else if(!strcasecmp(var, "enable-automatic-punctuation")) { + if(val) globals.opt_enable_automatic_punctuation = switch_true(val); + } else if(!strcasecmp(var, "enable-spoken-punctuation")) { + if(val) globals.opt_enable_spoken_punctuation = switch_true(val); + } else if(!strcasecmp(var, "enable-spoken-emojis")) { + if(val) globals.opt_enable_spoken_emojis = switch_true(val); + } else if(!strcasecmp(var, "microphone-distance")) { + if(val) globals.opt_meta_microphone_distance = switch_core_strdup(pool, gcp_get_microphone_distance(val)); + } else if(!strcasecmp(var, "recording-device-type")) { + if(val) globals.opt_meta_recording_device_type = switch_core_strdup(pool, gcp_get_recording_device(val)); + } else if(!strcasecmp(var, "interaction-type")) { + if(val) globals.opt_meta_interaction_type = switch_core_strdup(pool, gcp_get_interaction(val)); + } + } + } + + if(!globals.api_url) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Missing required parameter: api-url\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + globals.speech_max_sec = !globals.speech_max_sec ? 35 : globals.speech_max_sec; + globals.speech_silence_sec = !globals.speech_silence_sec ? 3 : globals.speech_silence_sec; + globals.opt_encoding = globals.opt_encoding ? globals.opt_encoding : gcp_get_encoding("l16"); + globals.opt_speech_model = globals.opt_speech_model ? globals.opt_speech_model : "phone_call"; + globals.opt_max_alternatives = globals.opt_max_alternatives > 0 ? globals.opt_max_alternatives : 1; + globals.opt_meta_microphone_distance = globals.opt_meta_microphone_distance ? globals.opt_meta_microphone_distance : gcp_get_microphone_distance("unspecified"); + globals.opt_meta_recording_device_type = globals.opt_meta_recording_device_type ? globals.opt_meta_recording_device_type : gcp_get_recording_device("unspecified"); + globals.opt_meta_interaction_type = globals.opt_meta_interaction_type ? globals.opt_meta_interaction_type : gcp_get_interaction("unspecified"); + globals.retries_on_error = !globals.retries_on_error ? 1 : globals.retries_on_error; + + globals.tmp_path = switch_core_sprintf(pool, "%s%sgoogle-asr-cache", SWITCH_GLOBAL_dirs.temp_dir, SWITCH_PATH_SEPARATOR); + if(switch_directory_exists(globals.tmp_path, NULL) != SWITCH_STATUS_SUCCESS) { + switch_dir_make(globals.tmp_path, SWITCH_FPROT_OS_DEFAULT, NULL); + } + + *module_interface = switch_loadable_module_create_module_interface(pool, modname); + SWITCH_ADD_API(commands_interface, "google_asr_transcript", "Google speech-to-text", google_asr_cmd_handler, CMD_SYNTAX); + + asr_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_ASR_INTERFACE); + asr_interface->interface_name = "google"; + asr_interface->asr_open = asr_open; + asr_interface->asr_close = asr_close; + asr_interface->asr_feed = asr_feed; + asr_interface->asr_pause = asr_pause; + asr_interface->asr_resume = asr_resume; + asr_interface->asr_check_results = asr_check_results; + asr_interface->asr_get_results = asr_get_results; + asr_interface->asr_start_input_timers = asr_start_input_timers; + asr_interface->asr_text_param = asr_text_param; + asr_interface->asr_numeric_param = asr_numeric_param; + asr_interface->asr_float_param = asr_float_param; + asr_interface->asr_load_grammar = asr_load_grammar; + asr_interface->asr_unload_grammar = asr_unload_grammar; + + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google-ASR (%s)\n", MOD_VERSION); +out: + if(xml) { + switch_xml_free(xml); + } + return status; +} + +SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_google_asr_shutdown) { + uint8_t fl_wloop = SWITCH_TRUE; + + globals.fl_shutdown = SWITCH_TRUE; + + switch_mutex_lock(globals.mutex); + fl_wloop = (globals.active_threads > 0); + switch_mutex_unlock(globals.mutex); + + if(fl_wloop) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Waiting for termination (%d) threads...\n", globals.active_threads); + while(fl_wloop) { + switch_mutex_lock(globals.mutex); + fl_wloop = (globals.active_threads > 0); + switch_mutex_unlock(globals.mutex); + switch_yield(100000); + } + } + + return SWITCH_STATUS_SUCCESS; +} diff --git a/src/mod/asr_tts/mod_google_asr/mod_google_asr.h b/src/mod/asr_tts/mod_google_asr/mod_google_asr.h new file mode 100644 index 00000000000..ec56f244876 --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/mod_google_asr.h @@ -0,0 +1,149 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + * + */ +#ifndef MOD_GOOGLE_ASR_H +#define MOD_GOOGLE_ASR_H + +#include +#include +#include + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +#define MOD_CONFIG_NAME "google_asr.conf" +#define MOD_VERSION "1.0.4" +#define QUEUE_SIZE 128 +#define VAD_STORE_FRAMES 64 +#define VAD_RECOVERY_FRAMES 20 +#define BASE64_ENC_SZ(n) (4*((n+2)/3)) +#define BOOL2STR(v) (v ? "true" : "false") + +//#define MOD_GOOGLE_ASR_DEBUG + +typedef struct { + switch_mutex_t *mutex; + uint32_t active_threads; + uint32_t speech_max_sec; + uint32_t speech_silence_sec; + uint32_t vad_silence_ms; + uint32_t vad_voice_ms; + uint32_t vad_threshold; + uint32_t request_timeout; // seconds + uint32_t connect_timeout; // seconds + uint32_t retries_on_error; + uint8_t fl_vad_debug; + uint8_t fl_shutdown; + char *tmp_path; + char *api_key; + char *api_url; + char *user_agent; + char *default_lang; + char *proxy; + char *proxy_credentials; + char *opt_encoding; + char *opt_speech_model; + char *opt_meta_microphone_distance; + char *opt_meta_recording_device_type; + char *opt_meta_interaction_type; + uint32_t opt_max_alternatives; + uint32_t opt_use_enhanced_model; + uint32_t opt_enable_word_time_offsets; + uint32_t opt_enable_word_confidence; + uint32_t opt_enable_profanity_filter; + uint32_t opt_enable_automatic_punctuation; + uint32_t opt_enable_spoken_punctuation; + uint32_t opt_enable_spoken_emojis; +} globals_t; + + +typedef struct { + switch_memory_pool_t *pool; + switch_vad_t *vad; + switch_buffer_t *vad_buffer; + switch_mutex_t *mutex; + switch_queue_t *q_audio; + switch_queue_t *q_text; + switch_buffer_t *curl_recv_buffer_ref; + switch_byte_t *curl_send_buffer_ref; + char *api_key; + char *lang; + switch_vad_state_t vad_state; + uint32_t retries_on_error; + uint32_t curl_send_buffer_len; + uint32_t transcription_results; + uint32_t vad_buffer_size; + uint32_t vad_stored_frames; + uint32_t chunk_buffer_size; + uint32_t refs; + uint32_t samplerate; + uint32_t channels; + uint32_t frame_len; + uint32_t silence_sec; + uint8_t fl_start_timers; + uint8_t fl_pause; + uint8_t fl_vad_first_cycle; + uint8_t fl_destroyed; + uint8_t fl_abort; + // + char *opt_speech_model; + char *opt_meta_microphone_distance; + char *opt_meta_recording_device_type; + char *opt_meta_interaction_type; + uint32_t opt_max_alternatives; + uint32_t opt_use_enhanced_model; + uint32_t opt_enable_word_time_offsets; + uint32_t opt_enable_word_confidence; + uint32_t opt_enable_profanity_filter; + uint32_t opt_enable_automatic_punctuation; + uint32_t opt_enable_spoken_punctuation; + uint32_t opt_enable_spoken_emojis; + uint32_t opt_enable_speaker_diarization; + uint32_t opt_diarization_min_speaker_count; + uint32_t opt_diarization_max_speaker_count; +} asr_ctx_t; + +typedef struct { + uint32_t len; + switch_byte_t *data; +} xdata_buffer_t; + + +/* curl.c */ +switch_status_t curl_perform(asr_ctx_t *asr_ctx, globals_t *globals); + +/* utils.c */ +switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len); +switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len); +void xdata_buffer_free(xdata_buffer_t **buf); +void xdata_buffer_queue_clean(switch_queue_t *queue); +void text_queue_clean(switch_queue_t *queue); +char *parse_response(char *data, switch_stream_handle_t *stream); + +char *gcp_get_language(const char *val); +char *gcp_get_encoding(const char *val); +char *gcp_get_microphone_distance(const char *val); +char *gcp_get_recording_device(const char *val); +char *gcp_get_interaction(const char *val); + + +#endif diff --git a/src/mod/asr_tts/mod_google_asr/utils.c b/src/mod/asr_tts/mod_google_asr/utils.c new file mode 100644 index 00000000000..9842da557d3 --- /dev/null +++ b/src/mod/asr_tts/mod_google_asr/utils.c @@ -0,0 +1,171 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + */ +#include "mod_google_asr.h" + +extern globals_t globals; + +/** + ** https://cloud.google.com/speech-to-text/docs/reference/rest/v1/RecognitionConfig + ** https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages + ** + **/ +char *gcp_get_language(const char *val) { + if(strcasecmp(val, "en") == 0) { return "en-US"; } + if(strcasecmp(val, "de") == 0) { return "de-DE"; } + if(strcasecmp(val, "es") == 0) { return "es-US"; } + if(strcasecmp(val, "it") == 0) { return "it-IT"; } + if(strcasecmp(val, "ru") == 0) { return "ru-RU"; } + return (char *)val; +} + +char *gcp_get_encoding(const char *val) { + if(strcasecmp(val, "unspecified") == 0) { return "ENCODING_UNSPECIFIED"; } + if(strcasecmp(val, "l16") == 0) { return "LINEAR16"; } + if(strcasecmp(val, "flac") == 0) { return "FLAC"; } + if(strcasecmp(val, "ulaw") == 0) { return "MULAW"; } + if(strcasecmp(val, "amr") == 0) { return "AMR"; } + return (char *)val; +} + +char *gcp_get_microphone_distance(const char *val) { + if(strcasecmp(val, "unspecified") == 0) { return "MICROPHONE_DISTANCE_UNSPECIFIED"; } + if(strcasecmp(val, "nearfield") == 0) { return "NEARFIELD"; } + if(strcasecmp(val, "midfield") == 0) { return "MIDFIELD"; } + if(strcasecmp(val, "farfield") == 0) { return "FARFIELD"; } + return (char *)val; +} + +char *gcp_get_recording_device(const char *val) { + if(strcasecmp(val, "unspecified") == 0) { return "RECORDING_DEVICE_TYPE_UNSPECIFIED"; } + if(strcasecmp(val, "smartphone") == 0) { return "SMARTPHONE"; } + if(strcasecmp(val, "pc") == 0) { return "PC"; } + if(strcasecmp(val, "phone_line") == 0) { return "PHONE_LINE"; } + if(strcasecmp(val, "vehicle") == 0) { return "VEHICLE"; } + if(strcasecmp(val, "other_outdoor_device") == 0) { return "OTHER_OUTDOOR_DEVICE"; } + if(strcasecmp(val, "other_indoor_device") == 0) { return "OTHER_INDOOR_DEVICE"; } + return (char *)val; +} + +char *gcp_get_interaction(const char *val) { + if(strcasecmp(val, "unspecified") == 0) { return "INTERACTION_TYPE_UNSPECIFIED"; } + if(strcasecmp(val, "discussion") == 0) { return "DISCUSSION"; } + if(strcasecmp(val, "presentation") == 0) { return "PRESENTATION"; } + if(strcasecmp(val, "phone_call") == 0) { return "PHONE_CALL"; } + if(strcasecmp(val, "voicemal") == 0) { return "VOICEMAIL"; } + if(strcasecmp(val, "professionally_produced") == 0) { return "PROFESSIONALLY_PRODUCED"; } + if(strcasecmp(val, "voice_search") == 0) { return "VOICE_SEARCH"; } + if(strcasecmp(val, "voice_command") == 0) { return "VOICE_COMMAND"; } + if(strcasecmp(val, "dictation") == 0) { return "DICTATION"; } + return (char *)val; +} + +switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len) { + xdata_buffer_t *buf = NULL; + + switch_zmalloc(buf, sizeof(xdata_buffer_t)); + + if(data_len) { + switch_malloc(buf->data, data_len); + switch_assert(buf->data); + + buf->len = data_len; + memcpy(buf->data, data, data_len); + } + + *out = buf; + return SWITCH_STATUS_SUCCESS; +} + +void xdata_buffer_free(xdata_buffer_t **buf) { + if(buf && *buf) { + switch_safe_free((*buf)->data); + free(*buf); + } +} + +void xdata_buffer_queue_clean(switch_queue_t *queue) { + xdata_buffer_t *data = NULL; + + if(!queue || !switch_queue_size(queue)) { return; } + + while(switch_queue_trypop(queue, (void *) &data) == SWITCH_STATUS_SUCCESS) { + if(data) { xdata_buffer_free(&data); } + } +} + +switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len) { + xdata_buffer_t *buff = NULL; + + if(xdata_buffer_alloc(&buff, data, data_len) == SWITCH_STATUS_SUCCESS) { + if(switch_queue_trypush(queue, buff) == SWITCH_STATUS_SUCCESS) { + return SWITCH_STATUS_SUCCESS; + } + xdata_buffer_free(&buff); + } + return SWITCH_STATUS_FALSE; +} + +void text_queue_clean(switch_queue_t *queue) { + void *data = NULL; + + if(!queue || !switch_queue_size(queue)) { + return; + } + + while(switch_queue_trypop(queue, (void *)&data) == SWITCH_STATUS_SUCCESS) { + switch_safe_free(data); + } +} + +char *parse_response(char *data, switch_stream_handle_t *stream) { + char *result = NULL; + cJSON *json = NULL; + + if(!data) { + return NULL; + } + + if((json = cJSON_Parse(data)) != NULL) { + cJSON *jres = cJSON_GetObjectItem(json, "results"); + if(jres && cJSON_GetArraySize(jres) > 0) { + cJSON *jelem = cJSON_GetArrayItem(jres, 0); + if(jelem) { + jres = cJSON_GetObjectItem(jelem, "alternatives"); + if(jres && cJSON_GetArraySize(jres) > 0) { + jelem = cJSON_GetArrayItem(jres, 0); + if(jelem) { + cJSON *jt = cJSON_GetObjectItem(jelem, "transcript"); + if(jt && jt->valuestring) { + result = strdup(jt->valuestring); + } + } + } + } + } + } + + if(json) { + cJSON_Delete(json); + } + + return result; +} diff --git a/src/mod/asr_tts/mod_openai_asr/Makefile.am b/src/mod/asr_tts/mod_openai_asr/Makefile.am new file mode 100644 index 00000000000..2213c833904 --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/Makefile.am @@ -0,0 +1,12 @@ + +include $(top_srcdir)/build/modmake.rulesam + +MODNAME=mod_openai_asr +mod_LTLIBRARIES = mod_openai_asr.la +mod_openai_asr_la_SOURCES = mod_openai_asr.c utils.c curl.c +mod_openai_asr_la_CFLAGS = $(AM_CFLAGS) -I. -Wno-pointer-arith +mod_openai_asr_la_LIBADD = $(switch_builddir)/libfreeswitch.la +mod_openai_asr_la_LDFLAGS = -avoid-version -module -no-undefined -shared + +$(am_mod_openai_asr_la_OBJECTS): mod_openai_asr.h + diff --git a/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml b/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml new file mode 100644 index 00000000000..99b35e99dfd --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml b/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml new file mode 100644 index 00000000000..e72bd6f5927 --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/src/mod/asr_tts/mod_openai_asr/curl.c b/src/mod/asr_tts/mod_openai_asr/curl.c new file mode 100644 index 00000000000..622aafe9334 --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/curl.c @@ -0,0 +1,127 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + */ +#include "mod_openai_asr.h" + +static size_t curl_io_write_callback(char *buffer, size_t size, size_t nitems, void *user_data) { + switch_buffer_t *recv_buffer = (switch_buffer_t *)user_data; + size_t len = (size * nitems); + + if(len > 0 && recv_buffer) { + switch_buffer_write(recv_buffer, buffer, len); + } + + return len; +} + +switch_status_t curl_perform(switch_buffer_t *recv_buffer, char *api_key, char *model_name, char *filename, globals_t *globals) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + CURL *curl_handle = NULL; + curl_mime *form = NULL; + curl_mimepart *field1=NULL, *field2=NULL; + switch_curl_slist_t *headers = NULL; + switch_CURLcode curl_ret = 0; + long http_resp = 0; + + curl_handle = switch_curl_easy_init(); + headers = switch_curl_slist_append(headers, "Content-Type: multipart/form-data"); + + switch_curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, headers); + switch_curl_easy_setopt(curl_handle, CURLOPT_POST, 1); + switch_curl_easy_setopt(curl_handle, CURLOPT_NOSIGNAL, 1); + switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, curl_io_write_callback); + switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *) recv_buffer); + + if(globals->connect_timeout > 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, globals->connect_timeout); + } + if(globals->request_timeout > 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, globals->request_timeout); + } + if(globals->user_agent) { + switch_curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, globals->user_agent); + } + if(strncasecmp(globals->api_url, "https", 5) == 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0); + switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 0); + } + if(globals->proxy) { + if(globals->proxy_credentials != NULL) { + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYUSERPWD, globals->proxy_credentials); + } + if(strncasecmp(globals->proxy, "https", 5) == 0) { + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY_SSL_VERIFYPEER, 0); + } + switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY, globals->proxy); + } + + if(api_key) { + curl_easy_setopt(curl_handle, CURLOPT_XOAUTH2_BEARER, api_key); + curl_easy_setopt(curl_handle, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); + } + + if((form = curl_mime_init(curl_handle))) { + if((field1 = curl_mime_addpart(form))) { + curl_mime_name(field1, "model"); + curl_mime_data(field1, model_name, CURL_ZERO_TERMINATED); + } + if((field2 = curl_mime_addpart(form))) { + curl_mime_name(field2, "file"); + curl_mime_filedata(field2, filename); + } + switch_curl_easy_setopt(curl_handle, CURLOPT_MIMEPOST, form); + } + + headers = switch_curl_slist_append(headers, "Expect:"); + switch_curl_easy_setopt(curl_handle, CURLOPT_URL, globals->api_url); + + curl_ret = switch_curl_easy_perform(curl_handle); + if(!curl_ret) { + switch_curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &http_resp); + if(!http_resp) { switch_curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CONNECTCODE, &http_resp); } + } else { + http_resp = curl_ret; + } + + if(http_resp != 200) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "http-error=[%ld] (%s)\n", http_resp, globals->api_url); + status = SWITCH_STATUS_FALSE; + } + + if(recv_buffer) { + if(switch_buffer_inuse(recv_buffer) > 0) { + switch_buffer_write(recv_buffer, "\0", 1); + } + } + + if(curl_handle) { + switch_curl_easy_cleanup(curl_handle); + } + if(form) { + curl_mime_free(form); + } + if(headers) { + switch_curl_slist_free_all(headers); + } + + return status; +} diff --git a/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c new file mode 100644 index 00000000000..88923db07a0 --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c @@ -0,0 +1,729 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + * OpenAI Speech-To-Text service for the Freeswitch. + * https://platform.openai.com/docs/guides/speech-to-text + * + * Development respository: + * https://github.com/akscf/mod_openai_asr + * + */ +#include "mod_openai_asr.h" + +globals_t globals; + +SWITCH_MODULE_LOAD_FUNCTION(mod_openai_asr_load); +SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_asr_shutdown); +SWITCH_MODULE_DEFINITION(mod_openai_asr, mod_openai_asr_load, mod_openai_asr_shutdown, NULL); + +static void *SWITCH_THREAD_FUNC transcribe_thread(switch_thread_t *thread, void *obj) { + volatile asr_ctx_t *_ref = (asr_ctx_t *)obj; + asr_ctx_t *asr_ctx = (asr_ctx_t *)_ref; + switch_status_t status = SWITCH_STATUS_FALSE; + switch_buffer_t *chunk_buffer = NULL; + switch_buffer_t *curl_recv_buffer = NULL; + switch_memory_pool_t *pool = NULL; + cJSON *json = NULL; + time_t sentence_timeout = 0; + uint32_t schunks = 0; + uint32_t chunk_buffer_size = 0; + uint8_t fl_cbuff_overflow = SWITCH_FALSE; + void *pop = NULL; + + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->refs++; + switch_mutex_unlock(asr_ctx->mutex); + + if(switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_core_new_memory_pool()\n"); + goto out; + } + if(switch_buffer_create_dynamic(&curl_recv_buffer, 1024, 2048, 8192) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create_dynamic()\n"); + goto out; + } + + while(SWITCH_TRUE) { + if(globals.fl_shutdown || asr_ctx->fl_destroyed) { + break; + } + if(chunk_buffer_size == 0) { + switch_mutex_lock(asr_ctx->mutex); + chunk_buffer_size = asr_ctx->chunk_buffer_size; + switch_mutex_unlock(asr_ctx->mutex); + + if(chunk_buffer_size > 0) { + if(switch_buffer_create(pool, &chunk_buffer, chunk_buffer_size) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_buffer_create()\n"); + break; + } + switch_buffer_zero(chunk_buffer); + } + goto timer_next; + } + + fl_cbuff_overflow = SWITCH_FALSE; + while(switch_queue_trypop(asr_ctx->q_audio, &pop) == SWITCH_STATUS_SUCCESS) { + xdata_buffer_t *audio_buffer = (xdata_buffer_t *)pop; + if(globals.fl_shutdown || asr_ctx->fl_destroyed ) { + xdata_buffer_free(&audio_buffer); + break; + } + if(audio_buffer && audio_buffer->len) { + if(switch_buffer_write(chunk_buffer, audio_buffer->data, audio_buffer->len) >= chunk_buffer_size) { + fl_cbuff_overflow = SWITCH_TRUE; + break; + } + schunks++; + } + xdata_buffer_free(&audio_buffer); + } + + if(fl_cbuff_overflow) { + sentence_timeout = 1; + } else { + if(schunks && asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING) { + if(!sentence_timeout) { + sentence_timeout = asr_ctx->silence_sec + switch_epoch_time_now(NULL); + } + } + if(sentence_timeout && (asr_ctx->vad_state == SWITCH_VAD_STATE_START_TALKING || asr_ctx->vad_state == SWITCH_VAD_STATE_TALKING)) { + sentence_timeout = 0; + } + } + + if(sentence_timeout && sentence_timeout <= switch_epoch_time_now(NULL)) { + const void *chunk_buffer_ptr = NULL; + const void *http_response_ptr = NULL; + uint32_t buf_len = 0, http_recv_len = 0, stt_failed = 0; + char *chunk_fname = NULL; + + if((buf_len = switch_buffer_peek_zerocopy(chunk_buffer, &chunk_buffer_ptr)) > 0 && chunk_buffer_ptr) { + chunk_fname = chunk_write((switch_byte_t *)chunk_buffer_ptr, buf_len, asr_ctx->channels, asr_ctx->samplerate, globals.opt_encoding); + } + if(chunk_fname) { + for(uint32_t rqtry = 0; rqtry < asr_ctx->retries_on_error; rqtry++) { + switch_buffer_zero(curl_recv_buffer); + status = curl_perform(curl_recv_buffer, asr_ctx->opt_api_key, asr_ctx->opt_model, chunk_fname, &globals); + if(status == SWITCH_STATUS_SUCCESS || globals.fl_shutdown || asr_ctx->fl_destroyed) { break; } + switch_yield(1000); + } + + http_recv_len = switch_buffer_peek_zerocopy(curl_recv_buffer, &http_response_ptr); + if(status == SWITCH_STATUS_SUCCESS) { + if(http_response_ptr && http_recv_len) { + char *txt = parse_response((char *)http_response_ptr, NULL); +#ifdef MOD_OPENAI_ASR_DEBUG + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Service response [%s]\n", (char *)http_response_ptr); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Text [%s]\n", txt ? txt : "null"); +#endif + if(!txt) txt = strdup(""); + if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->transcription_results++; + switch_mutex_unlock(asr_ctx->mutex); + } else { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n"); + switch_safe_free(txt); + } + } else { + stt_failed = 1; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Empty service response!\n"); + } + } else { + stt_failed = 1; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to perform request!\n"); + } + + if(stt_failed) { + char *txt = strdup("[transcription failed]"); + if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->transcription_results++; + switch_mutex_unlock(asr_ctx->mutex); + } else { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n"); + switch_safe_free(txt); + } + } + + schunks = 0; + sentence_timeout = 0; + unlink(chunk_fname); + switch_safe_free(chunk_fname); + switch_buffer_zero(chunk_buffer); + } + } + + timer_next: + switch_yield(10000); + } + +out: + if(json != NULL) { + cJSON_Delete(json); + } + if(curl_recv_buffer) { + switch_buffer_destroy(&curl_recv_buffer); + } + if(chunk_buffer) { + switch_buffer_destroy(&chunk_buffer); + } + if(pool) { + switch_core_destroy_memory_pool(&pool); + } + + switch_mutex_lock(asr_ctx->mutex); + if(asr_ctx->refs > 0) asr_ctx->refs--; + switch_mutex_unlock(asr_ctx->mutex); + + switch_mutex_lock(globals.mutex); + if(globals.active_threads) globals.active_threads--; + switch_mutex_unlock(globals.mutex); + + return NULL; +} + +// --------------------------------------------------------------------------------------------------------------------------------------------- +static switch_status_t asr_open(switch_asr_handle_t *ah, const char *codec, int samplerate, const char *dest, switch_asr_flag_t *flags) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + switch_threadattr_t *attr = NULL; + switch_thread_t *thread = NULL; + asr_ctx_t *asr_ctx = NULL; + + if(strcmp(codec, "L16") !=0) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unsupported encoding (%s)\n", codec); + switch_goto_status(SWITCH_STATUS_FALSE, out); + } + + if((asr_ctx = switch_core_alloc(ah->memory_pool, sizeof(asr_ctx_t))) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_core_alloc()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + asr_ctx->channels = 1; + asr_ctx->chunk_buffer_size = 0; + asr_ctx->samplerate = samplerate; + asr_ctx->silence_sec = globals.speech_silence_sec; + asr_ctx->retries_on_error = globals.retries_on_error; + + asr_ctx->opt_model = globals.opt_model; + asr_ctx->opt_api_key = globals.api_key; + + if((status = switch_mutex_init(&asr_ctx->mutex, SWITCH_MUTEX_NESTED, ah->memory_pool)) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_mutex_init()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + switch_queue_create(&asr_ctx->q_audio, QUEUE_SIZE, ah->memory_pool); + switch_queue_create(&asr_ctx->q_text, QUEUE_SIZE, ah->memory_pool); + + asr_ctx->vad_buffer = NULL; + asr_ctx->frame_len = 0; + asr_ctx->vad_buffer_size = 0; + asr_ctx->vad_stored_frames = 0; + asr_ctx->fl_vad_first_cycle = SWITCH_TRUE; + + if((asr_ctx->vad = switch_vad_init(asr_ctx->samplerate, asr_ctx->channels)) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_vad_init()\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + switch_vad_set_mode(asr_ctx->vad, -1); + switch_vad_set_param(asr_ctx->vad, "debug", globals.fl_vad_debug); + if(globals.vad_silence_ms > 0) { switch_vad_set_param(asr_ctx->vad, "silence_ms", globals.vad_silence_ms); } + if(globals.vad_voice_ms > 0) { switch_vad_set_param(asr_ctx->vad, "voice_ms", globals.vad_voice_ms); } + if(globals.vad_threshold > 0) { switch_vad_set_param(asr_ctx->vad, "thresh", globals.vad_threshold); } + + ah->private_info = asr_ctx; + + switch_mutex_lock(globals.mutex); + globals.active_threads++; + switch_mutex_unlock(globals.mutex); + + switch_threadattr_create(&attr, ah->memory_pool); + switch_threadattr_detach_set(attr, 1); + switch_threadattr_stacksize_set(attr, SWITCH_THREAD_STACKSIZE); + switch_thread_create(&thread, attr, transcribe_thread, asr_ctx, ah->memory_pool); + +out: + return status; +} + +static switch_status_t asr_close(switch_asr_handle_t *ah, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + uint8_t fl_wloop = SWITCH_TRUE; + + assert(asr_ctx != NULL); + + asr_ctx->fl_abort = SWITCH_TRUE; + asr_ctx->fl_destroyed = SWITCH_TRUE; + + switch_mutex_lock(asr_ctx->mutex); + fl_wloop = (asr_ctx->refs != 0); + switch_mutex_unlock(asr_ctx->mutex); + + if(fl_wloop) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for unlock (refs=%d)...\n", asr_ctx->refs); + while(fl_wloop) { + switch_mutex_lock(asr_ctx->mutex); + fl_wloop = (asr_ctx->refs != 0); + switch_mutex_unlock(asr_ctx->mutex); + switch_yield(100000); + } + } + + if(asr_ctx->q_audio) { + xdata_buffer_queue_clean(asr_ctx->q_audio); + switch_queue_term(asr_ctx->q_audio); + } + if(asr_ctx->q_text) { + text_queue_clean(asr_ctx->q_text); + switch_queue_term(asr_ctx->q_text); + } + if(asr_ctx->vad) { + switch_vad_destroy(&asr_ctx->vad); + } + if(asr_ctx->vad_buffer) { + switch_buffer_destroy(&asr_ctx->vad_buffer); + } + + switch_set_flag(ah, SWITCH_ASR_FLAG_CLOSED); + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_feed(switch_asr_handle_t *ah, void *data, unsigned int data_len, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *) ah->private_info; + switch_vad_state_t vad_state = 0; + uint8_t fl_has_audio = SWITCH_FALSE; + + assert(asr_ctx != NULL); + + if(switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) { + return SWITCH_STATUS_BREAK; + } + if(asr_ctx->fl_destroyed || asr_ctx->fl_abort) { + return SWITCH_STATUS_BREAK; + } + if(asr_ctx->fl_pause) { + return SWITCH_STATUS_SUCCESS; + } + if(!data || !data_len) { + return SWITCH_STATUS_BREAK; + } + + if(data_len > 0 && asr_ctx->frame_len == 0) { + switch_mutex_lock(asr_ctx->mutex); + asr_ctx->frame_len = data_len; + asr_ctx->vad_buffer_size = asr_ctx->frame_len * VAD_STORE_FRAMES; + asr_ctx->chunk_buffer_size = asr_ctx->samplerate * globals.speech_max_sec; + switch_mutex_unlock(asr_ctx->mutex); + + if(switch_buffer_create(ah->memory_pool, &asr_ctx->vad_buffer, asr_ctx->vad_buffer_size) != SWITCH_STATUS_SUCCESS) { + asr_ctx->vad_buffer_size = 0; + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create()\n"); + } + } + + if(asr_ctx->vad_buffer_size) { + if(asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING || (asr_ctx->vad_state == vad_state && vad_state == SWITCH_VAD_STATE_NONE)) { + if(data_len <= asr_ctx->frame_len) { + if(asr_ctx->vad_stored_frames >= VAD_STORE_FRAMES) { + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + asr_ctx->fl_vad_first_cycle = SWITCH_FALSE; + } + switch_buffer_write(asr_ctx->vad_buffer, data, MIN(asr_ctx->frame_len, data_len)); + asr_ctx->vad_stored_frames++; + } + } + + vad_state = switch_vad_process(asr_ctx->vad, (int16_t *)data, (data_len / sizeof(int16_t))); + if(vad_state == SWITCH_VAD_STATE_START_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_TRUE; + } else if (vad_state == SWITCH_VAD_STATE_STOP_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_FALSE; + switch_vad_reset(asr_ctx->vad); + } else if (vad_state == SWITCH_VAD_STATE_TALKING) { + asr_ctx->vad_state = vad_state; + fl_has_audio = SWITCH_TRUE; + } + } else { + fl_has_audio = SWITCH_TRUE; + } + + if(fl_has_audio) { + if(vad_state == SWITCH_VAD_STATE_START_TALKING && asr_ctx->vad_stored_frames > 0) { + xdata_buffer_t *tau_buf = NULL; + const void *ptr = NULL; + switch_size_t vblen = 0; + uint32_t rframes = 0, rlen = 0; + int ofs = 0; + + if((vblen = switch_buffer_peek_zerocopy(asr_ctx->vad_buffer, &ptr)) && ptr && vblen > 0) { + rframes = (asr_ctx->vad_stored_frames >= VAD_RECOVERY_FRAMES ? VAD_RECOVERY_FRAMES : (asr_ctx->fl_vad_first_cycle ? asr_ctx->vad_stored_frames : VAD_RECOVERY_FRAMES)); + rlen = (rframes * asr_ctx->frame_len); + ofs = (vblen - rlen); + + if(ofs < 0) { + uint32_t hdr_sz = -ofs; + uint32_t hdr_ofs = (asr_ctx->vad_buffer_size - hdr_sz); + + switch_zmalloc(tau_buf, sizeof(xdata_buffer_t)); + + tau_buf->len = (hdr_sz + vblen + data_len); + switch_malloc(tau_buf->data, tau_buf->len); + + memcpy(tau_buf->data, (void *)(ptr + hdr_ofs), hdr_sz); + memcpy(tau_buf->data + hdr_sz , (void *)(ptr + 0), vblen); + memcpy(tau_buf->data + rlen, data, data_len); + + if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) { + xdata_buffer_free(&tau_buf); + } + + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + } else { + switch_zmalloc(tau_buf, sizeof(xdata_buffer_t)); + + tau_buf->len = (rlen + data_len); + switch_malloc(tau_buf->data, tau_buf->len); + + memcpy(tau_buf->data, (void *)(ptr + ofs), rlen); + memcpy(tau_buf->data + rlen, data, data_len); + + if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) { + xdata_buffer_free(&tau_buf); + } + + switch_buffer_zero(asr_ctx->vad_buffer); + asr_ctx->vad_stored_frames = 0; + } + } + } else { + xdata_buffer_push(asr_ctx->q_audio, data, data_len); + } + } + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_check_results(switch_asr_handle_t *ah, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + if(asr_ctx->fl_pause) { + return SWITCH_STATUS_FALSE; + } + + return (asr_ctx->transcription_results > 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE); +} + +static switch_status_t asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + switch_status_t status = SWITCH_STATUS_FALSE; + void *pop = NULL; + + assert(asr_ctx != NULL); + + if(switch_queue_trypop(asr_ctx->q_text, &pop) == SWITCH_STATUS_SUCCESS) { + if(pop) { + *xmlstr = (char *)pop; + status = SWITCH_STATUS_SUCCESS; + + switch_mutex_lock(asr_ctx->mutex); + if(asr_ctx->transcription_results > 0) asr_ctx->transcription_results--; + switch_mutex_unlock(asr_ctx->mutex); + } + } + + return status; +} + +static switch_status_t asr_start_input_timers(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_start_timers = SWITCH_TRUE; + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_pause(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_pause = SWITCH_TRUE; + + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_resume(switch_asr_handle_t *ah) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + asr_ctx->fl_pause = SWITCH_FALSE; + + return SWITCH_STATUS_SUCCESS; +} + +static void asr_text_param(switch_asr_handle_t *ah, char *param, const char *val) { + asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info; + + assert(asr_ctx != NULL); + + if(strcasecmp(param, "lang") == 0) { + if(val) asr_ctx->opt_lang = switch_core_strdup(ah->memory_pool, val); + } else if(strcasecmp(param, "model") == 0) { + if(val) asr_ctx->opt_model = switch_core_strdup(ah->memory_pool, val); + } else if(strcasecmp(param, "key") == 0) { + if(val) asr_ctx->opt_api_key = switch_core_strdup(ah->memory_pool, val); + } else if(strcasecmp(param, "silence") == 0) { + if(val) asr_ctx->silence_sec = atoi(val); + } +} + +static void asr_numeric_param(switch_asr_handle_t *ah, char *param, int val) { +} + +static void asr_float_param(switch_asr_handle_t *ah, char *param, double val) { +} + +static switch_status_t asr_load_grammar(switch_asr_handle_t *ah, const char *grammar, const char *name) { + return SWITCH_STATUS_SUCCESS; +} + +static switch_status_t asr_unload_grammar(switch_asr_handle_t *ah, const char *name) { + return SWITCH_STATUS_SUCCESS; +} + +// --------------------------------------------------------------------------------------------------------------------------------------------- +#define CMD_SYNTAX "path_to/filename.(mp3|wav) [key=altkey model=altModel]\n" +SWITCH_STANDARD_API(openai_asr_cmd_handler) { + switch_status_t status = 0; + char *mycmd = NULL, *argv[10] = { 0 }; int argc = 0; + switch_buffer_t *recv_buf = NULL; + const void *response_ptr = NULL; + char *opt_api_key = globals.api_key; + char *opt_model = globals.opt_model; + char *file_name = NULL, *file_ext = NULL; + uint32_t recv_len = 0; + + if (!zstr(cmd)) { + mycmd = strdup(cmd); + switch_assert(mycmd); + argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0]))); + } + if(argc == 0) { + goto usage; + } + + file_name = argv[0]; + if(switch_file_exists(file_name, NULL) != SWITCH_STATUS_SUCCESS) { + stream->write_function(stream, "-ERR: file not found (%s)\n", file_name); + goto out; + } + + file_ext = strrchr(file_name, '.'); + if(!file_ext) { + stream->write_function(stream, "-ERR: unsupported file encoding (null)\n"); + goto out; + } + + file_ext++; + if(strcasecmp("mp3", file_ext) && strcasecmp("wav", file_ext)) { + stream->write_function(stream, "-ERR: unsupported file encoding (%s)\n", file_ext); + goto out; + } + + if(switch_buffer_create_dynamic(&recv_buf, 1024, 2048, 8192) != SWITCH_STATUS_SUCCESS) { + stream->write_function(stream, "-ERR: switch_buffer_create_dynamic()\n"); + goto out; + } + + if(argc > 1) { + for(int i = 1; i < argc; i++) { + char *kvp[2] = { 0 }; + if(switch_separate_string(argv[i], '=', kvp, 2) >= 2) { + if(strcasecmp(kvp[0], "key") == 0) { + if(kvp[1]) opt_api_key = kvp[1]; + } else if(strcasecmp(kvp[0], "model") == 0) { + if(kvp[1]) opt_model = kvp[1]; + } + } + } + } + + status = curl_perform(recv_buf, opt_api_key, opt_model, file_name, &globals); + + recv_len = switch_buffer_peek_zerocopy(recv_buf, &response_ptr); + if(status == SWITCH_STATUS_SUCCESS && response_ptr && recv_len) { + char *txt = parse_response((char *)response_ptr, stream); + if(txt) { + stream->write_function(stream, "+OK: %s\n", txt); + } + switch_safe_free(txt); + } else { + stream->write_function(stream, "-ERR: unable to perform request\n"); + } + + goto out; +usage: + stream->write_function(stream, "-ERR:\nUsage: %s\n", CMD_SYNTAX); + +out: + if(recv_buf) { + switch_buffer_destroy(&recv_buf); + } + + switch_safe_free(mycmd); + return SWITCH_STATUS_SUCCESS; +} + +// --------------------------------------------------------------------------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------------------------------------------------------------------------- +SWITCH_MODULE_LOAD_FUNCTION(mod_openai_asr_load) { + switch_status_t status = SWITCH_STATUS_SUCCESS; + switch_xml_t cfg, xml, settings, param; + switch_api_interface_t *commands_interface; + switch_asr_interface_t *asr_interface; + + memset(&globals, 0, sizeof(globals)); + switch_mutex_init(&globals.mutex, SWITCH_MUTEX_NESTED, pool); + + if((xml = switch_xml_open_cfg(MOD_CONFIG_NAME, &cfg, NULL)) == NULL) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open configuration: %s\n", MOD_CONFIG_NAME); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + if((settings = switch_xml_child(cfg, "settings"))) { + for (param = switch_xml_child(settings, "param"); param; param = param->next) { + char *var = (char *) switch_xml_attr_soft(param, "name"); + char *val = (char *) switch_xml_attr_soft(param, "value"); + + if(!strcasecmp(var, "vad-silence-ms")) { + if(val) globals.vad_silence_ms = atoi (val); + } else if(!strcasecmp(var, "vad-voice-ms")) { + if(val) globals.vad_voice_ms = atoi (val); + } else if(!strcasecmp(var, "vad-threshold")) { + if(val) globals.vad_threshold = atoi (val); + } else if(!strcasecmp(var, "vad-debug")) { + if(val) globals.fl_vad_debug = switch_true(val); + } else if(!strcasecmp(var, "api-key")) { + if(val) globals.api_key = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "api-url")) { + if(val) globals.api_url = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "user-agent")) { + if(val) globals.user_agent = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "proxy")) { + if(val) globals.proxy = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "proxy-credentials")) { + if(val) globals.proxy_credentials = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "encoding")) { + if(val) globals.opt_encoding = switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "model")) { + if(val) globals.opt_model= switch_core_strdup(pool, val); + } else if(!strcasecmp(var, "speech-max-sec")) { + if(val) globals.speech_max_sec = atoi(val); + } else if(!strcasecmp(var, "speech-silence-sec")) { + if(val) globals.speech_silence_sec = atoi(val); + } else if(!strcasecmp(var, "request-timeout")) { + if(val) globals.request_timeout = atoi(val); + } else if(!strcasecmp(var, "connect-timeout")) { + if(val) globals.connect_timeout = atoi(val); + } else if(!strcasecmp(var, "log-http-errors")) { + if(val) globals.fl_log_http_errors = switch_true(val); + } else if(!strcasecmp(var, "retries-on-error")) { + if(val) globals.retries_on_error = atoi(val); + } + } + } + + if(!globals.api_url) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Missing required parameter: api-url\n"); + switch_goto_status(SWITCH_STATUS_GENERR, out); + } + + globals.opt_encoding = globals.opt_encoding ? globals.opt_encoding : "wav"; + globals.speech_max_sec = !globals.speech_max_sec ? 35 : globals.speech_max_sec; + globals.speech_silence_sec = !globals.speech_silence_sec ? 3 : globals.speech_silence_sec; + globals.retries_on_error = !globals.retries_on_error ? 1 : globals.retries_on_error; + + globals.tmp_path = switch_core_sprintf(pool, "%s%sopenai-asr-cache", SWITCH_GLOBAL_dirs.temp_dir, SWITCH_PATH_SEPARATOR); + if(switch_directory_exists(globals.tmp_path, NULL) != SWITCH_STATUS_SUCCESS) { + switch_dir_make(globals.tmp_path, SWITCH_FPROT_OS_DEFAULT, NULL); + } + + *module_interface = switch_loadable_module_create_module_interface(pool, modname); + SWITCH_ADD_API(commands_interface, "openai_asr_transcript", "OpenAI speech-to-text", openai_asr_cmd_handler, CMD_SYNTAX); + + asr_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_ASR_INTERFACE); + asr_interface->interface_name = "openai"; + asr_interface->asr_open = asr_open; + asr_interface->asr_close = asr_close; + asr_interface->asr_feed = asr_feed; + asr_interface->asr_pause = asr_pause; + asr_interface->asr_resume = asr_resume; + asr_interface->asr_check_results = asr_check_results; + asr_interface->asr_get_results = asr_get_results; + asr_interface->asr_start_input_timers = asr_start_input_timers; + asr_interface->asr_text_param = asr_text_param; + asr_interface->asr_numeric_param = asr_numeric_param; + asr_interface->asr_float_param = asr_float_param; + asr_interface->asr_load_grammar = asr_load_grammar; + asr_interface->asr_unload_grammar = asr_unload_grammar; + + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "OpenAI-ASR (%s)\n", MOD_VERSION); +out: + if(xml) { + switch_xml_free(xml); + } + return status; +} + +SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_asr_shutdown) { + uint8_t fl_wloop = SWITCH_TRUE; + + globals.fl_shutdown = SWITCH_TRUE; + + switch_mutex_lock(globals.mutex); + fl_wloop = (globals.active_threads > 0); + switch_mutex_unlock(globals.mutex); + + if(fl_wloop) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for termination (%d) threads...\n", globals.active_threads); + while(fl_wloop) { + switch_mutex_lock(globals.mutex); + fl_wloop = (globals.active_threads > 0); + switch_mutex_unlock(globals.mutex); + switch_yield(100000); + } + } + + return SWITCH_STATUS_SUCCESS; +} diff --git a/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h new file mode 100644 index 00000000000..230cc68212c --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h @@ -0,0 +1,110 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + */ +#ifndef MOD_OPENAI_ASR_H +#define MOD_OPENAI_ASR_H + +#include +#include +#include + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +#define MOD_CONFIG_NAME "openai_asr.conf" +#define MOD_VERSION "1.0.4" +#define QUEUE_SIZE 128 +#define VAD_STORE_FRAMES 64 +#define VAD_RECOVERY_FRAMES 20 + +//#define MOD_OPENAI_ASR_DEBUG + +typedef struct { + switch_mutex_t *mutex; + uint32_t active_threads; + uint32_t speech_max_sec; + uint32_t speech_silence_sec; + uint32_t vad_silence_ms; + uint32_t vad_voice_ms; + uint32_t vad_threshold; + uint32_t request_timeout; // secondss + uint32_t connect_timeout; // seconds + uint32_t retries_on_error; + uint8_t fl_vad_debug; + uint8_t fl_shutdown; + uint8_t fl_log_http_errors; + char *tmp_path; + char *api_key; + char *api_url; + char *user_agent; + char *proxy; + char *proxy_credentials; + char *opt_encoding; + char *opt_model; +} globals_t; + +typedef struct { + switch_memory_pool_t *pool; + switch_vad_t *vad; + switch_buffer_t *vad_buffer; + switch_mutex_t *mutex; + switch_queue_t *q_audio; + switch_queue_t *q_text; + switch_buffer_t *curl_recv_buffer_ref; + switch_vad_state_t vad_state; + char *opt_lang; + char *opt_model; + char *opt_api_key; + int32_t transcription_results; + uint32_t retries_on_error; + uint32_t vad_buffer_size; + uint32_t vad_stored_frames; + uint32_t chunk_buffer_size; + uint32_t refs; + uint32_t samplerate; + uint32_t channels; + uint32_t frame_len; + uint32_t silence_sec; + uint8_t fl_start_timers; + uint8_t fl_pause; + uint8_t fl_vad_first_cycle; + uint8_t fl_destroyed; + uint8_t fl_abort; +} asr_ctx_t; + +typedef struct { + uint32_t len; + switch_byte_t *data; +} xdata_buffer_t; + +/* curl.c */ +switch_status_t curl_perform(switch_buffer_t *recv_buffer, char *api_key, char *model_name, char *filename, globals_t *globals); + +/* utils.c */ +char *chunk_write(switch_byte_t *buf, uint32_t buf_len, uint32_t channels, uint32_t samplerate, const char *file_ext); +switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len); +switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len); +void xdata_buffer_free(xdata_buffer_t **buf); +void xdata_buffer_queue_clean(switch_queue_t *queue); +void text_queue_clean(switch_queue_t *queue); +char *parse_response(char *data, switch_stream_handle_t *stream); + +#endif diff --git a/src/mod/asr_tts/mod_openai_asr/utils.c b/src/mod/asr_tts/mod_openai_asr/utils.c new file mode 100644 index 00000000000..ed9a620dc2c --- /dev/null +++ b/src/mod/asr_tts/mod_openai_asr/utils.c @@ -0,0 +1,149 @@ +/* + * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application + * Copyright (C) 2005-2014, Anthony Minessale II + * + * Version: MPL 1.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Module Contributor(s): + * Konstantin Alexandrin + * + * + */ +#include "mod_openai_asr.h" + +extern globals_t globals; + +switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len) { + xdata_buffer_t *buf = NULL; + + switch_zmalloc(buf, sizeof(xdata_buffer_t)); + + if(data_len) { + switch_malloc(buf->data, data_len); + switch_assert(buf->data); + + buf->len = data_len; + memcpy(buf->data, data, data_len); + } + + *out = buf; + return SWITCH_STATUS_SUCCESS; +} + +void xdata_buffer_free(xdata_buffer_t **buf) { + if(buf && *buf) { + switch_safe_free((*buf)->data); + free(*buf); + } +} + +void xdata_buffer_queue_clean(switch_queue_t *queue) { + xdata_buffer_t *data = NULL; + + if(!queue || !switch_queue_size(queue)) { + return; + } + + while(switch_queue_trypop(queue, (void *) &data) == SWITCH_STATUS_SUCCESS) { + if(data) { xdata_buffer_free(&data); } + } +} + +switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len) { + xdata_buffer_t *buff = NULL; + + if(xdata_buffer_alloc(&buff, data, data_len) == SWITCH_STATUS_SUCCESS) { + if(switch_queue_trypush(queue, buff) == SWITCH_STATUS_SUCCESS) { + return SWITCH_STATUS_SUCCESS; + } + xdata_buffer_free(&buff); + } + return SWITCH_STATUS_FALSE; +} + +char *chunk_write(switch_byte_t *buf, uint32_t buf_len, uint32_t channels, uint32_t samplerate, const char *file_ext) { + switch_status_t status = SWITCH_STATUS_FALSE; + switch_size_t len = (buf_len / sizeof(int16_t)); + switch_file_handle_t fh = { 0 }; + char *file_name = NULL; + char name_uuid[SWITCH_UUID_FORMATTED_LENGTH + 1] = { 0 }; + int flags = (SWITCH_FILE_FLAG_WRITE | SWITCH_FILE_DATA_SHORT); + + switch_uuid_str((char *)name_uuid, sizeof(name_uuid)); + file_name = switch_mprintf("%s%s%s.%s", globals.tmp_path, SWITCH_PATH_SEPARATOR, name_uuid, (file_ext == NULL ? "wav" : file_ext) ); + + if((status = switch_core_file_open(&fh, file_name, channels, samplerate, flags, NULL)) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open file (%s)\n", file_name); + goto out; + } + + if((status = switch_core_file_write(&fh, buf, &len)) != SWITCH_STATUS_SUCCESS) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to write (%s)\n", file_name); + goto out; + } + + switch_core_file_close(&fh); +out: + if(status != SWITCH_STATUS_SUCCESS) { + if(file_name) { + unlink(file_name); + switch_safe_free(file_name); + } + return NULL; + } + + return file_name; +} + +void text_queue_clean(switch_queue_t *queue) { + void *data = NULL; + + if(!queue || !switch_queue_size(queue)) { + return; + } + + while(switch_queue_trypop(queue, (void *)&data) == SWITCH_STATUS_SUCCESS) { + switch_safe_free(data); + } +} + +char *parse_response(char *data, switch_stream_handle_t *stream) { + char *result = NULL; + cJSON *json = NULL; + + if(!data) { + return NULL; + } + + if(!(json = cJSON_Parse(data))) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to parse json (%s)\n", data); + if(stream) stream->write_function(stream, "-ERR: Unable to parse json (see log)\n"); + } else { + cJSON *jres = cJSON_GetObjectItem(json, "error"); + if(jres) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Service returns error (%s)\n", data); + if(stream) stream->write_function(stream, "-ERR: Service returns error (see log)\n"); + } else { + cJSON *jres = cJSON_GetObjectItem(json, "text"); + if(jres) { + result = strdup(jres->valuestring); + } + } + } + + if(json) { + cJSON_Delete(json); + } + + return result; +} diff --git a/src/switch_hashtable.c b/src/switch_hashtable.c index 66669acb63d..107d539c4f4 100644 --- a/src/switch_hashtable.c +++ b/src/switch_hashtable.c @@ -126,7 +126,7 @@ hashtable_expand(switch_hashtable_t *h) realloc(h->table, newsize * sizeof(struct entry *)); if (NULL == newtable) { (h->primeindex)--; return 0; } h->table = newtable; - memset(newtable[h->tablelength], 0, newsize - h->tablelength); + memset(&newtable[h->tablelength], 0, (newsize - h->tablelength) * sizeof(struct entry*)); for (i = 0; i < h->tablelength; i++) { for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) { index = indexFor(newsize,e->h);