From 35be7897a9afbd23781dfb686e37de6c0e09a91a Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 5 Jul 2025 18:16:05 -0700 Subject: [PATCH 01/14] Memory optimizations. Allocate minimal placeholder only for h264 (CONFIG_MAX_NALU_SIZE); Move LVGL graphics working memory out of internal memory to PSRAM. --- esp32-s3-box-3/CMakeLists.txt | 5 +++++ esp32-s3-box-3/sdkconfig.defaults | 5 +++++ esp32-s3-box-3/src/main.cpp | 14 +++++++------- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt index fbba559..6195c59 100644 --- a/esp32-s3-box-3/CMakeLists.txt +++ b/esp32-s3-box-3/CMakeLists.txt @@ -7,6 +7,11 @@ if(NOT IDF_TARGET STREQUAL linux) add_compile_definitions(WIFI_SSID="$ENV{WIFI_SSID}") add_compile_definitions(WIFI_PASSWORD="$ENV{WIFI_PASSWORD}") + + # We are not using video in this app, so set the max NALU size to very small + if(NOT DEFINED ENV{CONFIG_MAX_NALU_SIZE}) + add_compile_definitions(CONFIG_MAX_NALU_SIZE=32) + endif() endif() if(NOT DEFINED ENV{PIPECAT_SMALLWEBRTC_URL}) diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults index 236be55..99300dd 100644 --- a/esp32-s3-box-3/sdkconfig.defaults +++ b/esp32-s3-box-3/sdkconfig.defaults @@ -33,3 +33,8 @@ CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_COMPILER_OPTIMIZATION_ASSERTIONS_DISABLE=y CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n + +# Use C library memory allocator for LVGL to save 64KB of BSS +# This removes the 64KB work_mem_int buffer from BSS +CONFIG_LV_USE_BUILTIN_MALLOC=n +CONFIG_LV_USE_CLIB_MALLOC=y diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp index 8bc08f1..4ac44ea 100644 --- a/esp32-s3-box-3/src/main.cpp +++ b/esp32-s3-box-3/src/main.cpp @@ -1,5 +1,6 @@ #include "main.h" +#include #include #include #include @@ -8,15 +9,14 @@ #include "nvs_flash.h" extern "C" void app_main(void) { - esp_err_t ret = nvs_flash_init(); - if (ret == ESP_ERR_NVS_NO_FREE_PAGES || - ret == ESP_ERR_NVS_NEW_VERSION_FOUND) { - ESP_ERROR_CHECK(nvs_flash_erase()); - ret = nvs_flash_init(); - } - ESP_ERROR_CHECK(ret); + ESP_LOGI(LOG_TAG, "[APP] Startup.."); + ESP_LOGI(LOG_TAG, "[APP] IDF version: %s", esp_get_idf_version()); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + + ESP_ERROR_CHECK(nvs_flash_init()); ESP_ERROR_CHECK(esp_event_loop_create_default()); + pipecat_init_screen(); peer_init(); pipecat_init_audio_capture(); From 36ab58359b91872cfbed7688691cb80ac0dd855f Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 5 Jul 2025 20:49:24 -0700 Subject: [PATCH 02/14] temp commit; moving tasks around --- esp32-s3-box-3/src/main.cpp | 10 ++++++++- esp32-s3-box-3/src/main.h | 2 ++ esp32-s3-box-3/src/webrtc.cpp | 42 +++++++++++++++++++++++++++++------ 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp index 4ac44ea..fa69c17 100644 --- a/esp32-s3-box-3/src/main.cpp +++ b/esp32-s3-box-3/src/main.cpp @@ -8,6 +8,9 @@ #ifndef LINUX_BUILD #include "nvs_flash.h" +// All operations that modify the webrtc state should use this semaphore to ensure thread safety +SemaphoreHandle_t webrtcSemaphore = NULL; + extern "C" void app_main(void) { ESP_LOGI(LOG_TAG, "[APP] Startup.."); @@ -17,6 +20,8 @@ extern "C" void app_main(void) { ESP_ERROR_CHECK(nvs_flash_init()); ESP_ERROR_CHECK(esp_event_loop_create_default()); + webrtcSemaphore = xSemaphoreCreateMutex(); + pipecat_init_screen(); peer_init(); pipecat_init_audio_capture(); @@ -26,8 +31,10 @@ extern "C" void app_main(void) { pipecat_screen_system_log("Pipecat ESP32 client initialized\n"); + pipecat_webrtc_run_task(); + while (1) { - pipecat_webrtc_loop(); + // Add some stats printout to the screen here vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); } } @@ -38,6 +45,7 @@ int main(void) { pipecat_webrtc(); while (1) { + // Todo: test/fix linux build pipecat_webrtc_loop(); vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); } diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h index 389d7ee..a5e6478 100644 --- a/esp32-s3-box-3/src/main.h +++ b/esp32-s3-box-3/src/main.h @@ -18,12 +18,14 @@ extern void pipecat_audio_decode(uint8_t *data, size_t size); // WebRTC / Signalling extern void pipecat_init_webrtc(); extern void pipecat_webrtc_loop(); +extern void pipecat_webrtc_run_task(); extern void pipecat_http_request(char *offer, char *answer); // RTVI typedef struct { void (*on_bot_started_speaking)(); void (*on_bot_stopped_speaking)(); + void (*on_bot_tts_text)(const char *text); } rtvi_callbacks_t; diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index b777b81..8602309 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -9,10 +9,15 @@ #include "main.h" +extern SemaphoreHandle_t webrtcSemaphore; + static PeerConnection *peer_connection = NULL; #ifndef LINUX_BUILD -StaticTask_t task_buffer; + +static TaskHandle_t xPcTaskHandle = NULL; +static TaskHandle_t xSendAudioTaskHandle = NULL; + void pipecat_send_audio_task(void *user_data) { pipecat_init_audio_encoder(); @@ -51,13 +56,10 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state, #ifndef LINUX_BUILD esp_restart(); #endif - } else if (state == PEER_CONNECTION_CONNECTED) { + } else if (state == PEER_CONNECTION_COMPLETED) { #ifndef LINUX_BUILD - StackType_t *stack_memory = (StackType_t *)heap_caps_malloc( - 30000 * sizeof(StackType_t), MALLOC_CAP_SPIRAM); - xTaskCreateStaticPinnedToCore(pipecat_send_audio_task, "audio_publisher", - 30000, NULL, 7, stack_memory, &task_buffer, - 0); + // Todo: move this to a separate init block + xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0); pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); #endif } @@ -109,3 +111,29 @@ void pipecat_init_webrtc() { void pipecat_webrtc_loop() { peer_connection_loop(peer_connection); } + + +void peer_connection_task(void* arg) { + ESP_LOGI(LOG_TAG, "peer_connection_task started"); + + while (peer_connection == NULL) { + ESP_LOGI(LOG_TAG, "peer_connection_task waiting for peer_connection"); + vTaskDelay(pdMS_TO_TICKS(1000)); + } + + + for (;;) { + if (xSemaphoreTake(webrtcSemaphore, portMAX_DELAY)) { + peer_connection_loop(peer_connection); + xSemaphoreGive(webrtcSemaphore); + } + + vTaskDelay(pdMS_TO_TICKS(1)); + } +} + +void pipecat_webrtc_run_task() { + ESP_LOGI(LOG_TAG, "peer_connection_task starting ..."); + + xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 1); +} From 7af51e9b20ddfb9f181f5c8ff4b234c7742473ee Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 13:01:07 -0700 Subject: [PATCH 03/14] more work on memory fragmentation --- esp32-s3-box-3/sdkconfig.defaults | 2 ++ esp32-s3-box-3/src/main.cpp | 13 +++++++++++++ esp32-s3-box-3/src/screen.cpp | 1 + esp32-s3-box-3/src/webrtc.cpp | 6 +++++- 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults index 99300dd..1a5c07d 100644 --- a/esp32-s3-box-3/sdkconfig.defaults +++ b/esp32-s3-box-3/sdkconfig.defaults @@ -38,3 +38,5 @@ CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n # This removes the 64KB work_mem_int buffer from BSS CONFIG_LV_USE_BUILTIN_MALLOC=n CONFIG_LV_USE_CLIB_MALLOC=y +CONFIG_BSP_LCD_DRAW_BUF_HEIGHT=10 +CONFIG_BSP_LCD_DRAW_BUF_DOUBLE=n diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp index fa69c17..55ae6e7 100644 --- a/esp32-s3-box-3/src/main.cpp +++ b/esp32-s3-box-3/src/main.cpp @@ -23,14 +23,27 @@ extern "C" void app_main(void) { webrtcSemaphore = xSemaphoreCreateMutex(); pipecat_init_screen(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + peer_init(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_audio_capture(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_audio_decoder(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_wifi(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_webrtc(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + ESP_LOGI(LOG_TAG, "Pipecat ESP32 client initialized"); pipecat_screen_system_log("Pipecat ESP32 client initialized\n"); + ESP_LOGI(LOG_TAG, "Starting webrtc task"); pipecat_webrtc_run_task(); while (1) { diff --git a/esp32-s3-box-3/src/screen.cpp b/esp32-s3-box-3/src/screen.cpp index bbd1d9b..b4b7f47 100644 --- a/esp32-s3-box-3/src/screen.cpp +++ b/esp32-s3-box-3/src/screen.cpp @@ -77,6 +77,7 @@ static void screen_task(void *pvParameter) { } void pipecat_init_screen() { + bsp_display_start(); bsp_display_backlight_on(); diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index 8602309..937e166 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -58,9 +58,13 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state, #endif } else if (state == PEER_CONNECTION_COMPLETED) { #ifndef LINUX_BUILD + + ESP_LOGI(LOG_TAG, "Creating send audio task"); // Todo: move this to a separate init block xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0); - pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + + // pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); #endif } } From 10b52519a788c095a34adab1a639628e2cf9b580 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 16:24:30 -0700 Subject: [PATCH 04/14] aec working with dummy reference data --- .gitmodules | 3 + esp32-s3-box-3/CMakeLists.txt | 2 +- esp32-s3-box-3/dependencies.lock | 66 ++++++++-- esp32-s3-box-3/esp-adf | 1 + esp32-s3-box-3/partitions.csv | 2 +- esp32-s3-box-3/sdkconfig.defaults | 3 + esp32-s3-box-3/src/CMakeLists.txt | 2 +- esp32-s3-box-3/src/main.cpp | 3 + esp32-s3-box-3/src/main.h | 1 + esp32-s3-box-3/src/media.cpp | 201 +++++++++++++++++++++++++++++- esp32-s3-box-3/src/webrtc.cpp | 9 +- 11 files changed, 273 insertions(+), 20 deletions(-) create mode 160000 esp32-s3-box-3/esp-adf diff --git a/.gitmodules b/.gitmodules index c6602c8..65f0d07 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "esp32-s3-box-3/components/esp-protocols"] path = esp32-s3-box-3/components/esp-protocols url = https://github.com/espressif/esp-protocols.git +[submodule "esp32-s3-box-3/esp-adf"] + path = esp32-s3-box-3/esp-adf + url = https://github.com/espressif/esp-adf.git diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt index 6195c59..ae931c7 100644 --- a/esp32-s3-box-3/CMakeLists.txt +++ b/esp32-s3-box-3/CMakeLists.txt @@ -25,7 +25,7 @@ endif() add_compile_definitions(PIPECAT_SMALLWEBRTC_URL="$ENV{PIPECAT_SMALLWEBRTC_URL}") set(COMPONENTS src) -set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus") +set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus" "esp-adf/components") if(IDF_TARGET STREQUAL linux) add_compile_definitions(LINUX_BUILD=1) diff --git a/esp32-s3-box-3/dependencies.lock b/esp32-s3-box-3/dependencies.lock index e6c6da2..d533402 100644 --- a/esp32-s3-box-3/dependencies.lock +++ b/esp32-s3-box-3/dependencies.lock @@ -1,4 +1,10 @@ dependencies: + esp_codec_dev: + dependencies: [] + source: + path: /home/khkramer/src/pipecat-esp32/esp32-s3-box-3/esp-adf/components/esp_codec_dev + type: local + version: 1.3.5 espressif/button: component_hash: f53face2ab21fa0ffaf4cf0f6e513d393f56df6586bb2ad1146120f03f19ee05 dependencies: @@ -63,16 +69,16 @@ dependencies: targets: - esp32s3 version: 3.0.0~1 - espressif/esp_codec_dev: - component_hash: c71e2d13dad6fc41561590dd88dbc45c79e3f4ef48d5ee3575c60e8b6c8e79d5 + espressif/esp-dsp: + component_hash: 619639efc18cfa361a9e423739b9b0ffc14991effc6c027f955c2f2c3bf1754b dependencies: - name: idf require: private - version: '>=4.0' + version: '>=4.2' source: - registry_url: https://components.espressif.com + registry_url: https://components.espressif.com/ type: service - version: 1.3.5 + version: 1.6.0 espressif/esp_lcd_ili9341: component_hash: 31f1b793aa2110dd2ae071c21ccbff0a4eb20d9a4ee40b6294c0dc0ad9552c4e dependencies: @@ -84,7 +90,7 @@ dependencies: require: private version: 0.* source: - registry_url: https://components.espressif.com + registry_url: https://components.espressif.com/ type: service version: 1.2.0 espressif/esp_lcd_touch: @@ -139,6 +145,16 @@ dependencies: registry_url: https://components.espressif.com type: service version: 2.6.0 + espressif/esp_websocket_client: + component_hash: f77326f0e1c38da4e9c97e17c5d649b0dd13027f2645e720e48db269638fd622 + dependencies: + - name: idf + require: private + version: '>=5.0' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.4.0 espressif/icm42670: component_hash: 28b56e174f75c70037f5208aaed6c3789f0d243500d975519584bf9dc8c0836c dependencies: @@ -149,6 +165,36 @@ dependencies: registry_url: https://components.espressif.com type: service version: 2.0.2 + espressif/jsmn: + component_hash: d80350c41bbaa827c98a25b6072df00884e72f54885996fab4a4f0aebce6b6c3 + dependencies: + - name: idf + require: private + version: '>=4.3' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.1.0 + espressif/nghttp: + component_hash: 32e513409ddce0000990ec74b0711f535462726236d72578cd70498e6bd66dc3 + dependencies: + - name: idf + require: private + version: '>=5.0' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.65.0 + espressif/zlib: + component_hash: d901723af51f13fc8e5824f39f32239c847956e8fd951a05266588dc5cfbb9ae + dependencies: + - name: idf + require: private + version: '>=4.4' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.3.1 idf: source: type: idf @@ -162,7 +208,13 @@ dependencies: version: 9.3.0 direct_dependencies: - espressif/esp-box-3 +- espressif/esp-dsp +- espressif/esp_lcd_ili9341 +- espressif/esp_websocket_client +- espressif/jsmn +- espressif/nghttp +- espressif/zlib - idf -manifest_hash: 55f6a702498de8e87201da504a38c5c958e2094ec832c4f8d6d368369d5d0625 +manifest_hash: 1891dc87058c6eec7aa77f286a1967ebe7c293709db8da5b2f1e79d2e45f57ef target: esp32s3 version: 2.0.0 diff --git a/esp32-s3-box-3/esp-adf b/esp32-s3-box-3/esp-adf new file mode 160000 index 0000000..1e839af --- /dev/null +++ b/esp32-s3-box-3/esp-adf @@ -0,0 +1 @@ +Subproject commit 1e839afb5a5166aad6028ac2796272f32454dced diff --git a/esp32-s3-box-3/partitions.csv b/esp32-s3-box-3/partitions.csv index 03d5d13..6bfa8e4 100644 --- a/esp32-s3-box-3/partitions.csv +++ b/esp32-s3-box-3/partitions.csv @@ -2,5 +2,5 @@ # Name, Type, SubType, Offset, Size, Flags nvs, data, nvs, 0x9000, 0x6000, phy_init, data, phy, 0xf000, 0x1000, -factory, app, factory, 0x10000, 0x180000, +factory, app, factory, 0x10000, 0x200000, diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults index 1a5c07d..9abc65a 100644 --- a/esp32-s3-box-3/sdkconfig.defaults +++ b/esp32-s3-box-3/sdkconfig.defaults @@ -18,6 +18,9 @@ CONFIG_ESP_MAIN_TASK_STACK_SIZE=16384 # Defaults to partitions.csv CONFIG_PARTITION_TABLE_CUSTOM=y +# Set flash size to 16MB for ESP32-S3-Box-3 +CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y + # Set highest CPU Freq CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y diff --git a/esp32-s3-box-3/src/CMakeLists.txt b/esp32-s3-box-3/src/CMakeLists.txt index da4e1e7..cc4ab61 100644 --- a/esp32-s3-box-3/src/CMakeLists.txt +++ b/esp32-s3-box-3/src/CMakeLists.txt @@ -7,7 +7,7 @@ if(IDF_TARGET STREQUAL linux) else() idf_component_register( SRCS ${COMMON_SRC} "wifi.cpp" "media.cpp" "rtvi.cpp" "rtvi_callbacks.cpp" "screen.cpp" - REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl) + REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl esp-sr) endif() idf_component_get_property(lib peer COMPONENT_LIB) diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp index 55ae6e7..2d18090 100644 --- a/esp32-s3-box-3/src/main.cpp +++ b/esp32-s3-box-3/src/main.cpp @@ -34,6 +34,9 @@ extern "C" void app_main(void) { pipecat_init_audio_decoder(); heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_aec(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_wifi(); heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h index a5e6478..5797b52 100644 --- a/esp32-s3-box-3/src/main.h +++ b/esp32-s3-box-3/src/main.h @@ -14,6 +14,7 @@ extern void pipecat_init_audio_decoder(); extern void pipecat_init_audio_encoder(); extern void pipecat_send_audio(PeerConnection *peer_connection); extern void pipecat_audio_decode(uint8_t *data, size_t size); +extern void pipecat_init_aec(); // WebRTC / Signalling extern void pipecat_init_webrtc(); diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index ba95185..f628845 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -3,18 +3,29 @@ #include #include +#include #include "esp_log.h" #include "main.h" +#include "esp_afe_aec.h" + + #define SAMPLE_RATE (16000) #define OPUS_BUFFER_SIZE 1276 // 1276 bytes is recommended by opus_encode -#define PCM_BUFFER_SIZE 640 +#define AEC_CHUNK_SIZE 256 // AEC processes 256 samples at a time for 16kHz +#define OPUS_FRAME_SIZE 320 // Opus frame size in samples +#define COMMON_BUFFER_SAMPLES 1280 // LCM(256, 320) = 1280 samples +#define AEC_CHUNKS_TO_BATCH (COMMON_BUFFER_SAMPLES / AEC_CHUNK_SIZE) // 5 chunks +#define OPUS_FRAMES_TO_BATCH (COMMON_BUFFER_SAMPLES / OPUS_FRAME_SIZE) // 4 frames +#define PCM_BUFFER_SIZE (COMMON_BUFFER_SAMPLES * sizeof(int16_t)) // 2560 bytes #define OPUS_ENCODER_BITRATE 30000 #define OPUS_ENCODER_COMPLEXITY 0 + + std::atomic is_playing = false; void set_is_playing(int16_t *in_buf, size_t in_samples) { bool any_set = false; @@ -78,6 +89,16 @@ OpusEncoder *opus_encoder = NULL; uint8_t *encoder_output_buffer = NULL; uint8_t *read_buffer = NULL; +// Timing statistics +static uint64_t total_read_time = 0; +static uint64_t total_aec_time = 0; +static uint64_t total_encode_time = 0; +static uint64_t total_send_time = 0; +static uint32_t cycle_count = 0; + +// AEC processed buffer +static int16_t *processed_buffer = NULL; + void pipecat_init_audio_encoder() { int encoder_error; opus_encoder = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP, @@ -100,22 +121,190 @@ void pipecat_init_audio_encoder() { read_buffer = (uint8_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT); encoder_output_buffer = (uint8_t *)malloc(OPUS_BUFFER_SIZE); + processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT); } + +typedef struct { + int task_stack; /*!< Task stack size */ + int task_prio; /*!< Task peroid */ + int task_core; /*!< The core that task to be created */ + bool debug_aec; /*!< debug AEC input data */ + bool stack_in_ext; /*!< Try to allocate stack in external memory */ + afe_type_t type; /*!< The type of afe, AFE_TYPE_SR , AFE_TYPE_VC , AFE_TYPE_VC_8K */ + afe_mode_t mode; /*!< The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF */ + int filter_length; /*!< The filter length of aec */ + char *input_format; /*!< The input format is as follows: For example, 'MR', 'RMNM'. + 'M' means data from microphone, 'R' means data from reference data, 'N' means no data.. */ +} aec_stream_cfg_t; + +static int16_t *aec_in_buffer = NULL; +static int16_t *aec_out_buffer = NULL; +static afe_aec_handle_t *aec_handle = NULL; + +void pipecat_init_aec() { + // Try a simpler approach - create AEC without the task configuration struct + ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration"); + + // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) + aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); + if (aec_handle == NULL) { + ESP_LOGE(LOG_TAG, "Failed to create AEC"); + return; + } + + ESP_LOGI(LOG_TAG, "AEC created successfully"); + + int frame_size = afe_aec_get_chunksize(aec_handle); + int input_size = frame_size * aec_handle->pcm_config.total_ch_num * sizeof(int16_t); + int output_size = frame_size * sizeof(int16_t); + + // Allocate buffers in PSRAM instead of internal memory + aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_SPIRAM); + aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_SPIRAM); + + if (aec_in_buffer == NULL || aec_out_buffer == NULL) { + ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers"); + // Try internal memory as fallback + if (aec_in_buffer) heap_caps_free(aec_in_buffer); + if (aec_out_buffer) heap_caps_free(aec_out_buffer); + + aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_INTERNAL); + aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_INTERNAL); + + if (aec_in_buffer == NULL || aec_out_buffer == NULL) { + ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers even in internal memory"); + return; + } + ESP_LOGW(LOG_TAG, "AEC buffers allocated in internal memory"); + } + + ESP_LOGI(LOG_TAG, "AEC frame size: %d, input size: %d, output size: %d", frame_size, input_size, output_size); + ESP_LOGI(LOG_TAG, "AEC buffers allocated: in=%p, out=%p", aec_in_buffer, aec_out_buffer); +} + + void pipecat_send_audio(PeerConnection *peer_connection) { + uint64_t start_time, end_time; + + // Check if AEC is initialized + if (aec_handle == NULL) { + // AEC disabled, bypass processing + // ESP_LOGW(LOG_TAG, "AEC not initialized, bypassing"); + } + + // Measure esp_codec_dev_read time + start_time = esp_timer_get_time(); if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) != ESP_OK) { printf("esp_codec_dev_read failed"); return; } + end_time = esp_timer_get_time(); + total_read_time += (end_time - start_time); if (is_playing) { memset(read_buffer, 0, PCM_BUFFER_SIZE); } - auto encoded_size = opus_encode(opus_encoder, (const opus_int16 *)read_buffer, - PCM_BUFFER_SIZE / sizeof(uint16_t), - encoder_output_buffer, OPUS_BUFFER_SIZE); - peer_connection_send_audio(peer_connection, encoder_output_buffer, - encoded_size); + // AEC processing (skip if disabled) + if (aec_handle != NULL) { + // Get AEC chunk size (should be 256 samples for 16kHz) + int aec_chunk_size = afe_aec_get_chunksize(aec_handle); + int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t); // 640 samples + int num_chunks = total_samples / aec_chunk_size; // Should be 2.5, so we'll process 2 chunks + + // Debug logging + static bool first_run = true; + if (first_run) { + ESP_LOGI(LOG_TAG, "AEC debug: chunk_size=%d, total_samples=%d, num_chunks=%d", + aec_chunk_size, total_samples, num_chunks); + ESP_LOGI(LOG_TAG, "AEC debug: total_ch_num=%d, sample_rate=%d", + aec_handle->pcm_config.total_ch_num, aec_handle->pcm_config.sample_rate); + ESP_LOGI(LOG_TAG, "AEC debug: input format=%s, mic_num=%d, ref_num=%d", + "MR", aec_handle->pcm_config.mic_num, aec_handle->pcm_config.ref_num); + first_run = false; + } + + // Process all chunks + if (num_chunks != AEC_CHUNKS_TO_BATCH) { + ESP_LOGW(LOG_TAG, "Unexpected number of AEC chunks: %d (expected %d)", num_chunks, AEC_CHUNKS_TO_BATCH); + } + + // Process AEC in chunks + for (int chunk = 0; chunk < num_chunks; chunk++) { + // Prepare input: interleave mic data with zeros (reference channel) + int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size); + + // Format: MRMRMR... (interleaved - mic and ref alternating) + for (int i = 0; i < aec_chunk_size; i++) { + aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel + aec_in_buffer[i * 2 + 1] = 0; // Reference channel (zeros for now) + } + + // Run AEC process for this chunk + start_time = esp_timer_get_time(); + int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); + end_time = esp_timer_get_time(); + total_aec_time += (end_time - start_time); + + if (ret < 0) { + ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d (0x%x) at chunk %d", + ret, ret, chunk); + // Fall back to bypass on error + memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE); + return; + } + // Log success on first chunk of first run + if (first_run && chunk == 0) { + ESP_LOGI(LOG_TAG, "AEC process successful, returned %d bytes", ret); + } + + // Copy AEC output to processed buffer + memcpy(processed_buffer + (chunk * aec_chunk_size), aec_out_buffer, aec_chunk_size * sizeof(int16_t)); + } + + // All samples have been processed through AEC + } else { + // AEC disabled, just copy input to processed buffer + memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE); + } + + // Process Opus frames - we have exactly COMMON_BUFFER_SAMPLES (1280) samples + // This encodes to exactly OPUS_FRAMES_TO_BATCH (4) frames + for (int frame = 0; frame < OPUS_FRAMES_TO_BATCH; frame++) { + // Measure opus_encode time + start_time = esp_timer_get_time(); + auto encoded_size = opus_encode(opus_encoder, + processed_buffer + (frame * OPUS_FRAME_SIZE), + OPUS_FRAME_SIZE, + encoder_output_buffer, OPUS_BUFFER_SIZE); + end_time = esp_timer_get_time(); + total_encode_time += (end_time - start_time); + + // Measure peer_connection_send_audio time + start_time = esp_timer_get_time(); + peer_connection_send_audio(peer_connection, encoder_output_buffer, + encoded_size); + end_time = esp_timer_get_time(); + total_send_time += (end_time - start_time); + } + + // Update cycle count and print statistics every 50 cycles + cycle_count++; + if (cycle_count % 50 == 0) { + uint64_t avg_read_time = total_read_time / 50; + uint64_t avg_aec_time = total_aec_time / 50; + uint64_t avg_encode_time = total_encode_time / 50; + uint64_t avg_send_time = total_send_time / 50; + + ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): read=%llu us, aec=%llu us, encode=%llu us, send=%llu us", + avg_read_time, avg_aec_time, avg_encode_time, avg_send_time); + + // Reset counters for next 50 cycles + total_read_time = 0; + total_aec_time = 0; + total_encode_time = 0; + total_send_time = 0; + } } diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index 937e166..8172098 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -23,7 +23,7 @@ void pipecat_send_audio_task(void *user_data) { while (1) { pipecat_send_audio(peer_connection); - vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); + vTaskDelay(pdMS_TO_TICKS(1)); } } #endif @@ -61,10 +61,11 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state, ESP_LOGI(LOG_TAG, "Creating send audio task"); // Todo: move this to a separate init block - xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0); + // Why is this stack size so large? Opus encoder? + xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 1); heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); - // pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); + pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); #endif } } @@ -139,5 +140,5 @@ void peer_connection_task(void* arg) { void pipecat_webrtc_run_task() { ESP_LOGI(LOG_TAG, "peer_connection_task starting ..."); - xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 1); + xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 0); } From f0b36941cad4e2c66e799cfa741aca1ea6b38b1a Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 16:40:41 -0700 Subject: [PATCH 05/14] working aec --- esp32-s3-box-3/src/media.cpp | 92 +++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index f628845..27332a8 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "esp_log.h" #include "main.h" @@ -59,6 +60,10 @@ void pipecat_init_audio_capture() { opus_int16 *decoder_buffer = NULL; OpusDecoder *opus_decoder = NULL; +// Ring buffer for speaker reference data (declare before use) +static RingbufHandle_t speaker_ringbuf = NULL; +static const size_t SPEAKER_RINGBUF_SIZE = 8192; // ~3.2 read cycles at 1280 samples/read + void pipecat_init_audio_decoder() { int decoder_error = 0; opus_decoder = opus_decoder_create(SAMPLE_RATE, 1, &decoder_error); @@ -77,11 +82,35 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { if (decoded_size > 0) { set_is_playing(decoder_buffer, decoded_size); + + // Send to speaker if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer, decoded_size * sizeof(uint16_t))) != ESP_OK) { ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret)); } + + // Also send to ring buffer for AEC reference + if (speaker_ringbuf != NULL) { + BaseType_t ret = xRingbufferSend(speaker_ringbuf, decoder_buffer, + decoded_size * sizeof(int16_t), 0); + if (ret != pdTRUE) { + // Ring buffer full, this is expected occasionally + static int overflow_count = 0; + if (++overflow_count % 100 == 0) { + ESP_LOGW(LOG_TAG, "Speaker ring buffer overflow count: %d", overflow_count); + } + } else { + // Log successful writes occasionally + static int write_count = 0; + if (++write_count % 50 == 0) { + ESP_LOGI(LOG_TAG, "Speaker ring buffer writes: %d, size: %d bytes", + write_count, decoded_size * sizeof(int16_t)); + } + } + } else { + ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL!"); + } } } @@ -146,6 +175,14 @@ void pipecat_init_aec() { // Try a simpler approach - create AEC without the task configuration struct ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration"); + // Create ring buffer for speaker reference data + speaker_ringbuf = xRingbufferCreate(SPEAKER_RINGBUF_SIZE, RINGBUF_TYPE_BYTEBUF); + if (speaker_ringbuf == NULL) { + ESP_LOGE(LOG_TAG, "Failed to create speaker ring buffer"); + return; + } + ESP_LOGI(LOG_TAG, "Speaker ring buffer created, size=%d", SPEAKER_RINGBUF_SIZE); + // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); if (aec_handle == NULL) { @@ -203,9 +240,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) { end_time = esp_timer_get_time(); total_read_time += (end_time - start_time); - if (is_playing) { - memset(read_buffer, 0, PCM_BUFFER_SIZE); - } + // Don't zero out microphone input - let AEC handle echo cancellation + // if (is_playing) { + // memset(read_buffer, 0, PCM_BUFFER_SIZE); + // } // AEC processing (skip if disabled) if (aec_handle != NULL) { @@ -236,10 +274,50 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // Prepare input: interleave mic data with zeros (reference channel) int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size); - // Format: MRMRMR... (interleaved - mic and ref alternating) - for (int i = 0; i < aec_chunk_size; i++) { - aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel - aec_in_buffer[i * 2 + 1] = 0; // Reference channel (zeros for now) + // Get reference data from speaker ring buffer + size_t ref_bytes_needed = aec_chunk_size * sizeof(int16_t); + size_t ref_bytes_available = 0; + int16_t *ref_data = NULL; + + if (speaker_ringbuf != NULL) { + ref_data = (int16_t *)xRingbufferReceive(speaker_ringbuf, &ref_bytes_available, 0); + } else { + static bool logged_null = false; + if (!logged_null) { + ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL during read!"); + logged_null = true; + } + } + + if (ref_data != NULL && ref_bytes_available >= ref_bytes_needed) { + // Use actual speaker reference data + for (int i = 0; i < aec_chunk_size; i++) { + aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel + aec_in_buffer[i * 2 + 1] = ref_data[i]; // Reference channel (actual speaker data) + } + // Return item to ring buffer + vRingbufferReturnItem(speaker_ringbuf, ref_data); + + // Log successful reads occasionally + static int ref_success_count = 0; + if (++ref_success_count % 50 == 0) { + ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, bytes: %d", + ref_success_count, ref_bytes_available); + } + } else { + // No reference data available, use zeros + for (int i = 0; i < aec_chunk_size; i++) { + aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel + aec_in_buffer[i * 2 + 1] = 0; // Reference channel (zeros as fallback) + } + if (ref_data != NULL) { + vRingbufferReturnItem(speaker_ringbuf, ref_data); + } + // Log when we don't have reference data (only occasionally to avoid spam) + static int no_ref_count = 0; + if (++no_ref_count % 100 == 0) { + ESP_LOGW(LOG_TAG, "No speaker reference data available count: %d", no_ref_count); + } } // Run AEC process for this chunk From 3dad580ae9f5775c84495cd0b7a332a9d1cd8fa1 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 17:15:31 -0700 Subject: [PATCH 06/14] circular buffer --- esp32-s3-box-3/src/media.cpp | 200 ++++++++++++++++++++++++++++------- 1 file changed, 162 insertions(+), 38 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 27332a8..6fb8545 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -25,6 +25,9 @@ #define OPUS_ENCODER_BITRATE 30000 #define OPUS_ENCODER_COMPLEXITY 0 +// Forward declare for use in set_is_playing +static bool ref_buffer_primed = false; + std::atomic is_playing = false; @@ -35,7 +38,14 @@ void set_is_playing(int16_t *in_buf, size_t in_samples) { any_set = true; } } + bool was_playing = is_playing.load(); is_playing = any_set; + + // Reset buffer priming when playback stops + if (was_playing && !any_set) { + ref_buffer_primed = false; + ESP_LOGD(LOG_TAG, "Playback stopped, reset reference buffer priming"); + } } esp_codec_dev_handle_t mic_codec_dev = NULL; @@ -60,9 +70,15 @@ void pipecat_init_audio_capture() { opus_int16 *decoder_buffer = NULL; OpusDecoder *opus_decoder = NULL; -// Ring buffer for speaker reference data (declare before use) -static RingbufHandle_t speaker_ringbuf = NULL; -static const size_t SPEAKER_RINGBUF_SIZE = 8192; // ~3.2 read cycles at 1280 samples/read +// Circular buffer for speaker reference data (not using FreeRTOS ring buffer) +static int16_t* speaker_circular_buffer = NULL; +static const size_t SPEAKER_BUFFER_SAMPLES = 4096; // 4096 samples = 8192 bytes +static const size_t MIN_REF_BUFFER_SAMPLES = 2560; // 2 read cycles worth to prevent underruns +static volatile size_t speaker_write_idx = 0; +static volatile size_t speaker_read_idx = 0; +static volatile size_t speaker_samples_available = 0; +static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; +// ref_buffer_primed already declared above for use in set_is_playing void pipecat_init_audio_decoder() { int decoder_error = 0; @@ -76,6 +92,18 @@ void pipecat_init_audio_decoder() { } void pipecat_audio_decode(uint8_t *data, size_t size) { + // Log packet arrival timing + static uint64_t last_decode_time = 0; + uint64_t now = esp_timer_get_time(); + if (last_decode_time > 0) { + uint64_t delta_ms = (now - last_decode_time) / 1000; + static int packet_count = 0; + if (++packet_count % 10 == 0) { + ESP_LOGI(LOG_TAG, "Audio packet %d: size=%d, delta=%llu ms", packet_count, size, delta_ms); + } + } + last_decode_time = now; + esp_err_t ret; int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); @@ -90,26 +118,55 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret)); } - // Also send to ring buffer for AEC reference - if (speaker_ringbuf != NULL) { - BaseType_t ret = xRingbufferSend(speaker_ringbuf, decoder_buffer, - decoded_size * sizeof(int16_t), 0); - if (ret != pdTRUE) { - // Ring buffer full, this is expected occasionally - static int overflow_count = 0; - if (++overflow_count % 100 == 0) { - ESP_LOGW(LOG_TAG, "Speaker ring buffer overflow count: %d", overflow_count); + // Also send to circular buffer for AEC reference + if (speaker_circular_buffer != NULL) { + // Variables for logging outside critical section + bool log_write = false; + bool log_overflow = false; + int write_count_log = 0; + int overflow_count_log = 0; + size_t samples_available_log = 0; + + portENTER_CRITICAL(&speaker_buffer_mutex); + + // Check if we have space + size_t space_available = SPEAKER_BUFFER_SAMPLES - speaker_samples_available; + if (space_available >= decoded_size) { + // Copy samples to circular buffer + for (int i = 0; i < decoded_size; i++) { + speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; + speaker_write_idx = (speaker_write_idx + 1) % SPEAKER_BUFFER_SAMPLES; } - } else { - // Log successful writes occasionally + speaker_samples_available += decoded_size; + + // Prepare logging info static int write_count = 0; if (++write_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Speaker ring buffer writes: %d, size: %d bytes", - write_count, decoded_size * sizeof(int16_t)); + log_write = true; + write_count_log = write_count; + samples_available_log = speaker_samples_available; + } + } else { + // Buffer overflow + static int overflow_count = 0; + if (++overflow_count % 100 == 0) { + log_overflow = true; + overflow_count_log = overflow_count; } } + + portEXIT_CRITICAL(&speaker_buffer_mutex); + + // Do logging outside critical section + if (log_write) { + ESP_LOGI(LOG_TAG, "Speaker buffer writes: %d, samples: %d, available: %d", + write_count_log, decoded_size, samples_available_log); + } + if (log_overflow) { + ESP_LOGW(LOG_TAG, "Speaker buffer overflow count: %d", overflow_count_log); + } } else { - ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL!"); + ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL!"); } } } @@ -175,13 +232,13 @@ void pipecat_init_aec() { // Try a simpler approach - create AEC without the task configuration struct ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration"); - // Create ring buffer for speaker reference data - speaker_ringbuf = xRingbufferCreate(SPEAKER_RINGBUF_SIZE, RINGBUF_TYPE_BYTEBUF); - if (speaker_ringbuf == NULL) { - ESP_LOGE(LOG_TAG, "Failed to create speaker ring buffer"); + // Create circular buffer for speaker reference data + speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM); + if (speaker_circular_buffer == NULL) { + ESP_LOGE(LOG_TAG, "Failed to allocate speaker circular buffer"); return; } - ESP_LOGI(LOG_TAG, "Speaker ring buffer created, size=%d", SPEAKER_RINGBUF_SIZE); + ESP_LOGI(LOG_TAG, "Speaker circular buffer created, size=%d samples", SPEAKER_BUFFER_SAMPLES); // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); @@ -274,35 +331,105 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // Prepare input: interleave mic data with zeros (reference channel) int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size); - // Get reference data from speaker ring buffer - size_t ref_bytes_needed = aec_chunk_size * sizeof(int16_t); - size_t ref_bytes_available = 0; - int16_t *ref_data = NULL; + // Get reference data from speaker circular buffer + bool have_ref_data = false; + int16_t ref_chunk[256]; // Temporary buffer for reference data - if (speaker_ringbuf != NULL) { - ref_data = (int16_t *)xRingbufferReceive(speaker_ringbuf, &ref_bytes_available, 0); + if (speaker_circular_buffer != NULL) { + // Variables for logging outside critical section + bool log_buffer_state = false; + bool log_primed = false; + bool log_underrun = false; + size_t samples_for_log = 0; + int underrun_type = 0; // 0=none, 1=remaining chunks, 2=current chunk + size_t needed_for_log = 0; + + portENTER_CRITICAL(&speaker_buffer_mutex); + + // Debug log buffer state occasionally + static int buffer_check_count = 0; + if (++buffer_check_count % 100 == 0) { + log_buffer_state = true; + samples_for_log = speaker_samples_available; + } + + // Only start using reference data if we have enough buffered + if (!ref_buffer_primed && speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) { + ref_buffer_primed = true; + log_primed = true; + samples_for_log = speaker_samples_available; + } + + // Only try to get reference data if buffer is primed + if (ref_buffer_primed) { + // Check if we have enough samples for this chunk + if (speaker_samples_available >= aec_chunk_size) { + // Read samples from circular buffer + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; + speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES; + } + speaker_samples_available -= aec_chunk_size; + have_ref_data = true; + + // Check if we're running too low for remaining chunks + int remaining_chunks = num_chunks - chunk - 1; + size_t remaining_samples_needed = remaining_chunks * aec_chunk_size; + if (speaker_samples_available < remaining_samples_needed) { + ref_buffer_primed = false; + log_underrun = true; + underrun_type = 1; + samples_for_log = speaker_samples_available; + needed_for_log = remaining_samples_needed; + } + } else { + // Not enough samples, unprime + ref_buffer_primed = false; + log_underrun = true; + underrun_type = 2; + samples_for_log = speaker_samples_available; + needed_for_log = aec_chunk_size; + } + } + + portEXIT_CRITICAL(&speaker_buffer_mutex); + + // Do logging outside critical section + if (log_buffer_state) { + ESP_LOGI(LOG_TAG, "Circular buffer state: %d samples available, chunk %d/%d", + samples_for_log, chunk, num_chunks); + } + if (log_primed) { + ESP_LOGI(LOG_TAG, "Reference buffer primed with %d samples", samples_for_log); + } + if (log_underrun) { + if (underrun_type == 1) { + ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for remaining, have %d)", + chunk, num_chunks, needed_for_log, samples_for_log); + } else { + ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for current, have %d)", + chunk, num_chunks, needed_for_log, samples_for_log); + } + } } else { static bool logged_null = false; if (!logged_null) { - ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL during read!"); + ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL during read!"); logged_null = true; } } - if (ref_data != NULL && ref_bytes_available >= ref_bytes_needed) { + if (have_ref_data) { // Use actual speaker reference data for (int i = 0; i < aec_chunk_size; i++) { aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel - aec_in_buffer[i * 2 + 1] = ref_data[i]; // Reference channel (actual speaker data) + aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference channel (actual speaker data) } - // Return item to ring buffer - vRingbufferReturnItem(speaker_ringbuf, ref_data); // Log successful reads occasionally static int ref_success_count = 0; if (++ref_success_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, bytes: %d", - ref_success_count, ref_bytes_available); + ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count); } } else { // No reference data available, use zeros @@ -310,9 +437,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel aec_in_buffer[i * 2 + 1] = 0; // Reference channel (zeros as fallback) } - if (ref_data != NULL) { - vRingbufferReturnItem(speaker_ringbuf, ref_data); - } // Log when we don't have reference data (only occasionally to avoid spam) static int no_ref_count = 0; if (++no_ref_count % 100 == 0) { From 17537741485e2723d4027e8a233c832af7251c08 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 17:25:04 -0700 Subject: [PATCH 07/14] elapsed time timing for audio producer task --- esp32-s3-box-3/src/webrtc.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index 8172098..2fb1c78 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "main.h" @@ -22,8 +23,19 @@ void pipecat_send_audio_task(void *user_data) { pipecat_init_audio_encoder(); while (1) { + uint64_t start_time = esp_timer_get_time(); pipecat_send_audio(peer_connection); - vTaskDelay(pdMS_TO_TICKS(1)); + uint64_t end_time = esp_timer_get_time(); + uint64_t elapsed_ms = (end_time - start_time) / 1000; + + // We process 1280 samples at 16kHz = 80ms of audio + // Delay for the remainder of the 80ms period + if (elapsed_ms < 80) { + vTaskDelay(pdMS_TO_TICKS(80 - elapsed_ms)); + } else { + // If processing took longer than 80ms, yield to other tasks + vTaskDelay(pdMS_TO_TICKS(1)); + } } } #endif From b35fba0d30d9eba073481502b0454fdbf04cc4a2 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sun, 6 Jul 2025 17:51:53 -0700 Subject: [PATCH 08/14] introduce reference sample delay --- esp32-s3-box-3/src/media.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 6fb8545..5bc66d3 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -78,6 +78,11 @@ static volatile size_t speaker_write_idx = 0; static volatile size_t speaker_read_idx = 0; static volatile size_t speaker_samples_available = 0; static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; + +// AEC reference delay compensation (in samples) +// This accounts for acoustic path + hardware delays +// Typical value: 320 samples = 20ms at 16kHz +static const size_t AEC_REF_DELAY_SAMPLES = 320; // ref_buffer_primed already declared above for use in set_is_playing void pipecat_init_audio_decoder() { @@ -354,7 +359,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) { } // Only start using reference data if we have enough buffered - if (!ref_buffer_primed && speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) { + // We need MIN_REF_BUFFER_SAMPLES + delay compensation + size_t required_samples = MIN_REF_BUFFER_SAMPLES + AEC_REF_DELAY_SAMPLES; + if (!ref_buffer_primed && speaker_samples_available >= required_samples) { ref_buffer_primed = true; log_primed = true; samples_for_log = speaker_samples_available; @@ -362,9 +369,11 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // Only try to get reference data if buffer is primed if (ref_buffer_primed) { - // Check if we have enough samples for this chunk - if (speaker_samples_available >= aec_chunk_size) { + // Check if we have enough samples for this chunk plus delay + if (speaker_samples_available >= aec_chunk_size + AEC_REF_DELAY_SAMPLES) { // Read samples from circular buffer + // The delay is maintained by keeping a fixed distance between write and read pointers + // speaker_samples_available should always be >= AEC_REF_DELAY_SAMPLES + needed samples for (int i = 0; i < aec_chunk_size; i++) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES; @@ -373,8 +382,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) { have_ref_data = true; // Check if we're running too low for remaining chunks + // We need to maintain AEC_REF_DELAY_SAMPLES delay plus samples for remaining chunks int remaining_chunks = num_chunks - chunk - 1; - size_t remaining_samples_needed = remaining_chunks * aec_chunk_size; + size_t remaining_samples_needed = remaining_chunks * aec_chunk_size + AEC_REF_DELAY_SAMPLES; if (speaker_samples_available < remaining_samples_needed) { ref_buffer_primed = false; log_underrun = true; From 7471ee48ab2122f7553d942d019add7acee6d39b Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Jul 2025 12:31:39 -0700 Subject: [PATCH 09/14] seemingly stable, but not amazing, aec --- esp32-s3-box-3/src/media.cpp | 110 +++++++++++++++++++++------------- esp32-s3-box-3/src/webrtc.cpp | 13 +--- 2 files changed, 70 insertions(+), 53 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 5bc66d3..81bcab6 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -72,17 +72,15 @@ OpusDecoder *opus_decoder = NULL; // Circular buffer for speaker reference data (not using FreeRTOS ring buffer) static int16_t* speaker_circular_buffer = NULL; -static const size_t SPEAKER_BUFFER_SAMPLES = 4096; // 4096 samples = 8192 bytes -static const size_t MIN_REF_BUFFER_SAMPLES = 2560; // 2 read cycles worth to prevent underruns +static const size_t SPEAKER_BUFFER_SAMPLES = 4096; // 4096 samples = 256ms @ 16kHz +static const size_t MIN_REF_BUFFER_SAMPLES = 2560; // 2 read cycles worth (160ms) +static const size_t MIN_REF_BUFFER_RESUME = 1920; // 1.5 cycles for recovery (120ms) static volatile size_t speaker_write_idx = 0; static volatile size_t speaker_read_idx = 0; static volatile size_t speaker_samples_available = 0; static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; -// AEC reference delay compensation (in samples) -// This accounts for acoustic path + hardware delays -// Typical value: 320 samples = 20ms at 16kHz -static const size_t AEC_REF_DELAY_SAMPLES = 320; +// AEC reference delay compensation removed - assuming AEC implementation handles delay internally // ref_buffer_primed already declared above for use in set_is_playing void pipecat_init_audio_decoder() { @@ -215,27 +213,13 @@ void pipecat_init_audio_encoder() { processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT); } - -typedef struct { - int task_stack; /*!< Task stack size */ - int task_prio; /*!< Task peroid */ - int task_core; /*!< The core that task to be created */ - bool debug_aec; /*!< debug AEC input data */ - bool stack_in_ext; /*!< Try to allocate stack in external memory */ - afe_type_t type; /*!< The type of afe, AFE_TYPE_SR , AFE_TYPE_VC , AFE_TYPE_VC_8K */ - afe_mode_t mode; /*!< The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF */ - int filter_length; /*!< The filter length of aec */ - char *input_format; /*!< The input format is as follows: For example, 'MR', 'RMNM'. - 'M' means data from microphone, 'R' means data from reference data, 'N' means no data.. */ -} aec_stream_cfg_t; - static int16_t *aec_in_buffer = NULL; static int16_t *aec_out_buffer = NULL; static afe_aec_handle_t *aec_handle = NULL; void pipecat_init_aec() { // Try a simpler approach - create AEC without the task configuration struct - ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration"); + ESP_LOGI(LOG_TAG, "Initializing AEC"); // Create circular buffer for speaker reference data speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM); @@ -308,7 +292,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // } // AEC processing (skip if disabled) - if (aec_handle != NULL) { + if (aec_handle != NULL && aec_enabled) { // Get AEC chunk size (should be 256 samples for 16kHz) int aec_chunk_size = afe_aec_get_chunksize(aec_handle); int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t); // 640 samples @@ -358,22 +342,47 @@ void pipecat_send_audio(PeerConnection *peer_connection) { samples_for_log = speaker_samples_available; } + // Warn if we're getting close to the unprime threshold + static int low_buffer_count = 0; + if (speaker_samples_available < 2200 && speaker_samples_available >= 1920) { + if (++low_buffer_count % 10 == 0) { + ESP_LOGW(LOG_TAG, "Buffer running low: %d samples (unprime at 1920)", speaker_samples_available); + } + } else { + low_buffer_count = 0; + } + // Only start using reference data if we have enough buffered - // We need MIN_REF_BUFFER_SAMPLES + delay compensation - size_t required_samples = MIN_REF_BUFFER_SAMPLES + AEC_REF_DELAY_SAMPLES; - if (!ref_buffer_primed && speaker_samples_available >= required_samples) { - ref_buffer_primed = true; - log_primed = true; - samples_for_log = speaker_samples_available; + // Use hysteresis to prevent oscillation + if (!ref_buffer_primed) { + size_t required_samples = in_recovery_mode ? MIN_REF_BUFFER_RESUME : MIN_REF_BUFFER_SAMPLES; + if (speaker_samples_available >= required_samples) { + ref_buffer_primed = true; + log_primed = true; + samples_for_log = speaker_samples_available; + } + } + + // Recovery check: if we've been unprimed for too long, reduce requirements + static int unprime_count = 0; + static bool in_recovery_mode = false; + + if (!ref_buffer_primed) { + unprime_count++; + // After 25 attempts (~2 seconds), enter recovery mode + if (unprime_count > 25 && !in_recovery_mode) { + in_recovery_mode = true; + ESP_LOGW(LOG_TAG, "Entering recovery mode after %d attempts", unprime_count); + } + } else { + unprime_count = 0; } // Only try to get reference data if buffer is primed if (ref_buffer_primed) { - // Check if we have enough samples for this chunk plus delay - if (speaker_samples_available >= aec_chunk_size + AEC_REF_DELAY_SAMPLES) { + // Check if we have enough samples for this chunk + if (speaker_samples_available >= aec_chunk_size) { // Read samples from circular buffer - // The delay is maintained by keeping a fixed distance between write and read pointers - // speaker_samples_available should always be >= AEC_REF_DELAY_SAMPLES + needed samples for (int i = 0; i < aec_chunk_size; i++) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES; @@ -382,15 +391,33 @@ void pipecat_send_audio(PeerConnection *peer_connection) { have_ref_data = true; // Check if we're running too low for remaining chunks - // We need to maintain AEC_REF_DELAY_SAMPLES delay plus samples for remaining chunks int remaining_chunks = num_chunks - chunk - 1; - size_t remaining_samples_needed = remaining_chunks * aec_chunk_size + AEC_REF_DELAY_SAMPLES; - if (speaker_samples_available < remaining_samples_needed) { - ref_buffer_primed = false; - log_underrun = true; - underrun_type = 1; - samples_for_log = speaker_samples_available; - needed_for_log = remaining_samples_needed; + size_t remaining_samples_needed = remaining_chunks * aec_chunk_size; + + // In recovery mode, be more lenient - only unprime if we can't process ANY more chunks + if (in_recovery_mode) { + if (speaker_samples_available < aec_chunk_size) { + ref_buffer_primed = false; + log_underrun = true; + underrun_type = 1; + samples_for_log = speaker_samples_available; + needed_for_log = aec_chunk_size; + } else if (speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) { + // Exit recovery mode once buffer is fully healthy + in_recovery_mode = false; + ESP_LOGI(LOG_TAG, "Exiting recovery mode, buffer healthy with %d samples", speaker_samples_available); + } + } else { + // Normal mode - only unprime if we drop below a lower threshold + // This provides hysteresis to prevent oscillation + if (speaker_samples_available < 1920) { // 1.5 cycles = 120ms + ref_buffer_primed = false; + log_underrun = true; + underrun_type = 1; + samples_for_log = speaker_samples_available; + needed_for_log = remaining_samples_needed; + ESP_LOGW(LOG_TAG, "Unpriming: dropped below 1920 samples (have %d)", speaker_samples_available); + } } } else { // Not enough samples, unprime @@ -439,7 +466,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // Log successful reads occasionally static int ref_success_count = 0; if (++ref_success_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count); + ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, buffer: %d samples", + ref_success_count, speaker_samples_available); } } else { // No reference data available, use zeros diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index 2fb1c78..d26d36a 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -23,19 +23,8 @@ void pipecat_send_audio_task(void *user_data) { pipecat_init_audio_encoder(); while (1) { - uint64_t start_time = esp_timer_get_time(); pipecat_send_audio(peer_connection); - uint64_t end_time = esp_timer_get_time(); - uint64_t elapsed_ms = (end_time - start_time) / 1000; - - // We process 1280 samples at 16kHz = 80ms of audio - // Delay for the remainder of the 80ms period - if (elapsed_ms < 80) { - vTaskDelay(pdMS_TO_TICKS(80 - elapsed_ms)); - } else { - // If processing took longer than 80ms, yield to other tasks - vTaskDelay(pdMS_TO_TICKS(1)); - } + vTaskDelay(pdMS_TO_TICKS(1)); } } #endif From 2dbd6a88990003da1a1c485199a0247d51410ad6 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Jul 2025 18:49:26 -0700 Subject: [PATCH 10/14] temp commit --- esp32-s3-box-3/src/media.cpp | 495 ++++++++++++++--------------------- 1 file changed, 193 insertions(+), 302 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 81bcab6..6851678 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -25,28 +25,10 @@ #define OPUS_ENCODER_BITRATE 30000 #define OPUS_ENCODER_COMPLEXITY 0 -// Forward declare for use in set_is_playing -static bool ref_buffer_primed = false; - - +// AEC control +static bool aec_enabled = true; // Can be set to false to bypass AEC std::atomic is_playing = false; -void set_is_playing(int16_t *in_buf, size_t in_samples) { - bool any_set = false; - for (size_t i = 0; i < in_samples; i++) { - if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) { - any_set = true; - } - } - bool was_playing = is_playing.load(); - is_playing = any_set; - - // Reset buffer priming when playback stops - if (was_playing && !any_set) { - ref_buffer_primed = false; - ESP_LOGD(LOG_TAG, "Playback stopped, reset reference buffer priming"); - } -} esp_codec_dev_handle_t mic_codec_dev = NULL; esp_codec_dev_handle_t spk_codec_dev = NULL; @@ -71,17 +53,33 @@ opus_int16 *decoder_buffer = NULL; OpusDecoder *opus_decoder = NULL; // Circular buffer for speaker reference data (not using FreeRTOS ring buffer) -static int16_t* speaker_circular_buffer = NULL; -static const size_t SPEAKER_BUFFER_SAMPLES = 4096; // 4096 samples = 256ms @ 16kHz -static const size_t MIN_REF_BUFFER_SAMPLES = 2560; // 2 read cycles worth (160ms) -static const size_t MIN_REF_BUFFER_RESUME = 1920; // 1.5 cycles for recovery (120ms) +// This is a sliding window buffer, not a queue - we read recent data for AEC +static int16_t speaker_circular_buffer[640]; // Exactly 2 Opus frames (40ms @ 16kHz) static volatile size_t speaker_write_idx = 0; static volatile size_t speaker_read_idx = 0; -static volatile size_t speaker_samples_available = 0; +static volatile bool speaker_buffer_active = false; // True when we're receiving audio static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; -// AEC reference delay compensation removed - assuming AEC implementation handles delay internally -// ref_buffer_primed already declared above for use in set_is_playing +// Function to update playback state and reset buffer when playback stops +void set_is_playing(int16_t *in_buf, size_t in_samples) { + bool any_set = false; + for (size_t i = 0; i < in_samples; i++) { + if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) { + any_set = true; + } + } + bool was_playing = is_playing.load(); + is_playing = any_set; + + // Reset buffer state when playback stops + if (was_playing && !any_set) { + portENTER_CRITICAL(&speaker_buffer_mutex); + speaker_buffer_active = false; + speaker_read_idx = speaker_write_idx; // Reset read to match write + portEXIT_CRITICAL(&speaker_buffer_mutex); + ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer"); + } +} void pipecat_init_audio_decoder() { int decoder_error = 0; @@ -122,55 +120,30 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { } // Also send to circular buffer for AEC reference - if (speaker_circular_buffer != NULL) { - // Variables for logging outside critical section - bool log_write = false; - bool log_overflow = false; - int write_count_log = 0; - int overflow_count_log = 0; - size_t samples_available_log = 0; + portENTER_CRITICAL(&speaker_buffer_mutex); - portENTER_CRITICAL(&speaker_buffer_mutex); + // Write to circular buffer, always keeping just the freshest 2 frames + for (int i = 0; i < decoded_size; i++) { + speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; + speaker_write_idx = (speaker_write_idx + 1) % 640; + } + speaker_buffer_active = true; - // Check if we have space - size_t space_available = SPEAKER_BUFFER_SAMPLES - speaker_samples_available; - if (space_available >= decoded_size) { - // Copy samples to circular buffer - for (int i = 0; i < decoded_size; i++) { - speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; - speaker_write_idx = (speaker_write_idx + 1) % SPEAKER_BUFFER_SAMPLES; - } - speaker_samples_available += decoded_size; - - // Prepare logging info - static int write_count = 0; - if (++write_count % 50 == 0) { - log_write = true; - write_count_log = write_count; - samples_available_log = speaker_samples_available; - } - } else { - // Buffer overflow - static int overflow_count = 0; - if (++overflow_count % 100 == 0) { - log_overflow = true; - overflow_count_log = overflow_count; - } + // If read pointer is too far behind, advance it to stay within 2 frames + size_t distance = (speaker_write_idx - speaker_read_idx + 640) % 640; + if (distance > 640 - 320) { + // Read pointer is about to be overrun, advance it + speaker_read_idx = (speaker_write_idx + 640 - 320) % 640; } portEXIT_CRITICAL(&speaker_buffer_mutex); - // Do logging outside critical section - if (log_write) { - ESP_LOGI(LOG_TAG, "Speaker buffer writes: %d, samples: %d, available: %d", - write_count_log, decoded_size, samples_available_log); + // Occasional logging + static int write_count = 0; + if (++write_count % 50 == 0) { + ESP_LOGI(LOG_TAG, "Speaker buffer: write_idx=%d, read_idx=%d", + speaker_write_idx, speaker_read_idx); } - if (log_overflow) { - ESP_LOGW(LOG_TAG, "Speaker buffer overflow count: %d", overflow_count_log); - } - } else { - ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL!"); - } } } @@ -183,7 +156,10 @@ static uint64_t total_read_time = 0; static uint64_t total_aec_time = 0; static uint64_t total_encode_time = 0; static uint64_t total_send_time = 0; +static uint64_t total_wait_time = 0; static uint32_t cycle_count = 0; +static uint32_t aec_skip_count = 0; +static uint64_t last_frame_send_time = 0; // AEC processed buffer static int16_t *processed_buffer = NULL; @@ -221,13 +197,8 @@ void pipecat_init_aec() { // Try a simpler approach - create AEC without the task configuration struct ESP_LOGI(LOG_TAG, "Initializing AEC"); - // Create circular buffer for speaker reference data - speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM); - if (speaker_circular_buffer == NULL) { - ESP_LOGE(LOG_TAG, "Failed to allocate speaker circular buffer"); - return; - } - ESP_LOGI(LOG_TAG, "Speaker circular buffer created, size=%d samples", SPEAKER_BUFFER_SAMPLES); + // Speaker circular buffer is now statically allocated + ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=640 samples"); // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); @@ -270,278 +241,198 @@ void pipecat_init_aec() { void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t start_time, end_time; - // Check if AEC is initialized - if (aec_handle == NULL) { - // AEC disabled, bypass processing - // ESP_LOGW(LOG_TAG, "AEC not initialized, bypassing"); - } - - // Measure esp_codec_dev_read time - start_time = esp_timer_get_time(); - if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) != - ESP_OK) { - printf("esp_codec_dev_read failed"); - return; - } - end_time = esp_timer_get_time(); - total_read_time += (end_time - start_time); - - // Don't zero out microphone input - let AEC handle echo cancellation - // if (is_playing) { - // memset(read_buffer, 0, PCM_BUFFER_SIZE); - // } - - // AEC processing (skip if disabled) - if (aec_handle != NULL && aec_enabled) { - // Get AEC chunk size (should be 256 samples for 16kHz) - int aec_chunk_size = afe_aec_get_chunksize(aec_handle); - int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t); // 640 samples - int num_chunks = total_samples / aec_chunk_size; // Should be 2.5, so we'll process 2 chunks + // Wait for at least one Opus frame (320 samples) of speaker data + uint64_t wait_start = esp_timer_get_time(); + while (true) { + portENTER_CRITICAL(&speaker_buffer_mutex); + int samples_available = 0; + if (speaker_buffer_active) { + samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640; + } + portEXIT_CRITICAL(&speaker_buffer_mutex); - // Debug logging - static bool first_run = true; - if (first_run) { - ESP_LOGI(LOG_TAG, "AEC debug: chunk_size=%d, total_samples=%d, num_chunks=%d", - aec_chunk_size, total_samples, num_chunks); - ESP_LOGI(LOG_TAG, "AEC debug: total_ch_num=%d, sample_rate=%d", - aec_handle->pcm_config.total_ch_num, aec_handle->pcm_config.sample_rate); - ESP_LOGI(LOG_TAG, "AEC debug: input format=%s, mic_num=%d, ref_num=%d", - "MR", aec_handle->pcm_config.mic_num, aec_handle->pcm_config.ref_num); - first_run = false; + if (samples_available >= 320) { + break; // We have enough data } - // Process all chunks - if (num_chunks != AEC_CHUNKS_TO_BATCH) { - ESP_LOGW(LOG_TAG, "Unexpected number of AEC chunks: %d (expected %d)", num_chunks, AEC_CHUNKS_TO_BATCH); + // Brief delay before checking again + vTaskDelay(1 / portTICK_PERIOD_MS); + } + uint64_t wait_end = esp_timer_get_time(); + total_wait_time += (wait_end - wait_start); + + // Buffers for interleaved processing + static int16_t opus_accumulator[640]; // Accumulate AEC output for Opus encoding + static int opus_accumulator_pos = 0; + int16_t mic_chunk_buffer[256]; + int aec_chunk_size = 256; // AEC processes 256 samples at a time + + // Track timing for AEC skip decision + bool skip_last_aec = false; + int frame_count = 0; + + // Single loop processing 5 chunks of 256 samples each + for (int chunk = 0; chunk < 5; chunk++) { + // 1. Read 256 samples from microphone + start_time = esp_timer_get_time(); + if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) { + ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed"); + return; } + end_time = esp_timer_get_time(); + total_read_time += (end_time - start_time); - // Process AEC in chunks - for (int chunk = 0; chunk < num_chunks; chunk++) { - // Prepare input: interleave mic data with zeros (reference channel) - int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size); - + // 2. Process with AEC or bypass + if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) { // Get reference data from speaker circular buffer - bool have_ref_data = false; - int16_t ref_chunk[256]; // Temporary buffer for reference data + int16_t ref_chunk[256]; + int samples_available = 0; + bool log_underrun = false; + static int underrun_count = 0; + static int ref_success_count = 0; - if (speaker_circular_buffer != NULL) { - // Variables for logging outside critical section - bool log_buffer_state = false; - bool log_primed = false; - bool log_underrun = false; - size_t samples_for_log = 0; - int underrun_type = 0; // 0=none, 1=remaining chunks, 2=current chunk - size_t needed_for_log = 0; - - portENTER_CRITICAL(&speaker_buffer_mutex); - - // Debug log buffer state occasionally - static int buffer_check_count = 0; - if (++buffer_check_count % 100 == 0) { - log_buffer_state = true; - samples_for_log = speaker_samples_available; - } - - // Warn if we're getting close to the unprime threshold - static int low_buffer_count = 0; - if (speaker_samples_available < 2200 && speaker_samples_available >= 1920) { - if (++low_buffer_count % 10 == 0) { - ESP_LOGW(LOG_TAG, "Buffer running low: %d samples (unprime at 1920)", speaker_samples_available); - } - } else { - low_buffer_count = 0; - } - - // Only start using reference data if we have enough buffered - // Use hysteresis to prevent oscillation - if (!ref_buffer_primed) { - size_t required_samples = in_recovery_mode ? MIN_REF_BUFFER_RESUME : MIN_REF_BUFFER_SAMPLES; - if (speaker_samples_available >= required_samples) { - ref_buffer_primed = true; - log_primed = true; - samples_for_log = speaker_samples_available; - } - } - - // Recovery check: if we've been unprimed for too long, reduce requirements - static int unprime_count = 0; - static bool in_recovery_mode = false; - - if (!ref_buffer_primed) { - unprime_count++; - // After 25 attempts (~2 seconds), enter recovery mode - if (unprime_count > 25 && !in_recovery_mode) { - in_recovery_mode = true; - ESP_LOGW(LOG_TAG, "Entering recovery mode after %d attempts", unprime_count); - } - } else { - unprime_count = 0; - } + portENTER_CRITICAL(&speaker_buffer_mutex); - // Only try to get reference data if buffer is primed - if (ref_buffer_primed) { - // Check if we have enough samples for this chunk - if (speaker_samples_available >= aec_chunk_size) { - // Read samples from circular buffer + if (speaker_buffer_active) { + samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640; + + if (samples_available >= aec_chunk_size) { + // We have enough data, read it for (int i = 0; i < aec_chunk_size; i++) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; - speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES; + speaker_read_idx = (speaker_read_idx + 1) % 640; + } + } else if (samples_available > 0) { + // Not enough data, read what we have and pad with zeros + int i; + for (i = 0; i < samples_available; i++) { + ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; + speaker_read_idx = (speaker_read_idx + 1) % 640; + } + for (; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; } - speaker_samples_available -= aec_chunk_size; - have_ref_data = true; - - // Check if we're running too low for remaining chunks - int remaining_chunks = num_chunks - chunk - 1; - size_t remaining_samples_needed = remaining_chunks * aec_chunk_size; - // In recovery mode, be more lenient - only unprime if we can't process ANY more chunks - if (in_recovery_mode) { - if (speaker_samples_available < aec_chunk_size) { - ref_buffer_primed = false; - log_underrun = true; - underrun_type = 1; - samples_for_log = speaker_samples_available; - needed_for_log = aec_chunk_size; - } else if (speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) { - // Exit recovery mode once buffer is fully healthy - in_recovery_mode = false; - ESP_LOGI(LOG_TAG, "Exiting recovery mode, buffer healthy with %d samples", speaker_samples_available); - } - } else { - // Normal mode - only unprime if we drop below a lower threshold - // This provides hysteresis to prevent oscillation - if (speaker_samples_available < 1920) { // 1.5 cycles = 120ms - ref_buffer_primed = false; - log_underrun = true; - underrun_type = 1; - samples_for_log = speaker_samples_available; - needed_for_log = remaining_samples_needed; - ESP_LOGW(LOG_TAG, "Unpriming: dropped below 1920 samples (have %d)", speaker_samples_available); - } + if (++underrun_count % 10 == 0) { + log_underrun = true; } } else { - // Not enough samples, unprime - ref_buffer_primed = false; - log_underrun = true; - underrun_type = 2; - samples_for_log = speaker_samples_available; - needed_for_log = aec_chunk_size; + // No data at all - don't advance read pointer, just use zeros + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; + } + + if (++underrun_count % 10 == 0) { + log_underrun = true; + } + } + } else { + // Buffer not active, use zeros + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; } } portEXIT_CRITICAL(&speaker_buffer_mutex); - // Do logging outside critical section - if (log_buffer_state) { - ESP_LOGI(LOG_TAG, "Circular buffer state: %d samples available, chunk %d/%d", - samples_for_log, chunk, num_chunks); - } - if (log_primed) { - ESP_LOGI(LOG_TAG, "Reference buffer primed with %d samples", samples_for_log); - } if (log_underrun) { - if (underrun_type == 1) { - ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for remaining, have %d)", - chunk, num_chunks, needed_for_log, samples_for_log); - } else { - ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for current, have %d)", - chunk, num_chunks, needed_for_log, samples_for_log); - } + ESP_LOGW(LOG_TAG, "Speaker buffer underrun: only %d/%d samples available", + samples_available, aec_chunk_size); } - } else { - static bool logged_null = false; - if (!logged_null) { - ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL during read!"); - logged_null = true; - } - } - if (have_ref_data) { - // Use actual speaker reference data - for (int i = 0; i < aec_chunk_size; i++) { - aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel - aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference channel (actual speaker data) - } - - // Log successful reads occasionally - static int ref_success_count = 0; - if (++ref_success_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, buffer: %d samples", - ref_success_count, speaker_samples_available); - } - } else { - // No reference data available, use zeros - for (int i = 0; i < aec_chunk_size; i++) { - aec_in_buffer[i * 2] = mic_ptr[i]; // Microphone channel - aec_in_buffer[i * 2 + 1] = 0; // Reference channel (zeros as fallback) - } - // Log when we don't have reference data (only occasionally to avoid spam) - static int no_ref_count = 0; - if (++no_ref_count % 100 == 0) { - ESP_LOGW(LOG_TAG, "No speaker reference data available count: %d", no_ref_count); - } + // Create interleaved AEC input + for (int i = 0; i < aec_chunk_size; i++) { + aec_in_buffer[i * 2] = mic_chunk_buffer[i]; // Mic + aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference } - // Run AEC process for this chunk + // Run AEC start_time = esp_timer_get_time(); int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); end_time = esp_timer_get_time(); total_aec_time += (end_time - start_time); if (ret < 0) { - ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d (0x%x) at chunk %d", - ret, ret, chunk); - // Fall back to bypass on error - memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE); - return; + ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); + // Fall back - copy mic input directly + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + } else { + // Copy AEC output to accumulator + memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); } - // Log success on first chunk of first run - if (first_run && chunk == 0) { - ESP_LOGI(LOG_TAG, "AEC process successful, returned %d bytes", ret); + + // Log successful AEC processing occasionally + if (++ref_success_count % 50 == 0) { + ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count); } + } else { + // AEC disabled or skipped - copy mic input directly + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - // Copy AEC output to processed buffer - memcpy(processed_buffer + (chunk * aec_chunk_size), aec_out_buffer, aec_chunk_size * sizeof(int16_t)); + if (chunk == 4 && skip_last_aec) { + static int skip_log_count = 0; + if (++skip_log_count % 10 == 0) { + ESP_LOGI(LOG_TAG, "Skipped AEC on 5th chunk to catch up, count: %d", skip_log_count); + } + } } - // All samples have been processed through AEC - } else { - // AEC disabled, just copy input to processed buffer - memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE); - } - - // Process Opus frames - we have exactly COMMON_BUFFER_SAMPLES (1280) samples - // This encodes to exactly OPUS_FRAMES_TO_BATCH (4) frames - for (int frame = 0; frame < OPUS_FRAMES_TO_BATCH; frame++) { - // Measure opus_encode time - start_time = esp_timer_get_time(); - auto encoded_size = opus_encode(opus_encoder, - processed_buffer + (frame * OPUS_FRAME_SIZE), - OPUS_FRAME_SIZE, - encoder_output_buffer, OPUS_BUFFER_SIZE); - end_time = esp_timer_get_time(); - total_encode_time += (end_time - start_time); + opus_accumulator_pos += 256; - // Measure peer_connection_send_audio time - start_time = esp_timer_get_time(); - peer_connection_send_audio(peer_connection, encoder_output_buffer, - encoded_size); - end_time = esp_timer_get_time(); - total_send_time += (end_time - start_time); + // 3. Encode and send Opus frames when we have enough samples + while (opus_accumulator_pos >= 320) { + start_time = esp_timer_get_time(); + int encoded_size = opus_encode(opus_encoder, opus_accumulator, 320, + encoder_output_buffer, OPUS_BUFFER_SIZE); + end_time = esp_timer_get_time(); + total_encode_time += (end_time - start_time); + + start_time = esp_timer_get_time(); + peer_connection_send_audio(peer_connection, encoder_output_buffer, encoded_size); + end_time = esp_timer_get_time(); + total_send_time += (end_time - start_time); + + frame_count++; + + // After sending the 3rd frame, check timing for next call + if (frame_count == 3) { + uint64_t third_frame_sent_time = end_time; + if (last_frame_send_time > 0) { + uint64_t elapsed = (third_frame_sent_time - last_frame_send_time) / 1000; // ms + if (elapsed > 62) { + skip_last_aec = true; + aec_skip_count++; + } + } + } + + // Remember when we sent the 4th frame + if (frame_count == 4) { + last_frame_send_time = end_time; + } + + // Shift remaining samples + opus_accumulator_pos -= 320; + if (opus_accumulator_pos > 0) { + memmove(opus_accumulator, opus_accumulator + 320, opus_accumulator_pos * sizeof(int16_t)); + } + } } - // Update cycle count and print statistics every 50 cycles + // Update statistics cycle_count++; if (cycle_count % 50 == 0) { + uint64_t avg_wait_time = total_wait_time / 50; uint64_t avg_read_time = total_read_time / 50; uint64_t avg_aec_time = total_aec_time / 50; uint64_t avg_encode_time = total_encode_time / 50; uint64_t avg_send_time = total_send_time / 50; - ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): read=%llu us, aec=%llu us, encode=%llu us, send=%llu us", - avg_read_time, avg_aec_time, avg_encode_time, avg_send_time); + ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu", + avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count); - // Reset counters for next 50 cycles + // Reset counters + total_wait_time = 0; total_read_time = 0; total_aec_time = 0; total_encode_time = 0; From 125e422bffc54be3c63ac29444f101d1cf9e5ac6 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Jul 2025 21:09:54 -0700 Subject: [PATCH 11/14] temp commit --- esp32-s3-box-3/src/media.cpp | 226 +++++++++++++++++++++++++++-------- 1 file changed, 175 insertions(+), 51 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 6851678..a35932d 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -57,8 +57,10 @@ OpusDecoder *opus_decoder = NULL; static int16_t speaker_circular_buffer[640]; // Exactly 2 Opus frames (40ms @ 16kHz) static volatile size_t speaker_write_idx = 0; static volatile size_t speaker_read_idx = 0; +static volatile size_t speaker_buffer_count = 0; // Number of samples in buffer static volatile bool speaker_buffer_active = false; // True when we're receiving audio static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; +static volatile uint64_t last_buffer_read_time = 0; // Track when buffer was last read // Function to update playback state and reset buffer when playback stops void set_is_playing(int16_t *in_buf, size_t in_samples) { @@ -76,6 +78,8 @@ void set_is_playing(int16_t *in_buf, size_t in_samples) { portENTER_CRITICAL(&speaker_buffer_mutex); speaker_buffer_active = false; speaker_read_idx = speaker_write_idx; // Reset read to match write + speaker_buffer_count = 0; + last_buffer_read_time = 0; // Reset read time portEXIT_CRITICAL(&speaker_buffer_mutex); ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer"); } @@ -99,51 +103,83 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { if (last_decode_time > 0) { uint64_t delta_ms = (now - last_decode_time) / 1000; static int packet_count = 0; - if (++packet_count % 10 == 0) { - ESP_LOGI(LOG_TAG, "Audio packet %d: size=%d, delta=%llu ms", packet_count, size, delta_ms); + packet_count++; + + // Log every packet for first 20, then every 10th + if (packet_count <= 20 || packet_count % 10 == 0) { + // Get current buffer state for correlation + size_t current_write_idx, current_read_idx, current_count; + portENTER_CRITICAL(&speaker_buffer_mutex); + current_write_idx = speaker_write_idx; + current_read_idx = speaker_read_idx; + current_count = speaker_buffer_count; + portEXIT_CRITICAL(&speaker_buffer_mutex); + + ESP_LOGI(LOG_TAG, "[ARRIVAL] pkt#%d: size=%d, delta=%llums, buffer=%d samples, w=%d r=%d", + packet_count, size, delta_ms, current_count, current_write_idx, current_read_idx); } } last_decode_time = now; - esp_err_t ret; - int decoded_size = - opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); + // Decode Opus frame to get the size (or use fixed size if predictable) + int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); if (decoded_size > 0) { - set_is_playing(decoder_buffer, decoded_size); - - // Send to speaker - if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer, - decoded_size * sizeof(uint16_t))) != - ESP_OK) { - ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret)); - } - - // Also send to circular buffer for AEC reference + // FIRST: Write to circular buffer for AEC reference (before any other processing) portENTER_CRITICAL(&speaker_buffer_mutex); - // Write to circular buffer, always keeping just the freshest 2 frames - for (int i = 0; i < decoded_size; i++) { + // Write to circular buffer + size_t space_available = 640 - speaker_buffer_count; + size_t samples_to_write = decoded_size; + + // If we don't have enough space, drop oldest samples + if (samples_to_write > space_available) { + size_t samples_to_drop = samples_to_write - space_available; + speaker_read_idx = (speaker_read_idx + samples_to_drop) % 640; + speaker_buffer_count -= samples_to_drop; + } + + // Write new samples + for (int i = 0; i < samples_to_write; i++) { speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; speaker_write_idx = (speaker_write_idx + 1) % 640; } + speaker_buffer_count += samples_to_write; speaker_buffer_active = true; - // If read pointer is too far behind, advance it to stay within 2 frames - size_t distance = (speaker_write_idx - speaker_read_idx + 640) % 640; - if (distance > 640 - 320) { - // Read pointer is about to be overrun, advance it - speaker_read_idx = (speaker_write_idx + 640 - 320) % 640; - } + // Get samples in buffer after write (while still in critical section) + size_t samples_in_buffer = speaker_buffer_count; portEXIT_CRITICAL(&speaker_buffer_mutex); - // Occasional logging + // Log timing relative to last read from this buffer + uint64_t time_since_last_read = 0; + if (last_buffer_read_time > 0 && now > last_buffer_read_time) { + time_since_last_read = (now - last_buffer_read_time) / 1000; + } else { + time_since_last_read = 0; // First write or invalid timestamp + } + + // More frequent logging for debugging static int write_count = 0; - if (++write_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Speaker buffer: write_idx=%d, read_idx=%d", + write_count++; + if (write_count <= 20 || write_count % 10 == 0) { + // Log the actual count, not calculated value + ESP_LOGI(LOG_TAG, "[WRITE] cnt=%d: +%d samples, buffer=%d, since_read=%llums (w=%d r=%d)", + write_count, decoded_size, samples_in_buffer, time_since_last_read, speaker_write_idx, speaker_read_idx); } + + // THEN: Process for playback + set_is_playing(decoder_buffer, decoded_size); + + // Send to speaker + esp_err_t ret; + if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer, + decoded_size * sizeof(uint16_t))) != + ESP_OK) { + ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret)); + } } } @@ -241,18 +277,25 @@ void pipecat_init_aec() { void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t start_time, end_time; - // Wait for at least one Opus frame (320 samples) of speaker data + // Wait for (640 samples = 2 Opus frames) in our speaker data reference buffer to handle codec burst pattern uint64_t wait_start = esp_timer_get_time(); + int initial_samples = 0; + int wait_iterations = 0; while (true) { portENTER_CRITICAL(&speaker_buffer_mutex); int samples_available = 0; if (speaker_buffer_active) { - samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640; + samples_available = speaker_buffer_count; } portEXIT_CRITICAL(&speaker_buffer_mutex); - if (samples_available >= 320) { - break; // We have enough data + wait_iterations++; + if (initial_samples == 0) { + initial_samples = samples_available; + } + + if (samples_available >= 640) { + break; // Buffer is full - ready to handle burst consumption } // Brief delay before checking again @@ -261,6 +304,21 @@ void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t wait_end = esp_timer_get_time(); total_wait_time += (wait_end - wait_start); + // Log initial buffer state + static int call_count = 0; + call_count++; + // Get consistent snapshot of buffer state + size_t log_write_idx, log_read_idx, log_count; + portENTER_CRITICAL(&speaker_buffer_mutex); + log_write_idx = speaker_write_idx; + log_read_idx = speaker_read_idx; + log_count = speaker_buffer_count; + portEXIT_CRITICAL(&speaker_buffer_mutex); + + ESP_LOGI(LOG_TAG, "[SEND_START] call#%d: initial=%d, wait=%lluus, iters=%d, count=%d, w=%d r=%d", + call_count, initial_samples, wait_end - wait_start, wait_iterations, + log_count, log_write_idx, log_read_idx); + // Buffers for interleaved processing static int16_t opus_accumulator[640]; // Accumulate AEC output for Opus encoding static int opus_accumulator_pos = 0; @@ -271,9 +329,26 @@ void pipecat_send_audio(PeerConnection *peer_connection) { bool skip_last_aec = false; int frame_count = 0; + // Track timing - each read blocks for ~16ms (256 samples @ 16kHz) + uint64_t loop_start_time = esp_timer_get_time(); + // Single loop processing 5 chunks of 256 samples each for (int chunk = 0; chunk < 5; chunk++) { + uint64_t chunk_start_time = esp_timer_get_time(); // 1. Read 256 samples from microphone + // Log buffer state before read (for chunk 0 and 1) + if (chunk <= 1) { + size_t pre_read_write_idx, pre_read_read_idx, pre_read_count; + portENTER_CRITICAL(&speaker_buffer_mutex); + pre_read_write_idx = speaker_write_idx; + pre_read_read_idx = speaker_read_idx; + pre_read_count = speaker_buffer_count; + portEXIT_CRITICAL(&speaker_buffer_mutex); + + ESP_LOGI(LOG_TAG, "[PRE_READ] chunk=%d: buffer has %d samples (w=%d r=%d)", + chunk, pre_read_count, pre_read_write_idx, pre_read_read_idx); + } + start_time = esp_timer_get_time(); if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) { ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed"); @@ -282,6 +357,12 @@ void pipecat_send_audio(PeerConnection *peer_connection) { end_time = esp_timer_get_time(); total_read_time += (end_time - start_time); + // Log if a long read happened (indicating actual blocking) + uint64_t read_duration = (end_time - start_time) / 1000; + if (read_duration > 10) { // More than 10ms suggests actual blocking + ESP_LOGI(LOG_TAG, "[BLOCKED_READ] chunk=%d: read took %llums", chunk, read_duration); + } + // 2. Process with AEC or bypass if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) { // Get reference data from speaker circular buffer @@ -294,7 +375,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { portENTER_CRITICAL(&speaker_buffer_mutex); if (speaker_buffer_active) { - samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640; + samples_available = speaker_buffer_count; if (samples_available >= aec_chunk_size) { // We have enough data, read it @@ -302,6 +383,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; speaker_read_idx = (speaker_read_idx + 1) % 640; } + speaker_buffer_count -= aec_chunk_size; } else if (samples_available > 0) { // Not enough data, read what we have and pad with zeros int i; @@ -309,11 +391,14 @@ void pipecat_send_audio(PeerConnection *peer_connection) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; speaker_read_idx = (speaker_read_idx + 1) % 640; } + speaker_buffer_count = 0; // We consumed all available samples for (; i < aec_chunk_size; i++) { ref_chunk[i] = 0; } - if (++underrun_count % 10 == 0) { + underrun_count++; + // Log every underrun for first 20, then every 10th + if (underrun_count <= 20 || underrun_count % 10 == 0) { log_underrun = true; } } else { @@ -322,7 +407,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) { ref_chunk[i] = 0; } - if (++underrun_count % 10 == 0) { + underrun_count++; + // Log every underrun for first 20, then every 10th + if (underrun_count <= 20 || underrun_count % 10 == 0) { log_underrun = true; } } @@ -335,9 +422,28 @@ void pipecat_send_audio(PeerConnection *peer_connection) { portEXIT_CRITICAL(&speaker_buffer_mutex); + // Update last read time if we read any data + // Update last read time if we successfully read data + if (speaker_buffer_active && samples_available >= aec_chunk_size) { + last_buffer_read_time = esp_timer_get_time(); + } else if (speaker_buffer_active && samples_available > 0) { + // Even partial reads update the time + last_buffer_read_time = esp_timer_get_time(); + } + if (log_underrun) { - ESP_LOGW(LOG_TAG, "Speaker buffer underrun: only %d/%d samples available", - samples_available, aec_chunk_size); + uint64_t time_since_loop_start = (esp_timer_get_time() - loop_start_time) / 1000; + // Get consistent state outside critical section + size_t log_w_idx, log_r_idx, log_cnt; + portENTER_CRITICAL(&speaker_buffer_mutex); + log_w_idx = speaker_write_idx; + log_r_idx = speaker_read_idx; + log_cnt = speaker_buffer_count; + portEXIT_CRITICAL(&speaker_buffer_mutex); + + ESP_LOGW(LOG_TAG, "[UNDERRUN] #%d: chunk=%d, only %d/%d samples, %llums since loop start, count=%d, w=%d r=%d", + underrun_count, chunk, samples_available, aec_chunk_size, + time_since_loop_start, log_cnt, log_w_idx, log_r_idx); } // Create interleaved AEC input @@ -346,24 +452,32 @@ void pipecat_send_audio(PeerConnection *peer_connection) { aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference } - // Run AEC - start_time = esp_timer_get_time(); - int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); - end_time = esp_timer_get_time(); - total_aec_time += (end_time - start_time); + // Temporarily disabled AEC + // ---- + // // Run AEC + // start_time = esp_timer_get_time(); + // int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); + // end_time = esp_timer_get_time(); + // total_aec_time += (end_time - start_time); - if (ret < 0) { - ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); - // Fall back - copy mic input directly - memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - } else { - // Copy AEC output to accumulator - memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); - } + // if (ret < 0) { + // ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); + // // Fall back - copy mic input directly + // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + // } else { + // // Copy AEC output to accumulator + // memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); + // } + // ---- + // Just copy mic data + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - // Log successful AEC processing occasionally - if (++ref_success_count % 50 == 0) { - ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count); + // Log successful reads more frequently for debugging + ref_success_count++; + if (ref_success_count <= 10 || ref_success_count % 20 == 0) { + uint64_t chunk_duration = (esp_timer_get_time() - chunk_start_time) / 1000; + ESP_LOGI(LOG_TAG, "[READ_OK] chunk=%d, got %d samples, chunk took %llums", + chunk, aec_chunk_size, chunk_duration); } } else { // AEC disabled or skipped - copy mic input directly @@ -379,6 +493,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) { opus_accumulator_pos += 256; + // Note: esp_codec_dev_read blocks, providing natural pacing + // 3. Encode and send Opus frames when we have enough samples while (opus_accumulator_pos >= 320) { start_time = esp_timer_get_time(); @@ -420,6 +536,14 @@ void pipecat_send_audio(PeerConnection *peer_connection) { } // Update statistics + // Log loop timing + uint64_t loop_duration = (esp_timer_get_time() - loop_start_time) / 1000; + static int timing_log_count = 0; + if (++timing_log_count <= 10 || timing_log_count % 20 == 0) { + ESP_LOGI(LOG_TAG, "[LOOP_TIME] call#%d: full loop took %llums (expect ~80ms for 5x16ms reads)", + call_count, loop_duration); + } + cycle_count++; if (cycle_count % 50 == 0) { uint64_t avg_wait_time = total_wait_time / 50; From 153b8ebee3385bafa31a11b431bc30929ecb0f7b Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Jul 2025 22:12:43 -0700 Subject: [PATCH 12/14] temp commit --- esp32-s3-box-3/src/media.cpp | 114 +++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index a35932d..5625d65 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -54,7 +54,7 @@ OpusDecoder *opus_decoder = NULL; // Circular buffer for speaker reference data (not using FreeRTOS ring buffer) // This is a sliding window buffer, not a queue - we read recent data for AEC -static int16_t speaker_circular_buffer[640]; // Exactly 2 Opus frames (40ms @ 16kHz) +static int16_t speaker_circular_buffer[960]; // 3 Opus frames (60ms @ 16kHz) static volatile size_t speaker_write_idx = 0; static volatile size_t speaker_read_idx = 0; static volatile size_t speaker_buffer_count = 0; // Number of samples in buffer @@ -129,20 +129,20 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { portENTER_CRITICAL(&speaker_buffer_mutex); // Write to circular buffer - size_t space_available = 640 - speaker_buffer_count; + size_t space_available = 960 - speaker_buffer_count; size_t samples_to_write = decoded_size; // If we don't have enough space, drop oldest samples if (samples_to_write > space_available) { size_t samples_to_drop = samples_to_write - space_available; - speaker_read_idx = (speaker_read_idx + samples_to_drop) % 640; + speaker_read_idx = (speaker_read_idx + samples_to_drop) % 960; speaker_buffer_count -= samples_to_drop; } // Write new samples for (int i = 0; i < samples_to_write; i++) { speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; - speaker_write_idx = (speaker_write_idx + 1) % 640; + speaker_write_idx = (speaker_write_idx + 1) % 960; } speaker_buffer_count += samples_to_write; speaker_buffer_active = true; @@ -197,6 +197,14 @@ static uint32_t cycle_count = 0; static uint32_t aec_skip_count = 0; static uint64_t last_frame_send_time = 0; +// Frame rate monitoring +static uint64_t first_frame_time = 0; +static uint32_t frames_sent = 0; +static float current_frame_rate = 0.0; +static const uint32_t MIN_FRAMES_FOR_MEASUREMENT = 50; // Measure over 50 frames (~1 second) +static const float TARGET_FRAME_RATE = 50.0; // 50 fps (20ms per frame) +static const float MIN_ACCEPTABLE_FRAME_RATE = 48.0; // Allow 4% tolerance + // AEC processed buffer static int16_t *processed_buffer = NULL; @@ -234,7 +242,7 @@ void pipecat_init_aec() { ESP_LOGI(LOG_TAG, "Initializing AEC"); // Speaker circular buffer is now statically allocated - ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=640 samples"); + ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples"); // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); @@ -276,8 +284,21 @@ void pipecat_init_aec() { void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t start_time, end_time; + static uint64_t last_call_end_time = 0; + uint64_t function_start_time = esp_timer_get_time(); - // Wait for (640 samples = 2 Opus frames) in our speaker data reference buffer to handle codec burst pattern + // Log time since last call + if (last_call_end_time > 0) { + uint64_t time_between_calls = (function_start_time - last_call_end_time) / 1000; // ms + static int between_calls_log_count = 0; + between_calls_log_count++; + if (between_calls_log_count <= 20 || between_calls_log_count % 10 == 0) { + ESP_LOGI(LOG_TAG, "[CALL_GAP] Time between calls: %llums (call #%d)", + time_between_calls, between_calls_log_count); + } + } + + // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern uint64_t wait_start = esp_timer_get_time(); int initial_samples = 0; int wait_iterations = 0; @@ -294,8 +315,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) { initial_samples = samples_available; } - if (samples_available >= 640) { - break; // Buffer is full - ready to handle burst consumption + if (samples_available >= 960) { + break; // Buffer has 3 Opus frames - ready to handle burst consumption } // Brief delay before checking again @@ -381,7 +402,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // We have enough data, read it for (int i = 0; i < aec_chunk_size; i++) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; - speaker_read_idx = (speaker_read_idx + 1) % 640; + speaker_read_idx = (speaker_read_idx + 1) % 960; } speaker_buffer_count -= aec_chunk_size; } else if (samples_available > 0) { @@ -389,7 +410,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { int i; for (i = 0; i < samples_available; i++) { ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; - speaker_read_idx = (speaker_read_idx + 1) % 640; + speaker_read_idx = (speaker_read_idx + 1) % 960; } speaker_buffer_count = 0; // We consumed all available samples for (; i < aec_chunk_size; i++) { @@ -452,25 +473,23 @@ void pipecat_send_audio(PeerConnection *peer_connection) { aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference } - // Temporarily disabled AEC - // ---- - // // Run AEC - // start_time = esp_timer_get_time(); - // int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); - // end_time = esp_timer_get_time(); - // total_aec_time += (end_time - start_time); + // Run AEC + start_time = esp_timer_get_time(); + int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); + end_time = esp_timer_get_time(); + total_aec_time += (end_time - start_time); - // if (ret < 0) { - // ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); - // // Fall back - copy mic input directly - // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - // } else { - // // Copy AEC output to accumulator - // memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); - // } + if (ret < 0) { + ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); + // Fall back - copy mic input directly + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + } else { + // Copy AEC output to accumulator + memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); + } // ---- - // Just copy mic data - memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + // DEBUG: Just copy mic data + // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); // Log successful reads more frequently for debugging ref_success_count++; @@ -510,7 +529,37 @@ void pipecat_send_audio(PeerConnection *peer_connection) { frame_count++; - // After sending the 3rd frame, check timing for next call + // Frame rate tracking + uint64_t now = esp_timer_get_time(); + if (first_frame_time == 0) { + first_frame_time = now; + frames_sent = 0; + } + + frames_sent++; + + // Check frame rate after minimum frames + if (frames_sent >= MIN_FRAMES_FOR_MEASUREMENT) { + uint64_t elapsed_us = now - first_frame_time; + float elapsed_seconds = elapsed_us / 1000000.0f; + current_frame_rate = frames_sent / elapsed_seconds; + + if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) { + ESP_LOGW(LOG_TAG, "Frame rate too low: %.1f fps (target: %.1f fps). Falling behind!", + current_frame_rate, TARGET_FRAME_RATE); + + // Set flag to skip AEC on next cycle + // (We'll use this instead of the old timing method) + } + + // Reset counters for next measurement period + first_frame_time = now; + frames_sent = 0; + } + + // Old timing check - replaced by frame rate monitoring + // Keeping the structure in case we want to re-enable with new logic + /* if (frame_count == 3) { uint64_t third_frame_sent_time = end_time; if (last_frame_send_time > 0) { @@ -522,10 +571,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) { } } - // Remember when we sent the 4th frame if (frame_count == 4) { last_frame_send_time = end_time; } + */ // Shift remaining samples opus_accumulator_pos -= 320; @@ -552,8 +601,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t avg_encode_time = total_encode_time / 50; uint64_t avg_send_time = total_send_time / 50; - ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu", - avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count); + ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu, frame_rate=%.1f fps", + avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count, current_frame_rate); // Reset counters total_wait_time = 0; @@ -562,4 +611,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) { total_encode_time = 0; total_send_time = 0; } + + // Record end time for next call's gap measurement + last_call_end_time = esp_timer_get_time(); } From 931e7a20ac7b01814228116bcf8bc09265f24666 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Mon, 7 Jul 2025 22:29:26 -0700 Subject: [PATCH 13/14] AEC is taking too long and we're not able to send 50 audio frames per second. We also have to buffer a couple of speaker data frames, and that may be too much delay for the AEC to do well with, anyway. --- esp32-s3-box-3/src/media.cpp | 123 +---------------------------------- 1 file changed, 3 insertions(+), 120 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 5625d65..15d19f3 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -97,29 +97,7 @@ void pipecat_init_audio_decoder() { } void pipecat_audio_decode(uint8_t *data, size_t size) { - // Log packet arrival timing - static uint64_t last_decode_time = 0; uint64_t now = esp_timer_get_time(); - if (last_decode_time > 0) { - uint64_t delta_ms = (now - last_decode_time) / 1000; - static int packet_count = 0; - packet_count++; - - // Log every packet for first 20, then every 10th - if (packet_count <= 20 || packet_count % 10 == 0) { - // Get current buffer state for correlation - size_t current_write_idx, current_read_idx, current_count; - portENTER_CRITICAL(&speaker_buffer_mutex); - current_write_idx = speaker_write_idx; - current_read_idx = speaker_read_idx; - current_count = speaker_buffer_count; - portEXIT_CRITICAL(&speaker_buffer_mutex); - - ESP_LOGI(LOG_TAG, "[ARRIVAL] pkt#%d: size=%d, delta=%llums, buffer=%d samples, w=%d r=%d", - packet_count, size, delta_ms, current_count, current_write_idx, current_read_idx); - } - } - last_decode_time = now; // Decode Opus frame to get the size (or use fixed size if predictable) int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); @@ -160,15 +138,6 @@ void pipecat_audio_decode(uint8_t *data, size_t size) { time_since_last_read = 0; // First write or invalid timestamp } - // More frequent logging for debugging - static int write_count = 0; - write_count++; - if (write_count <= 20 || write_count % 10 == 0) { - // Log the actual count, not calculated value - ESP_LOGI(LOG_TAG, "[WRITE] cnt=%d: +%d samples, buffer=%d, since_read=%llums (w=%d r=%d)", - write_count, decoded_size, samples_in_buffer, time_since_last_read, - speaker_write_idx, speaker_read_idx); - } // THEN: Process for playback set_is_playing(decoder_buffer, decoded_size); @@ -287,16 +256,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { static uint64_t last_call_end_time = 0; uint64_t function_start_time = esp_timer_get_time(); - // Log time since last call - if (last_call_end_time > 0) { - uint64_t time_between_calls = (function_start_time - last_call_end_time) / 1000; // ms - static int between_calls_log_count = 0; - between_calls_log_count++; - if (between_calls_log_count <= 20 || between_calls_log_count % 10 == 0) { - ESP_LOGI(LOG_TAG, "[CALL_GAP] Time between calls: %llums (call #%d)", - time_between_calls, between_calls_log_count); - } - } // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern uint64_t wait_start = esp_timer_get_time(); @@ -325,20 +284,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { uint64_t wait_end = esp_timer_get_time(); total_wait_time += (wait_end - wait_start); - // Log initial buffer state - static int call_count = 0; - call_count++; - // Get consistent snapshot of buffer state - size_t log_write_idx, log_read_idx, log_count; - portENTER_CRITICAL(&speaker_buffer_mutex); - log_write_idx = speaker_write_idx; - log_read_idx = speaker_read_idx; - log_count = speaker_buffer_count; - portEXIT_CRITICAL(&speaker_buffer_mutex); - - ESP_LOGI(LOG_TAG, "[SEND_START] call#%d: initial=%d, wait=%lluus, iters=%d, count=%d, w=%d r=%d", - call_count, initial_samples, wait_end - wait_start, wait_iterations, - log_count, log_write_idx, log_read_idx); // Buffers for interleaved processing static int16_t opus_accumulator[640]; // Accumulate AEC output for Opus encoding @@ -357,18 +302,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { for (int chunk = 0; chunk < 5; chunk++) { uint64_t chunk_start_time = esp_timer_get_time(); // 1. Read 256 samples from microphone - // Log buffer state before read (for chunk 0 and 1) - if (chunk <= 1) { - size_t pre_read_write_idx, pre_read_read_idx, pre_read_count; - portENTER_CRITICAL(&speaker_buffer_mutex); - pre_read_write_idx = speaker_write_idx; - pre_read_read_idx = speaker_read_idx; - pre_read_count = speaker_buffer_count; - portEXIT_CRITICAL(&speaker_buffer_mutex); - - ESP_LOGI(LOG_TAG, "[PRE_READ] chunk=%d: buffer has %d samples (w=%d r=%d)", - chunk, pre_read_count, pre_read_write_idx, pre_read_read_idx); - } start_time = esp_timer_get_time(); if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) { @@ -378,11 +311,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { end_time = esp_timer_get_time(); total_read_time += (end_time - start_time); - // Log if a long read happened (indicating actual blocking) - uint64_t read_duration = (end_time - start_time) / 1000; - if (read_duration > 10) { // More than 10ms suggests actual blocking - ESP_LOGI(LOG_TAG, "[BLOCKED_READ] chunk=%d: read took %llums", chunk, read_duration); - } // 2. Process with AEC or bypass if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) { @@ -491,23 +419,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) { // DEBUG: Just copy mic data // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - // Log successful reads more frequently for debugging - ref_success_count++; - if (ref_success_count <= 10 || ref_success_count % 20 == 0) { - uint64_t chunk_duration = (esp_timer_get_time() - chunk_start_time) / 1000; - ESP_LOGI(LOG_TAG, "[READ_OK] chunk=%d, got %d samples, chunk took %llums", - chunk, aec_chunk_size, chunk_duration); - } } else { // AEC disabled or skipped - copy mic input directly memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); - if (chunk == 4 && skip_last_aec) { - static int skip_log_count = 0; - if (++skip_log_count % 10 == 0) { - ESP_LOGI(LOG_TAG, "Skipped AEC on 5th chunk to catch up, count: %d", skip_log_count); - } - } } opus_accumulator_pos += 256; @@ -544,38 +459,15 @@ void pipecat_send_audio(PeerConnection *peer_connection) { float elapsed_seconds = elapsed_us / 1000000.0f; current_frame_rate = frames_sent / elapsed_seconds; - if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) { - ESP_LOGW(LOG_TAG, "Frame rate too low: %.1f fps (target: %.1f fps). Falling behind!", - current_frame_rate, TARGET_FRAME_RATE); - - // Set flag to skip AEC on next cycle - // (We'll use this instead of the old timing method) + if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) { + // We could try to catch up here, somehow. But not sure what makes sense. } // Reset counters for next measurement period first_frame_time = now; frames_sent = 0; } - - // Old timing check - replaced by frame rate monitoring - // Keeping the structure in case we want to re-enable with new logic - /* - if (frame_count == 3) { - uint64_t third_frame_sent_time = end_time; - if (last_frame_send_time > 0) { - uint64_t elapsed = (third_frame_sent_time - last_frame_send_time) / 1000; // ms - if (elapsed > 62) { - skip_last_aec = true; - aec_skip_count++; - } - } - } - - if (frame_count == 4) { - last_frame_send_time = end_time; - } - */ - + // Shift remaining samples opus_accumulator_pos -= 320; if (opus_accumulator_pos > 0) { @@ -585,13 +477,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) { } // Update statistics - // Log loop timing - uint64_t loop_duration = (esp_timer_get_time() - loop_start_time) / 1000; - static int timing_log_count = 0; - if (++timing_log_count <= 10 || timing_log_count % 20 == 0) { - ESP_LOGI(LOG_TAG, "[LOOP_TIME] call#%d: full loop took %llums (expect ~80ms for 5x16ms reads)", - call_count, loop_duration); - } cycle_count++; if (cycle_count % 50 == 0) { @@ -612,6 +497,4 @@ void pipecat_send_audio(PeerConnection *peer_connection) { total_send_time = 0; } - // Record end time for next call's gap measurement - last_call_end_time = esp_timer_get_time(); } From 4c533cb3a92f816f8d6e45095322c70983054fe8 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Tue, 8 Jul 2025 15:15:49 -0700 Subject: [PATCH 14/14] change filter length --- esp32-s3-box-3/src/media.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index 15d19f3..989d2af 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -213,8 +213,10 @@ void pipecat_init_aec() { // Speaker circular buffer is now statically allocated ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples"); - // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3) - aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST); + // Create AEC with basic parameters. + // Filter_length=4 recommended for ESP32S3, but filter_length=2 and AFE_MODE_LOW_COST seem to be a local + // maximum in terms of quality with the current code we have here. + aec_handle = afe_aec_create("MR", 2, AFE_TYPE_VC, AFE_MODE_LOW_COST); if (aec_handle == NULL) { ESP_LOGE(LOG_TAG, "Failed to create AEC"); return;