From 35be7897a9afbd23781dfb686e37de6c0e09a91a Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sat, 5 Jul 2025 18:16:05 -0700
Subject: [PATCH 01/14] Memory optimizations. Allocate minimal placeholder only
 for h264 (CONFIG_MAX_NALU_SIZE); Move LVGL graphics working memory out of
 internal memory to PSRAM.

---
 esp32-s3-box-3/CMakeLists.txt     |  5 +++++
 esp32-s3-box-3/sdkconfig.defaults |  5 +++++
 esp32-s3-box-3/src/main.cpp       | 14 +++++++-------
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt
index fbba559..6195c59 100644
--- a/esp32-s3-box-3/CMakeLists.txt
+++ b/esp32-s3-box-3/CMakeLists.txt
@@ -7,6 +7,11 @@ if(NOT IDF_TARGET STREQUAL linux)
 
   add_compile_definitions(WIFI_SSID="$ENV{WIFI_SSID}")
   add_compile_definitions(WIFI_PASSWORD="$ENV{WIFI_PASSWORD}")
+
+  # We are not using video in this app, so set the max NALU size to very small
+  if(NOT DEFINED ENV{CONFIG_MAX_NALU_SIZE})
+    add_compile_definitions(CONFIG_MAX_NALU_SIZE=32)
+  endif()
 endif()
 
 if(NOT DEFINED ENV{PIPECAT_SMALLWEBRTC_URL})
diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults
index 236be55..99300dd 100644
--- a/esp32-s3-box-3/sdkconfig.defaults
+++ b/esp32-s3-box-3/sdkconfig.defaults
@@ -33,3 +33,8 @@ CONFIG_COMPILER_OPTIMIZATION_PERF=y
 CONFIG_COMPILER_OPTIMIZATION_ASSERTIONS_DISABLE=y
 
 CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n
+
+# Use C library memory allocator for LVGL to save 64KB of BSS
+# This removes the 64KB work_mem_int buffer from BSS
+CONFIG_LV_USE_BUILTIN_MALLOC=n
+CONFIG_LV_USE_CLIB_MALLOC=y
diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp
index 8bc08f1..4ac44ea 100644
--- a/esp32-s3-box-3/src/main.cpp
+++ b/esp32-s3-box-3/src/main.cpp
@@ -1,5 +1,6 @@
 #include "main.h"
 
+#include <inttypes.h>
 #include <esp_event.h>
 #include <esp_log.h>
 #include <peer.h>
@@ -8,15 +9,14 @@
 #include "nvs_flash.h"
 
 extern "C" void app_main(void) {
-  esp_err_t ret = nvs_flash_init();
-  if (ret == ESP_ERR_NVS_NO_FREE_PAGES ||
-      ret == ESP_ERR_NVS_NEW_VERSION_FOUND) {
-    ESP_ERROR_CHECK(nvs_flash_erase());
-    ret = nvs_flash_init();
-  }
-  ESP_ERROR_CHECK(ret);
 
+  ESP_LOGI(LOG_TAG, "[APP] Startup..");
+  ESP_LOGI(LOG_TAG, "[APP] IDF version: %s", esp_get_idf_version());
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
+  ESP_ERROR_CHECK(nvs_flash_init());
   ESP_ERROR_CHECK(esp_event_loop_create_default());
+
   pipecat_init_screen();
   peer_init();
   pipecat_init_audio_capture();

From 36ab58359b91872cfbed7688691cb80ac0dd855f Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sat, 5 Jul 2025 20:49:24 -0700
Subject: [PATCH 02/14] temp commit; moving tasks around

---
 esp32-s3-box-3/src/main.cpp   | 10 ++++++++-
 esp32-s3-box-3/src/main.h     |  2 ++
 esp32-s3-box-3/src/webrtc.cpp | 42 +++++++++++++++++++++++++++++------
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp
index 4ac44ea..fa69c17 100644
--- a/esp32-s3-box-3/src/main.cpp
+++ b/esp32-s3-box-3/src/main.cpp
@@ -8,6 +8,9 @@
 #ifndef LINUX_BUILD
 #include "nvs_flash.h"
 
+// All operations that modify the webrtc state should use this semaphore to ensure thread safety
+SemaphoreHandle_t webrtcSemaphore = NULL;
+
 extern "C" void app_main(void) {
 
   ESP_LOGI(LOG_TAG, "[APP] Startup..");
@@ -17,6 +20,8 @@ extern "C" void app_main(void) {
   ESP_ERROR_CHECK(nvs_flash_init());
   ESP_ERROR_CHECK(esp_event_loop_create_default());
 
+  webrtcSemaphore = xSemaphoreCreateMutex();
+
   pipecat_init_screen();
   peer_init();
   pipecat_init_audio_capture();
@@ -26,8 +31,10 @@ extern "C" void app_main(void) {
 
   pipecat_screen_system_log("Pipecat ESP32 client initialized\n");
 
+  pipecat_webrtc_run_task();
+
   while (1) {
-    pipecat_webrtc_loop();
+    // Add some stats printout to the screen here
     vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
   }
 }
@@ -38,6 +45,7 @@ int main(void) {
   pipecat_webrtc();
 
   while (1) {
+    // Todo: test/fix linux build
     pipecat_webrtc_loop();
     vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
   }
diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h
index 389d7ee..a5e6478 100644
--- a/esp32-s3-box-3/src/main.h
+++ b/esp32-s3-box-3/src/main.h
@@ -18,12 +18,14 @@ extern void pipecat_audio_decode(uint8_t *data, size_t size);
 // WebRTC / Signalling
 extern void pipecat_init_webrtc();
 extern void pipecat_webrtc_loop();
+extern void pipecat_webrtc_run_task();
 extern void pipecat_http_request(char *offer, char *answer);
 
 // RTVI
 typedef struct {
   void (*on_bot_started_speaking)();
   void (*on_bot_stopped_speaking)();
+
   void (*on_bot_tts_text)(const char *text);
 } rtvi_callbacks_t;
 
diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index b777b81..8602309 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -9,10 +9,15 @@
 
 #include "main.h"
 
+extern SemaphoreHandle_t webrtcSemaphore;
+
 static PeerConnection *peer_connection = NULL;
 
 #ifndef LINUX_BUILD
-StaticTask_t task_buffer;
+
+static TaskHandle_t xPcTaskHandle = NULL;
+static TaskHandle_t xSendAudioTaskHandle = NULL;
+
 void pipecat_send_audio_task(void *user_data) {
   pipecat_init_audio_encoder();
 
@@ -51,13 +56,10 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state,
 #ifndef LINUX_BUILD
     esp_restart();
 #endif
-  } else if (state == PEER_CONNECTION_CONNECTED) {
+  } else if (state == PEER_CONNECTION_COMPLETED) {
 #ifndef LINUX_BUILD
-    StackType_t *stack_memory = (StackType_t *)heap_caps_malloc(
-        30000 * sizeof(StackType_t), MALLOC_CAP_SPIRAM);
-    xTaskCreateStaticPinnedToCore(pipecat_send_audio_task, "audio_publisher",
-                                  30000, NULL, 7, stack_memory, &task_buffer,
-                                  0);
+    // Todo: move this to a separate init block
+    xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0);
     pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
 #endif
   }
@@ -109,3 +111,29 @@ void pipecat_init_webrtc() {
 void pipecat_webrtc_loop() {
   peer_connection_loop(peer_connection);
 }
+
+
+void peer_connection_task(void* arg) {
+  ESP_LOGI(LOG_TAG, "peer_connection_task started");
+
+  while (peer_connection == NULL) {
+    ESP_LOGI(LOG_TAG, "peer_connection_task waiting for peer_connection");
+    vTaskDelay(pdMS_TO_TICKS(1000));
+  }
+
+
+  for (;;) {
+    if (xSemaphoreTake(webrtcSemaphore, portMAX_DELAY)) {
+      peer_connection_loop(peer_connection);
+      xSemaphoreGive(webrtcSemaphore);
+    }
+
+    vTaskDelay(pdMS_TO_TICKS(1));
+  }
+}
+
+void pipecat_webrtc_run_task() {
+  ESP_LOGI(LOG_TAG, "peer_connection_task starting ...");
+
+  xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 1);
+}

From 7af51e9b20ddfb9f181f5c8ff4b234c7742473ee Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 13:01:07 -0700
Subject: [PATCH 03/14] more work on memory fragmentation

---
 esp32-s3-box-3/sdkconfig.defaults |  2 ++
 esp32-s3-box-3/src/main.cpp       | 13 +++++++++++++
 esp32-s3-box-3/src/screen.cpp     |  1 +
 esp32-s3-box-3/src/webrtc.cpp     |  6 +++++-
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults
index 99300dd..1a5c07d 100644
--- a/esp32-s3-box-3/sdkconfig.defaults
+++ b/esp32-s3-box-3/sdkconfig.defaults
@@ -38,3 +38,5 @@ CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n
 # This removes the 64KB work_mem_int buffer from BSS
 CONFIG_LV_USE_BUILTIN_MALLOC=n
 CONFIG_LV_USE_CLIB_MALLOC=y
+CONFIG_BSP_LCD_DRAW_BUF_HEIGHT=10
+CONFIG_BSP_LCD_DRAW_BUF_DOUBLE=n
diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp
index fa69c17..55ae6e7 100644
--- a/esp32-s3-box-3/src/main.cpp
+++ b/esp32-s3-box-3/src/main.cpp
@@ -23,14 +23,27 @@ extern "C" void app_main(void) {
   webrtcSemaphore = xSemaphoreCreateMutex();
 
   pipecat_init_screen();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   peer_init();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_audio_capture();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_audio_decoder();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_wifi();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_webrtc();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
 
+  ESP_LOGI(LOG_TAG, "Pipecat ESP32 client initialized");
   pipecat_screen_system_log("Pipecat ESP32 client initialized\n");
 
+  ESP_LOGI(LOG_TAG, "Starting webrtc task");
   pipecat_webrtc_run_task();
 
   while (1) {
diff --git a/esp32-s3-box-3/src/screen.cpp b/esp32-s3-box-3/src/screen.cpp
index bbd1d9b..b4b7f47 100644
--- a/esp32-s3-box-3/src/screen.cpp
+++ b/esp32-s3-box-3/src/screen.cpp
@@ -77,6 +77,7 @@ static void screen_task(void *pvParameter) {
 }
 
 void pipecat_init_screen() {
+
   bsp_display_start();
 
   bsp_display_backlight_on();
diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index 8602309..937e166 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -58,9 +58,13 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state,
 #endif
   } else if (state == PEER_CONNECTION_COMPLETED) {
 #ifndef LINUX_BUILD
+
+    ESP_LOGI(LOG_TAG, "Creating send audio task");
     // Todo: move this to a separate init block
     xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0);
-    pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
+    heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
+    // pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
 #endif
   }
 }

From 10b52519a788c095a34adab1a639628e2cf9b580 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 16:24:30 -0700
Subject: [PATCH 04/14] aec working with dummy reference data

---
 .gitmodules                       |   3 +
 esp32-s3-box-3/CMakeLists.txt     |   2 +-
 esp32-s3-box-3/dependencies.lock  |  66 ++++++++--
 esp32-s3-box-3/esp-adf            |   1 +
 esp32-s3-box-3/partitions.csv     |   2 +-
 esp32-s3-box-3/sdkconfig.defaults |   3 +
 esp32-s3-box-3/src/CMakeLists.txt |   2 +-
 esp32-s3-box-3/src/main.cpp       |   3 +
 esp32-s3-box-3/src/main.h         |   1 +
 esp32-s3-box-3/src/media.cpp      | 201 +++++++++++++++++++++++++++++-
 esp32-s3-box-3/src/webrtc.cpp     |   9 +-
 11 files changed, 273 insertions(+), 20 deletions(-)
 create mode 160000 esp32-s3-box-3/esp-adf

diff --git a/.gitmodules b/.gitmodules
index c6602c8..65f0d07 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "esp32-s3-box-3/components/esp-protocols"]
 	path = esp32-s3-box-3/components/esp-protocols
 	url = https://github.com/espressif/esp-protocols.git
+[submodule "esp32-s3-box-3/esp-adf"]
+	path = esp32-s3-box-3/esp-adf
+	url = https://github.com/espressif/esp-adf.git
diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt
index 6195c59..ae931c7 100644
--- a/esp32-s3-box-3/CMakeLists.txt
+++ b/esp32-s3-box-3/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
 add_compile_definitions(PIPECAT_SMALLWEBRTC_URL="$ENV{PIPECAT_SMALLWEBRTC_URL}")
 
 set(COMPONENTS src)
-set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus")
+set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus" "esp-adf/components")
 
 if(IDF_TARGET STREQUAL linux)
   add_compile_definitions(LINUX_BUILD=1)
diff --git a/esp32-s3-box-3/dependencies.lock b/esp32-s3-box-3/dependencies.lock
index e6c6da2..d533402 100644
--- a/esp32-s3-box-3/dependencies.lock
+++ b/esp32-s3-box-3/dependencies.lock
@@ -1,4 +1,10 @@
 dependencies:
+  esp_codec_dev:
+    dependencies: []
+    source:
+      path: /home/khkramer/src/pipecat-esp32/esp32-s3-box-3/esp-adf/components/esp_codec_dev
+      type: local
+    version: 1.3.5
   espressif/button:
     component_hash: f53face2ab21fa0ffaf4cf0f6e513d393f56df6586bb2ad1146120f03f19ee05
     dependencies:
@@ -63,16 +69,16 @@ dependencies:
     targets:
     - esp32s3
     version: 3.0.0~1
-  espressif/esp_codec_dev:
-    component_hash: c71e2d13dad6fc41561590dd88dbc45c79e3f4ef48d5ee3575c60e8b6c8e79d5
+  espressif/esp-dsp:
+    component_hash: 619639efc18cfa361a9e423739b9b0ffc14991effc6c027f955c2f2c3bf1754b
     dependencies:
     - name: idf
       require: private
-      version: '>=4.0'
+      version: '>=4.2'
     source:
-      registry_url: https://components.espressif.com
+      registry_url: https://components.espressif.com/
       type: service
-    version: 1.3.5
+    version: 1.6.0
   espressif/esp_lcd_ili9341:
     component_hash: 31f1b793aa2110dd2ae071c21ccbff0a4eb20d9a4ee40b6294c0dc0ad9552c4e
     dependencies:
@@ -84,7 +90,7 @@ dependencies:
       require: private
       version: 0.*
     source:
-      registry_url: https://components.espressif.com
+      registry_url: https://components.espressif.com/
       type: service
     version: 1.2.0
   espressif/esp_lcd_touch:
@@ -139,6 +145,16 @@ dependencies:
       registry_url: https://components.espressif.com
       type: service
     version: 2.6.0
+  espressif/esp_websocket_client:
+    component_hash: f77326f0e1c38da4e9c97e17c5d649b0dd13027f2645e720e48db269638fd622
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=5.0'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.4.0
   espressif/icm42670:
     component_hash: 28b56e174f75c70037f5208aaed6c3789f0d243500d975519584bf9dc8c0836c
     dependencies:
@@ -149,6 +165,36 @@ dependencies:
       registry_url: https://components.espressif.com
       type: service
     version: 2.0.2
+  espressif/jsmn:
+    component_hash: d80350c41bbaa827c98a25b6072df00884e72f54885996fab4a4f0aebce6b6c3
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=4.3'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.1.0
+  espressif/nghttp:
+    component_hash: 32e513409ddce0000990ec74b0711f535462726236d72578cd70498e6bd66dc3
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=5.0'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.65.0
+  espressif/zlib:
+    component_hash: d901723af51f13fc8e5824f39f32239c847956e8fd951a05266588dc5cfbb9ae
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=4.4'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.3.1
   idf:
     source:
       type: idf
@@ -162,7 +208,13 @@ dependencies:
     version: 9.3.0
 direct_dependencies:
 - espressif/esp-box-3
+- espressif/esp-dsp
+- espressif/esp_lcd_ili9341
+- espressif/esp_websocket_client
+- espressif/jsmn
+- espressif/nghttp
+- espressif/zlib
 - idf
-manifest_hash: 55f6a702498de8e87201da504a38c5c958e2094ec832c4f8d6d368369d5d0625
+manifest_hash: 1891dc87058c6eec7aa77f286a1967ebe7c293709db8da5b2f1e79d2e45f57ef
 target: esp32s3
 version: 2.0.0
diff --git a/esp32-s3-box-3/esp-adf b/esp32-s3-box-3/esp-adf
new file mode 160000
index 0000000..1e839af
--- /dev/null
+++ b/esp32-s3-box-3/esp-adf
@@ -0,0 +1 @@
+Subproject commit 1e839afb5a5166aad6028ac2796272f32454dced
diff --git a/esp32-s3-box-3/partitions.csv b/esp32-s3-box-3/partitions.csv
index 03d5d13..6bfa8e4 100644
--- a/esp32-s3-box-3/partitions.csv
+++ b/esp32-s3-box-3/partitions.csv
@@ -2,5 +2,5 @@
 # Name,   Type, SubType, Offset,  Size, Flags
 nvs,      data, nvs,     0x9000,  0x6000,
 phy_init, data, phy,     0xf000,  0x1000,
-factory,  app,  factory, 0x10000, 0x180000,
+factory,  app,  factory, 0x10000, 0x200000,
 
diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults
index 1a5c07d..9abc65a 100644
--- a/esp32-s3-box-3/sdkconfig.defaults
+++ b/esp32-s3-box-3/sdkconfig.defaults
@@ -18,6 +18,9 @@ CONFIG_ESP_MAIN_TASK_STACK_SIZE=16384
 # Defaults to partitions.csv
 CONFIG_PARTITION_TABLE_CUSTOM=y
 
+# Set flash size to 16MB for ESP32-S3-Box-3
+CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
+
 # Set highest CPU Freq
 CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
 
diff --git a/esp32-s3-box-3/src/CMakeLists.txt b/esp32-s3-box-3/src/CMakeLists.txt
index da4e1e7..cc4ab61 100644
--- a/esp32-s3-box-3/src/CMakeLists.txt
+++ b/esp32-s3-box-3/src/CMakeLists.txt
@@ -7,7 +7,7 @@ if(IDF_TARGET STREQUAL linux)
 else()
 	idf_component_register(
 		SRCS ${COMMON_SRC} "wifi.cpp" "media.cpp" "rtvi.cpp" "rtvi_callbacks.cpp" "screen.cpp"
-		REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl)
+		REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl esp-sr)
 endif()
 
 idf_component_get_property(lib peer COMPONENT_LIB)
diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp
index 55ae6e7..2d18090 100644
--- a/esp32-s3-box-3/src/main.cpp
+++ b/esp32-s3-box-3/src/main.cpp
@@ -34,6 +34,9 @@ extern "C" void app_main(void) {
   pipecat_init_audio_decoder();
   heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
 
+  pipecat_init_aec();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_wifi();
   heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
 
diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h
index a5e6478..5797b52 100644
--- a/esp32-s3-box-3/src/main.h
+++ b/esp32-s3-box-3/src/main.h
@@ -14,6 +14,7 @@ extern void pipecat_init_audio_decoder();
 extern void pipecat_init_audio_encoder();
 extern void pipecat_send_audio(PeerConnection *peer_connection);
 extern void pipecat_audio_decode(uint8_t *data, size_t size);
+extern void pipecat_init_aec();
 
 // WebRTC / Signalling
 extern void pipecat_init_webrtc();
diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index ba95185..f628845 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -3,18 +3,29 @@
 
 #include <atomic>
 #include <cstring>
+#include <esp_timer.h>
 
 #include "esp_log.h"
 #include "main.h"
 
+#include "esp_afe_aec.h"
+
+
 #define SAMPLE_RATE (16000)
 
 #define OPUS_BUFFER_SIZE 1276  // 1276 bytes is recommended by opus_encode
-#define PCM_BUFFER_SIZE 640
+#define AEC_CHUNK_SIZE 256  // AEC processes 256 samples at a time for 16kHz
+#define OPUS_FRAME_SIZE 320  // Opus frame size in samples
+#define COMMON_BUFFER_SAMPLES 1280  // LCM(256, 320) = 1280 samples
+#define AEC_CHUNKS_TO_BATCH (COMMON_BUFFER_SAMPLES / AEC_CHUNK_SIZE)  // 5 chunks
+#define OPUS_FRAMES_TO_BATCH (COMMON_BUFFER_SAMPLES / OPUS_FRAME_SIZE)  // 4 frames
+#define PCM_BUFFER_SIZE (COMMON_BUFFER_SAMPLES * sizeof(int16_t))  // 2560 bytes
 
 #define OPUS_ENCODER_BITRATE 30000
 #define OPUS_ENCODER_COMPLEXITY 0
 
+
+
 std::atomic<bool> is_playing = false;
 void set_is_playing(int16_t *in_buf, size_t in_samples) {
   bool any_set = false;
@@ -78,6 +89,16 @@ OpusEncoder *opus_encoder = NULL;
 uint8_t *encoder_output_buffer = NULL;
 uint8_t *read_buffer = NULL;
 
+// Timing statistics
+static uint64_t total_read_time = 0;
+static uint64_t total_aec_time = 0;
+static uint64_t total_encode_time = 0;
+static uint64_t total_send_time = 0;
+static uint32_t cycle_count = 0;
+
+// AEC processed buffer
+static int16_t *processed_buffer = NULL;
+
 void pipecat_init_audio_encoder() {
   int encoder_error;
   opus_encoder = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP,
@@ -100,22 +121,190 @@ void pipecat_init_audio_encoder() {
   read_buffer =
       (uint8_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT);
   encoder_output_buffer = (uint8_t *)malloc(OPUS_BUFFER_SIZE);
+  processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT);
 }
 
+
+typedef struct {
+  int        task_stack;     /*!< Task stack size */
+  int        task_prio;      /*!< Task peroid */
+  int        task_core;      /*!< The core that task to be created */
+  bool       debug_aec;      /*!< debug AEC input data */
+  bool       stack_in_ext;   /*!< Try to allocate stack in external memory */
+  afe_type_t type;           /*!< The type of afe， AFE_TYPE_SR , AFE_TYPE_VC , AFE_TYPE_VC_8K */
+  afe_mode_t mode;           /*!< The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF */
+  int        filter_length;  /*!< The filter length of aec */
+  char      *input_format;   /*!< The input format is as follows: For example, 'MR', 'RMNM'.
+                                  'M' means data from microphone, 'R' means data from reference data, 'N' means no data.. */
+} aec_stream_cfg_t;
+
+static int16_t *aec_in_buffer = NULL;
+static int16_t *aec_out_buffer = NULL;
+static afe_aec_handle_t *aec_handle = NULL;
+
+void pipecat_init_aec() {
+  // Try a simpler approach - create AEC without the task configuration struct
+  ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration");
+  
+  // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
+  aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
+  if (aec_handle == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to create AEC");
+    return;
+  }
+  
+  ESP_LOGI(LOG_TAG, "AEC created successfully");
+  
+  int frame_size = afe_aec_get_chunksize(aec_handle);
+  int input_size = frame_size * aec_handle->pcm_config.total_ch_num * sizeof(int16_t);
+  int output_size = frame_size * sizeof(int16_t);
+
+  // Allocate buffers in PSRAM instead of internal memory
+  aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_SPIRAM);
+  aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_SPIRAM);
+  
+  if (aec_in_buffer == NULL || aec_out_buffer == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers");
+    // Try internal memory as fallback
+    if (aec_in_buffer) heap_caps_free(aec_in_buffer);
+    if (aec_out_buffer) heap_caps_free(aec_out_buffer);
+    
+    aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_INTERNAL);
+    aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_INTERNAL);
+    
+    if (aec_in_buffer == NULL || aec_out_buffer == NULL) {
+      ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers even in internal memory");
+      return;
+    }
+    ESP_LOGW(LOG_TAG, "AEC buffers allocated in internal memory");
+  }
+  
+  ESP_LOGI(LOG_TAG, "AEC frame size: %d, input size: %d, output size: %d", frame_size, input_size, output_size);
+  ESP_LOGI(LOG_TAG, "AEC buffers allocated: in=%p, out=%p", aec_in_buffer, aec_out_buffer);
+}
+
+
 void pipecat_send_audio(PeerConnection *peer_connection) {
+  uint64_t start_time, end_time;
+  
+  // Check if AEC is initialized
+  if (aec_handle == NULL) {
+    // AEC disabled, bypass processing
+    // ESP_LOGW(LOG_TAG, "AEC not initialized, bypassing");
+  }
+  
+  // Measure esp_codec_dev_read time
+  start_time = esp_timer_get_time();
   if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) !=
       ESP_OK) {
     printf("esp_codec_dev_read failed");
     return;
   }
+  end_time = esp_timer_get_time();
+  total_read_time += (end_time - start_time);
 
   if (is_playing) {
     memset(read_buffer, 0, PCM_BUFFER_SIZE);
   }
 
-  auto encoded_size = opus_encode(opus_encoder, (const opus_int16 *)read_buffer,
-                                  PCM_BUFFER_SIZE / sizeof(uint16_t),
-                                  encoder_output_buffer, OPUS_BUFFER_SIZE);
-  peer_connection_send_audio(peer_connection, encoder_output_buffer,
-                             encoded_size);
+  // AEC processing (skip if disabled)
+  if (aec_handle != NULL) {
+    // Get AEC chunk size (should be 256 samples for 16kHz)
+    int aec_chunk_size = afe_aec_get_chunksize(aec_handle);
+    int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t);  // 640 samples
+    int num_chunks = total_samples / aec_chunk_size;  // Should be 2.5, so we'll process 2 chunks
+    
+    // Debug logging
+    static bool first_run = true;
+    if (first_run) {
+      ESP_LOGI(LOG_TAG, "AEC debug: chunk_size=%d, total_samples=%d, num_chunks=%d", 
+               aec_chunk_size, total_samples, num_chunks);
+      ESP_LOGI(LOG_TAG, "AEC debug: total_ch_num=%d, sample_rate=%d", 
+               aec_handle->pcm_config.total_ch_num, aec_handle->pcm_config.sample_rate);
+      ESP_LOGI(LOG_TAG, "AEC debug: input format=%s, mic_num=%d, ref_num=%d",
+               "MR", aec_handle->pcm_config.mic_num, aec_handle->pcm_config.ref_num);
+      first_run = false;
+    }
+    
+    // Process all chunks
+    if (num_chunks != AEC_CHUNKS_TO_BATCH) {
+      ESP_LOGW(LOG_TAG, "Unexpected number of AEC chunks: %d (expected %d)", num_chunks, AEC_CHUNKS_TO_BATCH);
+    }
+    
+    // Process AEC in chunks
+    for (int chunk = 0; chunk < num_chunks; chunk++) {
+      // Prepare input: interleave mic data with zeros (reference channel)
+      int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size);
+      
+      // Format: MRMRMR... (interleaved - mic and ref alternating)
+      for (int i = 0; i < aec_chunk_size; i++) {
+        aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel
+        aec_in_buffer[i * 2 + 1] = 0;           // Reference channel (zeros for now)
+      }
+      
+      // Run AEC process for this chunk
+      start_time = esp_timer_get_time();
+      int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
+      end_time = esp_timer_get_time();
+      total_aec_time += (end_time - start_time);
+      
+      if (ret < 0) {
+        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d (0x%x) at chunk %d", 
+                 ret, ret, chunk);
+        // Fall back to bypass on error
+        memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE);
+        return;
+      }
+      // Log success on first chunk of first run
+      if (first_run && chunk == 0) {
+        ESP_LOGI(LOG_TAG, "AEC process successful, returned %d bytes", ret);
+      }
+      
+      // Copy AEC output to processed buffer
+      memcpy(processed_buffer + (chunk * aec_chunk_size), aec_out_buffer, aec_chunk_size * sizeof(int16_t));
+    }
+    
+    // All samples have been processed through AEC
+  } else {
+    // AEC disabled, just copy input to processed buffer
+    memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE);
+  }
+
+  // Process Opus frames - we have exactly COMMON_BUFFER_SAMPLES (1280) samples
+  // This encodes to exactly OPUS_FRAMES_TO_BATCH (4) frames
+  for (int frame = 0; frame < OPUS_FRAMES_TO_BATCH; frame++) {
+    // Measure opus_encode time
+    start_time = esp_timer_get_time();
+    auto encoded_size = opus_encode(opus_encoder, 
+                                    processed_buffer + (frame * OPUS_FRAME_SIZE),
+                                    OPUS_FRAME_SIZE,
+                                    encoder_output_buffer, OPUS_BUFFER_SIZE);
+    end_time = esp_timer_get_time();
+    total_encode_time += (end_time - start_time);
+    
+    // Measure peer_connection_send_audio time
+    start_time = esp_timer_get_time();
+    peer_connection_send_audio(peer_connection, encoder_output_buffer,
+                               encoded_size);
+    end_time = esp_timer_get_time();
+    total_send_time += (end_time - start_time);
+  }
+  
+  // Update cycle count and print statistics every 50 cycles
+  cycle_count++;
+  if (cycle_count % 50 == 0) {
+    uint64_t avg_read_time = total_read_time / 50;
+    uint64_t avg_aec_time = total_aec_time / 50;
+    uint64_t avg_encode_time = total_encode_time / 50;
+    uint64_t avg_send_time = total_send_time / 50;
+    
+    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): read=%llu us, aec=%llu us, encode=%llu us, send=%llu us",
+             avg_read_time, avg_aec_time, avg_encode_time, avg_send_time);
+    
+    // Reset counters for next 50 cycles
+    total_read_time = 0;
+    total_aec_time = 0;
+    total_encode_time = 0;
+    total_send_time = 0;
+  }
 }
diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index 937e166..8172098 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -23,7 +23,7 @@ void pipecat_send_audio_task(void *user_data) {
 
   while (1) {
     pipecat_send_audio(peer_connection);
-    vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
+    vTaskDelay(pdMS_TO_TICKS(1));
   }
 }
 #endif
@@ -61,10 +61,11 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state,
 
     ESP_LOGI(LOG_TAG, "Creating send audio task");
     // Todo: move this to a separate init block
-    xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 0);
+    // Why is this stack size so large? Opus encoder?
+    xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 1);
     heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
 
-    // pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
+    pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
 #endif
   }
 }
@@ -139,5 +140,5 @@ void peer_connection_task(void* arg) {
 void pipecat_webrtc_run_task() {
   ESP_LOGI(LOG_TAG, "peer_connection_task starting ...");
 
-  xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 1);
+  xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 0);
 }

From f0b36941cad4e2c66e799cfa741aca1ea6b38b1a Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 16:40:41 -0700
Subject: [PATCH 05/14] working aec

---
 esp32-s3-box-3/src/media.cpp | 92 +++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 7 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index f628845..27332a8 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -4,6 +4,7 @@
 #include <atomic>
 #include <cstring>
 #include <esp_timer.h>
+#include <freertos/ringbuf.h>
 
 #include "esp_log.h"
 #include "main.h"
@@ -59,6 +60,10 @@ void pipecat_init_audio_capture() {
 opus_int16 *decoder_buffer = NULL;
 OpusDecoder *opus_decoder = NULL;
 
+// Ring buffer for speaker reference data (declare before use)
+static RingbufHandle_t speaker_ringbuf = NULL;
+static const size_t SPEAKER_RINGBUF_SIZE = 8192;  // ~3.2 read cycles at 1280 samples/read
+
 void pipecat_init_audio_decoder() {
   int decoder_error = 0;
   opus_decoder = opus_decoder_create(SAMPLE_RATE, 1, &decoder_error);
@@ -77,11 +82,35 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
 
   if (decoded_size > 0) {
     set_is_playing(decoder_buffer, decoded_size);
+    
+    // Send to speaker
     if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer,
                                    decoded_size * sizeof(uint16_t))) !=
         ESP_OK) {
       ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret));
     }
+    
+    // Also send to ring buffer for AEC reference
+    if (speaker_ringbuf != NULL) {
+      BaseType_t ret = xRingbufferSend(speaker_ringbuf, decoder_buffer, 
+                                       decoded_size * sizeof(int16_t), 0);
+      if (ret != pdTRUE) {
+        // Ring buffer full, this is expected occasionally
+        static int overflow_count = 0;
+        if (++overflow_count % 100 == 0) {
+          ESP_LOGW(LOG_TAG, "Speaker ring buffer overflow count: %d", overflow_count);
+        }
+      } else {
+        // Log successful writes occasionally
+        static int write_count = 0;
+        if (++write_count % 50 == 0) {
+          ESP_LOGI(LOG_TAG, "Speaker ring buffer writes: %d, size: %d bytes", 
+                   write_count, decoded_size * sizeof(int16_t));
+        }
+      }
+    } else {
+      ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL!");
+    }
   }
 }
 
@@ -146,6 +175,14 @@ void pipecat_init_aec() {
   // Try a simpler approach - create AEC without the task configuration struct
   ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration");
   
+  // Create ring buffer for speaker reference data
+  speaker_ringbuf = xRingbufferCreate(SPEAKER_RINGBUF_SIZE, RINGBUF_TYPE_BYTEBUF);
+  if (speaker_ringbuf == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to create speaker ring buffer");
+    return;
+  }
+  ESP_LOGI(LOG_TAG, "Speaker ring buffer created, size=%d", SPEAKER_RINGBUF_SIZE);
+  
   // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
   aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
   if (aec_handle == NULL) {
@@ -203,9 +240,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   end_time = esp_timer_get_time();
   total_read_time += (end_time - start_time);
 
-  if (is_playing) {
-    memset(read_buffer, 0, PCM_BUFFER_SIZE);
-  }
+  // Don't zero out microphone input - let AEC handle echo cancellation
+  // if (is_playing) {
+  //   memset(read_buffer, 0, PCM_BUFFER_SIZE);
+  // }
 
   // AEC processing (skip if disabled)
   if (aec_handle != NULL) {
@@ -236,10 +274,50 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       // Prepare input: interleave mic data with zeros (reference channel)
       int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size);
       
-      // Format: MRMRMR... (interleaved - mic and ref alternating)
-      for (int i = 0; i < aec_chunk_size; i++) {
-        aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel
-        aec_in_buffer[i * 2 + 1] = 0;           // Reference channel (zeros for now)
+      // Get reference data from speaker ring buffer
+      size_t ref_bytes_needed = aec_chunk_size * sizeof(int16_t);
+      size_t ref_bytes_available = 0;
+      int16_t *ref_data = NULL;
+      
+      if (speaker_ringbuf != NULL) {
+        ref_data = (int16_t *)xRingbufferReceive(speaker_ringbuf, &ref_bytes_available, 0);
+      } else {
+        static bool logged_null = false;
+        if (!logged_null) {
+          ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL during read!");
+          logged_null = true;
+        }
+      }
+      
+      if (ref_data != NULL && ref_bytes_available >= ref_bytes_needed) {
+        // Use actual speaker reference data
+        for (int i = 0; i < aec_chunk_size; i++) {
+          aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel
+          aec_in_buffer[i * 2 + 1] = ref_data[i]; // Reference channel (actual speaker data)
+        }
+        // Return item to ring buffer
+        vRingbufferReturnItem(speaker_ringbuf, ref_data);
+        
+        // Log successful reads occasionally
+        static int ref_success_count = 0;
+        if (++ref_success_count % 50 == 0) {
+          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, bytes: %d", 
+                   ref_success_count, ref_bytes_available);
+        }
+      } else {
+        // No reference data available, use zeros
+        for (int i = 0; i < aec_chunk_size; i++) {
+          aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel  
+          aec_in_buffer[i * 2 + 1] = 0;           // Reference channel (zeros as fallback)
+        }
+        if (ref_data != NULL) {
+          vRingbufferReturnItem(speaker_ringbuf, ref_data);
+        }
+        // Log when we don't have reference data (only occasionally to avoid spam)
+        static int no_ref_count = 0;
+        if (++no_ref_count % 100 == 0) {
+          ESP_LOGW(LOG_TAG, "No speaker reference data available count: %d", no_ref_count);
+        }
       }
       
       // Run AEC process for this chunk

From 3dad580ae9f5775c84495cd0b7a332a9d1cd8fa1 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 17:15:31 -0700
Subject: [PATCH 06/14] circular buffer

---
 esp32-s3-box-3/src/media.cpp | 200 ++++++++++++++++++++++++++++-------
 1 file changed, 162 insertions(+), 38 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 27332a8..6fb8545 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -25,6 +25,9 @@
 #define OPUS_ENCODER_BITRATE 30000
 #define OPUS_ENCODER_COMPLEXITY 0
 
+// Forward declare for use in set_is_playing
+static bool ref_buffer_primed = false;
+
 
 
 std::atomic<bool> is_playing = false;
@@ -35,7 +38,14 @@ void set_is_playing(int16_t *in_buf, size_t in_samples) {
       any_set = true;
     }
   }
+  bool was_playing = is_playing.load();
   is_playing = any_set;
+  
+  // Reset buffer priming when playback stops
+  if (was_playing && !any_set) {
+    ref_buffer_primed = false;
+    ESP_LOGD(LOG_TAG, "Playback stopped, reset reference buffer priming");
+  }
 }
 
 esp_codec_dev_handle_t mic_codec_dev = NULL;
@@ -60,9 +70,15 @@ void pipecat_init_audio_capture() {
 opus_int16 *decoder_buffer = NULL;
 OpusDecoder *opus_decoder = NULL;
 
-// Ring buffer for speaker reference data (declare before use)
-static RingbufHandle_t speaker_ringbuf = NULL;
-static const size_t SPEAKER_RINGBUF_SIZE = 8192;  // ~3.2 read cycles at 1280 samples/read
+// Circular buffer for speaker reference data (not using FreeRTOS ring buffer)
+static int16_t* speaker_circular_buffer = NULL;
+static const size_t SPEAKER_BUFFER_SAMPLES = 4096;  // 4096 samples = 8192 bytes
+static const size_t MIN_REF_BUFFER_SAMPLES = 2560;  // 2 read cycles worth to prevent underruns
+static volatile size_t speaker_write_idx = 0;
+static volatile size_t speaker_read_idx = 0;
+static volatile size_t speaker_samples_available = 0;
+static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
+// ref_buffer_primed already declared above for use in set_is_playing
 
 void pipecat_init_audio_decoder() {
   int decoder_error = 0;
@@ -76,6 +92,18 @@ void pipecat_init_audio_decoder() {
 }
 
 void pipecat_audio_decode(uint8_t *data, size_t size) {
+  // Log packet arrival timing
+  static uint64_t last_decode_time = 0;
+  uint64_t now = esp_timer_get_time();
+  if (last_decode_time > 0) {
+    uint64_t delta_ms = (now - last_decode_time) / 1000;
+    static int packet_count = 0;
+    if (++packet_count % 10 == 0) {
+      ESP_LOGI(LOG_TAG, "Audio packet %d: size=%d, delta=%llu ms", packet_count, size, delta_ms);
+    }
+  }
+  last_decode_time = now;
+  
   esp_err_t ret;
   int decoded_size =
       opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
@@ -90,26 +118,55 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
       ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret));
     }
     
-    // Also send to ring buffer for AEC reference
-    if (speaker_ringbuf != NULL) {
-      BaseType_t ret = xRingbufferSend(speaker_ringbuf, decoder_buffer, 
-                                       decoded_size * sizeof(int16_t), 0);
-      if (ret != pdTRUE) {
-        // Ring buffer full, this is expected occasionally
-        static int overflow_count = 0;
-        if (++overflow_count % 100 == 0) {
-          ESP_LOGW(LOG_TAG, "Speaker ring buffer overflow count: %d", overflow_count);
+    // Also send to circular buffer for AEC reference
+    if (speaker_circular_buffer != NULL) {
+      // Variables for logging outside critical section
+      bool log_write = false;
+      bool log_overflow = false;
+      int write_count_log = 0;
+      int overflow_count_log = 0;
+      size_t samples_available_log = 0;
+      
+      portENTER_CRITICAL(&speaker_buffer_mutex);
+      
+      // Check if we have space
+      size_t space_available = SPEAKER_BUFFER_SAMPLES - speaker_samples_available;
+      if (space_available >= decoded_size) {
+        // Copy samples to circular buffer
+        for (int i = 0; i < decoded_size; i++) {
+          speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
+          speaker_write_idx = (speaker_write_idx + 1) % SPEAKER_BUFFER_SAMPLES;
         }
-      } else {
-        // Log successful writes occasionally
+        speaker_samples_available += decoded_size;
+        
+        // Prepare logging info
         static int write_count = 0;
         if (++write_count % 50 == 0) {
-          ESP_LOGI(LOG_TAG, "Speaker ring buffer writes: %d, size: %d bytes", 
-                   write_count, decoded_size * sizeof(int16_t));
+          log_write = true;
+          write_count_log = write_count;
+          samples_available_log = speaker_samples_available;
+        }
+      } else {
+        // Buffer overflow
+        static int overflow_count = 0;
+        if (++overflow_count % 100 == 0) {
+          log_overflow = true;
+          overflow_count_log = overflow_count;
         }
       }
+      
+      portEXIT_CRITICAL(&speaker_buffer_mutex);
+      
+      // Do logging outside critical section
+      if (log_write) {
+        ESP_LOGI(LOG_TAG, "Speaker buffer writes: %d, samples: %d, available: %d", 
+                 write_count_log, decoded_size, samples_available_log);
+      }
+      if (log_overflow) {
+        ESP_LOGW(LOG_TAG, "Speaker buffer overflow count: %d", overflow_count_log);
+      }
     } else {
-      ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL!");
+      ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL!");
     }
   }
 }
@@ -175,13 +232,13 @@ void pipecat_init_aec() {
   // Try a simpler approach - create AEC without the task configuration struct
   ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration");
   
-  // Create ring buffer for speaker reference data
-  speaker_ringbuf = xRingbufferCreate(SPEAKER_RINGBUF_SIZE, RINGBUF_TYPE_BYTEBUF);
-  if (speaker_ringbuf == NULL) {
-    ESP_LOGE(LOG_TAG, "Failed to create speaker ring buffer");
+  // Create circular buffer for speaker reference data
+  speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM);
+  if (speaker_circular_buffer == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to allocate speaker circular buffer");
     return;
   }
-  ESP_LOGI(LOG_TAG, "Speaker ring buffer created, size=%d", SPEAKER_RINGBUF_SIZE);
+  ESP_LOGI(LOG_TAG, "Speaker circular buffer created, size=%d samples", SPEAKER_BUFFER_SAMPLES);
   
   // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
   aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
@@ -274,35 +331,105 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       // Prepare input: interleave mic data with zeros (reference channel)
       int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size);
       
-      // Get reference data from speaker ring buffer
-      size_t ref_bytes_needed = aec_chunk_size * sizeof(int16_t);
-      size_t ref_bytes_available = 0;
-      int16_t *ref_data = NULL;
+      // Get reference data from speaker circular buffer
+      bool have_ref_data = false;
+      int16_t ref_chunk[256];  // Temporary buffer for reference data
       
-      if (speaker_ringbuf != NULL) {
-        ref_data = (int16_t *)xRingbufferReceive(speaker_ringbuf, &ref_bytes_available, 0);
+      if (speaker_circular_buffer != NULL) {
+        // Variables for logging outside critical section
+        bool log_buffer_state = false;
+        bool log_primed = false;
+        bool log_underrun = false;
+        size_t samples_for_log = 0;
+        int underrun_type = 0;  // 0=none, 1=remaining chunks, 2=current chunk
+        size_t needed_for_log = 0;
+        
+        portENTER_CRITICAL(&speaker_buffer_mutex);
+        
+        // Debug log buffer state occasionally
+        static int buffer_check_count = 0;
+        if (++buffer_check_count % 100 == 0) {
+          log_buffer_state = true;
+          samples_for_log = speaker_samples_available;
+        }
+        
+        // Only start using reference data if we have enough buffered
+        if (!ref_buffer_primed && speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) {
+          ref_buffer_primed = true;
+          log_primed = true;
+          samples_for_log = speaker_samples_available;
+        }
+        
+        // Only try to get reference data if buffer is primed
+        if (ref_buffer_primed) {
+          // Check if we have enough samples for this chunk
+          if (speaker_samples_available >= aec_chunk_size) {
+            // Read samples from circular buffer
+            for (int i = 0; i < aec_chunk_size; i++) {
+              ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
+              speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES;
+            }
+            speaker_samples_available -= aec_chunk_size;
+            have_ref_data = true;
+            
+            // Check if we're running too low for remaining chunks
+            int remaining_chunks = num_chunks - chunk - 1;
+            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size;
+            if (speaker_samples_available < remaining_samples_needed) {
+              ref_buffer_primed = false;
+              log_underrun = true;
+              underrun_type = 1;
+              samples_for_log = speaker_samples_available;
+              needed_for_log = remaining_samples_needed;
+            }
+          } else {
+            // Not enough samples, unprime
+            ref_buffer_primed = false;
+            log_underrun = true;
+            underrun_type = 2;
+            samples_for_log = speaker_samples_available;
+            needed_for_log = aec_chunk_size;
+          }
+        }
+        
+        portEXIT_CRITICAL(&speaker_buffer_mutex);
+        
+        // Do logging outside critical section
+        if (log_buffer_state) {
+          ESP_LOGI(LOG_TAG, "Circular buffer state: %d samples available, chunk %d/%d", 
+                   samples_for_log, chunk, num_chunks);
+        }
+        if (log_primed) {
+          ESP_LOGI(LOG_TAG, "Reference buffer primed with %d samples", samples_for_log);
+        }
+        if (log_underrun) {
+          if (underrun_type == 1) {
+            ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for remaining, have %d)", 
+                     chunk, num_chunks, needed_for_log, samples_for_log);
+          } else {
+            ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for current, have %d)", 
+                     chunk, num_chunks, needed_for_log, samples_for_log);
+          }
+        }
       } else {
         static bool logged_null = false;
         if (!logged_null) {
-          ESP_LOGW(LOG_TAG, "Speaker ring buffer is NULL during read!");
+          ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL during read!");
           logged_null = true;
         }
       }
       
-      if (ref_data != NULL && ref_bytes_available >= ref_bytes_needed) {
+      if (have_ref_data) {
         // Use actual speaker reference data
         for (int i = 0; i < aec_chunk_size; i++) {
           aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel
-          aec_in_buffer[i * 2 + 1] = ref_data[i]; // Reference channel (actual speaker data)
+          aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference channel (actual speaker data)
         }
-        // Return item to ring buffer
-        vRingbufferReturnItem(speaker_ringbuf, ref_data);
         
         // Log successful reads occasionally
         static int ref_success_count = 0;
         if (++ref_success_count % 50 == 0) {
-          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, bytes: %d", 
-                   ref_success_count, ref_bytes_available);
+          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count);
         }
       } else {
         // No reference data available, use zeros
@@ -310,9 +437,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
           aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel  
           aec_in_buffer[i * 2 + 1] = 0;           // Reference channel (zeros as fallback)
         }
-        if (ref_data != NULL) {
-          vRingbufferReturnItem(speaker_ringbuf, ref_data);
-        }
         // Log when we don't have reference data (only occasionally to avoid spam)
         static int no_ref_count = 0;
         if (++no_ref_count % 100 == 0) {

From 17537741485e2723d4027e8a233c832af7251c08 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 17:25:04 -0700
Subject: [PATCH 07/14] elapsed time timing for audio producer task

---
 esp32-s3-box-3/src/webrtc.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index 8172098..2fb1c78 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -5,6 +5,7 @@
 
 #include <esp_event.h>
 #include <esp_log.h>
+#include <esp_timer.h>
 #include <string.h>
 
 #include "main.h"
@@ -22,8 +23,19 @@ void pipecat_send_audio_task(void *user_data) {
   pipecat_init_audio_encoder();
 
   while (1) {
+    uint64_t start_time = esp_timer_get_time();
     pipecat_send_audio(peer_connection);
-    vTaskDelay(pdMS_TO_TICKS(1));
+    uint64_t end_time = esp_timer_get_time();
+    uint64_t elapsed_ms = (end_time - start_time) / 1000;
+    
+    // We process 1280 samples at 16kHz = 80ms of audio
+    // Delay for the remainder of the 80ms period
+    if (elapsed_ms < 80) {
+      vTaskDelay(pdMS_TO_TICKS(80 - elapsed_ms));
+    } else {
+      // If processing took longer than 80ms, yield to other tasks
+      vTaskDelay(pdMS_TO_TICKS(1));
+    }
   }
 }
 #endif

From b35fba0d30d9eba073481502b0454fdbf04cc4a2 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Sun, 6 Jul 2025 17:51:53 -0700
Subject: [PATCH 08/14] introduce reference sample delay

---
 esp32-s3-box-3/src/media.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 6fb8545..5bc66d3 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -78,6 +78,11 @@ static volatile size_t speaker_write_idx = 0;
 static volatile size_t speaker_read_idx = 0;
 static volatile size_t speaker_samples_available = 0;
 static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
+
+// AEC reference delay compensation (in samples)
+// This accounts for acoustic path + hardware delays
+// Typical value: 320 samples = 20ms at 16kHz
+static const size_t AEC_REF_DELAY_SAMPLES = 320;
 // ref_buffer_primed already declared above for use in set_is_playing
 
 void pipecat_init_audio_decoder() {
@@ -354,7 +359,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         }
         
         // Only start using reference data if we have enough buffered
-        if (!ref_buffer_primed && speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) {
+        // We need MIN_REF_BUFFER_SAMPLES + delay compensation
+        size_t required_samples = MIN_REF_BUFFER_SAMPLES + AEC_REF_DELAY_SAMPLES;
+        if (!ref_buffer_primed && speaker_samples_available >= required_samples) {
           ref_buffer_primed = true;
           log_primed = true;
           samples_for_log = speaker_samples_available;
@@ -362,9 +369,11 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         
         // Only try to get reference data if buffer is primed
         if (ref_buffer_primed) {
-          // Check if we have enough samples for this chunk
-          if (speaker_samples_available >= aec_chunk_size) {
+          // Check if we have enough samples for this chunk plus delay
+          if (speaker_samples_available >= aec_chunk_size + AEC_REF_DELAY_SAMPLES) {
             // Read samples from circular buffer
+            // The delay is maintained by keeping a fixed distance between write and read pointers
+            // speaker_samples_available should always be >= AEC_REF_DELAY_SAMPLES + needed samples
             for (int i = 0; i < aec_chunk_size; i++) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
               speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES;
@@ -373,8 +382,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
             have_ref_data = true;
             
             // Check if we're running too low for remaining chunks
+            // We need to maintain AEC_REF_DELAY_SAMPLES delay plus samples for remaining chunks
             int remaining_chunks = num_chunks - chunk - 1;
-            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size;
+            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size + AEC_REF_DELAY_SAMPLES;
             if (speaker_samples_available < remaining_samples_needed) {
               ref_buffer_primed = false;
               log_underrun = true;

From 7471ee48ab2122f7553d942d019add7acee6d39b Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 7 Jul 2025 12:31:39 -0700
Subject: [PATCH 09/14] seemingly stable, but not amazing, aec

---
 esp32-s3-box-3/src/media.cpp  | 110 +++++++++++++++++++++-------------
 esp32-s3-box-3/src/webrtc.cpp |  13 +---
 2 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 5bc66d3..81bcab6 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -72,17 +72,15 @@ OpusDecoder *opus_decoder = NULL;
 
 // Circular buffer for speaker reference data (not using FreeRTOS ring buffer)
 static int16_t* speaker_circular_buffer = NULL;
-static const size_t SPEAKER_BUFFER_SAMPLES = 4096;  // 4096 samples = 8192 bytes
-static const size_t MIN_REF_BUFFER_SAMPLES = 2560;  // 2 read cycles worth to prevent underruns
+static const size_t SPEAKER_BUFFER_SAMPLES = 4096;  // 4096 samples = 256ms @ 16kHz
+static const size_t MIN_REF_BUFFER_SAMPLES = 2560;  // 2 read cycles worth (160ms)
+static const size_t MIN_REF_BUFFER_RESUME = 1920;   // 1.5 cycles for recovery (120ms)
 static volatile size_t speaker_write_idx = 0;
 static volatile size_t speaker_read_idx = 0;
 static volatile size_t speaker_samples_available = 0;
 static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
 
-// AEC reference delay compensation (in samples)
-// This accounts for acoustic path + hardware delays
-// Typical value: 320 samples = 20ms at 16kHz
-static const size_t AEC_REF_DELAY_SAMPLES = 320;
+// AEC reference delay compensation removed - assuming AEC implementation handles delay internally
 // ref_buffer_primed already declared above for use in set_is_playing
 
 void pipecat_init_audio_decoder() {
@@ -215,27 +213,13 @@ void pipecat_init_audio_encoder() {
   processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT);
 }
 
-
-typedef struct {
-  int        task_stack;     /*!< Task stack size */
-  int        task_prio;      /*!< Task peroid */
-  int        task_core;      /*!< The core that task to be created */
-  bool       debug_aec;      /*!< debug AEC input data */
-  bool       stack_in_ext;   /*!< Try to allocate stack in external memory */
-  afe_type_t type;           /*!< The type of afe， AFE_TYPE_SR , AFE_TYPE_VC , AFE_TYPE_VC_8K */
-  afe_mode_t mode;           /*!< The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF */
-  int        filter_length;  /*!< The filter length of aec */
-  char      *input_format;   /*!< The input format is as follows: For example, 'MR', 'RMNM'.
-                                  'M' means data from microphone, 'R' means data from reference data, 'N' means no data.. */
-} aec_stream_cfg_t;
-
 static int16_t *aec_in_buffer = NULL;
 static int16_t *aec_out_buffer = NULL;
 static afe_aec_handle_t *aec_handle = NULL;
 
 void pipecat_init_aec() {
   // Try a simpler approach - create AEC without the task configuration struct
-  ESP_LOGI(LOG_TAG, "Initializing AEC with simple configuration");
+  ESP_LOGI(LOG_TAG, "Initializing AEC");
   
   // Create circular buffer for speaker reference data
   speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM);
@@ -308,7 +292,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   // }
 
   // AEC processing (skip if disabled)
-  if (aec_handle != NULL) {
+  if (aec_handle != NULL && aec_enabled) {
     // Get AEC chunk size (should be 256 samples for 16kHz)
     int aec_chunk_size = afe_aec_get_chunksize(aec_handle);
     int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t);  // 640 samples
@@ -358,22 +342,47 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
           samples_for_log = speaker_samples_available;
         }
         
+        // Warn if we're getting close to the unprime threshold
+        static int low_buffer_count = 0;
+        if (speaker_samples_available < 2200 && speaker_samples_available >= 1920) {
+          if (++low_buffer_count % 10 == 0) {
+            ESP_LOGW(LOG_TAG, "Buffer running low: %d samples (unprime at 1920)", speaker_samples_available);
+          }
+        } else {
+          low_buffer_count = 0;
+        }
+        
         // Only start using reference data if we have enough buffered
-        // We need MIN_REF_BUFFER_SAMPLES + delay compensation
-        size_t required_samples = MIN_REF_BUFFER_SAMPLES + AEC_REF_DELAY_SAMPLES;
-        if (!ref_buffer_primed && speaker_samples_available >= required_samples) {
-          ref_buffer_primed = true;
-          log_primed = true;
-          samples_for_log = speaker_samples_available;
+        // Use hysteresis to prevent oscillation
+        if (!ref_buffer_primed) {
+          size_t required_samples = in_recovery_mode ? MIN_REF_BUFFER_RESUME : MIN_REF_BUFFER_SAMPLES;
+          if (speaker_samples_available >= required_samples) {
+            ref_buffer_primed = true;
+            log_primed = true;
+            samples_for_log = speaker_samples_available;
+          }
+        }
+        
+        // Recovery check: if we've been unprimed for too long, reduce requirements
+        static int unprime_count = 0;
+        static bool in_recovery_mode = false;
+        
+        if (!ref_buffer_primed) {
+          unprime_count++;
+          // After 25 attempts (~2 seconds), enter recovery mode
+          if (unprime_count > 25 && !in_recovery_mode) {
+            in_recovery_mode = true;
+            ESP_LOGW(LOG_TAG, "Entering recovery mode after %d attempts", unprime_count);
+          }
+        } else {
+          unprime_count = 0;
         }
         
         // Only try to get reference data if buffer is primed
         if (ref_buffer_primed) {
-          // Check if we have enough samples for this chunk plus delay
-          if (speaker_samples_available >= aec_chunk_size + AEC_REF_DELAY_SAMPLES) {
+          // Check if we have enough samples for this chunk
+          if (speaker_samples_available >= aec_chunk_size) {
             // Read samples from circular buffer
-            // The delay is maintained by keeping a fixed distance between write and read pointers
-            // speaker_samples_available should always be >= AEC_REF_DELAY_SAMPLES + needed samples
             for (int i = 0; i < aec_chunk_size; i++) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
               speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES;
@@ -382,15 +391,33 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
             have_ref_data = true;
             
             // Check if we're running too low for remaining chunks
-            // We need to maintain AEC_REF_DELAY_SAMPLES delay plus samples for remaining chunks
             int remaining_chunks = num_chunks - chunk - 1;
-            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size + AEC_REF_DELAY_SAMPLES;
-            if (speaker_samples_available < remaining_samples_needed) {
-              ref_buffer_primed = false;
-              log_underrun = true;
-              underrun_type = 1;
-              samples_for_log = speaker_samples_available;
-              needed_for_log = remaining_samples_needed;
+            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size;
+            
+            // In recovery mode, be more lenient - only unprime if we can't process ANY more chunks
+            if (in_recovery_mode) {
+              if (speaker_samples_available < aec_chunk_size) {
+                ref_buffer_primed = false;
+                log_underrun = true;
+                underrun_type = 1;
+                samples_for_log = speaker_samples_available;
+                needed_for_log = aec_chunk_size;
+              } else if (speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) {
+                // Exit recovery mode once buffer is fully healthy
+                in_recovery_mode = false;
+                ESP_LOGI(LOG_TAG, "Exiting recovery mode, buffer healthy with %d samples", speaker_samples_available);
+              }
+            } else {
+              // Normal mode - only unprime if we drop below a lower threshold
+              // This provides hysteresis to prevent oscillation
+              if (speaker_samples_available < 1920) {  // 1.5 cycles = 120ms
+                ref_buffer_primed = false;
+                log_underrun = true;
+                underrun_type = 1;
+                samples_for_log = speaker_samples_available;
+                needed_for_log = remaining_samples_needed;
+                ESP_LOGW(LOG_TAG, "Unpriming: dropped below 1920 samples (have %d)", speaker_samples_available);
+              }
             }
           } else {
             // Not enough samples, unprime
@@ -439,7 +466,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         // Log successful reads occasionally
         static int ref_success_count = 0;
         if (++ref_success_count % 50 == 0) {
-          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count);
+          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, buffer: %d samples", 
+                   ref_success_count, speaker_samples_available);
         }
       } else {
         // No reference data available, use zeros
diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index 2fb1c78..d26d36a 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -23,19 +23,8 @@ void pipecat_send_audio_task(void *user_data) {
   pipecat_init_audio_encoder();
 
   while (1) {
-    uint64_t start_time = esp_timer_get_time();
     pipecat_send_audio(peer_connection);
-    uint64_t end_time = esp_timer_get_time();
-    uint64_t elapsed_ms = (end_time - start_time) / 1000;
-    
-    // We process 1280 samples at 16kHz = 80ms of audio
-    // Delay for the remainder of the 80ms period
-    if (elapsed_ms < 80) {
-      vTaskDelay(pdMS_TO_TICKS(80 - elapsed_ms));
-    } else {
-      // If processing took longer than 80ms, yield to other tasks
-      vTaskDelay(pdMS_TO_TICKS(1));
-    }
+    vTaskDelay(pdMS_TO_TICKS(1));
   }
 }
 #endif

From 2dbd6a88990003da1a1c485199a0247d51410ad6 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 7 Jul 2025 18:49:26 -0700
Subject: [PATCH 10/14] temp commit

---
 esp32-s3-box-3/src/media.cpp | 495 ++++++++++++++---------------------
 1 file changed, 193 insertions(+), 302 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 81bcab6..6851678 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -25,28 +25,10 @@
 #define OPUS_ENCODER_BITRATE 30000
 #define OPUS_ENCODER_COMPLEXITY 0
 
-// Forward declare for use in set_is_playing
-static bool ref_buffer_primed = false;
-
-
+// AEC control
+static bool aec_enabled = true;  // Can be set to false to bypass AEC
 
 std::atomic<bool> is_playing = false;
-void set_is_playing(int16_t *in_buf, size_t in_samples) {
-  bool any_set = false;
-  for (size_t i = 0; i < in_samples; i++) {
-    if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) {
-      any_set = true;
-    }
-  }
-  bool was_playing = is_playing.load();
-  is_playing = any_set;
-  
-  // Reset buffer priming when playback stops
-  if (was_playing && !any_set) {
-    ref_buffer_primed = false;
-    ESP_LOGD(LOG_TAG, "Playback stopped, reset reference buffer priming");
-  }
-}
 
 esp_codec_dev_handle_t mic_codec_dev = NULL;
 esp_codec_dev_handle_t spk_codec_dev = NULL;
@@ -71,17 +53,33 @@ opus_int16 *decoder_buffer = NULL;
 OpusDecoder *opus_decoder = NULL;
 
 // Circular buffer for speaker reference data (not using FreeRTOS ring buffer)
-static int16_t* speaker_circular_buffer = NULL;
-static const size_t SPEAKER_BUFFER_SAMPLES = 4096;  // 4096 samples = 256ms @ 16kHz
-static const size_t MIN_REF_BUFFER_SAMPLES = 2560;  // 2 read cycles worth (160ms)
-static const size_t MIN_REF_BUFFER_RESUME = 1920;   // 1.5 cycles for recovery (120ms)
+// This is a sliding window buffer, not a queue - we read recent data for AEC
+static int16_t speaker_circular_buffer[640];   // Exactly 2 Opus frames (40ms @ 16kHz)
 static volatile size_t speaker_write_idx = 0;
 static volatile size_t speaker_read_idx = 0;
-static volatile size_t speaker_samples_available = 0;
+static volatile bool speaker_buffer_active = false; // True when we're receiving audio
 static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
 
-// AEC reference delay compensation removed - assuming AEC implementation handles delay internally
-// ref_buffer_primed already declared above for use in set_is_playing
+// Function to update playback state and reset buffer when playback stops
+void set_is_playing(int16_t *in_buf, size_t in_samples) {
+  bool any_set = false;
+  for (size_t i = 0; i < in_samples; i++) {
+    if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) {
+      any_set = true;
+    }
+  }
+  bool was_playing = is_playing.load();
+  is_playing = any_set;
+  
+  // Reset buffer state when playback stops
+  if (was_playing && !any_set) {
+    portENTER_CRITICAL(&speaker_buffer_mutex);
+    speaker_buffer_active = false;
+    speaker_read_idx = speaker_write_idx; // Reset read to match write
+    portEXIT_CRITICAL(&speaker_buffer_mutex);
+    ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer");
+  }
+}
 
 void pipecat_init_audio_decoder() {
   int decoder_error = 0;
@@ -122,55 +120,30 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
     }
     
     // Also send to circular buffer for AEC reference
-    if (speaker_circular_buffer != NULL) {
-      // Variables for logging outside critical section
-      bool log_write = false;
-      bool log_overflow = false;
-      int write_count_log = 0;
-      int overflow_count_log = 0;
-      size_t samples_available_log = 0;
+    portENTER_CRITICAL(&speaker_buffer_mutex);
       
-      portENTER_CRITICAL(&speaker_buffer_mutex);
+      // Write to circular buffer, always keeping just the freshest 2 frames
+      for (int i = 0; i < decoded_size; i++) {
+        speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
+        speaker_write_idx = (speaker_write_idx + 1) % 640;
+      }
+      speaker_buffer_active = true;
       
-      // Check if we have space
-      size_t space_available = SPEAKER_BUFFER_SAMPLES - speaker_samples_available;
-      if (space_available >= decoded_size) {
-        // Copy samples to circular buffer
-        for (int i = 0; i < decoded_size; i++) {
-          speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
-          speaker_write_idx = (speaker_write_idx + 1) % SPEAKER_BUFFER_SAMPLES;
-        }
-        speaker_samples_available += decoded_size;
-        
-        // Prepare logging info
-        static int write_count = 0;
-        if (++write_count % 50 == 0) {
-          log_write = true;
-          write_count_log = write_count;
-          samples_available_log = speaker_samples_available;
-        }
-      } else {
-        // Buffer overflow
-        static int overflow_count = 0;
-        if (++overflow_count % 100 == 0) {
-          log_overflow = true;
-          overflow_count_log = overflow_count;
-        }
+      // If read pointer is too far behind, advance it to stay within 2 frames
+      size_t distance = (speaker_write_idx - speaker_read_idx + 640) % 640;
+      if (distance > 640 - 320) {
+        // Read pointer is about to be overrun, advance it
+        speaker_read_idx = (speaker_write_idx + 640 - 320) % 640;
       }
       
       portEXIT_CRITICAL(&speaker_buffer_mutex);
       
-      // Do logging outside critical section
-      if (log_write) {
-        ESP_LOGI(LOG_TAG, "Speaker buffer writes: %d, samples: %d, available: %d", 
-                 write_count_log, decoded_size, samples_available_log);
+      // Occasional logging
+      static int write_count = 0;
+      if (++write_count % 50 == 0) {
+        ESP_LOGI(LOG_TAG, "Speaker buffer: write_idx=%d, read_idx=%d", 
+                 speaker_write_idx, speaker_read_idx);
       }
-      if (log_overflow) {
-        ESP_LOGW(LOG_TAG, "Speaker buffer overflow count: %d", overflow_count_log);
-      }
-    } else {
-      ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL!");
-    }
   }
 }
 
@@ -183,7 +156,10 @@ static uint64_t total_read_time = 0;
 static uint64_t total_aec_time = 0;
 static uint64_t total_encode_time = 0;
 static uint64_t total_send_time = 0;
+static uint64_t total_wait_time = 0;
 static uint32_t cycle_count = 0;
+static uint32_t aec_skip_count = 0;
+static uint64_t last_frame_send_time = 0;
 
 // AEC processed buffer
 static int16_t *processed_buffer = NULL;
@@ -221,13 +197,8 @@ void pipecat_init_aec() {
   // Try a simpler approach - create AEC without the task configuration struct
   ESP_LOGI(LOG_TAG, "Initializing AEC");
   
-  // Create circular buffer for speaker reference data
-  speaker_circular_buffer = (int16_t *)heap_caps_malloc(SPEAKER_BUFFER_SAMPLES * sizeof(int16_t), MALLOC_CAP_SPIRAM);
-  if (speaker_circular_buffer == NULL) {
-    ESP_LOGE(LOG_TAG, "Failed to allocate speaker circular buffer");
-    return;
-  }
-  ESP_LOGI(LOG_TAG, "Speaker circular buffer created, size=%d samples", SPEAKER_BUFFER_SAMPLES);
+  // Speaker circular buffer is now statically allocated
+  ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=640 samples");
   
   // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
   aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
@@ -270,278 +241,198 @@ void pipecat_init_aec() {
 void pipecat_send_audio(PeerConnection *peer_connection) {
   uint64_t start_time, end_time;
   
-  // Check if AEC is initialized
-  if (aec_handle == NULL) {
-    // AEC disabled, bypass processing
-    // ESP_LOGW(LOG_TAG, "AEC not initialized, bypassing");
-  }
-  
-  // Measure esp_codec_dev_read time
-  start_time = esp_timer_get_time();
-  if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) !=
-      ESP_OK) {
-    printf("esp_codec_dev_read failed");
-    return;
-  }
-  end_time = esp_timer_get_time();
-  total_read_time += (end_time - start_time);
-
-  // Don't zero out microphone input - let AEC handle echo cancellation
-  // if (is_playing) {
-  //   memset(read_buffer, 0, PCM_BUFFER_SIZE);
-  // }
-
-  // AEC processing (skip if disabled)
-  if (aec_handle != NULL && aec_enabled) {
-    // Get AEC chunk size (should be 256 samples for 16kHz)
-    int aec_chunk_size = afe_aec_get_chunksize(aec_handle);
-    int total_samples = PCM_BUFFER_SIZE / sizeof(uint16_t);  // 640 samples
-    int num_chunks = total_samples / aec_chunk_size;  // Should be 2.5, so we'll process 2 chunks
+  // Wait for at least one Opus frame (320 samples) of speaker data
+  uint64_t wait_start = esp_timer_get_time();
+  while (true) {
+    portENTER_CRITICAL(&speaker_buffer_mutex);
+    int samples_available = 0;
+    if (speaker_buffer_active) {
+      samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640;
+    }
+    portEXIT_CRITICAL(&speaker_buffer_mutex);
     
-    // Debug logging
-    static bool first_run = true;
-    if (first_run) {
-      ESP_LOGI(LOG_TAG, "AEC debug: chunk_size=%d, total_samples=%d, num_chunks=%d", 
-               aec_chunk_size, total_samples, num_chunks);
-      ESP_LOGI(LOG_TAG, "AEC debug: total_ch_num=%d, sample_rate=%d", 
-               aec_handle->pcm_config.total_ch_num, aec_handle->pcm_config.sample_rate);
-      ESP_LOGI(LOG_TAG, "AEC debug: input format=%s, mic_num=%d, ref_num=%d",
-               "MR", aec_handle->pcm_config.mic_num, aec_handle->pcm_config.ref_num);
-      first_run = false;
+    if (samples_available >= 320) {
+      break;  // We have enough data
     }
     
-    // Process all chunks
-    if (num_chunks != AEC_CHUNKS_TO_BATCH) {
-      ESP_LOGW(LOG_TAG, "Unexpected number of AEC chunks: %d (expected %d)", num_chunks, AEC_CHUNKS_TO_BATCH);
+    // Brief delay before checking again
+    vTaskDelay(1 / portTICK_PERIOD_MS);
+  }
+  uint64_t wait_end = esp_timer_get_time();
+  total_wait_time += (wait_end - wait_start);
+  
+  // Buffers for interleaved processing
+  static int16_t opus_accumulator[640];  // Accumulate AEC output for Opus encoding
+  static int opus_accumulator_pos = 0;
+  int16_t mic_chunk_buffer[256];
+  int aec_chunk_size = 256;  // AEC processes 256 samples at a time
+  
+  // Track timing for AEC skip decision
+  bool skip_last_aec = false;
+  int frame_count = 0;
+  
+  // Single loop processing 5 chunks of 256 samples each
+  for (int chunk = 0; chunk < 5; chunk++) {
+    // 1. Read 256 samples from microphone
+    start_time = esp_timer_get_time();
+    if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) {
+      ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed");
+      return;
     }
+    end_time = esp_timer_get_time();
+    total_read_time += (end_time - start_time);
     
-    // Process AEC in chunks
-    for (int chunk = 0; chunk < num_chunks; chunk++) {
-      // Prepare input: interleave mic data with zeros (reference channel)
-      int16_t *mic_ptr = (int16_t *)read_buffer + (chunk * aec_chunk_size);
-      
+    // 2. Process with AEC or bypass
+    if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) {
       // Get reference data from speaker circular buffer
-      bool have_ref_data = false;
-      int16_t ref_chunk[256];  // Temporary buffer for reference data
+      int16_t ref_chunk[256];
+      int samples_available = 0;
+      bool log_underrun = false;
+      static int underrun_count = 0;
+      static int ref_success_count = 0;
       
-      if (speaker_circular_buffer != NULL) {
-        // Variables for logging outside critical section
-        bool log_buffer_state = false;
-        bool log_primed = false;
-        bool log_underrun = false;
-        size_t samples_for_log = 0;
-        int underrun_type = 0;  // 0=none, 1=remaining chunks, 2=current chunk
-        size_t needed_for_log = 0;
-        
-        portENTER_CRITICAL(&speaker_buffer_mutex);
-        
-        // Debug log buffer state occasionally
-        static int buffer_check_count = 0;
-        if (++buffer_check_count % 100 == 0) {
-          log_buffer_state = true;
-          samples_for_log = speaker_samples_available;
-        }
-        
-        // Warn if we're getting close to the unprime threshold
-        static int low_buffer_count = 0;
-        if (speaker_samples_available < 2200 && speaker_samples_available >= 1920) {
-          if (++low_buffer_count % 10 == 0) {
-            ESP_LOGW(LOG_TAG, "Buffer running low: %d samples (unprime at 1920)", speaker_samples_available);
-          }
-        } else {
-          low_buffer_count = 0;
-        }
-        
-        // Only start using reference data if we have enough buffered
-        // Use hysteresis to prevent oscillation
-        if (!ref_buffer_primed) {
-          size_t required_samples = in_recovery_mode ? MIN_REF_BUFFER_RESUME : MIN_REF_BUFFER_SAMPLES;
-          if (speaker_samples_available >= required_samples) {
-            ref_buffer_primed = true;
-            log_primed = true;
-            samples_for_log = speaker_samples_available;
-          }
-        }
-        
-        // Recovery check: if we've been unprimed for too long, reduce requirements
-        static int unprime_count = 0;
-        static bool in_recovery_mode = false;
-        
-        if (!ref_buffer_primed) {
-          unprime_count++;
-          // After 25 attempts (~2 seconds), enter recovery mode
-          if (unprime_count > 25 && !in_recovery_mode) {
-            in_recovery_mode = true;
-            ESP_LOGW(LOG_TAG, "Entering recovery mode after %d attempts", unprime_count);
-          }
-        } else {
-          unprime_count = 0;
-        }
+      portENTER_CRITICAL(&speaker_buffer_mutex);
         
-        // Only try to get reference data if buffer is primed
-        if (ref_buffer_primed) {
-          // Check if we have enough samples for this chunk
-          if (speaker_samples_available >= aec_chunk_size) {
-            // Read samples from circular buffer
+        if (speaker_buffer_active) {
+          samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640;
+          
+          if (samples_available >= aec_chunk_size) {
+            // We have enough data, read it
             for (int i = 0; i < aec_chunk_size; i++) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
-              speaker_read_idx = (speaker_read_idx + 1) % SPEAKER_BUFFER_SAMPLES;
+              speaker_read_idx = (speaker_read_idx + 1) % 640;
+            }
+          } else if (samples_available > 0) {
+            // Not enough data, read what we have and pad with zeros
+            int i;
+            for (i = 0; i < samples_available; i++) {
+              ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
+              speaker_read_idx = (speaker_read_idx + 1) % 640;
+            }
+            for (; i < aec_chunk_size; i++) {
+              ref_chunk[i] = 0;
             }
-            speaker_samples_available -= aec_chunk_size;
-            have_ref_data = true;
-            
-            // Check if we're running too low for remaining chunks
-            int remaining_chunks = num_chunks - chunk - 1;
-            size_t remaining_samples_needed = remaining_chunks * aec_chunk_size;
             
-            // In recovery mode, be more lenient - only unprime if we can't process ANY more chunks
-            if (in_recovery_mode) {
-              if (speaker_samples_available < aec_chunk_size) {
-                ref_buffer_primed = false;
-                log_underrun = true;
-                underrun_type = 1;
-                samples_for_log = speaker_samples_available;
-                needed_for_log = aec_chunk_size;
-              } else if (speaker_samples_available >= MIN_REF_BUFFER_SAMPLES) {
-                // Exit recovery mode once buffer is fully healthy
-                in_recovery_mode = false;
-                ESP_LOGI(LOG_TAG, "Exiting recovery mode, buffer healthy with %d samples", speaker_samples_available);
-              }
-            } else {
-              // Normal mode - only unprime if we drop below a lower threshold
-              // This provides hysteresis to prevent oscillation
-              if (speaker_samples_available < 1920) {  // 1.5 cycles = 120ms
-                ref_buffer_primed = false;
-                log_underrun = true;
-                underrun_type = 1;
-                samples_for_log = speaker_samples_available;
-                needed_for_log = remaining_samples_needed;
-                ESP_LOGW(LOG_TAG, "Unpriming: dropped below 1920 samples (have %d)", speaker_samples_available);
-              }
+            if (++underrun_count % 10 == 0) {
+              log_underrun = true;
             }
           } else {
-            // Not enough samples, unprime
-            ref_buffer_primed = false;
-            log_underrun = true;
-            underrun_type = 2;
-            samples_for_log = speaker_samples_available;
-            needed_for_log = aec_chunk_size;
+            // No data at all - don't advance read pointer, just use zeros
+            for (int i = 0; i < aec_chunk_size; i++) {
+              ref_chunk[i] = 0;
+            }
+            
+            if (++underrun_count % 10 == 0) {
+              log_underrun = true;
+            }
+          }
+        } else {
+          // Buffer not active, use zeros
+          for (int i = 0; i < aec_chunk_size; i++) {
+            ref_chunk[i] = 0;
           }
         }
         
         portEXIT_CRITICAL(&speaker_buffer_mutex);
         
-        // Do logging outside critical section
-        if (log_buffer_state) {
-          ESP_LOGI(LOG_TAG, "Circular buffer state: %d samples available, chunk %d/%d", 
-                   samples_for_log, chunk, num_chunks);
-        }
-        if (log_primed) {
-          ESP_LOGI(LOG_TAG, "Reference buffer primed with %d samples", samples_for_log);
-        }
         if (log_underrun) {
-          if (underrun_type == 1) {
-            ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for remaining, have %d)", 
-                     chunk, num_chunks, needed_for_log, samples_for_log);
-          } else {
-            ESP_LOGW(LOG_TAG, "Reference buffer underrun (chunk %d/%d, need %d samples for current, have %d)", 
-                     chunk, num_chunks, needed_for_log, samples_for_log);
-          }
+          ESP_LOGW(LOG_TAG, "Speaker buffer underrun: only %d/%d samples available", 
+                   samples_available, aec_chunk_size);
         }
-      } else {
-        static bool logged_null = false;
-        if (!logged_null) {
-          ESP_LOGW(LOG_TAG, "Speaker circular buffer is NULL during read!");
-          logged_null = true;
-        }
-      }
       
-      if (have_ref_data) {
-        // Use actual speaker reference data
-        for (int i = 0; i < aec_chunk_size; i++) {
-          aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel
-          aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference channel (actual speaker data)
-        }
-        
-        // Log successful reads occasionally
-        static int ref_success_count = 0;
-        if (++ref_success_count % 50 == 0) {
-          ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d, buffer: %d samples", 
-                   ref_success_count, speaker_samples_available);
-        }
-      } else {
-        // No reference data available, use zeros
-        for (int i = 0; i < aec_chunk_size; i++) {
-          aec_in_buffer[i * 2] = mic_ptr[i];      // Microphone channel  
-          aec_in_buffer[i * 2 + 1] = 0;           // Reference channel (zeros as fallback)
-        }
-        // Log when we don't have reference data (only occasionally to avoid spam)
-        static int no_ref_count = 0;
-        if (++no_ref_count % 100 == 0) {
-          ESP_LOGW(LOG_TAG, "No speaker reference data available count: %d", no_ref_count);
-        }
+      // Create interleaved AEC input
+      for (int i = 0; i < aec_chunk_size; i++) {
+        aec_in_buffer[i * 2] = mic_chunk_buffer[i];     // Mic
+        aec_in_buffer[i * 2 + 1] = ref_chunk[i];        // Reference
       }
       
-      // Run AEC process for this chunk
+      // Run AEC
       start_time = esp_timer_get_time();
       int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
       end_time = esp_timer_get_time();
       total_aec_time += (end_time - start_time);
       
       if (ret < 0) {
-        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d (0x%x) at chunk %d", 
-                 ret, ret, chunk);
-        // Fall back to bypass on error
-        memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE);
-        return;
+        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
+        // Fall back - copy mic input directly
+        memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      } else {
+        // Copy AEC output to accumulator
+        memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
       }
-      // Log success on first chunk of first run
-      if (first_run && chunk == 0) {
-        ESP_LOGI(LOG_TAG, "AEC process successful, returned %d bytes", ret);
+      
+      // Log successful AEC processing occasionally
+      if (++ref_success_count % 50 == 0) {
+        ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count);
       }
+    } else {
+      // AEC disabled or skipped - copy mic input directly
+      memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
       
-      // Copy AEC output to processed buffer
-      memcpy(processed_buffer + (chunk * aec_chunk_size), aec_out_buffer, aec_chunk_size * sizeof(int16_t));
+      if (chunk == 4 && skip_last_aec) {
+        static int skip_log_count = 0;
+        if (++skip_log_count % 10 == 0) {
+          ESP_LOGI(LOG_TAG, "Skipped AEC on 5th chunk to catch up, count: %d", skip_log_count);
+        }
+      }
     }
     
-    // All samples have been processed through AEC
-  } else {
-    // AEC disabled, just copy input to processed buffer
-    memcpy(processed_buffer, read_buffer, PCM_BUFFER_SIZE);
-  }
-
-  // Process Opus frames - we have exactly COMMON_BUFFER_SAMPLES (1280) samples
-  // This encodes to exactly OPUS_FRAMES_TO_BATCH (4) frames
-  for (int frame = 0; frame < OPUS_FRAMES_TO_BATCH; frame++) {
-    // Measure opus_encode time
-    start_time = esp_timer_get_time();
-    auto encoded_size = opus_encode(opus_encoder, 
-                                    processed_buffer + (frame * OPUS_FRAME_SIZE),
-                                    OPUS_FRAME_SIZE,
-                                    encoder_output_buffer, OPUS_BUFFER_SIZE);
-    end_time = esp_timer_get_time();
-    total_encode_time += (end_time - start_time);
+    opus_accumulator_pos += 256;
     
-    // Measure peer_connection_send_audio time
-    start_time = esp_timer_get_time();
-    peer_connection_send_audio(peer_connection, encoder_output_buffer,
-                               encoded_size);
-    end_time = esp_timer_get_time();
-    total_send_time += (end_time - start_time);
+    // 3. Encode and send Opus frames when we have enough samples
+    while (opus_accumulator_pos >= 320) {
+      start_time = esp_timer_get_time();
+      int encoded_size = opus_encode(opus_encoder, opus_accumulator, 320,
+                                    encoder_output_buffer, OPUS_BUFFER_SIZE);
+      end_time = esp_timer_get_time();
+      total_encode_time += (end_time - start_time);
+      
+      start_time = esp_timer_get_time();
+      peer_connection_send_audio(peer_connection, encoder_output_buffer, encoded_size);
+      end_time = esp_timer_get_time();
+      total_send_time += (end_time - start_time);
+      
+      frame_count++;
+      
+      // After sending the 3rd frame, check timing for next call
+      if (frame_count == 3) {
+        uint64_t third_frame_sent_time = end_time;
+        if (last_frame_send_time > 0) {
+          uint64_t elapsed = (third_frame_sent_time - last_frame_send_time) / 1000; // ms
+          if (elapsed > 62) {
+            skip_last_aec = true;
+            aec_skip_count++;
+          }
+        }
+      }
+      
+      // Remember when we sent the 4th frame
+      if (frame_count == 4) {
+        last_frame_send_time = end_time;
+      }
+      
+      // Shift remaining samples
+      opus_accumulator_pos -= 320;
+      if (opus_accumulator_pos > 0) {
+        memmove(opus_accumulator, opus_accumulator + 320, opus_accumulator_pos * sizeof(int16_t));
+      }
+    }
   }
   
-  // Update cycle count and print statistics every 50 cycles
+  // Update statistics
   cycle_count++;
   if (cycle_count % 50 == 0) {
+    uint64_t avg_wait_time = total_wait_time / 50;
     uint64_t avg_read_time = total_read_time / 50;
     uint64_t avg_aec_time = total_aec_time / 50;
     uint64_t avg_encode_time = total_encode_time / 50;
     uint64_t avg_send_time = total_send_time / 50;
     
-    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): read=%llu us, aec=%llu us, encode=%llu us, send=%llu us",
-             avg_read_time, avg_aec_time, avg_encode_time, avg_send_time);
+    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu",
+             avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count);
     
-    // Reset counters for next 50 cycles
+    // Reset counters
+    total_wait_time = 0;
     total_read_time = 0;
     total_aec_time = 0;
     total_encode_time = 0;

From 125e422bffc54be3c63ac29444f101d1cf9e5ac6 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 7 Jul 2025 21:09:54 -0700
Subject: [PATCH 11/14] temp commit

---
 esp32-s3-box-3/src/media.cpp | 226 +++++++++++++++++++++++++++--------
 1 file changed, 175 insertions(+), 51 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 6851678..a35932d 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -57,8 +57,10 @@ OpusDecoder *opus_decoder = NULL;
 static int16_t speaker_circular_buffer[640];   // Exactly 2 Opus frames (40ms @ 16kHz)
 static volatile size_t speaker_write_idx = 0;
 static volatile size_t speaker_read_idx = 0;
+static volatile size_t speaker_buffer_count = 0;  // Number of samples in buffer
 static volatile bool speaker_buffer_active = false; // True when we're receiving audio
 static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
+static volatile uint64_t last_buffer_read_time = 0;  // Track when buffer was last read
 
 // Function to update playback state and reset buffer when playback stops
 void set_is_playing(int16_t *in_buf, size_t in_samples) {
@@ -76,6 +78,8 @@ void set_is_playing(int16_t *in_buf, size_t in_samples) {
     portENTER_CRITICAL(&speaker_buffer_mutex);
     speaker_buffer_active = false;
     speaker_read_idx = speaker_write_idx; // Reset read to match write
+    speaker_buffer_count = 0;
+    last_buffer_read_time = 0;  // Reset read time
     portEXIT_CRITICAL(&speaker_buffer_mutex);
     ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer");
   }
@@ -99,51 +103,83 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
   if (last_decode_time > 0) {
     uint64_t delta_ms = (now - last_decode_time) / 1000;
     static int packet_count = 0;
-    if (++packet_count % 10 == 0) {
-      ESP_LOGI(LOG_TAG, "Audio packet %d: size=%d, delta=%llu ms", packet_count, size, delta_ms);
+    packet_count++;
+    
+    // Log every packet for first 20, then every 10th
+    if (packet_count <= 20 || packet_count % 10 == 0) {
+      // Get current buffer state for correlation
+      size_t current_write_idx, current_read_idx, current_count;
+      portENTER_CRITICAL(&speaker_buffer_mutex);
+      current_write_idx = speaker_write_idx;
+      current_read_idx = speaker_read_idx;
+      current_count = speaker_buffer_count;
+      portEXIT_CRITICAL(&speaker_buffer_mutex);
+      
+      ESP_LOGI(LOG_TAG, "[ARRIVAL] pkt#%d: size=%d, delta=%llums, buffer=%d samples, w=%d r=%d", 
+               packet_count, size, delta_ms, current_count, current_write_idx, current_read_idx);
     }
   }
   last_decode_time = now;
   
-  esp_err_t ret;
-  int decoded_size =
-      opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
+  // Decode Opus frame to get the size (or use fixed size if predictable)
+  int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
 
   if (decoded_size > 0) {
-    set_is_playing(decoder_buffer, decoded_size);
-    
-    // Send to speaker
-    if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer,
-                                   decoded_size * sizeof(uint16_t))) !=
-        ESP_OK) {
-      ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret));
-    }
-    
-    // Also send to circular buffer for AEC reference
+    // FIRST: Write to circular buffer for AEC reference (before any other processing)
     portENTER_CRITICAL(&speaker_buffer_mutex);
       
-      // Write to circular buffer, always keeping just the freshest 2 frames
-      for (int i = 0; i < decoded_size; i++) {
+      // Write to circular buffer
+      size_t space_available = 640 - speaker_buffer_count;
+      size_t samples_to_write = decoded_size;
+      
+      // If we don't have enough space, drop oldest samples
+      if (samples_to_write > space_available) {
+        size_t samples_to_drop = samples_to_write - space_available;
+        speaker_read_idx = (speaker_read_idx + samples_to_drop) % 640;
+        speaker_buffer_count -= samples_to_drop;
+      }
+      
+      // Write new samples
+      for (int i = 0; i < samples_to_write; i++) {
         speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
         speaker_write_idx = (speaker_write_idx + 1) % 640;
       }
+      speaker_buffer_count += samples_to_write;
       speaker_buffer_active = true;
       
-      // If read pointer is too far behind, advance it to stay within 2 frames
-      size_t distance = (speaker_write_idx - speaker_read_idx + 640) % 640;
-      if (distance > 640 - 320) {
-        // Read pointer is about to be overrun, advance it
-        speaker_read_idx = (speaker_write_idx + 640 - 320) % 640;
-      }
+      // Get samples in buffer after write (while still in critical section)
+      size_t samples_in_buffer = speaker_buffer_count;
       
       portEXIT_CRITICAL(&speaker_buffer_mutex);
       
-      // Occasional logging
+      // Log timing relative to last read from this buffer
+      uint64_t time_since_last_read = 0;
+      if (last_buffer_read_time > 0 && now > last_buffer_read_time) {
+        time_since_last_read = (now - last_buffer_read_time) / 1000;
+      } else {
+        time_since_last_read = 0;  // First write or invalid timestamp
+      }
+      
+      // More frequent logging for debugging
       static int write_count = 0;
-      if (++write_count % 50 == 0) {
-        ESP_LOGI(LOG_TAG, "Speaker buffer: write_idx=%d, read_idx=%d", 
+      write_count++;
+      if (write_count <= 20 || write_count % 10 == 0) {
+        // Log the actual count, not calculated value
+        ESP_LOGI(LOG_TAG, "[WRITE] cnt=%d: +%d samples, buffer=%d, since_read=%llums (w=%d r=%d)", 
+                 write_count, decoded_size, samples_in_buffer, time_since_last_read,
                  speaker_write_idx, speaker_read_idx);
       }
+    
+    // THEN: Process for playback
+    set_is_playing(decoder_buffer, decoded_size);
+    
+    // Send to speaker
+    esp_err_t ret;
+    if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer,
+                                   decoded_size * sizeof(uint16_t))) !=
+        ESP_OK) {
+      ESP_LOGE(LOG_TAG, "esp_codec_dev_write failed: %s", esp_err_to_name(ret));
+    }
   }
 }
 
@@ -241,18 +277,25 @@ void pipecat_init_aec() {
 void pipecat_send_audio(PeerConnection *peer_connection) {
   uint64_t start_time, end_time;
   
-  // Wait for at least one Opus frame (320 samples) of speaker data
+  // Wait for (640 samples = 2 Opus frames) in our speaker data reference buffer to handle codec burst pattern
   uint64_t wait_start = esp_timer_get_time();
+  int initial_samples = 0;
+  int wait_iterations = 0;
   while (true) {
     portENTER_CRITICAL(&speaker_buffer_mutex);
     int samples_available = 0;
     if (speaker_buffer_active) {
-      samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640;
+      samples_available = speaker_buffer_count;
     }
     portEXIT_CRITICAL(&speaker_buffer_mutex);
     
-    if (samples_available >= 320) {
-      break;  // We have enough data
+    wait_iterations++;
+    if (initial_samples == 0) {
+      initial_samples = samples_available;
+    }
+    
+    if (samples_available >= 640) {
+      break;  // Buffer is full - ready to handle burst consumption
     }
     
     // Brief delay before checking again
@@ -261,6 +304,21 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   uint64_t wait_end = esp_timer_get_time();
   total_wait_time += (wait_end - wait_start);
   
+  // Log initial buffer state
+  static int call_count = 0;
+  call_count++;
+  // Get consistent snapshot of buffer state
+  size_t log_write_idx, log_read_idx, log_count;
+  portENTER_CRITICAL(&speaker_buffer_mutex);
+  log_write_idx = speaker_write_idx;
+  log_read_idx = speaker_read_idx;
+  log_count = speaker_buffer_count;
+  portEXIT_CRITICAL(&speaker_buffer_mutex);
+  
+  ESP_LOGI(LOG_TAG, "[SEND_START] call#%d: initial=%d, wait=%lluus, iters=%d, count=%d, w=%d r=%d", 
+           call_count, initial_samples, wait_end - wait_start, wait_iterations,
+           log_count, log_write_idx, log_read_idx);
+  
   // Buffers for interleaved processing
   static int16_t opus_accumulator[640];  // Accumulate AEC output for Opus encoding
   static int opus_accumulator_pos = 0;
@@ -271,9 +329,26 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   bool skip_last_aec = false;
   int frame_count = 0;
   
+  // Track timing - each read blocks for ~16ms (256 samples @ 16kHz)
+  uint64_t loop_start_time = esp_timer_get_time();
+  
   // Single loop processing 5 chunks of 256 samples each
   for (int chunk = 0; chunk < 5; chunk++) {
+    uint64_t chunk_start_time = esp_timer_get_time();
     // 1. Read 256 samples from microphone
+    // Log buffer state before read (for chunk 0 and 1)
+    if (chunk <= 1) {
+      size_t pre_read_write_idx, pre_read_read_idx, pre_read_count;
+      portENTER_CRITICAL(&speaker_buffer_mutex);
+      pre_read_write_idx = speaker_write_idx;
+      pre_read_read_idx = speaker_read_idx;
+      pre_read_count = speaker_buffer_count;
+      portEXIT_CRITICAL(&speaker_buffer_mutex);
+      
+      ESP_LOGI(LOG_TAG, "[PRE_READ] chunk=%d: buffer has %d samples (w=%d r=%d)", 
+               chunk, pre_read_count, pre_read_write_idx, pre_read_read_idx);
+    }
+    
     start_time = esp_timer_get_time();
     if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) {
       ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed");
@@ -282,6 +357,12 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     end_time = esp_timer_get_time();
     total_read_time += (end_time - start_time);
     
+    // Log if a long read happened (indicating actual blocking)
+    uint64_t read_duration = (end_time - start_time) / 1000;
+    if (read_duration > 10) {  // More than 10ms suggests actual blocking
+      ESP_LOGI(LOG_TAG, "[BLOCKED_READ] chunk=%d: read took %llums", chunk, read_duration);
+    }
+    
     // 2. Process with AEC or bypass
     if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) {
       // Get reference data from speaker circular buffer
@@ -294,7 +375,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       portENTER_CRITICAL(&speaker_buffer_mutex);
         
         if (speaker_buffer_active) {
-          samples_available = (speaker_write_idx - speaker_read_idx + 640) % 640;
+          samples_available = speaker_buffer_count;
           
           if (samples_available >= aec_chunk_size) {
             // We have enough data, read it
@@ -302,6 +383,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
               speaker_read_idx = (speaker_read_idx + 1) % 640;
             }
+            speaker_buffer_count -= aec_chunk_size;
           } else if (samples_available > 0) {
             // Not enough data, read what we have and pad with zeros
             int i;
@@ -309,11 +391,14 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
               speaker_read_idx = (speaker_read_idx + 1) % 640;
             }
+            speaker_buffer_count = 0;  // We consumed all available samples
             for (; i < aec_chunk_size; i++) {
               ref_chunk[i] = 0;
             }
             
-            if (++underrun_count % 10 == 0) {
+            underrun_count++;
+            // Log every underrun for first 20, then every 10th
+            if (underrun_count <= 20 || underrun_count % 10 == 0) {
               log_underrun = true;
             }
           } else {
@@ -322,7 +407,9 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
               ref_chunk[i] = 0;
             }
             
-            if (++underrun_count % 10 == 0) {
+            underrun_count++;
+            // Log every underrun for first 20, then every 10th
+            if (underrun_count <= 20 || underrun_count % 10 == 0) {
               log_underrun = true;
             }
           }
@@ -335,9 +422,28 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         
         portEXIT_CRITICAL(&speaker_buffer_mutex);
         
+        // Update last read time if we read any data
+        // Update last read time if we successfully read data
+        if (speaker_buffer_active && samples_available >= aec_chunk_size) {
+          last_buffer_read_time = esp_timer_get_time();
+        } else if (speaker_buffer_active && samples_available > 0) {
+          // Even partial reads update the time
+          last_buffer_read_time = esp_timer_get_time();
+        }
+        
         if (log_underrun) {
-          ESP_LOGW(LOG_TAG, "Speaker buffer underrun: only %d/%d samples available", 
-                   samples_available, aec_chunk_size);
+          uint64_t time_since_loop_start = (esp_timer_get_time() - loop_start_time) / 1000;
+          // Get consistent state outside critical section
+          size_t log_w_idx, log_r_idx, log_cnt;
+          portENTER_CRITICAL(&speaker_buffer_mutex);
+          log_w_idx = speaker_write_idx;
+          log_r_idx = speaker_read_idx;
+          log_cnt = speaker_buffer_count;
+          portEXIT_CRITICAL(&speaker_buffer_mutex);
+          
+          ESP_LOGW(LOG_TAG, "[UNDERRUN] #%d: chunk=%d, only %d/%d samples, %llums since loop start, count=%d, w=%d r=%d", 
+                   underrun_count, chunk, samples_available, aec_chunk_size, 
+                   time_since_loop_start, log_cnt, log_w_idx, log_r_idx);
         }
       
       // Create interleaved AEC input
@@ -346,24 +452,32 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         aec_in_buffer[i * 2 + 1] = ref_chunk[i];        // Reference
       }
       
-      // Run AEC
-      start_time = esp_timer_get_time();
-      int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
-      end_time = esp_timer_get_time();
-      total_aec_time += (end_time - start_time);
+      // Temporarily disabled AEC
+      // ----
+      // // Run AEC
+      // start_time = esp_timer_get_time();
+      // int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
+      // end_time = esp_timer_get_time();
+      // total_aec_time += (end_time - start_time);
       
-      if (ret < 0) {
-        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
-        // Fall back - copy mic input directly
-        memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
-      } else {
-        // Copy AEC output to accumulator
-        memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
-      }
+      // if (ret < 0) {
+      //   ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
+      //   // Fall back - copy mic input directly
+      //   memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      // } else {
+      //   // Copy AEC output to accumulator
+      //   memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
+      // }
+      // ---- 
+      // Just copy mic data
+      memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
       
-      // Log successful AEC processing occasionally
-      if (++ref_success_count % 50 == 0) {
-        ESP_LOGI(LOG_TAG, "Using speaker reference data, count: %d", ref_success_count);
+      // Log successful reads more frequently for debugging
+      ref_success_count++;
+      if (ref_success_count <= 10 || ref_success_count % 20 == 0) {
+        uint64_t chunk_duration = (esp_timer_get_time() - chunk_start_time) / 1000;
+        ESP_LOGI(LOG_TAG, "[READ_OK] chunk=%d, got %d samples, chunk took %llums", 
+                 chunk, aec_chunk_size, chunk_duration);
       }
     } else {
       // AEC disabled or skipped - copy mic input directly
@@ -379,6 +493,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     
     opus_accumulator_pos += 256;
     
+    // Note: esp_codec_dev_read blocks, providing natural pacing
+    
     // 3. Encode and send Opus frames when we have enough samples
     while (opus_accumulator_pos >= 320) {
       start_time = esp_timer_get_time();
@@ -420,6 +536,14 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   }
   
   // Update statistics
+  // Log loop timing
+  uint64_t loop_duration = (esp_timer_get_time() - loop_start_time) / 1000;
+  static int timing_log_count = 0;
+  if (++timing_log_count <= 10 || timing_log_count % 20 == 0) {
+    ESP_LOGI(LOG_TAG, "[LOOP_TIME] call#%d: full loop took %llums (expect ~80ms for 5x16ms reads)", 
+             call_count, loop_duration);
+  }
+  
   cycle_count++;
   if (cycle_count % 50 == 0) {
     uint64_t avg_wait_time = total_wait_time / 50;

From 153b8ebee3385bafa31a11b431bc30929ecb0f7b Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 7 Jul 2025 22:12:43 -0700
Subject: [PATCH 12/14] temp commit

---
 esp32-s3-box-3/src/media.cpp | 114 +++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 31 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index a35932d..5625d65 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -54,7 +54,7 @@ OpusDecoder *opus_decoder = NULL;
 
 // Circular buffer for speaker reference data (not using FreeRTOS ring buffer)
 // This is a sliding window buffer, not a queue - we read recent data for AEC
-static int16_t speaker_circular_buffer[640];   // Exactly 2 Opus frames (40ms @ 16kHz)
+static int16_t speaker_circular_buffer[960];   // 3 Opus frames (60ms @ 16kHz)
 static volatile size_t speaker_write_idx = 0;
 static volatile size_t speaker_read_idx = 0;
 static volatile size_t speaker_buffer_count = 0;  // Number of samples in buffer
@@ -129,20 +129,20 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
     portENTER_CRITICAL(&speaker_buffer_mutex);
       
       // Write to circular buffer
-      size_t space_available = 640 - speaker_buffer_count;
+      size_t space_available = 960 - speaker_buffer_count;
       size_t samples_to_write = decoded_size;
       
       // If we don't have enough space, drop oldest samples
       if (samples_to_write > space_available) {
         size_t samples_to_drop = samples_to_write - space_available;
-        speaker_read_idx = (speaker_read_idx + samples_to_drop) % 640;
+        speaker_read_idx = (speaker_read_idx + samples_to_drop) % 960;
         speaker_buffer_count -= samples_to_drop;
       }
       
       // Write new samples
       for (int i = 0; i < samples_to_write; i++) {
         speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
-        speaker_write_idx = (speaker_write_idx + 1) % 640;
+        speaker_write_idx = (speaker_write_idx + 1) % 960;
       }
       speaker_buffer_count += samples_to_write;
       speaker_buffer_active = true;
@@ -197,6 +197,14 @@ static uint32_t cycle_count = 0;
 static uint32_t aec_skip_count = 0;
 static uint64_t last_frame_send_time = 0;
 
+// Frame rate monitoring
+static uint64_t first_frame_time = 0;
+static uint32_t frames_sent = 0;
+static float current_frame_rate = 0.0;
+static const uint32_t MIN_FRAMES_FOR_MEASUREMENT = 50;  // Measure over 50 frames (~1 second)
+static const float TARGET_FRAME_RATE = 50.0;  // 50 fps (20ms per frame)
+static const float MIN_ACCEPTABLE_FRAME_RATE = 48.0;  // Allow 4% tolerance
+
 // AEC processed buffer
 static int16_t *processed_buffer = NULL;
 
@@ -234,7 +242,7 @@ void pipecat_init_aec() {
   ESP_LOGI(LOG_TAG, "Initializing AEC");
   
   // Speaker circular buffer is now statically allocated
-  ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=640 samples");
+  ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples");
   
   // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
   aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
@@ -276,8 +284,21 @@ void pipecat_init_aec() {
 
 void pipecat_send_audio(PeerConnection *peer_connection) {
   uint64_t start_time, end_time;
+  static uint64_t last_call_end_time = 0;
+  uint64_t function_start_time = esp_timer_get_time();
   
-  // Wait for (640 samples = 2 Opus frames) in our speaker data reference buffer to handle codec burst pattern
+  // Log time since last call
+  if (last_call_end_time > 0) {
+    uint64_t time_between_calls = (function_start_time - last_call_end_time) / 1000;  // ms
+    static int between_calls_log_count = 0;
+    between_calls_log_count++;
+    if (between_calls_log_count <= 20 || between_calls_log_count % 10 == 0) {
+      ESP_LOGI(LOG_TAG, "[CALL_GAP] Time between calls: %llums (call #%d)", 
+               time_between_calls, between_calls_log_count);
+    }
+  }
+  
+  // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern
   uint64_t wait_start = esp_timer_get_time();
   int initial_samples = 0;
   int wait_iterations = 0;
@@ -294,8 +315,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       initial_samples = samples_available;
     }
     
-    if (samples_available >= 640) {
-      break;  // Buffer is full - ready to handle burst consumption
+    if (samples_available >= 960) {
+      break;  // Buffer has 3 Opus frames - ready to handle burst consumption
     }
     
     // Brief delay before checking again
@@ -381,7 +402,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
             // We have enough data, read it
             for (int i = 0; i < aec_chunk_size; i++) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
-              speaker_read_idx = (speaker_read_idx + 1) % 640;
+              speaker_read_idx = (speaker_read_idx + 1) % 960;
             }
             speaker_buffer_count -= aec_chunk_size;
           } else if (samples_available > 0) {
@@ -389,7 +410,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
             int i;
             for (i = 0; i < samples_available; i++) {
               ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
-              speaker_read_idx = (speaker_read_idx + 1) % 640;
+              speaker_read_idx = (speaker_read_idx + 1) % 960;
             }
             speaker_buffer_count = 0;  // We consumed all available samples
             for (; i < aec_chunk_size; i++) {
@@ -452,25 +473,23 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         aec_in_buffer[i * 2 + 1] = ref_chunk[i];        // Reference
       }
       
-      // Temporarily disabled AEC
-      // ----
-      // // Run AEC
-      // start_time = esp_timer_get_time();
-      // int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
-      // end_time = esp_timer_get_time();
-      // total_aec_time += (end_time - start_time);
+      // Run AEC
+      start_time = esp_timer_get_time();
+      int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
+      end_time = esp_timer_get_time();
+      total_aec_time += (end_time - start_time);
       
-      // if (ret < 0) {
-      //   ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
-      //   // Fall back - copy mic input directly
-      //   memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
-      // } else {
-      //   // Copy AEC output to accumulator
-      //   memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
-      // }
+      if (ret < 0) {
+        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
+        // Fall back - copy mic input directly
+        memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      } else {
+        // Copy AEC output to accumulator
+        memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
+      }
       // ---- 
-      // Just copy mic data
-      memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      // DEBUG: Just copy mic data
+      // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
       
       // Log successful reads more frequently for debugging
       ref_success_count++;
@@ -510,7 +529,37 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       
       frame_count++;
       
-      // After sending the 3rd frame, check timing for next call
+      // Frame rate tracking
+      uint64_t now = esp_timer_get_time();
+      if (first_frame_time == 0) {
+        first_frame_time = now;
+        frames_sent = 0;
+      }
+      
+      frames_sent++;
+      
+      // Check frame rate after minimum frames
+      if (frames_sent >= MIN_FRAMES_FOR_MEASUREMENT) {
+        uint64_t elapsed_us = now - first_frame_time;
+        float elapsed_seconds = elapsed_us / 1000000.0f;
+        current_frame_rate = frames_sent / elapsed_seconds;
+        
+        if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) {
+          ESP_LOGW(LOG_TAG, "Frame rate too low: %.1f fps (target: %.1f fps). Falling behind!", 
+                   current_frame_rate, TARGET_FRAME_RATE);
+          
+          // Set flag to skip AEC on next cycle
+          // (We'll use this instead of the old timing method)
+        }
+        
+        // Reset counters for next measurement period
+        first_frame_time = now;
+        frames_sent = 0;
+      }
+      
+      // Old timing check - replaced by frame rate monitoring
+      // Keeping the structure in case we want to re-enable with new logic
+      /*
       if (frame_count == 3) {
         uint64_t third_frame_sent_time = end_time;
         if (last_frame_send_time > 0) {
@@ -522,10 +571,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         }
       }
       
-      // Remember when we sent the 4th frame
       if (frame_count == 4) {
         last_frame_send_time = end_time;
       }
+      */
       
       // Shift remaining samples
       opus_accumulator_pos -= 320;
@@ -552,8 +601,8 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     uint64_t avg_encode_time = total_encode_time / 50;
     uint64_t avg_send_time = total_send_time / 50;
     
-    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu",
-             avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count);
+    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu, frame_rate=%.1f fps",
+             avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count, current_frame_rate);
     
     // Reset counters
     total_wait_time = 0;
@@ -562,4 +611,7 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     total_encode_time = 0;
     total_send_time = 0;
   }
+  
+  // Record end time for next call's gap measurement
+  last_call_end_time = esp_timer_get_time();
 }

From 931e7a20ac7b01814228116bcf8bc09265f24666 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Mon, 7 Jul 2025 22:29:26 -0700
Subject: [PATCH 13/14] AEC is taking too long and we're not able to send 50
 audio frames per second. We also have to buffer a couple of speaker data
 frames, and that may be too much delay for the AEC to do well with, anyway.

---
 esp32-s3-box-3/src/media.cpp | 123 +----------------------------------
 1 file changed, 3 insertions(+), 120 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 5625d65..15d19f3 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -97,29 +97,7 @@ void pipecat_init_audio_decoder() {
 }
 
 void pipecat_audio_decode(uint8_t *data, size_t size) {
-  // Log packet arrival timing
-  static uint64_t last_decode_time = 0;
   uint64_t now = esp_timer_get_time();
-  if (last_decode_time > 0) {
-    uint64_t delta_ms = (now - last_decode_time) / 1000;
-    static int packet_count = 0;
-    packet_count++;
-    
-    // Log every packet for first 20, then every 10th
-    if (packet_count <= 20 || packet_count % 10 == 0) {
-      // Get current buffer state for correlation
-      size_t current_write_idx, current_read_idx, current_count;
-      portENTER_CRITICAL(&speaker_buffer_mutex);
-      current_write_idx = speaker_write_idx;
-      current_read_idx = speaker_read_idx;
-      current_count = speaker_buffer_count;
-      portEXIT_CRITICAL(&speaker_buffer_mutex);
-      
-      ESP_LOGI(LOG_TAG, "[ARRIVAL] pkt#%d: size=%d, delta=%llums, buffer=%d samples, w=%d r=%d", 
-               packet_count, size, delta_ms, current_count, current_write_idx, current_read_idx);
-    }
-  }
-  last_decode_time = now;
   
   // Decode Opus frame to get the size (or use fixed size if predictable)
   int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
@@ -160,15 +138,6 @@ void pipecat_audio_decode(uint8_t *data, size_t size) {
         time_since_last_read = 0;  // First write or invalid timestamp
       }
       
-      // More frequent logging for debugging
-      static int write_count = 0;
-      write_count++;
-      if (write_count <= 20 || write_count % 10 == 0) {
-        // Log the actual count, not calculated value
-        ESP_LOGI(LOG_TAG, "[WRITE] cnt=%d: +%d samples, buffer=%d, since_read=%llums (w=%d r=%d)", 
-                 write_count, decoded_size, samples_in_buffer, time_since_last_read,
-                 speaker_write_idx, speaker_read_idx);
-      }
     
     // THEN: Process for playback
     set_is_playing(decoder_buffer, decoded_size);
@@ -287,16 +256,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   static uint64_t last_call_end_time = 0;
   uint64_t function_start_time = esp_timer_get_time();
   
-  // Log time since last call
-  if (last_call_end_time > 0) {
-    uint64_t time_between_calls = (function_start_time - last_call_end_time) / 1000;  // ms
-    static int between_calls_log_count = 0;
-    between_calls_log_count++;
-    if (between_calls_log_count <= 20 || between_calls_log_count % 10 == 0) {
-      ESP_LOGI(LOG_TAG, "[CALL_GAP] Time between calls: %llums (call #%d)", 
-               time_between_calls, between_calls_log_count);
-    }
-  }
   
   // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern
   uint64_t wait_start = esp_timer_get_time();
@@ -325,20 +284,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   uint64_t wait_end = esp_timer_get_time();
   total_wait_time += (wait_end - wait_start);
   
-  // Log initial buffer state
-  static int call_count = 0;
-  call_count++;
-  // Get consistent snapshot of buffer state
-  size_t log_write_idx, log_read_idx, log_count;
-  portENTER_CRITICAL(&speaker_buffer_mutex);
-  log_write_idx = speaker_write_idx;
-  log_read_idx = speaker_read_idx;
-  log_count = speaker_buffer_count;
-  portEXIT_CRITICAL(&speaker_buffer_mutex);
-  
-  ESP_LOGI(LOG_TAG, "[SEND_START] call#%d: initial=%d, wait=%lluus, iters=%d, count=%d, w=%d r=%d", 
-           call_count, initial_samples, wait_end - wait_start, wait_iterations,
-           log_count, log_write_idx, log_read_idx);
   
   // Buffers for interleaved processing
   static int16_t opus_accumulator[640];  // Accumulate AEC output for Opus encoding
@@ -357,18 +302,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   for (int chunk = 0; chunk < 5; chunk++) {
     uint64_t chunk_start_time = esp_timer_get_time();
     // 1. Read 256 samples from microphone
-    // Log buffer state before read (for chunk 0 and 1)
-    if (chunk <= 1) {
-      size_t pre_read_write_idx, pre_read_read_idx, pre_read_count;
-      portENTER_CRITICAL(&speaker_buffer_mutex);
-      pre_read_write_idx = speaker_write_idx;
-      pre_read_read_idx = speaker_read_idx;
-      pre_read_count = speaker_buffer_count;
-      portEXIT_CRITICAL(&speaker_buffer_mutex);
-      
-      ESP_LOGI(LOG_TAG, "[PRE_READ] chunk=%d: buffer has %d samples (w=%d r=%d)", 
-               chunk, pre_read_count, pre_read_write_idx, pre_read_read_idx);
-    }
     
     start_time = esp_timer_get_time();
     if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) {
@@ -378,11 +311,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     end_time = esp_timer_get_time();
     total_read_time += (end_time - start_time);
     
-    // Log if a long read happened (indicating actual blocking)
-    uint64_t read_duration = (end_time - start_time) / 1000;
-    if (read_duration > 10) {  // More than 10ms suggests actual blocking
-      ESP_LOGI(LOG_TAG, "[BLOCKED_READ] chunk=%d: read took %llums", chunk, read_duration);
-    }
     
     // 2. Process with AEC or bypass
     if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) {
@@ -491,23 +419,10 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
       // DEBUG: Just copy mic data
       // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
       
-      // Log successful reads more frequently for debugging
-      ref_success_count++;
-      if (ref_success_count <= 10 || ref_success_count % 20 == 0) {
-        uint64_t chunk_duration = (esp_timer_get_time() - chunk_start_time) / 1000;
-        ESP_LOGI(LOG_TAG, "[READ_OK] chunk=%d, got %d samples, chunk took %llums", 
-                 chunk, aec_chunk_size, chunk_duration);
-      }
     } else {
       // AEC disabled or skipped - copy mic input directly
       memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
       
-      if (chunk == 4 && skip_last_aec) {
-        static int skip_log_count = 0;
-        if (++skip_log_count % 10 == 0) {
-          ESP_LOGI(LOG_TAG, "Skipped AEC on 5th chunk to catch up, count: %d", skip_log_count);
-        }
-      }
     }
     
     opus_accumulator_pos += 256;
@@ -544,38 +459,15 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
         float elapsed_seconds = elapsed_us / 1000000.0f;
         current_frame_rate = frames_sent / elapsed_seconds;
         
-        if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) {
-          ESP_LOGW(LOG_TAG, "Frame rate too low: %.1f fps (target: %.1f fps). Falling behind!", 
-                   current_frame_rate, TARGET_FRAME_RATE);
-          
-          // Set flag to skip AEC on next cycle
-          // (We'll use this instead of the old timing method)
+        if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) {          
+          // We could try to catch up here, somehow. But not sure what makes sense.
         }
         
         // Reset counters for next measurement period
         first_frame_time = now;
         frames_sent = 0;
       }
-      
-      // Old timing check - replaced by frame rate monitoring
-      // Keeping the structure in case we want to re-enable with new logic
-      /*
-      if (frame_count == 3) {
-        uint64_t third_frame_sent_time = end_time;
-        if (last_frame_send_time > 0) {
-          uint64_t elapsed = (third_frame_sent_time - last_frame_send_time) / 1000; // ms
-          if (elapsed > 62) {
-            skip_last_aec = true;
-            aec_skip_count++;
-          }
-        }
-      }
-      
-      if (frame_count == 4) {
-        last_frame_send_time = end_time;
-      }
-      */
-      
+            
       // Shift remaining samples
       opus_accumulator_pos -= 320;
       if (opus_accumulator_pos > 0) {
@@ -585,13 +477,6 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
   }
   
   // Update statistics
-  // Log loop timing
-  uint64_t loop_duration = (esp_timer_get_time() - loop_start_time) / 1000;
-  static int timing_log_count = 0;
-  if (++timing_log_count <= 10 || timing_log_count % 20 == 0) {
-    ESP_LOGI(LOG_TAG, "[LOOP_TIME] call#%d: full loop took %llums (expect ~80ms for 5x16ms reads)", 
-             call_count, loop_duration);
-  }
   
   cycle_count++;
   if (cycle_count % 50 == 0) {
@@ -612,6 +497,4 @@ void pipecat_send_audio(PeerConnection *peer_connection) {
     total_send_time = 0;
   }
   
-  // Record end time for next call's gap measurement
-  last_call_end_time = esp_timer_get_time();
 }

From 4c533cb3a92f816f8d6e45095322c70983054fe8 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@gmail.com>
Date: Tue, 8 Jul 2025 15:15:49 -0700
Subject: [PATCH 14/14] change filter length

---
 esp32-s3-box-3/src/media.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index 15d19f3..989d2af 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -213,8 +213,10 @@ void pipecat_init_aec() {
   // Speaker circular buffer is now statically allocated
   ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples");
   
-  // Create AEC with basic parameters (filter_length=4 recommended for ESP32S3)
-  aec_handle = afe_aec_create("MR", 4, AFE_TYPE_VC, AFE_MODE_LOW_COST);
+  // Create AEC with basic parameters.
+  // Filter_length=4 recommended for ESP32S3, but filter_length=2 and AFE_MODE_LOW_COST seem to be a local
+  // maximum in terms of quality with the current code we have here.
+  aec_handle = afe_aec_create("MR", 2, AFE_TYPE_VC, AFE_MODE_LOW_COST);
   if (aec_handle == NULL) {
     ESP_LOGE(LOG_TAG, "Failed to create AEC");
     return;