From ac4bec3066bfb829193c2a047c6fdc2dd53d4660 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 11:03:03 -0700
Subject: [PATCH 1/8] Support for DECODE operator

@tensorflow/micro

Add initial support for DECODE operator.
Add reference implementation.
Add LUT decompression support.
Update op resolvers.
Update Makefiles and Bazel BUILD files.
Add kernel unit test.

bug=fixes #3131
---
 python/tflite_micro/python_ops_resolver.cc    |   3 +-
 tensorflow/lite/micro/kernels/BUILD           |  20 +
 tensorflow/lite/micro/kernels/Makefile.inc    |   1 +
 tensorflow/lite/micro/kernels/decode.cc       | 148 ++++
 tensorflow/lite/micro/kernels/decode_state.cc |  36 +
 tensorflow/lite/micro/kernels/decode_state.h  |  87 +++
 .../lite/micro/kernels/decode_state_lut.cc    | 630 ++++++++++++++++++
 .../lite/micro/kernels/decode_state_lut.h     |  92 +++
 tensorflow/lite/micro/kernels/decode_test.cc  | 333 +++++++++
 tensorflow/lite/micro/kernels/micro_ops.h     |   1 +
 .../lite/micro/micro_mutable_op_resolver.h    |   7 +-
 .../micro/tools/benchmarking/op_resolver.h    |   3 +-
 tensorflow/lite/micro/tools/make/Makefile     |   3 +
 13 files changed, 1361 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/decode.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state.h
 create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.h
 create mode 100644 tensorflow/lite/micro/kernels/decode_test.cc

diff --git a/python/tflite_micro/python_ops_resolver.cc b/python/tflite_micro/python_ops_resolver.cc
index f5d6e636c16..34fc82956bc 100644
--- a/python/tflite_micro/python_ops_resolver.cc
+++ b/python/tflite_micro/python_ops_resolver.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@ PythonOpsResolver::PythonOpsResolver() {
   AddConv2D();
   AddCos();
   AddCumSum();
+  AddDecode();
   AddDelay();
   AddDepthToSpace();
   AddDepthwiseConv2D();
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 7b5ddc7b306..71cb5cd3fb0 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -236,6 +236,9 @@ tflm_kernel_cc_library(
         "conv.cc",
         "conv_common.cc",
         "cumsum.cc",
+        "decode.cc",
+        "decode_state.cc",
+        "decode_state_lut.cc",
         "depth_to_space.cc",
         "depthwise_conv.cc",
         "depthwise_conv_common.cc",
@@ -326,6 +329,8 @@ tflm_kernel_cc_library(
         "batch_matmul.h",
         "circular_buffer.h",
         "conv.h",
+        "decode_state.h",
+        "decode_state_lut.h",
         "depthwise_conv.h",
         "dequantize.h",
         "ethosu.h",
@@ -642,6 +647,21 @@ tflm_cc_test(
     ],
 )
 
+tflm_cc_test(
+    name = "decode_test",
+    srcs = [
+        "decode_test.cc",
+    ],
+    deps = [
+        ":kernel_runner",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:debug_log",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflm_cc_test(
     name = "decompress_test",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/Makefile.inc b/tensorflow/lite/micro/kernels/Makefile.inc
index f4456242fef..49c033b84e4 100644
--- a/tensorflow/lite/micro/kernels/Makefile.inc
+++ b/tensorflow/lite/micro/kernels/Makefile.inc
@@ -123,6 +123,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/ceil_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/comparisons_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize_test.cc \
diff --git a/tensorflow/lite/micro/kernels/decode.cc b/tensorflow/lite/micro/kernels/decode.cc
new file mode 100644
index 00000000000..6c1478bb7f7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode.cc
@@ -0,0 +1,148 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const size_t num_inputs = NumInputs(node);
+  const size_t num_outputs = NumOutputs(node);
+  TF_LITE_ENSURE(context, num_outputs > 0);
+  TF_LITE_ENSURE_EQ(context, num_inputs, num_outputs * 2);
+
+  MicroContext* const micro_context = GetMicroContext(context);
+
+  node->user_data = micro_context->AllocatePersistentBuffer(
+      num_outputs * sizeof(DecodeState*));
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  DecodeState** const dsp_arr =
+      reinterpret_cast<DecodeState**>(node->user_data);
+
+  TfLiteTensor* input = nullptr;
+  TfLiteTensor* ancillary = nullptr;
+  TfLiteTensor* output = nullptr;
+  TfLiteStatus status = kTfLiteOk;
+
+  for (size_t i = 0; i < num_inputs; i += 2) {
+    input = micro_context->AllocateTempInputTensor(node, i);
+    if (input == nullptr) {
+      MicroPrintf("failed to allocate input tensor %u", i);
+      status = kTfLiteError;
+      break;
+    }
+    ancillary = micro_context->AllocateTempInputTensor(node, i + 1);
+    if (ancillary == nullptr) {
+      MicroPrintf("failed to allocate ancillary tensor %u", i + 1);
+      status = kTfLiteError;
+      break;
+    }
+    output = micro_context->AllocateTempOutputTensor(node, i / 2);
+    if (output == nullptr) {
+      MicroPrintf("failed to allocate output tensor %u", i / 2);
+      status = kTfLiteError;
+      break;
+    }
+
+    if (DecodeState::Version(*ancillary) != 1) {
+      MicroPrintf("version %u != 1", DecodeState::Version(*ancillary));
+      status = kTfLiteError;
+      break;
+    }
+
+    DecodeState* dsp = nullptr;
+    switch (DecodeState::Type(*ancillary)) {
+      case DecodeState::kDcmTypeLUT:
+        dsp = DecodeState::CreateDecodeStateLUT(
+            context, micro_context->GetAlternateProfiler());
+        break;
+      case DecodeState::kDcmTypeCustom:
+        MicroPrintf("Custom decode type not yet supported");
+        break;
+      default:
+        MicroPrintf("unsupported decode type %u",
+                    DecodeState::Type(*ancillary));
+        break;
+    }
+
+    if (dsp != nullptr) {
+      status = dsp->Setup(*input, *ancillary, *output);
+      if (status != kTfLiteOk) {
+        break;
+      }
+      dsp_arr[i / 2] = dsp;
+    } else {
+      MicroPrintf("failed to allocate DecodeState[%u]", i / 2);
+      break;
+    }
+
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(ancillary);
+    micro_context->DeallocateTempTfLiteTensor(output);
+    input = nullptr;
+    ancillary = nullptr;
+    output = nullptr;
+  }
+
+  if (input != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input);
+  }
+  if (ancillary != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(ancillary);
+  }
+  if (output != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+
+  return status;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const size_t num_inputs = NumInputs(node);
+  DecodeState** const dsp_arr =
+      reinterpret_cast<DecodeState**>(node->user_data);
+
+  for (size_t i = 0; i < num_inputs; i += 2) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i);
+    TF_LITE_ENSURE(context, input != nullptr);
+    const TfLiteEvalTensor* ancillary =
+        tflite::micro::GetEvalInput(context, node, i + 1);
+    TF_LITE_ENSURE(context, ancillary != nullptr);
+    const TfLiteEvalTensor* output =
+        tflite::micro::GetEvalOutput(context, node, i / 2);
+    TF_LITE_ENSURE(context, output != nullptr);
+
+    TfLiteStatus status = dsp_arr[i / 2]->Decode(*input, *ancillary, *output);
+    TF_LITE_ENSURE(context, status == kTfLiteOk);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_DECODE() {
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state.cc b/tensorflow/lite/micro/kernels/decode_state.cc
new file mode 100644
index 00000000000..87bb6a506d3
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state.cc
@@ -0,0 +1,36 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+
+DecodeState* DecodeState::CreateDecodeStateLUT(
+    const TfLiteContext* context, MicroProfilerInterface* profiler) {
+  MicroContext* const micro_context = GetMicroContext(context);
+  void* buffer =
+      micro_context->AllocatePersistentBuffer(sizeof(DecodeStateLUT));
+  if (buffer == nullptr) {
+    return nullptr;
+  }
+  DecodeState* dsp = new (buffer) DecodeStateLUT(context, profiler);
+
+  return dsp;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state.h b/tensorflow/lite/micro/kernels/decode_state.h
new file mode 100644
index 00000000000..80594fd2c26
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state.h
@@ -0,0 +1,87 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_profiler_interface.h"
+
+namespace tflite {
+
+struct DecodeState {
+  DecodeState() = delete;
+
+  DecodeState(const TfLiteContext* context, MicroProfilerInterface* profiler)
+      : context_(context), micro_profiler_(profiler) {}
+
+  virtual TfLiteStatus Setup(const TfLiteTensor& input,
+                             const TfLiteTensor& ancillary,
+                             const TfLiteTensor& output) = 0;
+  virtual TfLiteStatus Decode(const TfLiteEvalTensor& input,
+                              const TfLiteEvalTensor& ancillary,
+                              const TfLiteEvalTensor& output) = 0;
+
+  static DecodeState* CreateDecodeStateLUT(const TfLiteContext* context,
+                                           MicroProfilerInterface* profiler);
+
+  static uint8_t Type(const TfLiteTensor& ancillary) {
+    return GetTensorData<uint8_t>(&ancillary)[kDcmDecodeTypeOffset];
+  }
+
+  static uint8_t Type(const TfLiteEvalTensor& ancillary) {
+    return micro::GetTensorData<uint8_t>(&ancillary)[kDcmDecodeTypeOffset];
+  }
+
+  static uint8_t Version(const TfLiteTensor& ancillary) {
+    return GetTensorData<uint8_t>(&ancillary)[kDcmVersionOffset];
+  }
+
+  static uint8_t Version(const TfLiteEvalTensor& ancillary) {
+    return micro::GetTensorData<uint8_t>(&ancillary)[kDcmVersionOffset];
+  }
+
+ protected:
+  virtual ~DecodeState() = default;
+
+  // Decode Common Metadata constants
+ public:
+  static constexpr uint8_t kDcmTypeLUT = 0;
+  static constexpr uint8_t kDcmTypeCustom = 127;
+
+  static constexpr size_t kDcmSizeInBytes = 16;
+
+ private:
+  static constexpr size_t kDcmDecodeTypeOffset = 0;
+  static constexpr size_t kDcmVersionOffset = 1;
+
+  // DecodeState vars
+ protected:
+  const TfLiteContext* context_;
+  MicroProfilerInterface* micro_profiler_;
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.cc b/tensorflow/lite/micro/kernels/decode_state_lut.cc
new file mode 100644
index 00000000000..477c21d80a7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state_lut.cc
@@ -0,0 +1,630 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+
+#include <cstddef>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_profiler.h"
+
+namespace tflite {
+
+TfLiteStatus DecodeStateLUT::Setup(const TfLiteTensor& input,
+                                   const TfLiteTensor& ancillary,
+                                   const TfLiteTensor& output) {
+  const uint8_t* const ancillary_data = GetTensorData<uint8_t>(&ancillary);
+  if (ancillary_data[kDcmVersionOffset] != 1) {
+    MicroPrintf("unsupported version %u", ancillary_data[kDcmVersionOffset]);
+    return kTfLiteError;
+  }
+
+  // resolve num_channels_ and use_alternate_axis_
+  if (output.quantization.type == kTfLiteAffineQuantization &&
+      output.quantization.params != nullptr) {
+    const TfLiteAffineQuantization* quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(output.quantization.params);
+    num_channels_ = quantization->scale->size;
+    if ((quantization->quantized_dimension == output.dims->size - 1) &&
+        num_channels_ > 1) {
+      use_alternate_axis_ = true;
+    } else if (quantization->quantized_dimension != 0) {
+      MicroPrintf("unsupported quantization axis %u",
+                  quantization->quantized_dimension);
+      return kTfLiteError;
+    }
+  }
+
+  compressed_indices_ = GetTensorData<uint8_t>(&input);
+  count_indices_ = NumElements(&output);
+  elements_per_channel_ =
+      use_alternate_axis_ ? 1 : count_indices_ / num_channels_;
+  value_table_ = &ancillary_data[kDcmSizeInBytes];
+  value_table_channel_stride_ = ancillary_data[kDcmValueTableStrideOffset];
+  compressed_bit_width_ =
+      ancillary_data[kDcmParamsOffset] & kDcmParamsBitWidthMask;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DecodeStateLUT::Decode(const TfLiteEvalTensor& input,
+                                    const TfLiteEvalTensor& ancillary,
+                                    const TfLiteEvalTensor& output) {
+  void* const buffer = const_cast<void*>(micro::GetTensorData<void>(&output));
+  TFLITE_DCHECK(buffer != nullptr);
+
+  switch (output.type) {
+    case kTfLiteBool:
+      DecompressToBuffer<bool>(buffer);
+      break;
+    case kTfLiteFloat32:
+      DecompressToBuffer<float>(buffer);
+      break;
+    case kTfLiteInt8:
+      DecompressToBuffer<int8_t>(buffer);
+      break;
+    case kTfLiteInt16:
+      DecompressToBuffer<int16_t>(buffer);
+      break;
+    case kTfLiteInt32:
+      DecompressToBuffer<int32_t>(buffer);
+      break;
+    case kTfLiteInt64:
+      DecompressToBuffer<int64_t>(buffer);
+      break;
+    default:
+      MicroPrintf("unsupported tensor type %s", TfLiteTypeGetName(output.type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+T* DecodeStateLUT::DecompressToBuffer(void* buffer) {
+  TFLITE_DCHECK(compressed_bit_width_ <= kMaxBitWidth);
+  TFLITE_DCHECK(compressed_bit_width_ > 0);
+
+  if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 4 &&
+      !use_alternate_axis_) {
+    DecompressToBufferWidth4_16(static_cast<int8_t*>(buffer));
+  } else if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 3 &&
+             !use_alternate_axis_) {
+    DecompressToBufferWidth3_32(static_cast<int8_t*>(buffer));
+  } else if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 2 &&
+             !use_alternate_axis_) {
+    DecompressToBufferWidth2_16(static_cast<int8_t*>(buffer));
+  } else {
+    DecompressToBufferWidthAny<T>(static_cast<T*>(buffer));
+  }
+
+  return static_cast<T*>(buffer);
+}
+
+template bool* DecodeStateLUT::DecompressToBuffer<bool>(void*);
+template float* DecodeStateLUT::DecompressToBuffer<float>(void*);
+template int8_t* DecodeStateLUT::DecompressToBuffer<int8_t>(void*);
+template int16_t* DecodeStateLUT::DecompressToBuffer<int16_t>(void*);
+template int32_t* DecodeStateLUT::DecompressToBuffer<int32_t>(void*);
+template int64_t* DecodeStateLUT::DecompressToBuffer<int64_t>(void*);
+
+void DecodeStateLUT::DecompressToBufferWidth4_16(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint64_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x0F)) {
+      const size_t index = GetNextTableIndexWidth4(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 16
+    if (count >= 16) {
+      const uint64_t* indices = reinterpret_cast<const uint64_t*>(
+          &compressed_indices_[current_offset >> 1]);
+
+      while (count >= 16) {
+        count -= 16;
+        uint64_t index = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index >> 4) & 0x0F]);
+        value |= static_cast<uint64_t>(value_table[index & 0x0F]) << 8;
+        value |= static_cast<uint64_t>(value_table[(index >> 12) & 0x0F]) << 16;
+        value |= static_cast<uint64_t>(value_table[(index >> 8) & 0x0F]) << 24;
+        value |= static_cast<uint64_t>(value_table[(index >> 20) & 0x0F]) << 32;
+        value |= static_cast<uint64_t>(value_table[(index >> 16) & 0x0F]) << 40;
+        value |= static_cast<uint64_t>(value_table[(index >> 28) & 0x0F]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index >> 24) & 0x0F]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index >> 36) & 0x0F]);
+        value2 |= static_cast<uint64_t>(value_table[(index >> 32) & 0x0F]) << 8;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 44) & 0x0F])
+                  << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 40) & 0x0F])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 52) & 0x0F])
+                  << 32;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 48) & 0x0F])
+                  << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 60) & 0x0F])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 56) & 0x0F])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        buffer += 16;
+      }
+
+      current_offset =
+          (reinterpret_cast<const uint8_t*>(indices) - compressed_indices_)
+          << 1;
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth4(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+void DecodeStateLUT::DecompressToBufferWidth2_16(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint32_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x0F)) {
+      const size_t index = GetNextTableIndexWidth2(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 16
+    if (count >= 16) {
+      const uint32_t* indices = reinterpret_cast<const uint32_t*>(
+          &compressed_indices_[current_offset >> 2]);
+
+      while (count >= 16) {
+        count -= 16;
+        uint32_t index = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index >> 6) & 0x03]);
+        value |= static_cast<uint64_t>(value_table[(index >> 4) & 0x03]) << 8;
+        value |= static_cast<uint64_t>(value_table[(index >> 2) & 0x03]) << 16;
+        value |= static_cast<uint64_t>(value_table[index & 0x03]) << 24;
+        value |= static_cast<uint64_t>(value_table[(index >> 14) & 0x03]) << 32;
+        value |= static_cast<uint64_t>(value_table[(index >> 12) & 0x03]) << 40;
+        value |= static_cast<uint64_t>(value_table[(index >> 10) & 0x03]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index >> 8) & 0x03]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index >> 22) & 0x03]);
+        value2 |= static_cast<uint64_t>(value_table[(index >> 20) & 0x03]) << 8;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 18) & 0x03])
+                  << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 16) & 0x03])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 30) & 0x03])
+                  << 32;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 28) & 0x03])
+                  << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 26) & 0x03])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 24) & 0x03])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        buffer += 16;
+      }
+
+      current_offset =
+          (reinterpret_cast<const uint8_t*>(indices) - compressed_indices_)
+          << 2;
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth2(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+void DecodeStateLUT::DecompressToBufferWidth3_32(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint32_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x1F)) {
+      const size_t index = GetNextTableIndexWidth3(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 32
+    if (count >= 32) {
+      const uint32_t* indices = reinterpret_cast<const uint32_t*>(
+          &compressed_indices_[(current_offset >> 5) * 12]);
+
+      while (count >= 32) {
+        count -= 32;
+        uint32_t index0 = *indices++;
+        uint32_t index1 = *indices++;
+        uint32_t index2 = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index0 >> 5) & 0x07]);
+        value |= static_cast<uint64_t>(value_table[(index0 >> 2) & 0x07]) << 8;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index0 << 1) & 0b110) | ((index0 >> 15) & 0b1)])
+            << 16;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 12) & 0x07])
+                 << 24;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 9) & 0x07]) << 32;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index0 >> 6) & 0b100) | ((index0 >> 22) & 0b11)])
+            << 40;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 19) & 0x07])
+                 << 48;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 16) & 0x07])
+                 << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index0 >> 29) & 0x07]);
+        value2 |= static_cast<uint64_t>(value_table[(index0 >> 26) & 0x07])
+                  << 8;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index0 >> 23) & 0b110) | ((index1 >> 7) & 0b1)])
+            << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 4) & 0x07])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 1) & 0x07])
+                  << 32;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index1 << 2) & 0b100) | ((index1 >> 14) & 0b11)])
+            << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 11) & 0x07])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 8) & 0x07])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        value = static_cast<uint64_t>(value_table[(index1 >> 21) & 0x07]);
+        value |= static_cast<uint64_t>(value_table[(index1 >> 18) & 0x07]) << 8;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index1 >> 15) & 0b110) | ((index1 >> 31) & 0b1)])
+            << 16;
+        value |= static_cast<uint64_t>(value_table[(index1 >> 28) & 0x07])
+                 << 24;
+        value |= static_cast<uint64_t>(value_table[(index1 >> 25) & 0x07])
+                 << 32;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index1 >> 22) & 0b100) | ((index2 >> 6) & 0b11)])
+            << 40;
+        value |= static_cast<uint64_t>(value_table[(index2 >> 3) & 0x07]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index2 >> 0) & 0x07]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 16) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index2 >> 13) & 0x07]);
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 10) & 0x07])
+                  << 8;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index2 >> 7) & 0b110) | ((index2 >> 23) & 0b1)])
+            << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 20) & 0x07])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 17) & 0x07])
+                  << 32;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index2 >> 14) & 0b100) | ((index2 >> 30) & 0b11)])
+            << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 27) & 0x07])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 24) & 0x07])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 24) = value2;
+
+        buffer += 32;
+        current_offset += 32;
+      }
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth3(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+// TODO(ddavis-2015): templating GetNextTableIndexWidth<N> makes this method
+// more than 2x faster, but with a large code size increase
+template <typename T>
+void DecodeStateLUT::DecompressToBufferWidthAny(T* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  if (use_alternate_axis_) {
+    const size_t stride = value_table_channel_stride_;
+    size_t current_offset = 0;
+    size_t count = count_indices_;
+
+    while (count > 0) {
+      const T* value_table = static_cast<const T*>(value_table_);
+      for (size_t channel = 0; channel < num_channels_; channel++) {
+        size_t index;
+        switch (compressed_bit_width_) {
+          case 1:
+            index = GetNextTableIndexWidth1(current_offset);
+            break;
+          case 2:
+            index = GetNextTableIndexWidth2(current_offset);
+            break;
+          case 3:
+            index = GetNextTableIndexWidth3(current_offset);
+            break;
+          case 4:
+            index = GetNextTableIndexWidth4(current_offset);
+            break;
+          case 5:
+            index = GetNextTableIndexWidth5(current_offset);
+            break;
+          case 6:
+            index = GetNextTableIndexWidth6(current_offset);
+            break;
+          case 7:
+            index = GetNextTableIndexWidth7(current_offset);
+            break;
+        }
+        current_offset++;
+        *buffer++ = value_table[index];
+        value_table += stride;
+      }
+      count -= num_channels_;
+    }
+  } else {
+    const size_t stride = value_table_channel_stride_;
+    const T* value_table = static_cast<const T*>(value_table_);
+    const size_t max_count = elements_per_channel_;
+    size_t current_offset = 0;
+
+    for (size_t channel = 0; channel < num_channels_; channel++) {
+      size_t count = max_count;
+
+      while (count-- > 0) {
+        size_t index;
+        switch (compressed_bit_width_) {
+          case 1:
+            index = GetNextTableIndexWidth1(current_offset);
+            break;
+          case 2:
+            index = GetNextTableIndexWidth2(current_offset);
+            break;
+          case 3:
+            index = GetNextTableIndexWidth3(current_offset);
+            break;
+          case 4:
+            index = GetNextTableIndexWidth4(current_offset);
+            break;
+          case 5:
+            index = GetNextTableIndexWidth5(current_offset);
+            break;
+          case 6:
+            index = GetNextTableIndexWidth6(current_offset);
+            break;
+          case 7:
+            index = GetNextTableIndexWidth7(current_offset);
+            break;
+        }
+        current_offset++;
+        *buffer++ = value_table[index];
+      }
+      value_table += stride;
+    }
+  }
+}
+
+template void DecodeStateLUT::DecompressToBufferWidthAny(bool*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(float*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int8_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int16_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int32_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int64_t*);
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth7(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 7;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 1;
+    case 1:
+      return ((indices[0] & 0b1) << 6) | (indices[1] >> 2);
+    case 2:
+      return ((indices[1] & 0b11) << 5) | (indices[2] >> 3);
+    case 3:
+      return ((indices[2] & 0b111) << 4) | (indices[3] >> 4);
+    case 4:
+      return ((indices[3] & 0x0F) << 3) | (indices[4] >> 5);
+    case 5:
+      return ((indices[4] & 0x1F) << 2) | (indices[5] >> 6);
+    case 6:
+      return ((indices[5] & 0x3F) << 1) | (indices[6] >> 7);
+    case 7:
+      return indices[6] & 0x7F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth6(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 2) * 3;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b11) {
+    case 0:
+      return indices[0] >> 2;
+    case 1:
+      return ((indices[0] & 0b11) << 4) | (indices[1] >> 4);
+    case 2:
+      return ((indices[1] & 0x0F) << 2) | (indices[2] >> 6);
+    case 3:
+      return indices[2] & 0x3F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth5(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 5;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 3;
+    case 1:
+      return ((indices[0] & 0b111) << 2) | (indices[1] >> 6);
+    case 2:
+      return (indices[1] >> 1) & 0x1F;
+    case 3:
+      return ((indices[1] & 0b1) << 4) | (indices[2] >> 4);
+    case 4:
+      return ((indices[2] & 0x0F) << 1) | (indices[3] >> 7);
+    case 5:
+      return (indices[3] >> 2) & 0x1F;
+    case 6:
+      return ((indices[3] & 0b11) << 3) | (indices[4] >> 5);
+    case 7:
+      return indices[4] & 0x1F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth4(
+    const size_t current_offset) {
+  if (current_offset & 1) {
+    return compressed_indices_[current_offset >> 1] & 0x0F;
+  } else {
+    return compressed_indices_[current_offset >> 1] >> 4;
+  }
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth3(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 3;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 5;
+    case 1:
+      return (indices[0] >> 2) & 0b111;
+    case 2:
+      return ((indices[0] & 0b11) << 1) | (indices[1] >> 7);
+    case 3:
+      return (indices[1] >> 4) & 0b111;
+    case 4:
+      return (indices[1] >> 1) & 0b111;
+    case 5:
+      return ((indices[1] & 0b1) << 2) | (indices[2] >> 6);
+    case 6:
+      return (indices[2] >> 3) & 0b111;
+    case 7:
+      return indices[2] & 0b111;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth2(
+    const size_t current_offset) {
+  if (current_offset & 0b10) {
+    if (current_offset & 1) {
+      return compressed_indices_[current_offset >> 2] & 0x03;
+    } else {
+      return (compressed_indices_[current_offset >> 2] >> 2) & 0x03;
+    }
+  } else {
+    if (current_offset & 1) {
+      return (compressed_indices_[current_offset >> 2] >> 4) & 0x03;
+    } else {
+      return (compressed_indices_[current_offset >> 2] >> 6) & 0x03;
+    }
+  }
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth1(
+    const size_t current_offset) {
+  const size_t shift = ~current_offset & 0b111;
+  return (compressed_indices_[current_offset >> 3] >> shift) & 0b1;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.h b/tensorflow/lite/micro/kernels/decode_state_lut.h
new file mode 100644
index 00000000000..dbb64683960
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state_lut.h
@@ -0,0 +1,92 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+
+namespace tflite {
+
+struct DecodeStateLUT : public DecodeState {
+  DecodeStateLUT() = delete;
+
+  DecodeStateLUT(const TfLiteContext* context, MicroProfilerInterface* profiler)
+      : DecodeState(context, profiler) {}
+
+  virtual TfLiteStatus Setup(const TfLiteTensor& input,
+                             const TfLiteTensor& ancillary,
+                             const TfLiteTensor& output) override;
+  virtual TfLiteStatus Decode(const TfLiteEvalTensor& input,
+                              const TfLiteEvalTensor& ancillary,
+                              const TfLiteEvalTensor& output) override;
+
+ protected:
+  // LUT compression constants
+  static constexpr size_t kMaxBitWidth = 7;
+  static constexpr size_t kMaxValueTableChannelStride = 128;
+
+ private:
+  // LUT Decode Common Metadata constants
+  static constexpr size_t kDcmVersionOffset = 4;
+  static constexpr size_t kDcmParamsOffset = 5;
+  static constexpr uint8_t kDcmParamsBitWidthMask = 0x07;
+  static constexpr size_t kDcmValueTableStrideOffset = 6;
+
+ protected:
+  virtual ~DecodeStateLUT() = default;
+
+  template <typename T>
+  T* DecompressToBuffer(void* buffer);
+
+  // optimized C++ for INT8, use_alt_axis == false
+  void DecompressToBufferWidth4_16(int8_t* buffer);
+  void DecompressToBufferWidth3_32(int8_t* buffer);
+  void DecompressToBufferWidth2_16(int8_t* buffer);
+
+  // generic C++ for any bit width and value table type
+  template <typename T>
+  void DecompressToBufferWidthAny(T* buffer);
+
+  // Optimized C++ table index fetch
+  inline size_t GetNextTableIndexWidth7(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth6(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth5(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth4(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth3(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth2(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth1(const size_t current_offset);
+
+ protected:
+  const uint8_t* compressed_indices_ = nullptr;
+  size_t count_indices_ = 0;
+  size_t num_channels_ = 1;
+  size_t elements_per_channel_ = 0;         // computed from use_alternate_axis_
+  const void* value_table_ = nullptr;       // Pointer into FlatBuffer values
+  uint8_t value_table_channel_stride_ = 0;  // elements per channel
+  uint8_t compressed_bit_width_ = 0;        // 1 to 7 bits
+  bool use_alternate_axis_ = false;         // shape channel axis:
+                                            // false = first, true = last
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc
new file mode 100644
index 00000000000..3008736e535
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_test.cc
@@ -0,0 +1,333 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <initializer_list>
+#include <type_traits>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+struct TensorInDatum {
+  const void* const data;
+  const TfLiteIntArray& dims;
+};
+
+struct TensorOutDatum {
+  void* const data;
+  const TfLiteIntArray& dims;
+  const TfLiteType type;
+  const TfLiteFloatArray& scales;
+  const TfLiteIntArray& zero_points;
+  const int quantized_dimension;
+
+  // initialized by CreatePerChannelQuantizedTensor
+  const TfLiteAffineQuantization affine_quantization;
+};
+
+template <typename T, size_t N>
+struct AncillaryLUT {
+  AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes],
+               const T (&values)[N]) {
+    std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_));
+    std::copy(std::begin(values), std::end(values), std::begin(value_table_));
+  }
+
+ private:
+  uint8_t dcm_[tflite::DecodeState::kDcmSizeInBytes];
+  T value_table_[N > 0 ? N : 1];  // assure not zero length
+};
+
+constexpr int kBitWidthLUT = 2;
+
+constexpr int8_t kAncillaryDataLUT0[] = {1, 2, 3, 4};
+constexpr int16_t kAncillaryDataLUT1[] = {5, 6, 7, 8};
+
+constexpr uint8_t kDcmLUT0[tflite::DecodeState::kDcmSizeInBytes] = {
+    tflite::DecodeState::kDcmTypeLUT,  // type: LUT
+    1,                                 // DCM version: 1
+    0,                                 // reserved
+    0,                                 // reserved
+    1,                                 // LUT version: 1
+    kBitWidthLUT,                      // Parameters: bit-width 2
+    std::size(kAncillaryDataLUT0),     // channel stride
+};
+
+constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = {
+    tflite::DecodeState::kDcmTypeLUT,  // type: LUT
+    1,                                 // DCM version: 1
+    0,                                 // reserved
+    0,                                 // reserved
+    1,                                 // LUT version: 1
+    kBitWidthLUT,                      // Parameters: bit-width 2
+    std::size(kAncillaryDataLUT1),     // channel stride
+};
+
+// Align the tensor data the same as a Buffer in the TfLite schema
+alignas(16) const
+    AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)> kAncillaryLUT0 = {
+        {kDcmLUT0}, {kAncillaryDataLUT0}};
+alignas(16) const
+    AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)> kAncillaryLUT1 = {
+        {kDcmLUT1}, {kAncillaryDataLUT1}};
+alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4};
+
+// Tensor shapes as TfLiteIntArray
+constexpr int kOutputShapeLUT[] = {3, 1, 2, 4};
+constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)};
+constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
+constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
+
+constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1};
+constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5};
+
+template <typename T>
+TfLiteStatus CheckOutput(const TfLiteTensor& output,
+                         const void* const expected) {
+  const T* const expected_data = reinterpret_cast<const T*>(expected);
+  const T* const output_data = tflite::GetTensorData<T>(&output);
+
+  constexpr float kTolerance = 1e-5;
+  const size_t kOutputCount = tflite::NumElements(&output);
+  for (size_t i = 0; i < kOutputCount; i++) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_data[i], output_data[i], kTolerance);
+    TF_LITE_MICRO_CHECK_FAIL();
+  }
+
+  return kTfLiteOk;
+}
+
+template <size_t kNumInputs, size_t kNumOutputs>
+TfLiteStatus ExecuteDecodeTest(
+    TfLiteTensor* tensors, const TFLMRegistration& registration,
+    const std::initializer_list<const void*>& expected) {
+  int kInputArrayData[kNumInputs + 1] = {kNumInputs};
+  for (size_t i = 0; i < kNumInputs; i++) {
+    kInputArrayData[i + 1] = i;
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInts(kInputArrayData);
+
+  int kOutputArrayData[kNumOutputs + 1] = {kNumOutputs};
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    kOutputArrayData[i + 1] = i + kNumInputs;
+  }
+  TfLiteIntArray* outputs_array = IntArrayFromInts(kOutputArrayData);
+
+  micro::KernelRunner runner(registration, tensors, kNumInputs + kNumOutputs,
+                             inputs_array, outputs_array, nullptr);
+
+  if (runner.InitAndPrepare() != kTfLiteOk || runner.Invoke() != kTfLiteOk) {
+    return kTfLiteError;
+  }
+
+  const TfLiteTensor* const output_tensors = &tensors[kNumInputs];
+  TfLiteStatus status = kTfLiteError;
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    switch (output_tensors[i].type) {
+      case kTfLiteInt8:
+        status = CheckOutput<int8_t>(output_tensors[i], expected.begin()[i]);
+        break;
+      case kTfLiteInt16:
+        status = CheckOutput<int16_t>(output_tensors[i], expected.begin()[i]);
+        break;
+      default:
+        TF_LITE_MICRO_FAIL("unsupported tensor type in test");
+        break;
+    }
+  }
+
+  return status;
+}
+
+template <size_t kNumInputs, size_t kNumOutputs>
+void TestDecode(const std::initializer_list<const TensorInDatum*>& encodes,
+                const std::initializer_list<const TensorInDatum*>& ancillaries,
+                const std::initializer_list<const TensorOutDatum*>& outputs,
+                const std::initializer_list<const void*>& expected,
+                const TFLMRegistration& registration,
+                const TfLiteStatus expected_status = kTfLiteOk) {
+  TfLiteTensor tensors[kNumInputs + kNumOutputs] = {};
+
+  for (size_t i = 0; i < kNumInputs; i += 2) {
+    const TensorInDatum& tid_encode = *encodes.begin()[i / 2];
+    tensors[i] = CreateTensor(tid_encode.data,
+                              const_cast<TfLiteIntArray*>(&tid_encode.dims),
+                              false, kTfLiteUInt8);
+    const TensorInDatum& tid_ancillary = *ancillaries.begin()[i / 2];
+    tensors[i + 1] = CreateTensor(
+        tid_ancillary.data, const_cast<TfLiteIntArray*>(&tid_ancillary.dims),
+        false, kTfLiteUInt8);
+  }
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    const TensorOutDatum& tod = *outputs.begin()[i];
+    if (tod.scales.size == 0) {
+      tensors[i + kNumInputs] = CreateTensor(
+          tod.data, const_cast<TfLiteIntArray*>(&tod.dims), false, tod.type);
+    } else {
+      tensors[i + kNumInputs] = CreatePerChannelQuantizedTensor(
+          tod.data, const_cast<TfLiteIntArray*>(&tod.dims),
+          const_cast<TfLiteFloatArray*>(&tod.scales),
+          const_cast<TfLiteIntArray*>(&tod.zero_points),
+          const_cast<TfLiteAffineQuantization*>(&tod.affine_quantization),
+          tod.quantized_dimension, false, tod.type);
+    }
+  }
+
+  TfLiteStatus s = ExecuteDecodeTest<kNumInputs, kNumOutputs>(
+      tensors, registration, expected);
+  TF_LITE_MICRO_EXPECT_EQ(s, expected_status);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(DecodeSingleTensor) {
+  // Align the tensor data the same as a Buffer in the TfLite schema
+  alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {};
+
+  const TfLiteIntArray* const encoded_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
+  static const tflite::testing::TensorInDatum tid_encode = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      encodes = {
+          &tid_encode,
+      };
+
+  const TfLiteIntArray* const ancillary_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
+  static const tflite::testing::TensorInDatum tid_ancillary = {
+      &tflite::testing::kAncillaryLUT0,
+      *ancillary_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      ancillaries = {&tid_ancillary};
+
+  const TfLiteIntArray* const output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+  constexpr float output_scales_data[] = {0};
+  const TfLiteFloatArray* const output_scales =
+      tflite::testing::FloatArrayFromFloats(output_scales_data);
+  constexpr int output_zero_points_data[] = {0};
+  const TfLiteIntArray* const output_zero_points =
+      tflite::testing::IntArrayFromInts(output_zero_points_data);
+  static const tflite::testing::TensorOutDatum tod = {
+      output_data,
+      *output_dims,
+      kTfLiteInt8,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
+      outputs = {&tod};
+
+  const std::initializer_list<const void*> expected = {
+      tflite::testing::kExpectLUT0,
+  };
+
+  tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
+                              outputs.size()>(
+      encodes, ancillaries, outputs, expected, tflite::Register_DECODE());
+}
+
+TF_LITE_MICRO_TEST(DecodeTwoTensors) {
+  // Align the tensor data the same as a Buffer in the TfLite schema
+  alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {};
+  alignas(16)
+      int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {};
+
+  const TfLiteIntArray* const encoded_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
+  static const tflite::testing::TensorInDatum tid_encode0 = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static const tflite::testing::TensorInDatum tid_encode1 = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      encodes = {&tid_encode0, &tid_encode1};
+
+  const TfLiteIntArray* const ancillary_dims0 =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
+  static const tflite::testing::TensorInDatum tid_ancillary0 = {
+      &tflite::testing::kAncillaryLUT0,
+      *ancillary_dims0,
+  };
+  const TfLiteIntArray* const ancillary_dims1 =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1);
+  static const tflite::testing::TensorInDatum tid_ancillary1 = {
+      &tflite::testing::kAncillaryLUT1,
+      *ancillary_dims1,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      ancillaries = {&tid_ancillary0, &tid_ancillary1};
+
+  const TfLiteIntArray* const output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+  constexpr float output_scales_data[] = {1, 1.0f};
+  const TfLiteFloatArray* const output_scales =
+      tflite::testing::FloatArrayFromFloats(output_scales_data);
+  constexpr int output_zero_points_data[] = {1, 0};
+  const TfLiteIntArray* const output_zero_points =
+      tflite::testing::IntArrayFromInts(output_zero_points_data);
+  static const tflite::testing::TensorOutDatum tod0 = {
+      output_data0,
+      *output_dims,
+      kTfLiteInt8,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static const tflite::testing::TensorOutDatum tod1 = {
+      output_data1,
+      *output_dims,
+      kTfLiteInt16,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
+      outputs = {&tod0, &tod1};
+
+  const std::initializer_list<const void*> expected = {
+      tflite::testing::kExpectLUT0,
+      tflite::testing::kExpectLUT1,
+  };
+
+  tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
+                              outputs.size()>(
+      encodes, ancillaries, outputs, expected, tflite::Register_DECODE());
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 2e33a6730bd..8b76ca2cc17 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -53,6 +53,7 @@ TFLMRegistration Register_CONCATENATION();
 TFLMRegistration Register_CONV_2D();
 TFLMRegistration Register_COS();
 TFLMRegistration Register_CUMSUM();
+TFLMRegistration Register_DECODE();
 TFLMRegistration Register_DEPTH_TO_SPACE();
 TFLMRegistration Register_DEPTHWISE_CONV_2D();
 TFLMRegistration Register_DEQUANTIZE();
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index f3f2080f0aa..6a638d93b97 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -206,6 +206,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseCumsum);
   }
 
+  TfLiteStatus AddDecode() {
+    const TFLMRegistration& registration = tflite::Register_DECODE();
+    return AddCustom("TFLM_DECODE", &registration);
+  }
+
   TfLiteStatus AddDelay() {
     // TODO(b/286250473): change back name to "Delay" and remove namespace
     return AddCustom("SignalDelay", tflite::tflm_signal::Register_DELAY());
diff --git a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
index 9b98849c472..651429b76ec 100644
--- a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
+++ b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ inline TfLiteStatus CreateOpResolver(TflmOpResolver& op_resolver) {
   TF_LITE_ENSURE_STATUS(op_resolver.AddConv2D());
   TF_LITE_ENSURE_STATUS(op_resolver.AddCos());
   TF_LITE_ENSURE_STATUS(op_resolver.AddCumSum());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddDecode());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDelay());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDepthToSpace());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDepthwiseConv2D());
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 0bf5532badf..a43abf7f7f7 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -386,6 +386,9 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_lut.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space.cc \

From 15ac156290878ced26340859f19b94ecec0f1885 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 12:01:19 -0700
Subject: [PATCH 2/8] update copyright

---
 tensorflow/lite/micro/kernels/micro_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 8b76ca2cc17..b715c735017 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 6f96b2983a150c728ea00837e9b9953c9479634f Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 17:05:24 -0700
Subject: [PATCH 3/8] Don't use constructors with global objects (bluepill will
 not call them).

Cleanup unit test.
---
 tensorflow/lite/micro/kernels/decode_test.cc | 121 ++++++++++---------
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc
index 3008736e535..69ee7f61a5f 100644
--- a/tensorflow/lite/micro/kernels/decode_test.cc
+++ b/tensorflow/lite/micro/kernels/decode_test.cc
@@ -47,6 +47,7 @@ struct TensorOutDatum {
 
 template <typename T, size_t N>
 struct AncillaryLUT {
+  AncillaryLUT() = delete;
   AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes],
                const T (&values)[N]) {
     std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_));
@@ -84,19 +85,11 @@ constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = {
 };
 
 // Align the tensor data the same as a Buffer in the TfLite schema
-alignas(16) const
-    AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)> kAncillaryLUT0 = {
-        {kDcmLUT0}, {kAncillaryDataLUT0}};
-alignas(16) const
-    AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)> kAncillaryLUT1 = {
-        {kDcmLUT1}, {kAncillaryDataLUT1}};
 alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4};
 
 // Tensor shapes as TfLiteIntArray
 constexpr int kOutputShapeLUT[] = {3, 1, 2, 4};
 constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)};
-constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
-constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
 
 constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1};
 constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5};
@@ -204,39 +197,55 @@ void TestDecode(const std::initializer_list<const TensorInDatum*>& encodes,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
+using tflite::testing::AncillaryLUT;
+using tflite::testing::kAncillaryDataLUT0;
+using tflite::testing::kAncillaryDataLUT1;
+using tflite::testing::kDcmLUT0;
+using tflite::testing::kDcmLUT1;
+using tflite::testing::kEncodedLUT;
+using tflite::testing::kEncodedShapeLUT;
+using tflite::testing::kExpectLUT0;
+using tflite::testing::kExpectLUT1;
+using tflite::testing::kOutputShapeLUT;
+using tflite::testing::TensorInDatum;
+using tflite::testing::TensorOutDatum;
+
 TF_LITE_MICRO_TEST(DecodeSingleTensor) {
   // Align the tensor data the same as a Buffer in the TfLite schema
-  alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {};
+  alignas(16) int8_t output_data[std::size(kExpectLUT0)] = {};
+  alignas(16) const AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)>
+      kAncillaryLUT = {{kDcmLUT0}, {kAncillaryDataLUT0}};
+
+  constexpr int kAncillaryShapeLUT[] = {1, sizeof(kAncillaryLUT)};
 
   const TfLiteIntArray* const encoded_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
-  static const tflite::testing::TensorInDatum tid_encode = {
-      tflite::testing::kEncodedLUT,
+      tflite::testing::IntArrayFromInts(kEncodedShapeLUT);
+  static const TensorInDatum tid_encode = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      encodes = {
-          &tid_encode,
-      };
+  static constexpr std::initializer_list<const TensorInDatum*> encodes = {
+      &tid_encode,
+  };
 
   const TfLiteIntArray* const ancillary_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
-  static const tflite::testing::TensorInDatum tid_ancillary = {
-      &tflite::testing::kAncillaryLUT0,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT);
+  static const TensorInDatum tid_ancillary = {
+      &kAncillaryLUT,
       *ancillary_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      ancillaries = {&tid_ancillary};
+  static constexpr std::initializer_list<const TensorInDatum*> ancillaries = {
+      &tid_ancillary};
 
   const TfLiteIntArray* const output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+      tflite::testing::IntArrayFromInts(kOutputShapeLUT);
   constexpr float output_scales_data[] = {0};
   const TfLiteFloatArray* const output_scales =
       tflite::testing::FloatArrayFromFloats(output_scales_data);
   constexpr int output_zero_points_data[] = {0};
   const TfLiteIntArray* const output_zero_points =
       tflite::testing::IntArrayFromInts(output_zero_points_data);
-  static const tflite::testing::TensorOutDatum tod = {
+  static const TensorOutDatum tod = {
       output_data,
       *output_dims,
       kTfLiteInt8,
@@ -245,12 +254,10 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) {
       0,
       {},
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
-      outputs = {&tod};
+  static constexpr std::initializer_list<const TensorOutDatum*> outputs = {
+      &tod};
 
-  const std::initializer_list<const void*> expected = {
-      tflite::testing::kExpectLUT0,
-  };
+  const std::initializer_list<const void*> expected = {kExpectLUT0};
 
   tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
                               outputs.size()>(
@@ -259,47 +266,53 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) {
 
 TF_LITE_MICRO_TEST(DecodeTwoTensors) {
   // Align the tensor data the same as a Buffer in the TfLite schema
-  alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {};
-  alignas(16)
-      int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {};
+  alignas(16) int8_t output_data0[std::size(kExpectLUT0)] = {};
+  alignas(16) int16_t output_data1[std::size(kExpectLUT1)] = {};
+  alignas(16) const AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)>
+      kAncillaryLUT0 = {{kDcmLUT0}, {kAncillaryDataLUT0}};
+  alignas(16) const AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)>
+      kAncillaryLUT1 = {{kDcmLUT1}, {kAncillaryDataLUT1}};
+
+  constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
+  constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
 
   const TfLiteIntArray* const encoded_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
-  static const tflite::testing::TensorInDatum tid_encode0 = {
-      tflite::testing::kEncodedLUT,
+      tflite::testing::IntArrayFromInts(kEncodedShapeLUT);
+  static const TensorInDatum tid_encode0 = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static const tflite::testing::TensorInDatum tid_encode1 = {
-      tflite::testing::kEncodedLUT,
+  static const TensorInDatum tid_encode1 = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      encodes = {&tid_encode0, &tid_encode1};
+  static constexpr std::initializer_list<const TensorInDatum*> encodes = {
+      &tid_encode0, &tid_encode1};
 
   const TfLiteIntArray* const ancillary_dims0 =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
-  static const tflite::testing::TensorInDatum tid_ancillary0 = {
-      &tflite::testing::kAncillaryLUT0,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT0);
+  static const TensorInDatum tid_ancillary0 = {
+      &kAncillaryLUT0,
       *ancillary_dims0,
   };
   const TfLiteIntArray* const ancillary_dims1 =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1);
-  static const tflite::testing::TensorInDatum tid_ancillary1 = {
-      &tflite::testing::kAncillaryLUT1,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT1);
+  static const TensorInDatum tid_ancillary1 = {
+      &kAncillaryLUT1,
       *ancillary_dims1,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      ancillaries = {&tid_ancillary0, &tid_ancillary1};
+  static constexpr std::initializer_list<const TensorInDatum*> ancillaries = {
+      &tid_ancillary0, &tid_ancillary1};
 
   const TfLiteIntArray* const output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+      tflite::testing::IntArrayFromInts(kOutputShapeLUT);
   constexpr float output_scales_data[] = {1, 1.0f};
   const TfLiteFloatArray* const output_scales =
       tflite::testing::FloatArrayFromFloats(output_scales_data);
   constexpr int output_zero_points_data[] = {1, 0};
   const TfLiteIntArray* const output_zero_points =
       tflite::testing::IntArrayFromInts(output_zero_points_data);
-  static const tflite::testing::TensorOutDatum tod0 = {
+  static const TensorOutDatum tod0 = {
       output_data0,
       *output_dims,
       kTfLiteInt8,
@@ -308,7 +321,7 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) {
       0,
       {},
   };
-  static const tflite::testing::TensorOutDatum tod1 = {
+  static const TensorOutDatum tod1 = {
       output_data1,
       *output_dims,
       kTfLiteInt16,
@@ -317,13 +330,11 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) {
       0,
       {},
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
-      outputs = {&tod0, &tod1};
+  static constexpr std::initializer_list<const TensorOutDatum*> outputs = {
+      &tod0, &tod1};
 
-  const std::initializer_list<const void*> expected = {
-      tflite::testing::kExpectLUT0,
-      tflite::testing::kExpectLUT1,
-  };
+  const std::initializer_list<const void*> expected = {kExpectLUT0,
+                                                       kExpectLUT1};
 
   tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
                               outputs.size()>(

From 7d1463a72791690b94f2603d962770fb9e2b57a0 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Sun, 20 Jul 2025 11:58:58 -0700
Subject: [PATCH 4/8] Support for DECODE operator

@tensorflow/micro

Additional support for DECODE operator.
Add Xtensa optimizations for LUT decompression.
Move all Xtensa kernel source references to the Xtensa target makefile.

bug=fixes #3150
---
 .../lite/micro/kernels/xtensa/decode_state.cc |  48 ++
 .../kernels/xtensa/xtensa_decode_state_lut.cc | 609 ++++++++++++++++++
 .../kernels/xtensa/xtensa_decode_state_lut.h  |  57 ++
 .../lite/micro/tools/make/ext_libs/xtensa.inc |  28 -
 .../tools/make/targets/xtensa_makefile.inc    |  61 +-
 5 files changed, 764 insertions(+), 39 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/xtensa/decode_state.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc
 create mode 100644 tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h

diff --git a/tensorflow/lite/micro/kernels/xtensa/decode_state.cc b/tensorflow/lite/micro/kernels/xtensa/decode_state.cc
new file mode 100644
index 00000000000..fcc8f39137b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa/decode_state.cc
@@ -0,0 +1,48 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+#ifdef HIFI5
+#include "tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h"
+#endif  // HIFI5
+
+namespace tflite {
+
+DecodeState* DecodeState::CreateDecodeStateLUT(
+    const TfLiteContext* context, MicroProfilerInterface* profiler) {
+  MicroContext* const micro_context = GetMicroContext(context);
+#ifdef HIFI5
+  constexpr size_t kBufferSize = sizeof(XtensaDecodeStateLUT);
+#else
+  constexpr size_t kBufferSize = sizeof(DecodeStateLUT);
+#endif  // HIFI5
+  void* buffer = micro_context->AllocatePersistentBuffer(kBufferSize);
+  if (buffer == nullptr) {
+    return nullptr;
+  }
+#ifdef HIFI5
+  DecodeState* dsp = new (buffer) XtensaDecodeStateLUT(context, profiler);
+#else
+  DecodeState* dsp = new (buffer) DecodeStateLUT(context, profiler);
+#endif  // HIFI5
+
+  return dsp;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc b/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc
new file mode 100644
index 00000000000..de5435f4b00
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc
@@ -0,0 +1,609 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h"
+
+#include <cstddef>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_profiler.h"
+
+namespace tflite {
+
+void XtensaDecodeStateLUT::DecompressToBufferWidth4_Xtensa(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  ae_int8x8 d_shuffle_t = AE_MOVINT8X8_FROMINT64(0xFB73EA62D951C840LL);
+  ae_int8x8 d_shuffle_value_t = AE_MOVINT8X8_FROMINT64(0x08192A3B4C5D6E7FLL);
+  int elements_per_channel_t_by_4 = elements_per_channel_ >> 4;
+  int elements_per_channel_t_rem = elements_per_channel_ & 0xF;
+  int j;
+
+  ae_int8x8 d_out1, d_out2;
+  ae_int8x8 d_value_0_t, d_value_1_t;
+  ae_int8x8 d_value_0, d_value_1;
+  ae_int8x8 d_index, d_dummy;
+
+  ae_int8x8* __restrict pIn_tmp = (ae_int8x8*)compressed_indices_;
+  ae_int8* __restrict p_out_tmp = (ae_int8*)buffer;
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* __restrict value_table =
+      static_cast<const uint8_t*>(value_table_);
+
+  const uint8_t* __restrict value_table_t = value_table;
+
+  ae_valignx2 align_store = AE_ZALIGN128();
+
+  for (size_t i = 0; i < num_channels_; i++) {
+    value_table_t = value_table;
+    ae_valignx2 align_vtab = AE_LA128_PP(value_table_t);
+    AE_LA8X8X2_IP(d_value_0_t, d_value_1_t, align_vtab,
+                  (ae_int8x16*)value_table_t);
+    AE_DSEL8X8(d_value_0, d_value_1, d_value_0_t, d_value_1_t,
+               d_shuffle_value_t);
+
+    ae_valign align_load = AE_LA64_PP(pIn_tmp);
+
+    for (j = 0; j < elements_per_channel_t_by_4; j++) {
+      AE_LA8X8_IP(d_index, align_load, pIn_tmp);
+      AE_DSEL8X8(d_out1, d_out2, d_value_0, d_value_1, d_index);
+      AE_DSEL8X8(d_out1, d_out2, d_out1, d_out2, d_shuffle_t);
+      AE_SA8X8X2_IP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp);
+    }
+
+    value_table += stride;
+    if (elements_per_channel_t_rem) {
+      ae_valignx2 align_index = AE_LA128_PP(pIn_tmp);
+      AE_LAV8X8X2_XP(d_index, d_dummy, align_index, (ae_int8x16*)pIn_tmp,
+                     (elements_per_channel_t_rem >>
+                      1)); /* Loading 48 bits for decoding 16 weight values */
+      AE_DSEL8X8(d_out1, d_out2, d_value_0, d_value_1, d_index);
+      AE_DSEL8X8(d_out1, d_out2, d_out1, d_out2, d_shuffle_t);
+      AE_SAV8X8X2_XP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp,
+                     elements_per_channel_t_rem);
+    }
+  }
+  AE_SA128POS_FP(align_store, (ae_int8x16*)p_out_tmp);
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidth3_Xtensa(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  int i, j;
+  ae_int8* __restrict p_out_tmp = (ae_int8*)buffer;
+  ae_int8x8* pIn_tmp = (ae_int8x8*)compressed_indices_;
+  const uint8_t* __restrict value_table =
+      static_cast<const uint8_t*>(value_table_);
+
+  const uint8_t* __restrict value_table_t = value_table;
+
+  int num_channels_t = num_channels_;
+  const size_t stride = value_table_channel_stride_;
+
+  int elements_per_channel_t_by_4 = elements_per_channel_ >> 4;
+  int elements_per_channel_t_rem = elements_per_channel_ & 0xF;
+
+  ae_int8x8 d_index, d_dummy;
+  ae_int8x8 d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+  ae_int8x8 d_out1, d_out2;
+
+  ae_valignx2 align_index = AE_LA128_PP(pIn_tmp);
+
+  ae_int8x8 d_shuffle_value_t = AE_MOVINT8X8_FROMINT64(0x08192A3B4C5D6E7FLL);
+  ae_int8x8 d_shuffle_t1 = AE_MOVINT8X8_FROMINT64(0x0F00050C00020000LL);
+  ae_int8x8 d_shuffle_t2 = AE_MOVINT8X8_FROMINT64(0x000E00040B000100LL);
+  ae_int8x8 d_shuffle_t3 = AE_MOVINT8X8_FROMINT64(0x0F060D040C030A01LL);
+  ae_int8x8 d_shuffle_t = AE_MOVINT8X8_FROMINT64(0xFB73EA62D951C840LL);
+
+  ae_valignx2 align_store = AE_ZALIGN128();
+
+  for (i = 0; i < num_channels_t; i++) {
+    ae_int8x8 d_value_0 = AE_MOVINT8X8_FROMINT64(AE_ZERO());
+    ae_int8x8 d_value_1 = AE_MOVINT8X8_FROMINT64(AE_ZERO());
+
+    value_table_t = value_table;
+
+    ae_valign align_vtab = AE_LA64_PP(value_table_t);
+    AE_LA8X8_IP(d_value_0, align_vtab, (ae_int8x8*)value_table_t);
+    AE_DSEL8X8(d_value_0, d_value_1, d_value_0, d_value_1, d_shuffle_value_t);
+
+    for (j = 0; j < elements_per_channel_t_by_4; j++) {
+      AE_LAV8X8X2_XP(d_index, d_dummy, align_index, (ae_int8x16*)pIn_tmp,
+                     6); /* Loading 48 bits for decoding 16 weight values */
+
+      d1 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 1));
+      d2 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 2));
+      d3 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 3));
+      d4 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 4));
+
+      d1 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d1), 0x7007007007000000LL));
+      d2 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d2), 0x0700700700700000LL));
+      d3 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d3), 0x0070070070070000LL));
+      d4 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d4), 0x0007007007007000LL));
+
+      d5 = d1 | d2;
+      d6 = d3 | d4;
+
+      d7 = AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d5), 4));
+      d8 = AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d6), 4));
+
+      d9 = AE_SEL8X8(d5, d7, d_shuffle_t1);
+      d10 = AE_SEL8X8(d6, d8, d_shuffle_t2);
+      d11 = AE_SEL8X8(d9, d10, d_shuffle_t3);
+
+      AE_DSEL8X8(d_out1, d_out2, d_value_0, d_value_1, d11);
+      AE_DSEL8X8(d_out1, d_out2, d_out1, d_out2, d_shuffle_t);
+
+      AE_SA8X8X2_IP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp);
+    }
+    if (elements_per_channel_t_rem) {
+      AE_LAV8X8X2_XP(d_index, d_dummy, align_index, (ae_int8x16*)pIn_tmp,
+                     3); /* Loading 48 bits for decoding 16 weight values */
+
+      d1 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 1));
+      d2 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 2));
+      d3 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 3));
+      d4 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 4));
+
+      d1 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d1), 0x7007007007000000LL));
+      d2 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d2), 0x0700700700700000LL));
+      d3 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d3), 0x0070070070070000LL));
+      d4 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d4), 0x0007007007007000LL));
+
+      d5 = d1 | d2;
+      d6 = d3 | d4;
+
+      d7 = AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d5), 4));
+      d8 = AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d6), 4));
+
+      d9 = AE_SEL8X8(d5, d7, d_shuffle_t1);
+      d10 = AE_SEL8X8(d6, d8, d_shuffle_t2);
+      d11 = AE_SEL8X8(d9, d10, d_shuffle_t3);
+
+      AE_DSEL8X8(d_out1, d_out2, d_value_0, d_value_1, d11);
+      AE_DSEL8X8(d_out1, d_out2, d_out1, d_out2, d_shuffle_t);
+
+      AE_SAV8X8X2_XP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp,
+                     elements_per_channel_t_rem);
+    }
+
+    value_table = value_table + stride;
+  }
+  AE_SA128POS_FP(align_store, (ae_int8x16*)p_out_tmp);
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidth2_Xtensa(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  int i, j;
+  ae_int8* __restrict p_out_tmp = (ae_int8*)buffer;
+  ae_int8x8* pIn_tmp = (ae_int8x8*)compressed_indices_;
+  const uint8_t* __restrict value_table =
+      static_cast<const uint8_t*>(value_table_);
+
+  const uint8_t* __restrict value_table_t = value_table;
+
+  int num_channels_t = num_channels_;
+  const size_t stride = value_table_channel_stride_;
+
+  int elements_per_channel_t_by_5 = elements_per_channel_ >> 5;
+  int elements_per_channel_t_rem = elements_per_channel_ & 0x1F;
+  int elements_per_channel_t_rem_minus_16 = 0;
+  if (elements_per_channel_t_rem > 16) {
+    elements_per_channel_t_rem_minus_16 = elements_per_channel_t_rem - 16;
+  }
+
+  ae_int8x8 d_index, d_dummy;
+  ae_int8x8 d0, d1, d2, d3, d4, d5;
+  ae_int8x8 q0, q1, q2, q3;
+  ae_int8x8 d_out1, d_out2;
+
+  ae_valignx2 align_index = AE_LA128_PP(pIn_tmp);
+
+  ae_int8x8 d_shuffle_value_t = AE_MOVINT8X8_FROMINT64(0x08192A3B4C5D6E7FLL);
+  ae_int8x8 d_shuffle_t1 = AE_MOVINT8X8_FROMINT64(0xFB73EA62D951C840LL);
+  ae_int8x8 d_shuffle_t2 = AE_MOVINT8X8_FROMINT64(0xFBEA7362D9C85140LL);
+
+  ae_valignx2 align_store = AE_ZALIGN128();
+
+  for (i = 0; i < num_channels_t; i++) {
+    ae_int8x8 d_value_0 = AE_MOVINT8X8_FROMINT64(AE_ZERO());
+    ae_int8x8 d_value_1 = AE_MOVINT8X8_FROMINT64(AE_ZERO());
+
+    value_table_t = value_table;
+
+    ae_valign align_vtab = AE_LA64_PP(value_table_t);
+    AE_LA8X8_IP(d_value_0, align_vtab, (ae_int8x8*)value_table_t);
+    AE_DSEL8X8(d_value_0, d_value_1, d_value_0, d_value_1, d_shuffle_value_t);
+
+    for (j = 0; j < elements_per_channel_t_by_5; j++) {
+      // AE_LA8X8_IP( d_index, align_index, pIn_tmp );    /* Loading 64 bits
+      // for decoding 32 weight values */
+
+      AE_LAV8X8X2_XP(d_index, d_dummy, align_index, (ae_int8x16*)pIn_tmp,
+                     8); /* Loading 64 bits for decoding 32 weight values  */
+      d0 = d_index;
+      d1 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 2));
+
+      d2 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d0),
+                   0x3333333333333333LL));  // i1,i3,i5, ....
+      d3 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d1),
+                   0x3333333333333333LL));  // i0,i2,i4, ....
+
+      AE_DSEL8X8(d4, d5, d3, d2,
+                 d_shuffle_t1);  // d4 = i0,i2,i1,i3,i4,i6,...    d5 =
+                                 // i16,i18, i17,i19, ....
+
+      AE_DSEL8X8(q0, q1, d_value_0, d_value_1,
+                 d4);  // q0 = 0,1,4,5,8,9,12,13        q1 = 2,3,6,7,10,11,14,15
+      AE_DSEL8X8(
+          q2, q3, d_value_0, d_value_1,
+          d5);  // q2 = 16,17,20,21,24,25,28,29  q3 = 18,19,22,23,26,27,30,31
+
+      AE_DSEL8X8(d_out1, d_out2, q0, q1, d_shuffle_t2);
+      AE_SA8X8X2_IP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp);
+
+      AE_DSEL8X8(d_out1, d_out2, q2, q3, d_shuffle_t2);
+      AE_SA8X8X2_IP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp);
+    }
+    if (elements_per_channel_t_rem) {
+      AE_LAV8X8X2_XP(d_index, d_dummy, align_index, (ae_int8x16*)pIn_tmp,
+                     (elements_per_channel_t_rem >>
+                      2)); /* Loading 48 bits for decoding 16 weight values */
+      d0 = d_index;
+      d1 =
+          AE_MOVINT8X8_FROMINT64(AE_SRLI64(AE_MOVINT64_FROMINT8X8(d_index), 2));
+      d2 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d0),
+                   0x3333333333333333LL));  // i1,i3,i5, ....
+      d3 = AE_MOVINT8X8_FROMINT64(
+          AE_AND64(AE_MOVINT64_FROMINT8X8(d1),
+                   0x3333333333333333LL));  // i0,i2,i4, ....
+
+      AE_DSEL8X8(d4, d5, d3, d2,
+                 d_shuffle_t1);  // d4 = i0,i2,i1,i3,i4,i6,...    d5 =
+                                 // i16,i18, i17,i19, ....
+
+      AE_DSEL8X8(q0, q1, d_value_0, d_value_1,
+                 d4);  // q0 = 0,1,4,5,8,9,12,13        q1 = 2,3,6,7,10,11,14,15
+      AE_DSEL8X8(
+          q2, q3, d_value_0, d_value_1,
+          d5);  // q2 = 16,17,20,21,24,25,28,29  q3 = 18,19,22,23,26,27,30,31
+
+      AE_DSEL8X8(d_out1, d_out2, q0, q1, d_shuffle_t2);
+
+      AE_SAV8X8X2_XP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp,
+                     elements_per_channel_t_rem);
+
+      AE_DSEL8X8(d_out1, d_out2, q2, q3, d_shuffle_t2);
+
+      AE_SAV8X8X2_XP(d_out1, d_out2, align_store, (ae_int8x16*)p_out_tmp,
+                     elements_per_channel_t_rem_minus_16);
+    }
+
+    value_table = value_table + stride;
+  }
+  AE_SA128POS_FP(align_store, (ae_int8x16*)p_out_tmp);
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidthAnyInt8_Xtensa(
+    int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const int stride = value_table_channel_stride_;
+  const uint8_t* __restrict value_table =
+      static_cast<const uint8_t*>(value_table_);
+
+  int num_channels_t = num_channels_;
+  short* __restrict p_stream = (short*)compressed_indices_;
+  uint32_t index;
+  ae_int8* __restrict p_out_tmp = (ae_int8*)buffer;
+  const size_t bw = compressed_bit_width_;
+
+  WUR_AE_BITPTR(0);
+  WUR_AE_BITHEAD(0);
+
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+
+  if (use_alternate_axis_) {
+    int count = count_indices_;
+    const uint8_t* __restrict value_table_t = value_table;
+
+    while (count > 0) {
+      value_table = value_table_t;
+
+      for (int channel = 0; channel < num_channels_t; channel++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
+        AE_S8_0_IP(d_tmp, p_out_tmp, 1);
+        value_table += stride;
+      }
+
+      count -= num_channels_t;
+    }
+  } else {
+    int elements_per_channel_t = elements_per_channel_;
+    uint32_t index_1, index_2;
+    uint32_t mask_bits = (1 << compressed_bit_width_) - 1;
+
+    for (int i = 0; i < num_channels_t; i++) {
+      elements_per_channel_t = elements_per_channel_;
+      /* if output pointer is not 2 byte aligned */
+      if ((unsigned int)p_out_tmp & 0x1) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
+        AE_S8_0_IP(d_tmp, p_out_tmp, 1);
+        elements_per_channel_t = elements_per_channel_t - 1;
+      }
+      for (int j = 0; j < (elements_per_channel_t >> 1); j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, 2 * bw);
+        index_1 = (index >> compressed_bit_width_) & mask_bits;
+        index_2 = (index)&mask_bits;
+        ae_int8x8 d_tmp1 = AE_L8_X((const ae_int8*)value_table, index_1);
+        ae_int8x8 d_tmp2 = AE_L8_X((const ae_int8*)value_table, index_2);
+        ae_int16x4 d_tmp =
+            AE_MOVINT16X4_FROMINT8X8(AE_SEL8X8I(d_tmp2, d_tmp1, 21));
+        AE_S16_0_IP(d_tmp, (ae_int16*)p_out_tmp, 2);
+      }
+      if (elements_per_channel_t & 0x1) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int8x8 d_tmp = AE_L8_X((const ae_int8*)value_table, index);
+        AE_S8_0_IP(d_tmp, p_out_tmp, 1);
+      }
+      value_table += stride;
+    }
+  }
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidthAnyInt16_Xtensa(
+    int16_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const int stride = value_table_channel_stride_;
+  const uint16_t* __restrict value_table =
+      static_cast<const uint16_t*>(value_table_);
+
+  int num_channels_t = num_channels_;
+  short* __restrict p_stream = (short*)compressed_indices_;
+  uint32_t index;
+  ae_int16* __restrict p_out_tmp = (ae_int16*)buffer;
+  const size_t bw = compressed_bit_width_;
+
+  WUR_AE_BITPTR(0);
+  WUR_AE_BITHEAD(0);
+
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+
+  if (use_alternate_axis_) {
+    int count = count_indices_;
+    const uint16_t* __restrict value_table_t = value_table;
+
+    while (count > 0) {
+      value_table = value_table_t;
+
+      for (int channel = 0; channel < num_channels_t; channel++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int16x4 d_tmp = AE_L16_X((const ae_int16*)value_table, index << 1);
+        AE_S16_0_IP(d_tmp, p_out_tmp, 2);
+        value_table += stride;
+      }
+
+      count -= num_channels_t;
+    }
+  } else {
+    int elements_per_channel_t = elements_per_channel_;
+
+    for (int i = 0; i < num_channels_t; i++) {
+      for (int j = 0; j < elements_per_channel_t; j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int16x4 d_tmp = AE_L16_X((const ae_int16*)value_table, index << 1);
+        AE_S16_0_IP(d_tmp, p_out_tmp, 2);
+      }
+
+      value_table += stride;
+    }
+  }
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidthAnyInt32_Xtensa(
+    int32_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const int stride = value_table_channel_stride_;
+  const uint32_t* __restrict value_table =
+      static_cast<const uint32_t*>(value_table_);
+
+  int num_channels_t = num_channels_;
+  short* __restrict p_stream = (short*)compressed_indices_;
+  uint32_t index;
+  ae_int32* __restrict p_out_tmp = (ae_int32*)buffer;
+  const size_t bw = compressed_bit_width_;
+
+  WUR_AE_BITPTR(0);
+  WUR_AE_BITHEAD(0);
+
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+
+  if (use_alternate_axis_) {
+    int count = count_indices_;
+    const uint32_t* __restrict value_table_t = value_table;
+
+    while (count > 0) {
+      value_table = value_table_t;
+
+      for (int channel = 0; channel < num_channels_t; channel++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int32x2 d_tmp = AE_L32_X((const ae_int32*)value_table, index << 2);
+        AE_S32_L_IP(d_tmp, p_out_tmp, 4);
+        value_table += stride;
+      }
+
+      count -= num_channels_t;
+    }
+  } else {
+    int elements_per_channel_t = elements_per_channel_;
+
+    for (int i = 0; i < num_channels_t; i++) {
+      for (int j = 0; j < elements_per_channel_t; j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int32x2 d_tmp = AE_L32_X((const ae_int32*)value_table, index << 2);
+        AE_S32_L_IP(d_tmp, p_out_tmp, 4);
+      }
+
+      value_table += stride;
+    }
+  }
+}
+
+void XtensaDecodeStateLUT::DecompressToBufferWidthAnyInt64_Xtensa(
+    int64_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const int stride = value_table_channel_stride_;
+  const uint64_t* __restrict value_table =
+      static_cast<const uint64_t*>(value_table_);
+
+  int num_channels_t = num_channels_;
+  short* __restrict p_stream = (short*)compressed_indices_;
+  uint32_t index;
+  ae_int64* __restrict p_out_tmp = (ae_int64*)buffer;
+  const size_t bw = compressed_bit_width_;
+
+  WUR_AE_BITPTR(0);
+  WUR_AE_BITHEAD(0);
+
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+  AE_DBI_IP((const unsigned short*)p_stream, 16);
+
+  if (use_alternate_axis_) {
+    int count = count_indices_;
+    const uint64_t* __restrict value_table_t = value_table;
+
+    while (count > 0) {
+      value_table = value_table_t;
+
+      for (int channel = 0; channel < num_channels_t; channel++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int64 d_tmp = AE_L64_X((const ae_int64*)value_table, index << 3);
+        AE_S64_IP(d_tmp, p_out_tmp, 8);
+        value_table += stride;
+      }
+
+      count -= num_channels_t;
+    }
+  } else {
+    int elements_per_channel_t = elements_per_channel_;
+
+    for (int i = 0; i < num_channels_t; i++) {
+      for (int j = 0; j < elements_per_channel_t; j++) {
+        AE_LB_DB_IP((unsigned short*)p_stream, index, bw);
+        ae_int64 d_tmp = AE_L64_X((const ae_int64*)value_table, index << 3);
+        AE_S64_IP(d_tmp, p_out_tmp, 8);
+      }
+
+      value_table += stride;
+    }
+  }
+}
+
+void XtensaDecodeStateLUT::DecompressToBuffer(int8_t* buffer) {
+  if (compressed_bit_width_ == 4 && !use_alternate_axis_) {
+    if (!(elements_per_channel_ & 0x01)) {
+      DecompressToBufferWidth4_Xtensa(buffer);
+    } else {
+      DecompressToBufferWidthAnyInt8_Xtensa(buffer);
+    }
+  } else if (compressed_bit_width_ == 3 && !use_alternate_axis_) {
+    if (!(elements_per_channel_ & 0x07)) {
+      DecompressToBufferWidth3_Xtensa(buffer);
+    } else {
+      DecompressToBufferWidthAnyInt8_Xtensa(buffer);
+    }
+  } else if (compressed_bit_width_ == 2 && !use_alternate_axis_) {
+    if (!(elements_per_channel_ & 0x03)) {
+      DecompressToBufferWidth2_Xtensa(buffer);
+    } else {
+      DecompressToBufferWidthAnyInt8_Xtensa(buffer);
+    }
+  } else {
+    DecompressToBufferWidthAnyInt8_Xtensa(buffer);
+  }
+}
+
+TfLiteStatus XtensaDecodeStateLUT::Decode(const TfLiteEvalTensor& input,
+                                          const TfLiteEvalTensor& ancillary,
+                                          const TfLiteEvalTensor& output) {
+  TFLITE_DCHECK(compressed_bit_width_ <= kMaxBitWidth);
+  TFLITE_DCHECK(compressed_bit_width_ > 0);
+
+  void* const buffer = const_cast<void*>(micro::GetTensorData<void>(&output));
+  TFLITE_DCHECK(buffer != nullptr);
+
+  switch (output.type) {
+    case kTfLiteBool:
+      DecompressToBuffer(static_cast<int8_t*>(buffer));
+      break;
+    case kTfLiteFloat32:
+      DecompressToBufferWidthAnyInt32_Xtensa(static_cast<int32_t*>(buffer));
+      break;
+    case kTfLiteInt8:
+      DecompressToBuffer(static_cast<int8_t*>(buffer));
+      break;
+    case kTfLiteInt16:
+      DecompressToBufferWidthAnyInt16_Xtensa(static_cast<int16_t*>(buffer));
+      break;
+    case kTfLiteInt32:
+      DecompressToBufferWidthAnyInt32_Xtensa(static_cast<int32_t*>(buffer));
+      break;
+    case kTfLiteInt64:
+      DecompressToBufferWidthAnyInt64_Xtensa(static_cast<int64_t*>(buffer));
+      break;
+    default:
+      MicroPrintf("unsupported tensor type %s", TfLiteTypeGetName(output.type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h b/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h
new file mode 100644
index 00000000000..b614887a4cc
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.h
@@ -0,0 +1,57 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_XTENSA_DECODE_STATE_LUT_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_XTENSA_DECODE_STATE_LUT_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+
+namespace tflite {
+
+struct XtensaDecodeStateLUT : public DecodeStateLUT {
+  XtensaDecodeStateLUT() = delete;
+
+  XtensaDecodeStateLUT(const TfLiteContext* context,
+                       MicroProfilerInterface* profiler)
+      : DecodeStateLUT(context, profiler) {}
+
+  virtual TfLiteStatus Decode(const TfLiteEvalTensor& input,
+                              const TfLiteEvalTensor& ancillary,
+                              const TfLiteEvalTensor& output) override;
+
+ protected:
+  virtual ~XtensaDecodeStateLUT() = default;
+
+  void DecompressToBuffer(int8_t* buffer);
+
+  void DecompressToBufferWidth4_Xtensa(int8_t* buffer);
+  void DecompressToBufferWidth3_Xtensa(int8_t* buffer);
+  void DecompressToBufferWidth2_Xtensa(int8_t* buffer);
+
+  void DecompressToBufferWidthAnyInt8_Xtensa(int8_t* buffer);
+  void DecompressToBufferWidthAnyInt16_Xtensa(int16_t* buffer);
+  void DecompressToBufferWidthAnyInt32_Xtensa(int32_t* buffer);
+  void DecompressToBufferWidthAnyInt64_Xtensa(int64_t* buffer);
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_XTENSA_DECODE_STATE_LUT_H_
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
index 70e1880c800..38a959d5fe5 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
@@ -1,33 +1,5 @@
 
-# Explicitly add kernel sources specific to the Xtensa optimized
-# implementations.
-MICROLITE_CC_KERNEL_SRCS += \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/add_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_common_xtensa.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int16_reference.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_reference.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_common_xtensa.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_int8.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pad_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_int8.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reduce_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reshape_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_int8_int16.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_vision.cc
-
 ifeq ($(TARGET_ARCH), hifimini)
-  # hifimini optimizations are implemented in the TFLM repository itself.
-  THIRD_PARTY_KERNEL_CC_SRCS += \
-    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc \
-    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc
-
   FFT_PATH := $(MAKEFILE_DIR)/downloads/hifi_fft
   INCLUDES += -I$(FFT_PATH)/
 
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
index e9f940392b1..8b3529e6f28 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
@@ -91,20 +91,59 @@ EXCLUDED_EXAMPLE_TESTS := \
 MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 MICRO_LITE_EXAMPLE_TESTS += $(shell find $(TENSORFLOW_ROOT)third_party/xtensa/examples/ -name Makefile.inc)
 
-# Needed for LSTM support.
-MICROLITE_CC_KERNEL_SRCS := $(MICROLITE_CC_KERNEL_SRCS) \
-$(TENSORFLOW_ROOT)tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc \
-$(TENSORFLOW_ROOT)tensorflow/lite/kernels/kernel_util.cc
+
 
 ifeq ($(OPTIMIZED_KERNEL_DIR), xtensa)
-  MICROLITE_CC_KERNEL_SRCS := $(MICROLITE_CC_KERNEL_SRCS) \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/unidirectional_sequence_lstm.cc
+  # Explicitly add kernel sources specific to the Xtensa optimized
+  # implementations.
+  MICROLITE_CC_KERNEL_SRCS += \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/add_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_common_xtensa.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int16_reference.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_reference.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_common_xtensa.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_int8.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pad_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_int8.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reduce_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reshape_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_int8_int16.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_vision.cc
+
+  # Needed for LSTM support.
+  MICROLITE_CC_KERNEL_SRCS += \
+    $(TENSORFLOW_ROOT)tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/kernels/kernel_util.cc
+
+  # There is no overlay of the reference kernels for hifimini optimizations.
+  # Add the optimized kernel sources here.
+  ifeq ($(TARGET_ARCH), hifimini)
+    MICROLITE_CC_KERNEL_SRCS += \
+      $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc \
+      $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc
+  endif
+
+  # Additional kernel sources for DECODE operator support
+  ifeq ($(TARGET_ARCH), $(filter $(TARGET_ARCH), hifi5))
+    MICROLITE_CC_KERNEL_SRCS += \
+      $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc
+  endif
+endif
 
-  # override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
-  # Xtensa intrinsics.
+# override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
+# Xtensa intrinsics.
 $(KERNEL_OBJDIR)$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.o: $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) -O3 -LNO:simd $(INCLUDES) -c $< -o $@
-endif
+
+$(KERNEL_OBJDIR)$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.o: $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/xtensa_decode_state_lut.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) -O3 -LNO:simd $(INCLUDES) -c $< -o $@

From 562fb191a6c36d79590f1ddd6e7577d392cce311 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Mon, 21 Jul 2025 11:38:52 -0700
Subject: [PATCH 5/8] Updates to Xtensa makefiles

@tensorflow/micro

Reorganize Xtensa makefiles such that all references to optimized kernel sources are moved to the Xtensa target makefile.

Move hifimini kernel sources to the parent directory, and rename them so they do not interfere with the target overlay mechanism of the root makefile.

bug=fixes #3153
---
 ...nt_utils.h => fixedpoint_utils_hifimini.h} |  0
 ...nnected.cc => fully_connected_hifimini.cc} |  4 +-
 .../{hifimini/svdf.cc => svdf_hifimini.cc}    |  4 +-
 .../lite/micro/tools/make/ext_libs/xtensa.inc | 28 -----------
 .../tools/make/targets/xtensa_makefile.inc    | 47 ++++++++++++++-----
 5 files changed, 39 insertions(+), 44 deletions(-)
 rename tensorflow/lite/micro/kernels/xtensa/{hifimini/fixedpoint_utils.h => fixedpoint_utils_hifimini.h} (100%)
 rename tensorflow/lite/micro/kernels/xtensa/{hifimini/fully_connected.cc => fully_connected_hifimini.cc} (97%)
 rename tensorflow/lite/micro/kernels/xtensa/{hifimini/svdf.cc => svdf_hifimini.cc} (98%)

diff --git a/tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h
rename to tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h
diff --git a/tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
similarity index 97%
rename from tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc
rename to tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
index b63c5001a91..6698b0f7191 100644
--- a/tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc
rename to tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
index 08ef4d9bb0c..056091b8490 100644
--- a/tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa_svdf.h"
 
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
index 70e1880c800..38a959d5fe5 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
@@ -1,33 +1,5 @@
 
-# Explicitly add kernel sources specific to the Xtensa optimized
-# implementations.
-MICROLITE_CC_KERNEL_SRCS += \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/add_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_common_xtensa.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int16_reference.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_reference.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_common_xtensa.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_int8.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pad_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_int8.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reduce_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reshape_vision.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_int8_int16.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_vision.cc
-
 ifeq ($(TARGET_ARCH), hifimini)
-  # hifimini optimizations are implemented in the TFLM repository itself.
-  THIRD_PARTY_KERNEL_CC_SRCS += \
-    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/svdf.cc \
-    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/hifimini/fully_connected.cc
-
   FFT_PATH := $(MAKEFILE_DIR)/downloads/hifi_fft
   INCLUDES += -I$(FFT_PATH)/
 
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
index e9f940392b1..b05a0670248 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_makefile.inc
@@ -91,20 +91,43 @@ EXCLUDED_EXAMPLE_TESTS := \
 MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 MICRO_LITE_EXAMPLE_TESTS += $(shell find $(TENSORFLOW_ROOT)third_party/xtensa/examples/ -name Makefile.inc)
 
-# Needed for LSTM support.
-MICROLITE_CC_KERNEL_SRCS := $(MICROLITE_CC_KERNEL_SRCS) \
-$(TENSORFLOW_ROOT)tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc \
-$(TENSORFLOW_ROOT)tensorflow/lite/kernels/kernel_util.cc
-
 ifeq ($(OPTIMIZED_KERNEL_DIR), xtensa)
-  MICROLITE_CC_KERNEL_SRCS := $(MICROLITE_CC_KERNEL_SRCS) \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
-  $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/unidirectional_sequence_lstm.cc
+  # Explicitly add kernel sources specific to the Xtensa optimized
+  # implementations.
+  #
+  # Do not include overlays for reference kernel files.
+  MICROLITE_CC_KERNEL_SRCS += \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/add_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_common_xtensa.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int16_reference.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_reference.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_common_xtensa.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_int8.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/fully_connected_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pad_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_int8.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/pooling_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reduce_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/reshape_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_int8_int16.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/softmax_vision.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
+
+  # Needed for LSTM support.
+  MICROLITE_CC_KERNEL_SRCS += \
+    $(TENSORFLOW_ROOT)tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc \
+    $(TENSORFLOW_ROOT)tensorflow/lite/kernels/kernel_util.cc
+endif
 
-  # override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
-  # Xtensa intrinsics.
+# override KERNEL_OPTIMIZATION_LEVEL to enable higher performance
+# Xtensa intrinsics.
 $(KERNEL_OBJDIR)$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.o: $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/decompress.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) -O3 -LNO:simd $(INCLUDES) -c $< -o $@
-endif

From ad64f56d76c4efffff5da74d596c5ad24a9866d2 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Mon, 21 Jul 2025 12:41:24 -0700
Subject: [PATCH 6/8] Fix incorrect include path. Fix code style errors.

---
 .../lite/micro/kernels/xtensa/fully_connected_hifimini.cc       | 2 +-
 tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc           | 2 +-
 tensorflow/lite/micro/kernels/xtensa/xtensa.h                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc b/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
index 6698b0f7191..8e36908f03b 100644
--- a/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/fully_connected_hifimini.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #if defined(HIFIMINI)
-#include "tensorflow/lite/micro/kernels/fully_connected.h"
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -24,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
diff --git a/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc b/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
index 056091b8490..a3da4e68a3c 100644
--- a/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/svdf_hifimini.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #if defined(HIFIMINI)
-#include "tensorflow/lite/micro/kernels/svdf.h"
 
 #include <cmath>
 
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/svdf.h"
 #include "tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
 #include "tensorflow/lite/micro/kernels/xtensa/xtensa_svdf.h"
diff --git a/tensorflow/lite/micro/kernels/xtensa/xtensa.h b/tensorflow/lite/micro/kernels/xtensa/xtensa.h
index 604736ddbd4..9a441f44096 100644
--- a/tensorflow/lite/micro/kernels/xtensa/xtensa.h
+++ b/tensorflow/lite/micro/kernels/xtensa/xtensa.h
@@ -19,7 +19,7 @@ limitations under the License.
 #if defined(HIFIMINI)
 #include <xtensa/tie/xt_hifi2.h>
 
-#include "tensorflow/lite/micro/kernels/xtensa/hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa/fixedpoint_utils_hifimini.h"
 #endif  // defined(HIFMINI)
 
 #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)

From 6622a28512a05ebdc2379dbe12b5bd4275ebf2b8 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Mon, 21 Jul 2025 13:36:01 -0700
Subject: [PATCH 7/8] fix copyright

---
 tensorflow/lite/micro/kernels/xtensa/xtensa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa/xtensa.h b/tensorflow/lite/micro/kernels/xtensa/xtensa.h
index 9a441f44096..0e7e51b0cb6 100644
--- a/tensorflow/lite/micro/kernels/xtensa/xtensa.h
+++ b/tensorflow/lite/micro/kernels/xtensa/xtensa.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From abe8d0a564cc52efca8cecc59e05577377ff3fed Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Mon, 21 Jul 2025 17:39:57 -0700
Subject: [PATCH 8/8] update generic benchmark op resolver size

---
 tensorflow/lite/micro/tools/benchmarking/op_resolver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
index 915ad843f5f..a89a2806e92 100644
--- a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
+++ b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace tflite {
 
-using TflmOpResolver = MicroMutableOpResolver<113>;
+using TflmOpResolver = MicroMutableOpResolver<115>;
 
 inline TfLiteStatus CreateOpResolver(TflmOpResolver& op_resolver) {
   TF_LITE_ENSURE_STATUS(op_resolver.AddAbs());