diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9ca7f59..9e4f615 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -34,7 +34,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 ENV DEBIAN_FRONTEND=interactive
 
-# Install Tensorrt Runtime
 RUN curl -fsSL -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
     && dpkg -i cuda-keyring_1.1-1_all.deb \
     && apt-get update && apt-get install -y --no-install-recommends \
@@ -59,8 +58,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
     && rm -rf /var/lib/apt/lists/*
 
-# Install pre-commit
-RUN python3 -m pip install --no-cache-dir pre-commit==3.8.0
+# Install pre-commit (Ubuntu 24.04 ships PEP 668 managed Python, so allow system install)
+# RUN python3 -m pip install --no-cache-dir --break-system-packages pre-commit==3.8.0
 
 # Set working dir (matches VSCode workspace)
 WORKDIR /deep_ros_ws
diff --git a/DEVELOPING.md b/DEVELOPING.md
index c0009f5..e290f3a 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -26,6 +26,75 @@ This project includes VS Code dev container configurations for easy ROS2 develop
 - **Build tools**: Includes `colcon` and `rosdep` for ROS development
 - **Extensions**: C++, CMake, Python, and XML support pre-installed
 
+### Stopping Containers
+
+After using "Rebuild and Reopen in Container", you can stop containers using:
+
+**Method 1: VS Code Command (Recommended)**
+- Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac)
+- Type: `Dev Containers: Reopen Folder Locally`
+- This closes the container and returns to local mode
+
+**Method 2: Stop Container**
+- Press `Ctrl+Shift+P`
+- Type: `Dev Containers: Stop Container`
+- This stops the container but keeps it available for later use
+
+**Method 3: Using Docker Commands**
+From a terminal (outside the container):
+
+```bash
+# List running containers
+docker ps
+
+# Stop a specific container by name
+docker stop <container_name>
+
+# Stop all dev containers
+docker ps -q --filter "name=vsc-deep_ros" | xargs -r docker stop
+
+# Stop and remove containers
+docker stop <container_name>
+docker rm <container_name>
+```
+
+**Method 4: Close VS Code Window**
+- Simply closing the VS Code window will stop the container when you exit
+- The container will remain stopped until you reopen the folder in container mode
+
+### Restarting Containers
+
+After stopping a container, you can restart it using:
+
+**Method 1: VS Code Command (Recommended)**
+- Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac)
+- Type: `Dev Containers: Reopen in Container`
+- This will start the existing container or create a new one if needed
+
+**Method 2: Rebuild and Reopen**
+- Press `Ctrl+Shift+P`
+- Type: `Dev Containers: Rebuild and Reopen in Container`
+- Use this if you want to rebuild the container from scratch (e.g., after Dockerfile changes)
+
+**Method 3: Using Docker Commands**
+From a terminal (outside the container):
+
+```bash
+# List all containers (including stopped)
+docker ps -a
+
+# Start a stopped container by name
+docker start <container_name>
+
+# Start and attach to container
+docker start <container_name>
+docker attach <container_name>
+```
+
+**Method 4: Reopen Folder in VS Code**
+- If VS Code detects a devcontainer configuration, it will prompt you to "Reopen in Container"
+- Click the notification or use the command palette option
+
 ### Common Commands
 
 Inside the container, you can do ros2 commands, colcon commands, rosdep, etc.
diff --git a/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp b/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp
index 2c6728c..30adf59 100644
--- a/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp
+++ b/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp
@@ -101,6 +101,17 @@ class BackendMemoryAllocator
    */
   void copy_device_to_device(void * dst, const void * src, size_t bytes);
 
+  /**
+   * @brief Copy data from host (CPU) to device memory - alias for copy_from_host
+   * @param dst Destination device memory pointer
+   * @param src Source host memory pointer
+   * @param bytes Number of bytes to copy
+   */
+  void copy_host_to_device(void * dst, const void * src, size_t bytes)
+  {
+    copy_from_host(dst, src, bytes);
+  }
+
 protected:
   /**
    * @brief Implementation of copy_from_host (to be overridden by backends)
diff --git a/deep_object_detection/CMakeLists.txt b/deep_object_detection/CMakeLists.txt
new file mode 100644
index 0000000..4b09bcd
--- /dev/null
+++ b/deep_object_detection/CMakeLists.txt
@@ -0,0 +1,128 @@
+# Copyright (c) 2025-present WATonomous. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.22)
+project(deep_object_detection)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(rclcpp REQUIRED)
+find_package(rclcpp_components REQUIRED)
+find_package(rclcpp_lifecycle REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(vision_msgs REQUIRED)
+find_package(deep_core REQUIRED)
+find_package(deep_msgs REQUIRED)
+find_package(pluginlib REQUIRED)
+find_package(rcl_interfaces REQUIRED)
+find_package(OpenCV REQUIRED COMPONENTS core imgproc)
+
+add_executable(deep_object_detection_node
+  src/main.cpp
+)
+
+add_library(deep_object_detection_lib SHARED
+  src/backend_manager.cpp
+  src/generic_postprocessor.cpp
+  src/image_preprocessor.cpp
+  src/deep_object_detection_node.cpp
+)
+
+target_include_directories(deep_object_detection_lib PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
+target_link_libraries(deep_object_detection_lib
+  PUBLIC
+    ${rclcpp_TARGETS}
+    ${rclcpp_lifecycle_TARGETS}
+    ${deep_core_TARGETS}
+    ${pluginlib_TARGETS}
+    ${vision_msgs_TARGETS}
+    ${deep_msgs_TARGETS}
+    deep_core::deep_core_lib
+  PRIVATE
+    ${rclcpp_components_TARGETS}
+    ${sensor_msgs_TARGETS}
+    ${cv_bridge_TARGETS}
+    ${rcl_interfaces_TARGETS}
+    ${OpenCV_LIBRARIES}
+)
+
+target_include_directories(deep_object_detection_lib SYSTEM PUBLIC ${OpenCV_INCLUDE_DIRS})
+
+rclcpp_components_register_nodes(deep_object_detection_lib "deep_object_detection::DeepObjectDetectionNode")
+
+target_link_libraries(deep_object_detection_node
+  deep_object_detection_lib
+)
+
+target_include_directories(deep_object_detection_node PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
+install(TARGETS deep_object_detection_lib
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+  RUNTIME DESTINATION bin
+)
+
+install(TARGETS deep_object_detection_node
+  RUNTIME DESTINATION lib/${PROJECT_NAME}
+)
+
+install(DIRECTORY include/
+  DESTINATION include
+)
+
+install(DIRECTORY config launch
+  DESTINATION share/${PROJECT_NAME}
+)
+
+if(BUILD_TESTING)
+  find_package(deep_test REQUIRED)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test/test_postprocessors.cpp)
+    add_deep_test(test_postprocessors test/test_postprocessors.cpp
+      LIBRARIES
+        deep_object_detection_lib
+        ${OpenCV_LIBRARIES}
+        ${cv_bridge_TARGETS}
+        ${sensor_msgs_TARGETS}
+        ${rcl_interfaces_TARGETS}
+    )
+  endif()
+
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test/test_preprocessor.cpp)
+    add_deep_test(test_preprocessor test/test_preprocessor.cpp
+      LIBRARIES
+        deep_object_detection_lib
+        ${OpenCV_LIBRARIES}
+        ${cv_bridge_TARGETS}
+        ${sensor_msgs_TARGETS}
+        ${rcl_interfaces_TARGETS}
+    )
+  endif()
+endif()
+
+ament_package()
diff --git a/deep_object_detection/README.md b/deep_object_detection/README.md
new file mode 100644
index 0000000..4fc9bb3
--- /dev/null
+++ b/deep_object_detection/README.md
@@ -0,0 +1,296 @@
+# Deep Object Detection Node
+
+A deep learning object detection node for ROS 2. Simply provide any ONNX-compatible object detection model, and the node will automatically detect and adapt to its output format.
+
+## Overview
+
+The `deep_object_detection` package provides:
+- Model-agnostic object detection using ONNX models
+- Automatic output format detection and adaptation
+- Multi-camera support via MultiImage messages with flexible batching
+- Configurable preprocessing and postprocessing pipelines
+- Explicit execution provider selection (TensorRT, CUDA, CPU) with fail-fast behavior
+- Full ROS 2 lifecycle node support
+
+## Package Contents
+
+### Nodes
+
+- **`deep_object_detection_node`**: A lifecycle node that subscribes to MultiImage messages (synchronized multi-camera input), runs object detection inference, and publishes detection results.
+
+### Configuration
+
+- **`config/generic_model_params.yaml`**: Example configuration showing the parameter structure for model, preprocessing, postprocessing, and execution provider settings.
+
+### Launch Files
+
+- **`launch/deep_object_detection.launch.yaml`**: Launch file for the object detection node with lifecycle management.
+
+## Usage
+
+### Building the Package
+
+```bash
+cd /path/to/your/ros2_workspace
+colcon build --packages-select deep_object_detection
+source install/setup.bash
+```
+
+### Running the Node
+
+1. **Basic usage**:
+
+   ```bash
+   ros2 launch deep_object_detection deep_object_detection.launch.yaml \
+     config_file:=/path/to/generic_model_params.yaml
+   ```
+
+2. **With topic remappings**:
+
+   ```bash
+   ros2 launch deep_object_detection deep_object_detection.launch.yaml \
+     config_file:=/path/to/config.yaml \
+     input_topic:=/my_camera/multi_image \
+     output_detections_topic:=/my_detections
+   ```
+
+3. **With specific model path**:
+
+   ```bash
+   ros2 launch deep_object_detection deep_object_detection.launch.yaml \
+     config_file:=/path/to/config.yaml \
+     preferred_provider:=tensorrt
+   ```
+
+4. **Manual lifecycle transitions** (if not using lifecycle manager):
+
+   ```bash
+   # Configure the node
+   ros2 lifecycle set /deep_object_detection_node configure
+
+   # Activate the node
+   ros2 lifecycle set /deep_object_detection_node activate
+   ```
+
+### Running with Camera Sync Node
+
+The object detection node requires the `camera_sync` node for synchronized multi-camera processing.
+
+1. **Launch camera sync node**:
+
+   ```bash
+   ros2 launch camera_sync multi_camera_sync.launch.yaml
+   ```
+
+2. **Launch object detection node**:
+
+   ```bash
+   ros2 launch deep_object_detection deep_object_detection.launch.yaml \
+     config_file:=/path/to/config.yaml \
+     input_topic:=/multi_camera_sync/multi_image_compressed
+   ```
+
+   Configure the detection node to subscribe to the MultiImage topic either via:
+   - **Remapping** (recommended): `input_topic:=/multi_camera_sync/multi_image_compressed` in launch command
+   - **Parameter**: Set `input_topic: "/multi_camera_sync/multi_image_compressed"` in config file
+
+   Also configure batching to match number of cameras:
+
+   ```yaml
+   min_batch_size: 3    # Minimum images before processing (match number of cameras)
+   max_batch_size: 3    # Maximum images per batch
+   ```
+
+### Zero-Copy Component Container (High Performance)
+
+For maximum performance, run both `multi_camera_sync` and `deep_object_detection` nodes in a single component container with intra-process communication:
+
+```yaml
+launch:
+  - node_container:
+      pkg: rclcpp_components
+      exec: component_container_mt
+      name: detection_container
+      param:
+        - name: use_intra_process_comms
+          value: true
+      composable_node:
+        - pkg: camera_sync
+          plugin: camera_sync::MultiCameraSyncNode
+          name: multi_camera_sync
+          extra_arg:
+            - name: use_intra_process_comms
+              value: "true"
+        - pkg: deep_object_detection
+          plugin: deep_object_detection::DeepObjectDetectionNode
+          name: deep_object_detection_node
+          extra_arg:
+            - name: use_intra_process_comms
+              value: "true"
+```
+
+## Configuration
+
+The node uses a nested parameter structure for configuration. Many parameters have sensible defaults.
+
+### Configuration Example
+
+```yaml
+deep_object_detection_node:
+  ros__parameters:
+    # Required
+    model_path: "/path/to/model.onnx"  # Absolute path to ONNX model file
+    input_topic: "/multi_camera_sync/multi_image_compressed"
+
+    # Model configuration
+    model:
+      num_classes: 80
+      bbox_format: "cxcywh"  # cxcywh, xyxy, or xywh
+    class_names_path: "/path/to/classes.txt"  # Optional: absolute path to class names file
+
+    # Preprocessing
+    preprocessing:
+      input_width: 640
+      input_height: 640
+      normalization_type: "scale_0_1"  # scale_0_1, imagenet, custom, none
+      resize_method: "letterbox"  # letterbox, resize, crop, pad
+      color_format: "rgb"  # rgb or bgr
+      mean: [0.0, 0.0, 0.0]  # For custom normalization
+      std: [1.0, 1.0, 1.0]   # For custom normalization
+
+    # Postprocessing
+    postprocessing:
+      score_threshold: 0.25
+      nms_iou_threshold: 0.45
+      max_detections: 300
+      score_activation: "sigmoid"  # sigmoid, softmax, none
+      class_score_mode: "all_classes"  # all_classes or single_confidence
+      use_multi_output: false
+      layout:
+        auto_detect: true
+
+    # Execution provider
+    preferred_provider: "tensorrt"  # tensorrt, cuda, or cpu
+    device_id: 0
+    enable_trt_engine_cache: false
+    trt_engine_cache_path: "/tmp/deep_ros_ort_trt_cache"
+
+    # Batching
+    min_batch_size: 1
+    max_batch_size: 3
+    max_batch_latency_ms: 0  # 0 = wait for min_batch_size
+    queue_size: 10  # 0 = unlimited
+
+    # Output
+    output_detections_topic: "/detections"
+```
+
+### Execution Provider Selection
+
+The node supports explicit provider selection with fail-fast behavior. If the specified provider is unavailable or fails to initialize, the node will immediately throw an error (no silent fallbacks).
+
+**Available Providers:**
+- `tensorrt` - TensorRT execution provider (requires CUDA and TensorRT)
+- `cuda` - CUDA execution provider (requires CUDA)
+- `cpu` - CPU execution provider (always available)
+
+## Expected Model Format
+
+The node automatically detects and adapts to various ONNX model output formats:
+
+- `[batch, detections, features]` - standard format
+- `[batch, features, detections]` - transposed format
+- `[batch, queries, 4+classes]` - query-based models (e.g., DETR)
+- `[batch, channels, anchors]` - anchor-based models
+- Any other layout - automatically detected and handled
+
+**Input:** The node expects MultiImage messages (deep_msgs/MultiImage) containing synchronized compressed images from multiple cameras.
+
+**Output:** Detection2DArray messages containing bounding boxes, scores, and class IDs for each image in the batch.
+
+## Parameters
+
+### Required Parameters
+- **`model_path`** (string): Absolute path to ONNX model file (e.g., `/workspaces/deep_ros/yolov8m.onnx`).
+- **`input_topic`** (string): MultiImage topic name to subscribe to.
+
+### Key Parameters
+- **`class_names_path`** (string, optional): Absolute path to text file with class names, one per line (e.g., `/workspaces/deep_ros/deep_object_detection/config/coco_classes.txt`). If not provided, class IDs will be used in output messages.
+- **`model.num_classes`** (int, default: 80): Number of detection classes.
+- **`model.bbox_format`** (string, default: "cxcywh"): Bounding box format (cxcywh, xyxy, or xywh).
+- **`preprocessing.input_width/input_height`** (int, default: 640): Model input image dimensions.
+- **`preprocessing.normalization_type`** (string, default: "scale_0_1"): Normalization method.
+- **`preprocessing.resize_method`** (string, default: "letterbox"): Image resizing method.
+- **`postprocessing.score_threshold`** (float, default: 0.25): Minimum confidence score.
+- **`postprocessing.nms_iou_threshold`** (float, default: 0.45): IoU threshold for NMS.
+- **`preferred_provider`** (string, default: "tensorrt"): Execution provider (tensorrt, cuda, or cpu).
+- **`min_batch_size`** (int, default: 1): Minimum images before processing.
+- **`max_batch_size`** (int, default: 3): Maximum images per batch.
+
+See `config/generic_model_params.yaml` for a complete parameter reference.
+
+## Topics
+
+Topic names can be configured either via parameters in the config file or via remappings in the launch file. Remappings take precedence over parameter values.
+
+### Input Topics
+- **`input_topic`**: MultiImage messages (deep_msgs/MultiImage) containing synchronized compressed images from multiple cameras.
+
+**Note:** The node only supports MultiImage messages. Individual camera topics are not supported.
+
+**Configuration:**
+- Via parameter: Set `input_topic` in the config file
+- Via remapping: Use `input_topic:=/your/topic/name` in the launch command or add remapping in launch file
+
+### Output Topics
+- **`output_detections_topic`**: Detection2DArray messages (default: "/detections") containing bounding boxes, scores, and class IDs for each image in the batch.
+
+**Configuration:**
+- Via parameter: Set `output_detections_topic` in the config file
+- Via remapping: Use `output_detections_topic:=/your/topic/name` in the launch command or add remapping in launch file
+
+**Example launch file with remappings:**
+
+```yaml
+- node:
+    pkg: "deep_object_detection"
+    exec: "deep_object_detection_node"
+    name: "deep_object_detection_node"
+    remap:
+      - from: "/multi_camera_sync/multi_image_compressed"
+        to: "/my_camera/multi_image"
+      - from: "/detections"
+        to: "/my_detections"
+    param:
+      - from: "$(var config_file)"
+```
+
+## Batching Behavior
+
+The node supports flexible batching with three parameters:
+
+- **`min_batch_size`**: Minimum number of images required before processing (default: 1)
+- **`max_batch_size`**: Maximum number of images per batch (default: 3)
+- **`max_batch_latency_ms`**: Maximum wait time in milliseconds before processing even if `min_batch_size` is not met (default: 0 = wait indefinitely)
+
+**Use Cases:**
+- **Single camera**: `min_batch_size=1, max_batch_size=1` - process each image immediately
+- **Multi-camera synchronized**: `min_batch_size=N, max_batch_size=N` where N = number of cameras
+- **Best-effort batching**: `min_batch_size=1, max_batch_size=N` with optional `max_batch_latency_ms` timeout
+
+## Limitations
+
+1. **MultiImage input only**: The node only supports MultiImage messages. Individual camera topics are not supported.
+2. **Compressed images only**: Only compressed images (sensor_msgs/CompressedImage) are supported. Raw images are not supported.
+3. **Fail-fast provider selection**: If the specified execution provider is unavailable, the node fails immediately. No automatic fallback to other providers.
+4. **No dynamic reconfiguration**: Parameters cannot be changed at runtime. Node must be reconfigured to change parameters.
+
+## Troubleshooting
+
+- **"No plugin loaded"**: Check that the backend plugin name is correct in the configuration
+- **"No model loaded"**: Verify the model path exists and is a valid ONNX file
+- **"Provider initialization failed"**: Check that the specified execution provider (tensorrt/cuda) is available and properly configured
+- **Lifecycle errors**: Ensure the node is properly configured before activation
+- **No detections**: Verify that the model output format matches the expected format, or enable auto-detection
+- **Plugin discovery issues**: Check that `deep_ort_backend_plugin` is built and sourced
+- **CUDA/TensorRT errors**: Ensure CUDA libraries and TensorRT (if using) are properly installed and accessible
diff --git a/deep_object_detection/config/coco_classes.txt b/deep_object_detection/config/coco_classes.txt
new file mode 100644
index 0000000..ca76c80
--- /dev/null
+++ b/deep_object_detection/config/coco_classes.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/deep_object_detection/config/generic_model_params.yaml b/deep_object_detection/config/generic_model_params.yaml
new file mode 100644
index 0000000..b9d90e8
--- /dev/null
+++ b/deep_object_detection/config/generic_model_params.yaml
@@ -0,0 +1,46 @@
+deep_object_detection_node:
+  ros__parameters:
+    model_path: "/workspaces/deep_ros/yolov8m.onnx"
+    class_names_path: "/workspaces/deep_ros/deep_object_detection/config/coco_classes.txt"
+
+    model:
+      num_classes: 80
+      bbox_format: "cxcywh"
+
+    preprocessing:
+      input_width: 640
+      input_height: 640
+      normalization_type: "scale_0_1"
+      resize_method: "letterbox"
+      color_format: "rgb"
+
+    postprocessing:
+      score_threshold: 0.25
+      nms_iou_threshold: 0.45
+      max_detections: 300
+      score_activation: "sigmoid"
+      use_multi_output: false
+
+      class_score_mode: "all_classes"
+      class_score_start_idx: -1
+      class_score_count: -1
+
+      coordinate_space: "preprocessed"
+
+      layout:
+        auto_detect: true
+        batch_dim: 0
+        detection_dim: 1
+        feature_dim: 2
+        bbox_start_idx: 0
+        bbox_count: 4
+        score_idx: 4
+        class_idx: 5
+
+    max_batch_size: 3
+    queue_size: 10
+
+    preferred_provider: "tensorrt"
+    device_id: 0
+    enable_trt_engine_cache: true
+    trt_engine_cache_path: "/tmp/deep_ros_ort_trt_cache"
diff --git a/deep_object_detection/include/deep_object_detection/backend_manager.hpp b/deep_object_detection/include/deep_object_detection/backend_manager.hpp
new file mode 100644
index 0000000..b3d2ba8
--- /dev/null
+++ b/deep_object_detection/include/deep_object_detection/backend_manager.hpp
@@ -0,0 +1,232 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file backend_manager.hpp
+ * @brief Backend plugin manager for ONNX Runtime inference
+ *
+ * This header defines the BackendManager class which:
+ * - Loads ONNX Runtime backend plugins (CPU, CUDA, TensorRT)
+ * - Initializes models with selected execution providers
+ * - Provides memory allocators for tensor operations
+ * - Executes inference on batched input tensors
+ * - Supports fail-fast behavior (no silent fallbacks)
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <deep_core/plugin_interfaces/deep_backend_plugin.hpp>
+#include <deep_core/types/tensor.hpp>
+#include <pluginlib/class_loader.hpp>
+#include <rclcpp/rclcpp.hpp>
+#include <rclcpp_lifecycle/lifecycle_node.hpp>
+
+#include "deep_object_detection/detection_types.hpp"
+
+namespace deep_ros
+{
+class BackendInferenceExecutor;
+class BackendMemoryAllocator;
+}  // namespace deep_ros
+
+namespace deep_object_detection
+{
+
+/**
+ * @brief Manages backend plugin loading and inference execution
+ *
+ * Handles loading of ONNX Runtime backend plugins (CPU, CUDA, TensorRT),
+ * model loading, and inference execution. Supports fail-fast behavior:
+ * if the requested provider is unavailable, initialization fails immediately.
+ *
+ * The manager:
+ * - Loads the appropriate backend plugin based on preferred_provider
+ * - Initializes the model with the selected execution provider
+ * - Provides memory allocator for tensor operations
+ * - Executes inference on batched input tensors
+ * - Warms up tensor shape cache for optimal performance
+ */
+class BackendManager
+{
+public:
+  /**
+   * @brief Construct the backend manager
+   * @param node Lifecycle node reference for parameter access and logging
+   * @param params Detection parameters containing model path and provider settings
+   *
+   * Does not initialize the backend. Call initialize() after construction.
+   */
+  BackendManager(rclcpp_lifecycle::LifecycleNode & node, const DetectionParams & params);
+
+  /**
+   * @brief Destructor: releases plugin resources
+   */
+  ~BackendManager();
+
+  /**
+   * @brief Initialize the backend: load plugin, model, and warmup
+   *
+   * Loads the backend plugin, initializes the model with the selected execution
+   * provider, and warms up tensor shape cache. Throws if provider is unavailable
+   * or model loading fails (fail-fast behavior).
+   */
+  void initialize();
+
+  /**
+   * @brief Run inference on a batched input tensor
+   * @param input Packed input tensor (batched preprocessed images)
+   * @return Output tensor from model inference
+   *
+   * Converts PackedInput to deep_ros::Tensor and runs inference.
+   * Returns the first output tensor (for single-output models).
+   */
+  deep_ros::Tensor infer(const PackedInput & input);
+
+  /**
+   * @brief Run inference and return all output tensors
+   * @param input Packed input tensor (batched preprocessed images)
+   * @return Vector of all output tensors from model
+   *
+   * Useful for multi-output models (e.g., separate boxes, scores, classes).
+   */
+  std::vector<deep_ros::Tensor> inferAllOutputs(const PackedInput & input);
+
+  /**
+   * @brief Get the active execution provider name
+   * @return String name of the active provider ("tensorrt", "cuda", or "cpu")
+   */
+  const std::string & activeProvider() const
+  {
+    return active_provider_;
+  }
+
+  /**
+   * @brief Get the memory allocator for tensor operations
+   * @return Shared pointer to backend memory allocator
+   *
+   * Used by preprocessor to allocate tensors in backend-appropriate memory
+   * (e.g., GPU memory for CUDA/TensorRT, CPU memory for CPU provider).
+   */
+  std::shared_ptr<deep_ros::BackendMemoryAllocator> allocator() const
+  {
+    return allocator_;
+  }
+
+  /**
+   * @brief Check if executor is initialized
+   * @return True if executor is available, false otherwise
+   */
+  bool hasExecutor() const
+  {
+    return static_cast<bool>(executor_);
+  }
+
+  /**
+   * @brief Get output tensor shape for given input shape
+   * @param input_shape Input tensor shape [batch, channels, height, width]
+   * @return Output tensor shape
+   *
+   * Uses cached shape information from warmup. Returns shape of first output
+   * tensor for the given input shape.
+   */
+  std::vector<size_t> getOutputShape(const std::vector<size_t> & input_shape) const;
+
+private:
+  /**
+   * @brief Initialize the backend plugin and load model
+   *
+   * Loads the plugin using pluginlib, creates executor and allocator,
+   * and loads the ONNX model. Throws if provider is unavailable or model
+   * loading fails.
+   */
+  void initializeBackend();
+
+  /**
+   * @brief Warmup tensor shape cache for optimal performance
+   *
+   * Runs dummy inference with expected input shapes to warm up TensorRT
+   * engine cache and optimize memory allocation. Always enabled (not configurable).
+   */
+  void warmupTensorShapeCache();
+
+  /**
+   * @brief Parse provider string to Provider enum
+   * @param provider_str Provider name ("tensorrt", "cuda", or "cpu")
+   * @return Provider enum value
+   */
+  Provider parseProvider(const std::string & provider_str) const;
+
+  /**
+   * @brief Check if CUDA runtime is available
+   * @return True if CUDA is available, false otherwise
+   *
+   * Used to validate CUDA/TensorRT provider requirements.
+   */
+  bool isCudaRuntimeAvailable() const;
+
+  /**
+   * @brief Convert Provider enum to string
+   * @param provider Provider enum value
+   * @return String representation ("tensorrt", "cuda", or "cpu")
+   */
+  std::string providerToString(Provider provider) const;
+
+  /**
+   * @brief Declare active_provider parameter for runtime visibility
+   * @param value Provider name to declare
+   *
+   * Makes the active provider visible in ROS parameter server for debugging.
+   */
+  void declareActiveProviderParameter(const std::string & value);
+
+  /**
+   * @brief Build input tensor from PackedInput
+   * @param packed Packed input data (flattened float array and shape)
+   * @return Tensor ready for inference
+   *
+   * Converts PackedInput (CPU memory) to deep_ros::Tensor using the
+   * backend allocator (may copy to GPU memory for CUDA/TensorRT).
+   */
+  deep_ros::Tensor buildInputTensor(const PackedInput & packed) const;
+
+  /// Reference to lifecycle node (for parameters and logging)
+  rclcpp_lifecycle::LifecycleNode & node_;
+  /// Detection parameters (model path, provider settings)
+  const DetectionParams & params_;
+  /// Selected execution provider enum
+  Provider provider_;
+  /// Active provider name (for logging and parameter server)
+  std::string active_provider_{"unknown"};
+  /// Inference executor (runs model inference)
+  std::shared_ptr<deep_ros::BackendInferenceExecutor> executor_;
+  /// Memory allocator (for tensor allocation)
+  std::shared_ptr<deep_ros::BackendMemoryAllocator> allocator_;
+  /// Plugin loader (loads backend plugin via pluginlib)
+  std::unique_ptr<pluginlib::ClassLoader<deep_ros::DeepBackendPlugin>> plugin_loader_;
+  /// Plugin instance (holds loaded plugin)
+  pluginlib::UniquePtr<deep_ros::DeepBackendPlugin> plugin_holder_;
+
+  /**
+   * @brief Map Provider enum to plugin name
+   * @param provider Provider enum value
+   * @return Plugin name for pluginlib (e.g., "deep_ort_backend_plugin::OrtBackendPlugin")
+   */
+  std::string providerToPluginName(Provider provider) const;
+};
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp b/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp
new file mode 100644
index 0000000..b1a4107
--- /dev/null
+++ b/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp
@@ -0,0 +1,277 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file deep_object_detection_node.hpp
+ * @brief ROS 2 lifecycle node for object detection using ONNX models
+ *
+ * This header defines the main DeepObjectDetectionNode class which:
+ * - Subscribes to MultiImage messages (synchronized multi-camera input)
+ * - Batches images for efficient inference
+ * - Runs preprocessing, inference, and postprocessing
+ * - Publishes Detection2DArray messages with bounding boxes and scores
+ * - Manages lifecycle states (configure, activate, deactivate, cleanup, shutdown)
+ */
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <deep_msgs/msg/multi_image.hpp>
+#include <opencv2/core/mat.hpp>
+#include <rclcpp/node_options.hpp>
+#include <rclcpp/rclcpp.hpp>
+#include <rclcpp_lifecycle/lifecycle_node.hpp>
+#include <rclcpp_lifecycle/state.hpp>
+#include <sensor_msgs/msg/compressed_image.hpp>
+#include <std_msgs/msg/header.hpp>
+
+#include "deep_object_detection/backend_manager.hpp"
+#include "deep_object_detection/detection_types.hpp"
+#include "deep_object_detection/generic_postprocessor.hpp"
+#include "deep_object_detection/image_preprocessor.hpp"
+
+namespace deep_object_detection
+{
+
+/**
+ * @brief ROS2 lifecycle node for object detection using ONNX models
+ *
+ * This node performs object detection on synchronized multi-camera streams via MultiImage messages.
+ * It supports:
+ * - MultiImage input: synchronized compressed images from multiple cameras
+ * - Batch processing: groups images for efficient inference
+ * - Multiple backends: CPU, CUDA, or TensorRT execution providers
+ * - Configurable preprocessing: resizing, normalization, color format conversion
+ * - Flexible postprocessing: NMS, score thresholds, multiple output formats
+ *
+ * The node follows the ROS2 lifecycle pattern:
+ * - unconfigured -> configuring -> inactive -> activating -> active
+ * - Can be deactivated/activated without full cleanup
+ * - Subscriptions and publishers are only active when the node is in the active state
+ */
+class DeepObjectDetectionNode : public rclcpp_lifecycle::LifecycleNode
+{
+public:
+  /**
+   * @brief Construct the object detection node
+   * @param options Node options for ROS2 configuration
+   */
+  explicit DeepObjectDetectionNode(const rclcpp::NodeOptions & options = rclcpp::NodeOptions());
+
+  /**
+   * @brief Configure the node: load model, initialize backends, setup postprocessor
+   * @return SUCCESS if configuration succeeds, FAILURE otherwise
+   *
+   * Loads class names, initializes image preprocessor, backend manager, and postprocessor.
+   * Detects model output shape and configures layout. Does not start subscriptions.
+   */
+  rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_configure(
+    const rclcpp_lifecycle::State &) override;
+
+  /**
+   * @brief Activate the node: start subscriptions and batch processing timer
+   * @return SUCCESS if activation succeeds, FAILURE otherwise
+   *
+   * Creates subscriptions (either MultiImage or individual camera topics based on configuration),
+   * starts the batch processing timer, and activates the detection publisher.
+   */
+  rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_activate(
+    const rclcpp_lifecycle::State &) override;
+
+  /**
+   * @brief Deactivate the node: stop subscriptions and timer
+   * @return SUCCESS
+   *
+   * Stops all subscriptions, cancels the batch timer, and deactivates the publisher.
+   * Node can be reactivated without full cleanup.
+   */
+  rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_deactivate(
+    const rclcpp_lifecycle::State &) override;
+
+  /**
+   * @brief Cleanup the node: release all resources
+   * @return SUCCESS
+   *
+   * Releases backend manager, preprocessor, postprocessor, and clears all data.
+   * Node must be reconfigured before reactivation.
+   */
+  rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_cleanup(
+    const rclcpp_lifecycle::State &) override;
+
+  /**
+   * @brief Shutdown the node: final cleanup
+   * @return SUCCESS
+   *
+   * Performs final cleanup. Node cannot be reactivated after shutdown.
+   */
+  rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_shutdown(
+    const rclcpp_lifecycle::State &) override;
+
+private:
+  /**
+   * @brief Declare and read all ROS parameters from the parameter server
+   *
+   * Reads configuration for model, preprocessing, postprocessing, execution provider,
+   * batching, and topic names. Validates required parameters and sets defaults.
+   */
+  void declareAndReadParameters();
+
+  /**
+   * @brief Setup the MultiImage subscription
+   *
+   * Creates a subscription to the input_topic_ with best_effort QoS.
+   * Subscription is only active when node is in active state.
+   */
+  void setupSubscription();
+
+  /**
+   * @brief Callback for MultiImage messages
+   * @param msg Shared pointer to MultiImage message containing synchronized compressed images
+   *
+   * Decodes each compressed image in the MultiImage message and enqueues them
+   * for batch processing. Failed decodes are dropped according to decode_failure_policy.
+   */
+  void onMultiImage(const deep_msgs::msg::MultiImage::ConstSharedPtr & msg);
+
+  /**
+   * @brief Handle a single compressed image from MultiImage message
+   * @param msg Compressed image message to decode and enqueue
+   *
+   * Decodes the compressed image using cv::imdecode(). If decoding fails,
+   * the image is dropped according to decode_failure_policy.
+   */
+  void handleCompressedImage(const sensor_msgs::msg::CompressedImage & msg);
+
+  /**
+   * @brief Enqueue an image for batch processing
+   * @param image Decoded BGR image (OpenCV Mat)
+   * @param header ROS message header with timestamp and frame_id
+   *
+   * Adds the image to the processing queue. If queue is full, drops oldest
+   * image according to queue_overflow_policy. Thread-safe.
+   */
+  void enqueueImage(cv::Mat image, const std_msgs::msg::Header & header);
+
+  /**
+   * @brief Timer callback for batch processing
+   *
+   * Called periodically (5ms) to check if enough images have accumulated
+   * to form a batch. Processes batch if min_batch_size is met or max_batch_latency_ms
+   * timeout has elapsed.
+   */
+  void onBatchTimer();
+
+  /**
+   * @brief Process a batch of images through the inference pipeline
+   * @param batch Vector of queued images to process
+   *
+   * Runs preprocessing, inference, and postprocessing for the batch.
+   * Publishes detection results. Runs asynchronously to avoid blocking timer callback.
+   */
+  void processBatch(const std::vector<QueuedImage> & batch);
+
+  /**
+   * @brief Publish detection results for a batch
+   * @param batch_detections Detections for each image in the batch
+   * @param headers ROS headers for each image (for timestamp and frame_id)
+   * @param metas Image metadata for coordinate transformations
+   *
+   * Converts SimpleDetection objects to Detection2DArray messages and publishes
+   * them. Each image in the batch gets its own Detection2DArray message.
+   */
+  void publishDetections(
+    const std::vector<std::vector<SimpleDetection>> & batch_detections,
+    const std::vector<std_msgs::msg::Header> & headers,
+    const std::vector<ImageMeta> & metas);
+
+  /**
+   * @brief Load class names from file
+   *
+   * Reads class names from class_names_path (one per line) and stores them
+   * in params_.class_names. If file doesn't exist or is empty, class_names
+   * remains empty and class IDs will be used in output messages.
+   */
+  void loadClassNames();
+
+  /**
+   * @brief Cleanup resources if configuration partially fails
+   *
+   * Releases resources that were allocated during on_configure() if an error
+   * occurs partway through configuration. Ensures no resource leaks.
+   */
+  void cleanupPartialConfiguration();
+
+  /**
+   * @brief Cleanup all node resources
+   *
+   * Releases backend_manager_, preprocessor_, postprocessor_, and clears
+   * all subscriptions and timers. Called during on_cleanup() and on_shutdown().
+   */
+  void cleanupAllResources();
+
+  /**
+   * @brief Stop all subscriptions and cancel batch timer
+   *
+   * Called during on_deactivate() to stop processing without full cleanup.
+   * Node can be reactivated without reconfiguration.
+   */
+  void stopSubscriptionsAndTimer();
+
+  /// Configuration parameters loaded from ROS parameter server
+  DetectionParams params_;
+
+  /// Subscription to MultiImage topic (synchronized multi-camera input)
+  rclcpp::Subscription<deep_msgs::msg::MultiImage>::SharedPtr multi_image_sub_;
+  /// Input topic name (can be set via parameter or remapping)
+  std::string input_topic_;
+  /// Publisher for Detection2DArray messages
+  rclcpp_lifecycle::LifecyclePublisher<Detection2DArrayMsg>::SharedPtr detection_pub_;
+  /// Timer for periodic batch processing checks (5ms period)
+  rclcpp::TimerBase::SharedPtr batch_timer_;
+
+  /// Queue of images waiting to be processed (thread-safe)
+  std::deque<QueuedImage> image_queue_;
+  /// Mutex protecting image_queue_ access
+  std::mutex queue_mutex_;
+  /// Callback group for subscription (allows concurrent execution)
+  rclcpp::CallbackGroup::SharedPtr callback_group_;
+
+  /// Counter for dropped images (queue overflow or decode failures)
+  size_t dropped_images_count_;
+
+  /// Image preprocessor (resize, normalize, color conversion)
+  std::unique_ptr<ImagePreprocessor> preprocessor_;
+  /// Postprocessor (NMS, coordinate transformation, message formatting)
+  std::unique_ptr<GenericPostprocessor> postprocessor_;
+  /// Backend manager (handles ONNX Runtime plugin loading and inference)
+  std::unique_ptr<BackendManager> backend_manager_;
+};
+
+/**
+ * @brief Factory function to create a DeepObjectDetectionNode instance
+ * @param options ROS2 node options for configuration
+ * @return Shared pointer to lifecycle node instance
+ *
+ * Used by rclcpp_components for composable node loading.
+ * Allows the node to be loaded as a component in a component container.
+ */
+std::shared_ptr<rclcpp_lifecycle::LifecycleNode> createDeepObjectDetectionNode(
+  const rclcpp::NodeOptions & options = rclcpp::NodeOptions());
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/include/deep_object_detection/detection_types.hpp b/deep_object_detection/include/deep_object_detection/detection_types.hpp
new file mode 100644
index 0000000..f601af9
--- /dev/null
+++ b/deep_object_detection/include/deep_object_detection/detection_types.hpp
@@ -0,0 +1,447 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file detection_types.hpp
+ * @brief Type definitions and enums for deep object detection
+ *
+ * This header defines:
+ * - Enum types for configuration options (Provider, BboxFormat, NormalizationType, etc.)
+ * - Configuration structures (PreprocessingConfig, PostprocessingConfig, DetectionParams)
+ * - Data structures (ImageMeta, QueuedImage, PackedInput, SimpleDetection)
+ * - Helper functions for string-to-enum conversion
+ * - ROS message type aliases (Detection2DMsg, Detection2DArrayMsg)
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <opencv2/core/mat.hpp>
+#include <std_msgs/msg/header.hpp>
+
+#if __has_include(<deep_msgs/msg/detection2_d_array.hpp>)
+  #include <deep_msgs/msg/detection2_d.hpp>
+  #include <deep_msgs/msg/detection2_d_array.hpp>
+
+namespace deep_object_detection
+{
+using Detection2DMsg = deep_msgs::msg::Detection2D;
+using Detection2DArrayMsg = deep_msgs::msg::Detection2DArray;
+}  // namespace deep_object_detection
+#else
+  #include <vision_msgs/msg/detection2_d.hpp>
+  #include <vision_msgs/msg/detection2_d_array.hpp>
+  #include <vision_msgs/msg/object_hypothesis_with_pose.hpp>
+
+namespace deep_object_detection
+{
+using Detection2DMsg = vision_msgs::msg::Detection2D;
+using Detection2DArrayMsg = vision_msgs::msg::Detection2DArray;
+}  // namespace deep_object_detection
+#endif
+
+namespace deep_object_detection
+{
+
+/// Number of RGB color channels
+constexpr size_t RGB_CHANNELS = 3;
+
+/**
+ * @brief Execution provider for ONNX Runtime inference
+ */
+enum class Provider
+{
+  TENSORRT,  ///< TensorRT execution provider (requires CUDA and TensorRT)
+  CUDA,  ///< CUDA execution provider (requires CUDA)
+  CPU  ///< CPU execution provider (always available)
+};
+
+/**
+ * @brief Postprocessor type (currently only generic is supported)
+ */
+enum class PostprocessorType
+{
+  GENERIC  ///< Generic postprocessor (supports various output formats)
+};
+
+/**
+ * @brief Bounding box coordinate format
+ */
+enum class BboxFormat
+{
+  CXCYWH,  ///< Center x, center y, width, height (YOLO format)
+  XYXY,  ///< Top-left x, top-left y, bottom-right x, bottom-right y
+  XYWH  ///< Top-left x, top-left y, width, height
+};
+
+/**
+ * @brief Image normalization method
+ */
+enum class NormalizationType
+{
+  IMAGENET,  ///< ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+  SCALE_0_1,  ///< Scale to [0, 1] range (divide by 255.0)
+  CUSTOM,  ///< Custom mean and std values
+  NONE  ///< No normalization
+};
+
+/**
+ * @brief Image resizing method
+ */
+enum class ResizeMethod
+{
+  LETTERBOX,  ///< Maintain aspect ratio, pad with gray (114) to fit input size
+  RESIZE,  ///< Stretch to input size (may distort aspect ratio)
+  CROP,  ///< Center crop to input size
+  PAD  ///< Pad to input size
+};
+
+/**
+ * @brief Score activation function for raw model outputs
+ */
+enum class ScoreActivation
+{
+  SIGMOID,  ///< Sigmoid activation: 1 / (1 + exp(-x))
+  SOFTMAX,  ///< Softmax activation (not fully implemented, returns raw)
+  NONE  ///< No activation (use raw scores)
+};
+
+/**
+ * @brief How class scores are extracted from model output
+ */
+enum class ClassScoreMode
+{
+  ALL_CLASSES,  ///< Extract class scores from all class logits (multi-class)
+  SINGLE_CONFIDENCE  ///< Use a single confidence score (single-class models)
+};
+
+/**
+ * @brief Coordinate space for bounding boxes
+ */
+enum class CoordinateSpace
+{
+  PREPROCESSED,  ///< Coordinates in preprocessed image space (model input size)
+  ORIGINAL  ///< Coordinates in original image space (not currently used)
+};
+
+/**
+ * @brief Policy for handling queue overflow
+ */
+enum class QueueOverflowPolicy
+{
+  DROP_OLDEST,  ///< Drop oldest image when queue is full (default)
+  DROP_NEWEST,  ///< Drop newest image when queue is full
+  THROW  ///< Throw exception when queue is full (not currently used)
+};
+
+/**
+ * @brief Policy for handling image decode failures
+ */
+enum class DecodeFailurePolicy
+{
+  DROP,  ///< Drop failed decode and log warning (default)
+  THROW  ///< Throw exception on decode failure (not currently used)
+};
+
+/**
+ * @brief Image metadata for coordinate transformation
+ *
+ * Stores information about preprocessing transformations needed to
+ * transform bounding box coordinates from preprocessed image space
+ * back to original image space.
+ */
+struct ImageMeta
+{
+  int original_width = 0;  ///< Original image width before preprocessing
+  int original_height = 0;  ///< Original image height before preprocessing
+  float scale_x = 1.0f;  ///< Horizontal scale factor (original_width / input_width)
+  float scale_y = 1.0f;  ///< Vertical scale factor (original_height / input_height)
+  float pad_x = 0.0f;  ///< Horizontal padding offset (for letterbox)
+  float pad_y = 0.0f;  ///< Vertical padding offset (for letterbox)
+};
+
+/**
+ * @brief Queued image waiting for batch processing
+ */
+struct QueuedImage
+{
+  cv::Mat bgr;  ///< Decoded BGR image (OpenCV Mat)
+  std_msgs::msg::Header header;  ///< ROS message header with timestamp and frame_id
+};
+
+/**
+ * @brief Packed input tensor for model inference
+ *
+ * Flattened float array in NCHW format (batch, channels, height, width)
+ * ready for model input. Shape vector describes tensor dimensions.
+ */
+struct PackedInput
+{
+  std::vector<float> data;  ///< Flattened float array (NCHW format)
+  std::vector<size_t> shape;  ///< Tensor shape [batch, channels, height, width]
+};
+
+/**
+ * @brief Preprocessing configuration parameters
+ */
+struct PreprocessingConfig
+{
+  int input_width = 640;  ///< Model input image width in pixels
+  int input_height = 640;  ///< Model input image height in pixels
+  NormalizationType normalization_type = NormalizationType::SCALE_0_1;  ///< Normalization method
+  std::vector<float> mean = {0.0f, 0.0f, 0.0f};  ///< Mean values for custom normalization (RGB order)
+  std::vector<float> std = {1.0f, 1.0f, 1.0f};  ///< Std values for custom normalization (RGB order)
+  ResizeMethod resize_method = ResizeMethod::LETTERBOX;  ///< Image resizing method
+  int pad_value = 114;  ///< Padding value for letterbox (gray, YOLO standard)
+  std::string color_format = "bgr";  ///< Input color format ("bgr" or "rgb")
+};
+
+/**
+ * @brief Model metadata configuration
+ */
+struct ModelMetadata
+{
+  int num_classes = 80;  ///< Number of detection classes
+  std::string class_names_file;  ///< Path to class names file (one per line, optional)
+  BboxFormat bbox_format = BboxFormat::CXCYWH;  ///< Bounding box format used by model
+};
+
+/**
+ * @brief Output tensor layout configuration
+ *
+ * Describes the structure of model output tensors for manual layout specification.
+ * Only used when auto_detect is false.
+ */
+struct OutputLayoutConfig
+{
+  bool auto_detect = true;  ///< True to auto-detect layout, false to use manual config
+  int batch_dim = 0;  ///< Batch dimension index
+  int detection_dim = 1;  ///< Detection dimension index
+  int feature_dim = 2;  ///< Feature dimension index
+  int bbox_start_idx = 0;  ///< Starting index for bbox coordinates in feature dimension
+  int bbox_count = 4;  ///< Number of bbox coordinates (always 4)
+  int score_idx = 4;  ///< Index for confidence score in feature dimension
+  int class_idx = 5;  ///< Index for class ID in feature dimension
+};
+
+/**
+ * @brief Postprocessing configuration parameters
+ */
+struct PostprocessingConfig
+{
+  float score_threshold = 0.25f;  ///< Minimum confidence score (detections below are filtered)
+  float nms_iou_threshold = 0.45f;  ///< IoU threshold for Non-Maximum Suppression
+  int max_detections = 300;  ///< Maximum number of detections per image (after NMS)
+  ScoreActivation score_activation = ScoreActivation::SIGMOID;  ///< Score activation function
+  bool enable_nms = true;  ///< Enable Non-Maximum Suppression (always true, not configurable)
+  bool use_multi_output = false;  ///< True if model has separate outputs for boxes, scores, classes
+  int output_boxes_idx = 0;  ///< Output index for bounding boxes (if use_multi_output)
+  int output_scores_idx = 1;  ///< Output index for scores (if use_multi_output)
+  int output_classes_idx = 2;  ///< Output index for class IDs (if use_multi_output)
+  ClassScoreMode class_score_mode = ClassScoreMode::ALL_CLASSES;  ///< How class scores are extracted
+  int class_score_start_idx = -1;  ///< Start index for class scores (-1 = use all)
+  int class_score_count = -1;  ///< Count of class scores (-1 = use all)
+  CoordinateSpace coordinate_space = CoordinateSpace::PREPROCESSED;  ///< Coordinate space (always PREPROCESSED)
+  OutputLayoutConfig layout;  ///< Output layout configuration
+};
+
+/**
+ * @brief Complete detection node configuration parameters
+ *
+ * Aggregates all configuration for model, preprocessing, postprocessing,
+ * execution provider, batching, and topics.
+ */
+struct DetectionParams
+{
+  std::string model_path;  ///< Absolute path to ONNX model file
+  ModelMetadata model_metadata;  ///< Model metadata (classes, bbox format)
+  PreprocessingConfig preprocessing;  ///< Preprocessing configuration
+  PostprocessingConfig postprocessing;  ///< Postprocessing configuration
+  std::string input_qos_reliability{"best_effort"};  ///< Input topic QoS reliability (always "best_effort")
+  std::string output_detections_topic{"/detections"};  ///< Output topic for detections
+  int max_batch_size{3};  ///< Maximum images per batch
+  int queue_size{10};  ///< Maximum queue size (0 = unlimited)
+  QueueOverflowPolicy queue_overflow_policy{QueueOverflowPolicy::DROP_OLDEST};  ///< Queue overflow policy
+  DecodeFailurePolicy decode_failure_policy{DecodeFailurePolicy::DROP};  ///< Decode failure policy
+  std::string preferred_provider{"tensorrt"};  ///< Preferred execution provider ("tensorrt", "cuda", or "cpu")
+  int device_id{0};  ///< GPU device ID (for CUDA/TensorRT)
+  bool warmup_tensor_shapes{true};  ///< Warmup tensor shapes (always true, not configurable)
+  bool enable_trt_engine_cache{false};  ///< Enable TensorRT engine caching
+  std::string trt_engine_cache_path{"/tmp/deep_ros_ort_trt_cache"};  ///< TensorRT engine cache directory
+  std::vector<std::string> class_names;  ///< Class name strings (loaded from file or empty)
+};
+
+/**
+ * @brief Convert string to PostprocessorType enum
+ * @param type Postprocessor type string (currently unused, always returns GENERIC)
+ * @return PostprocessorType::GENERIC
+ */
+inline PostprocessorType stringToPostprocessorType(const std::string & /*type*/)
+{
+  return PostprocessorType::GENERIC;
+}
+
+/**
+ * @brief Convert string to BboxFormat enum
+ * @param format Format string ("cxcywh", "xyxy", or "xywh", case-insensitive)
+ * @return BboxFormat enum (defaults to CXCYWH if unknown)
+ */
+inline BboxFormat stringToBboxFormat(const std::string & format)
+{
+  if (format == "cxcywh" || format == "CXCYWH") {
+    return BboxFormat::CXCYWH;
+  } else if (format == "xyxy" || format == "XYXY") {
+    return BboxFormat::XYXY;
+  } else if (format == "xywh" || format == "XYWH") {
+    return BboxFormat::XYWH;
+  }
+  return BboxFormat::CXCYWH;
+}
+
+/**
+ * @brief Convert string to NormalizationType enum
+ * @param type Normalization type string ("imagenet", "scale_0_1", "yolo", "custom", or "none", case-insensitive)
+ * @return NormalizationType enum (defaults to SCALE_0_1 if unknown)
+ */
+inline NormalizationType stringToNormalizationType(const std::string & type)
+{
+  if (type == "imagenet" || type == "IMAGENET") {
+    return NormalizationType::IMAGENET;
+  } else if (type == "scale_0_1" || type == "SCALE_0_1" || type == "yolo" || type == "YOLO") {
+    return NormalizationType::SCALE_0_1;
+  } else if (type == "custom" || type == "CUSTOM") {
+    return NormalizationType::CUSTOM;
+  } else if (type == "none" || type == "NONE") {
+    return NormalizationType::NONE;
+  }
+  return NormalizationType::SCALE_0_1;
+}
+
+/**
+ * @brief Convert string to ResizeMethod enum
+ * @param method Resize method string ("letterbox", "resize", "crop", or "pad", case-insensitive)
+ * @return ResizeMethod enum (defaults to LETTERBOX if unknown)
+ */
+inline ResizeMethod stringToResizeMethod(const std::string & method)
+{
+  if (method == "letterbox" || method == "LETTERBOX") {
+    return ResizeMethod::LETTERBOX;
+  } else if (method == "resize" || method == "RESIZE") {
+    return ResizeMethod::RESIZE;
+  } else if (method == "crop" || method == "CROP") {
+    return ResizeMethod::CROP;
+  } else if (method == "pad" || method == "PAD") {
+    return ResizeMethod::PAD;
+  }
+  return ResizeMethod::LETTERBOX;
+}
+
+/**
+ * @brief Convert string to ScoreActivation enum
+ * @param activation Activation string ("sigmoid", "softmax", or "none", case-insensitive)
+ * @return ScoreActivation enum (defaults to SIGMOID if unknown)
+ */
+inline ScoreActivation stringToScoreActivation(const std::string & activation)
+{
+  if (activation == "sigmoid" || activation == "SIGMOID") {
+    return ScoreActivation::SIGMOID;
+  } else if (activation == "softmax" || activation == "SOFTMAX") {
+    return ScoreActivation::SOFTMAX;
+  } else if (activation == "none" || activation == "NONE") {
+    return ScoreActivation::NONE;
+  }
+  return ScoreActivation::SIGMOID;
+}
+
+/**
+ * @brief Convert string to ClassScoreMode enum
+ * @param mode Mode string ("all_classes" or "single_confidence", case-insensitive)
+ * @return ClassScoreMode enum (defaults to ALL_CLASSES if unknown)
+ */
+inline ClassScoreMode stringToClassScoreMode(const std::string & mode)
+{
+  if (mode == "all_classes" || mode == "ALL_CLASSES") {
+    return ClassScoreMode::ALL_CLASSES;
+  } else if (mode == "single_confidence" || mode == "SINGLE_CONFIDENCE") {
+    return ClassScoreMode::SINGLE_CONFIDENCE;
+  }
+  return ClassScoreMode::ALL_CLASSES;
+}
+
+/**
+ * @brief Convert string to CoordinateSpace enum
+ * @param space Space string ("preprocessed" or "original", case-insensitive)
+ * @return CoordinateSpace enum (defaults to PREPROCESSED if unknown)
+ */
+inline CoordinateSpace stringToCoordinateSpace(const std::string & space)
+{
+  if (space == "preprocessed" || space == "PREPROCESSED") {
+    return CoordinateSpace::PREPROCESSED;
+  } else if (space == "original" || space == "ORIGINAL") {
+    return CoordinateSpace::ORIGINAL;
+  }
+  return CoordinateSpace::PREPROCESSED;
+}
+
+/**
+ * @brief Convert string to QueueOverflowPolicy enum
+ * @param policy Policy string ("drop_oldest", "drop_newest", or "throw", case-insensitive)
+ * @return QueueOverflowPolicy enum (defaults to DROP_OLDEST if unknown)
+ */
+inline QueueOverflowPolicy stringToQueueOverflowPolicy(const std::string & policy)
+{
+  if (policy == "drop_oldest" || policy == "DROP_OLDEST") {
+    return QueueOverflowPolicy::DROP_OLDEST;
+  } else if (policy == "drop_newest" || policy == "DROP_NEWEST") {
+    return QueueOverflowPolicy::DROP_NEWEST;
+  } else if (policy == "throw" || policy == "THROW") {
+    return QueueOverflowPolicy::THROW;
+  }
+  return QueueOverflowPolicy::DROP_OLDEST;
+}
+
+/**
+ * @brief Convert string to DecodeFailurePolicy enum
+ * @param policy Policy string ("drop" or "throw", case-insensitive)
+ * @return DecodeFailurePolicy enum (defaults to DROP if unknown)
+ */
+inline DecodeFailurePolicy stringToDecodeFailurePolicy(const std::string & policy)
+{
+  if (policy == "drop" || policy == "DROP") {
+    return DecodeFailurePolicy::DROP;
+  } else if (policy == "throw" || policy == "THROW") {
+    return DecodeFailurePolicy::THROW;
+  }
+  return DecodeFailurePolicy::DROP;
+}
+
+/**
+ * @brief Simple detection structure (internal representation)
+ *
+ * Stores a single detection with bounding box, score, and class ID.
+ * Coordinates are in original image space (x, y, width, height).
+ */
+struct SimpleDetection
+{
+  float x = 0.0f;  ///< Top-left x coordinate in original image space
+  float y = 0.0f;  ///< Top-left y coordinate in original image space
+  float width = 0.0f;  ///< Bounding box width in original image space
+  float height = 0.0f;  ///< Bounding box height in original image space
+  float score = 0.0f;  ///< Confidence score [0, 1]
+  int32_t class_id = -1;  ///< Class ID (-1 if unknown)
+};
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp b/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp
new file mode 100644
index 0000000..888028a
--- /dev/null
+++ b/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp
@@ -0,0 +1,277 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file generic_postprocessor.hpp
+ * @brief Generic postprocessor for object detection model outputs
+ *
+ * This header defines the GenericPostprocessor class which:
+ * - Automatically detects output tensor layouts
+ * - Applies score activation and thresholding
+ * - Performs Non-Maximum Suppression (NMS)
+ * - Transforms coordinates from preprocessed to original image space
+ * - Formats detections into ROS Detection2DArray messages
+ * - Supports both single-output and multi-output models
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <deep_core/types/tensor.hpp>
+#include <std_msgs/msg/header.hpp>
+
+#include "deep_object_detection/detection_types.hpp"
+
+namespace deep_object_detection
+{
+
+/**
+ * @brief Generic postprocessor for object detection models
+ *
+ * Handles postprocessing pipeline for various ONNX model output formats:
+ * - Automatic layout detection (supports [batch, detections, features], [batch, features, detections], etc.)
+ * - Score activation (sigmoid, softmax, or none)
+ * - Score thresholding
+ * - Non-maximum suppression (NMS)
+ * - Coordinate transformation from preprocessed to original image space
+ * - Message formatting for ROS Detection2DArray
+ *
+ * Supports both single-output models (boxes + scores + classes in one tensor)
+ * and multi-output models (separate tensors for boxes, scores, classes).
+ */
+class GenericPostprocessor
+{
+public:
+  /**
+   * @brief Output tensor layout configuration
+   *
+   * Describes the structure of model output tensors:
+   * - Dimension indices for batch, detections, and features
+   * - Feature indices for bbox coordinates, scores, and class IDs
+   * - Support for transposed layouts and multi-output models
+   */
+  struct OutputLayout
+  {
+    std::vector<size_t> shape;  ///< Output tensor shape
+    size_t batch_dim = 0;  ///< Batch dimension index
+    size_t detection_dim = 1;  ///< Detection dimension index
+    size_t feature_dim = 2;  ///< Feature dimension index
+    size_t bbox_start_idx = 0;  ///< Starting index for bbox coordinates in feature dimension
+    size_t bbox_count = 4;  ///< Number of bbox coordinates (always 4)
+    size_t score_idx = 4;  ///< Index for confidence score in feature dimension
+    size_t class_idx = 5;  ///< Index for class ID in feature dimension
+    bool has_separate_class_output = false;  ///< True if class IDs are in separate output tensor
+    size_t class_output_idx = 0;  ///< Output index for separate class tensor (if applicable)
+    bool auto_detect = true;  ///< True if layout should be auto-detected
+  };
+
+  /**
+   * @brief Construct the postprocessor
+   * @param config Postprocessing configuration (thresholds, NMS, activation)
+   * @param layout Output tensor layout configuration
+   * @param bbox_format Bounding box format (cxcywh, xyxy, or xywh)
+   * @param num_classes Number of detection classes
+   * @param class_names Vector of class name strings (empty if using class IDs)
+   * @param use_letterbox True if letterbox resize was used (affects coordinate transformation)
+   */
+  GenericPostprocessor(
+    const PostprocessingConfig & config,
+    const OutputLayout & layout,
+    BboxFormat bbox_format,
+    int num_classes,
+    const std::vector<std::string> & class_names,
+    bool use_letterbox);
+
+  /**
+   * @brief Automatically detect output tensor layout from shape
+   * @param output_shape Model output tensor shape
+   * @return Detected OutputLayout
+   *
+   * Analyzes tensor shape to determine layout:
+   * - [batch, detections, features] -> standard layout
+   * - [batch, features, detections] -> transposed layout
+   * - [batch, queries, 4+classes] -> query-based (DETR-style)
+   * - Other shapes -> heuristic-based detection
+   */
+  static OutputLayout detectLayout(const std::vector<size_t> & output_shape);
+
+  /**
+   * @brief Auto-configure output layout based on config and optional output shape
+   * @param output_shape Model output shape (can be empty for deferred detection)
+   * @param layout_config Layout configuration from parameters
+   * @return Configured OutputLayout
+   *
+   * Handles both manual and auto-detection modes. If auto_detect is true and output_shape
+   * is available, automatically detects layout. Otherwise uses manual config or defers detection.
+   */
+  static OutputLayout autoConfigure(const std::vector<size_t> & output_shape, const OutputLayoutConfig & layout_config);
+
+  /**
+   * @brief Decode model output tensor to detections
+   * @param output Model output tensor (single output)
+   * @param metas Image metadata for coordinate transformation
+   * @return Vector of detections per image in batch
+   *
+   * Extracts bounding boxes, scores, and class IDs from output tensor.
+   * Applies score thresholding, NMS, and coordinate transformation.
+   * Returns one vector of detections per image in the batch.
+   */
+  std::vector<std::vector<SimpleDetection>> decode(
+    const deep_ros::Tensor & output, const std::vector<ImageMeta> & metas) const;
+
+  /**
+   * @brief Decode multi-output model to detections
+   * @param outputs Vector of model output tensors (boxes, scores, classes separately)
+   * @param metas Image metadata for coordinate transformation
+   * @return Vector of detections per image in batch
+   *
+   * For models with separate outputs for boxes, scores, and classes.
+   * Uses output_boxes_idx, output_scores_idx, output_classes_idx from config.
+   */
+  std::vector<std::vector<SimpleDetection>> decodeMultiOutput(
+    const std::vector<deep_ros::Tensor> & outputs, const std::vector<ImageMeta> & metas) const;
+
+  /**
+   * @brief Get postprocessor format name
+   * @return Format name string ("generic")
+   */
+  std::string getFormatName() const
+  {
+    return "generic";
+  }
+
+  /**
+   * @brief Fill ROS Detection2DArray message with detections
+   * @param header ROS message header (timestamp and frame_id)
+   * @param detections Vector of detections for one image
+   * @param meta Image metadata
+   * @param out_msg Output message to fill
+   *
+   * Converts SimpleDetection objects to ROS Detection2D messages
+   * and populates the Detection2DArray message.
+   */
+  void fillDetectionMessage(
+    const std_msgs::msg::Header & header,
+    const std::vector<SimpleDetection> & detections,
+    const ImageMeta & meta,
+    Detection2DArrayMsg & out_msg) const;
+
+protected:
+  /**
+   * @brief Transform detection coordinates from preprocessed to original image space
+   * @param det Detection to transform (modified in-place)
+   * @param meta Image metadata (scale, padding)
+   * @param use_letterbox True if letterbox resize was used
+   *
+   * Transforms bounding box coordinates from preprocessed image space
+   * (model input size) to original image space using scale and padding
+   * information from preprocessing.
+   */
+  void adjustToOriginal(SimpleDetection & det, const ImageMeta & meta, bool use_letterbox) const;
+
+  /**
+   * @brief Apply Non-Maximum Suppression to detections
+   * @param dets Vector of detections (modified in-place)
+   * @param iou_threshold IoU threshold for NMS
+   * @return Filtered detections after NMS
+   *
+   * Removes overlapping detections with lower scores. Detections are
+   * sorted by score (descending) and suppressed if IoU > threshold.
+   */
+  std::vector<SimpleDetection> applyNms(std::vector<SimpleDetection> dets, float iou_threshold) const;
+
+  /**
+   * @brief Calculate Intersection over Union (IoU) between two detections
+   * @param a First detection
+   * @param b Second detection
+   * @return IoU value [0, 1]
+   */
+  static float iou(const SimpleDetection & a, const SimpleDetection & b);
+
+  /**
+   * @brief Get class label string for a class ID
+   * @param class_id Class ID
+   * @param class_names Vector of class name strings
+   * @return Class label string (name if available, otherwise ID as string)
+   */
+  std::string classLabel(int class_id, const std::vector<std::string> & class_names) const;
+
+  /**
+   * @brief Apply score activation function
+   * @param raw_score Raw score from model
+   * @return Activated score
+   *
+   * Applies activation according to config:
+   * - sigmoid: 1 / (1 + exp(-x))
+   * - softmax: exp(x) / sum(exp(x)) (not applicable here, returns raw)
+   * - none: return raw score
+   */
+  float applyActivation(float raw_score) const;
+
+private:
+  /**
+   * @brief Extract a single value from output tensor
+   * @param data Pointer to tensor data (flattened array)
+   * @param batch_idx Batch index
+   * @param detection_idx Detection index
+   * @param feature_idx Feature index
+   * @param shape Tensor shape for index calculation
+   * @return Extracted float value
+   *
+   * Calculates linear index from multi-dimensional indices and extracts value.
+   * Handles both standard and transposed layouts.
+   */
+  float extractValue(
+    const float * data,
+    size_t batch_idx,
+    size_t detection_idx,
+    size_t feature_idx,
+    const std::vector<size_t> & shape) const;
+
+  /**
+   * @brief Convert bbox data from tensor to SimpleDetection
+   * @param bbox_data Pointer to bbox coordinates in tensor
+   * @param batch_idx Batch index
+   * @param detection_idx Detection index
+   * @param shape Tensor shape
+   * @param det Output detection (bbox coordinates filled)
+   *
+   * Extracts 4 bbox coordinates and converts from model format (cxcywh, xyxy, or xywh)
+   * to SimpleDetection format (x, y, width, height in original image space).
+   */
+  void convertBbox(
+    const float * bbox_data,
+    size_t batch_idx,
+    size_t detection_idx,
+    const std::vector<size_t> & shape,
+    SimpleDetection & det) const;
+
+  /// Postprocessing configuration (thresholds, NMS, activation)
+  PostprocessingConfig config_;
+  /// Output tensor layout configuration
+  OutputLayout layout_;
+  /// Bounding box format (cxcywh, xyxy, or xywh)
+  BboxFormat bbox_format_;
+  /// Number of detection classes
+  int num_classes_;
+  /// Class name strings (empty if using class IDs)
+  std::vector<std::string> class_names_;
+  /// True if letterbox resize was used (affects coordinate transformation)
+  bool use_letterbox_;
+};
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp b/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp
new file mode 100644
index 0000000..fd59651
--- /dev/null
+++ b/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp
@@ -0,0 +1,162 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file image_preprocessor.hpp
+ * @brief Image preprocessing for object detection models
+ *
+ * This header defines the ImagePreprocessor class which:
+ * - Resizes images (letterbox, resize, crop, or pad)
+ * - Converts color formats (BGR to RGB)
+ * - Applies normalization (scale_0_1, imagenet, custom, or none)
+ * - Batches multiple images into a single tensor
+ * - Tracks metadata for coordinate transformation
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <opencv2/core/mat.hpp>
+
+#include "deep_object_detection/detection_types.hpp"
+
+namespace deep_object_detection
+{
+
+/**
+ * @brief Preprocesses images for model input
+ *
+ * Handles image preprocessing pipeline:
+ * - Resizing (letterbox, resize, crop, or pad)
+ * - Color format conversion (BGR to RGB)
+ * - Normalization (scale_0_1, imagenet, custom, or none)
+ * - Batching multiple images into a single tensor
+ *
+ * The preprocessor maintains aspect ratio when using letterbox resize
+ * and tracks metadata (scale, padding) for coordinate transformation
+ * in postprocessing.
+ */
+class ImagePreprocessor
+{
+public:
+  /**
+   * @brief Construct the preprocessor with configuration
+   * @param config Preprocessing configuration (resize method, normalization, etc.)
+   */
+  explicit ImagePreprocessor(const PreprocessingConfig & config);
+
+  /**
+   * @brief Preprocess a single BGR image
+   * @param bgr Input BGR image (OpenCV Mat)
+   * @param meta Output metadata (original size, scale, padding) for coordinate transformation
+   * @return Preprocessed image ready for model input
+   *
+   * Applies resize, color conversion, and normalization according to config.
+   * Updates meta with transformation parameters needed for postprocessing.
+   */
+  cv::Mat preprocess(const cv::Mat & bgr, ImageMeta & meta) const;
+
+  /**
+   * @brief Pack a batch of preprocessed images into a single tensor
+   * @param images Vector of preprocessed images (same size, normalized)
+   * @return Reference to PackedInput containing flattened float array and shape
+   *
+   * Converts batch of OpenCV Mats to a single flattened float array
+   * in NCHW format (batch, channels, height, width). Uses internal cache
+   * to avoid reallocation. Thread-safe for read-only access.
+   */
+  const PackedInput & pack(const std::vector<cv::Mat> & images) const;
+
+  /**
+   * @brief Get preprocessing configuration
+   * @return Reference to preprocessing configuration
+   */
+  const PreprocessingConfig & config() const
+  {
+    return config_;
+  }
+
+private:
+  /**
+   * @brief Apply letterbox resize (maintain aspect ratio, pad with gray)
+   * @param bgr Input BGR image
+   * @param meta Output metadata (scale and padding info)
+   * @return Resized image with letterbox padding
+   *
+   * Maintains aspect ratio by scaling to fit within input dimensions,
+   * then pads with gray (114) to reach exact input size. Updates meta
+   * with scale factors and padding offsets.
+   */
+  cv::Mat applyLetterbox(const cv::Mat & bgr, ImageMeta & meta) const;
+
+  /**
+   * @brief Apply simple resize (stretch to input size)
+   * @param bgr Input BGR image
+   * @param meta Output metadata (scale info)
+   * @return Resized image (may distort aspect ratio)
+   *
+   * Stretches image to exact input dimensions. Updates meta with scale factors.
+   */
+  cv::Mat applyResize(const cv::Mat & bgr, ImageMeta & meta) const;
+
+  /**
+   * @brief Apply center crop to input size
+   * @param bgr Input BGR image
+   * @param meta Output metadata (scale info)
+   * @return Cropped image
+   *
+   * Crops center region to input dimensions. Updates meta with scale factors.
+   */
+  cv::Mat applyCrop(const cv::Mat & bgr, ImageMeta & meta) const;
+
+  /**
+   * @brief Apply padding to input size
+   * @param bgr Input BGR image
+   * @param meta Output metadata (padding info)
+   * @return Padded image
+   *
+   * Pads image with gray (114) to reach input dimensions. Updates meta with padding offsets.
+   */
+  cv::Mat applyPad(const cv::Mat & bgr, ImageMeta & meta) const;
+
+  /**
+   * @brief Apply normalization to image
+   * @param image Image to normalize (modified in-place)
+   *
+   * Applies normalization according to config:
+   * - scale_0_1: divide by 255.0
+   * - imagenet: subtract mean [0.485, 0.456, 0.406], divide by std [0.229, 0.224, 0.225]
+   * - custom: use config mean and std
+   * - none: no normalization
+   */
+  void applyNormalization(cv::Mat & image) const;
+
+  /**
+   * @brief Convert BGR to RGB if needed
+   * @param image Image to convert (modified in-place)
+   *
+   * Converts BGR to RGB if color_format is "rgb" in config.
+   * No-op if color_format is "bgr".
+   */
+  void applyColorConversion(cv::Mat & image) const;
+
+  /// Preprocessing configuration
+  PreprocessingConfig config_;
+  /// Cached packed input (reused to avoid reallocation)
+  mutable PackedInput packed_input_cache_;
+};
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/launch/deep_object_detection.launch.yaml b/deep_object_detection/launch/deep_object_detection.launch.yaml
new file mode 100644
index 0000000..2f3d8ef
--- /dev/null
+++ b/deep_object_detection/launch/deep_object_detection.launch.yaml
@@ -0,0 +1,64 @@
+# Copyright (c) 2025-present WATonomous. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+launch:
+  - arg:
+      name: "config_file"
+      default: "$(find-pkg-share deep_object_detection)/config/generic_model_params.yaml"
+      description: "Path to YAML config for object detection"
+
+  - arg:
+      name: "input_topic"
+      default: "/multi_camera_sync/multi_image_compressed"
+      description: "Input MultiImage topic (configured via remapping)"
+
+  - arg:
+      name: "output_detections_topic"
+      default: "/detections"
+      description: "Output detections topic (configured via remapping)"
+
+  - arg:
+      name: "use_lifecycle_manager"
+      default: "true"
+      description: "Whether to use nav2_lifecycle_manager for automatic lifecycle transitions"
+
+  - node:
+      pkg: "deep_object_detection"
+      exec: "deep_object_detection_node"
+      name: "deep_object_detection_node"
+      output: "screen"
+      remap:
+        - from: "/multi_camera_sync/multi_image_compressed"
+          to: "$(var input_topic)"
+        - from: "/detections"
+          to: "$(var output_detections_topic)"
+      param:
+        - from: "$(var config_file)"
+        - name: "input_topic"
+          value: "$(var input_topic)"
+        - name: "output_detections_topic"
+          value: "$(var output_detections_topic)"
+
+  - node:
+      pkg: "nav2_lifecycle_manager"
+      exec: "lifecycle_manager"
+      name: "detection_lifecycle_manager"
+      output: "screen"
+      param:
+        - name: "node_names"
+          value: ["deep_object_detection_node"]
+        - name: "autostart"
+          value: true
+        - name: "bond_timeout"
+          value: 0.0
diff --git a/deep_object_detection/package.xml b/deep_object_detection/package.xml
new file mode 100644
index 0000000..7f9ca80
--- /dev/null
+++ b/deep_object_detection/package.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<package format="3">
+  <name>deep_object_detection</name>
+  <version>0.1.0</version>
+  <description>
+    Generic model-agnostic deep learning object detection node using ONNX Runtime with explicit provider selection (TensorRT/CUDA/CPU).
+    Works with any ONNX-compatible object detection model - automatically detects and adapts to model output format.
+    Features dynamic batching, multi-camera support, configurable preprocessing/postprocessing, and fail-fast error handling.
+  </description>
+
+  <maintainer email="watonomous@example.com">WATonomous</maintainer>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>rclcpp_lifecycle</depend>
+  <depend>sensor_msgs</depend>
+  <depend>cv_bridge</depend>
+  <depend>vision_msgs</depend>
+  <depend>deep_core</depend>
+  <depend>deep_msgs</depend>
+  <depend>deep_ort_backend_plugin</depend>
+  <depend>deep_ort_gpu_backend_plugin</depend>
+  <depend>onnxruntime_vendor</depend>
+  <depend>onnxruntime_gpu_vendor</depend>
+  <depend>opencv2</depend>
+  <depend>rcl_interfaces</depend>
+
+  <test_depend>deep_test</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/deep_object_detection/src/backend_manager.cpp b/deep_object_detection/src/backend_manager.cpp
new file mode 100644
index 0000000..2492819
--- /dev/null
+++ b/deep_object_detection/src/backend_manager.cpp
@@ -0,0 +1,277 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "deep_object_detection/backend_manager.hpp"
+
+#include <dlfcn.h>
+
+#include <algorithm>
+#include <cctype>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <deep_core/plugin_interfaces/deep_backend_plugin.hpp>
+#include <pluginlib/class_loader.hpp>
+#include <rcl_interfaces/msg/parameter_descriptor.hpp>
+
+namespace deep_object_detection
+{
+
+BackendManager::BackendManager(rclcpp_lifecycle::LifecycleNode & node, const DetectionParams & params)
+: node_(node)
+, params_(params)
+, plugin_loader_(
+    std::make_unique<pluginlib::ClassLoader<deep_ros::DeepBackendPlugin>>("deep_core", "deep_ros::DeepBackendPlugin"))
+{}
+
+BackendManager::~BackendManager()
+{
+  plugin_holder_.reset();
+  executor_.reset();
+  allocator_.reset();
+}
+
+void BackendManager::initialize()
+{
+  provider_ = parseProvider(params_.preferred_provider);
+  initializeBackend();
+}
+
+deep_ros::Tensor BackendManager::infer(const PackedInput & input)
+{
+  if (!executor_) {
+    RCLCPP_ERROR(node_.get_logger(), "Cannot perform inference: no backend executor initialized");
+    throw std::runtime_error("Cannot perform inference: no backend executor initialized");
+  }
+
+  auto input_tensor = buildInputTensor(input);
+  return executor_->run_inference(input_tensor);
+}
+
+std::vector<deep_ros::Tensor> BackendManager::inferAllOutputs(const PackedInput & input)
+{
+  if (!executor_) {
+    RCLCPP_ERROR(node_.get_logger(), "Cannot perform inference: no backend executor initialized");
+    throw std::runtime_error("Cannot perform inference: no backend executor initialized");
+  }
+
+  auto input_tensor = buildInputTensor(input);
+  auto output = executor_->run_inference(input_tensor);
+  return {output};
+}
+
+Provider BackendManager::parseProvider(const std::string & provider_str) const
+{
+  std::string normalized = provider_str;
+  std::transform(normalized.begin(), normalized.end(), normalized.begin(), [](unsigned char c) {
+    return static_cast<char>(std::tolower(c));
+  });
+
+  if (normalized == "cpu") {
+    return Provider::CPU;
+  } else if (normalized == "cuda") {
+    return Provider::CUDA;
+  } else if (normalized == "tensorrt") {
+    return Provider::TENSORRT;
+  } else {
+    throw std::runtime_error("Unknown provider: " + provider_str + ". Valid options: cpu, cuda, tensorrt");
+  }
+}
+
+void BackendManager::initializeBackend()
+{
+  if ((provider_ == Provider::TENSORRT || provider_ == Provider::CUDA) && !isCudaRuntimeAvailable()) {
+    std::string error = "Provider " + providerToString(provider_) +
+                        " requires CUDA runtime libraries (libcudart/libcuda) which are not available";
+    throw std::runtime_error(error);
+  }
+
+  const std::string plugin_name = providerToPluginName(provider_);
+  if (plugin_name.empty()) {
+    throw std::runtime_error("No plugin name for provider: " + providerToString(provider_));
+  }
+
+  const auto provider_name = providerToString(provider_);
+
+  if (!node_.has_parameter("Backend.execution_provider")) {
+    node_.declare_parameter<std::string>("Backend.execution_provider", provider_name);
+  } else {
+    node_.set_parameters({rclcpp::Parameter("Backend.execution_provider", provider_name)});
+  }
+
+  if (!node_.has_parameter("Backend.device_id")) {
+    node_.declare_parameter<int>("Backend.device_id", params_.device_id);
+  } else {
+    node_.set_parameters({rclcpp::Parameter("Backend.device_id", params_.device_id)});
+  }
+
+  if (!node_.has_parameter("Backend.trt_engine_cache_enable")) {
+    node_.declare_parameter<bool>("Backend.trt_engine_cache_enable", params_.enable_trt_engine_cache);
+  } else {
+    node_.set_parameters({rclcpp::Parameter("Backend.trt_engine_cache_enable", params_.enable_trt_engine_cache)});
+  }
+
+  if (!node_.has_parameter("Backend.trt_engine_cache_path")) {
+    node_.declare_parameter<std::string>("Backend.trt_engine_cache_path", params_.trt_engine_cache_path);
+  } else {
+    node_.set_parameters({rclcpp::Parameter("Backend.trt_engine_cache_path", params_.trt_engine_cache_path)});
+  }
+
+  auto node_ptr = node_.shared_from_this();
+  plugin_holder_ = plugin_loader_->createUniqueInstance(plugin_name);
+  plugin_holder_->initialize(node_ptr);
+  allocator_ = plugin_holder_->get_allocator();
+  executor_ = plugin_holder_->get_inference_executor();
+
+  if (!executor_ || !allocator_) {
+    throw std::runtime_error("Executor or allocator is null after plugin initialization");
+  }
+
+  if (params_.model_path.empty()) {
+    throw std::runtime_error("Model path is not set");
+  }
+
+  if (!executor_->load_model(params_.model_path)) {
+    throw std::runtime_error("Failed to load model: " + params_.model_path);
+  }
+
+  active_provider_ = providerToString(provider_);
+  declareActiveProviderParameter(active_provider_);
+  warmupTensorShapeCache();
+
+  RCLCPP_INFO(
+    node_.get_logger(),
+    "Initialized backend using provider: %s (device %d)",
+    active_provider_.c_str(),
+    params_.device_id);
+}
+
+void BackendManager::warmupTensorShapeCache()
+{
+  if (!params_.warmup_tensor_shapes) {
+    return;
+  }
+  if (provider_ == Provider::CPU) {
+    return;
+  }
+  if (!executor_ || !allocator_) {
+    return;
+  }
+  const size_t channels = RGB_CHANNELS;
+  const size_t height = static_cast<size_t>(params_.preprocessing.input_height);
+  const size_t width = static_cast<size_t>(params_.preprocessing.input_width);
+  const size_t per_image = channels * height * width;
+
+  const int batch = params_.max_batch_size;
+  RCLCPP_INFO(
+    node_.get_logger(), "Priming %s backend tensor shapes for batch size %d", active_provider_.c_str(), batch);
+
+  PackedInput dummy;
+  dummy.shape = {static_cast<size_t>(batch), channels, height, width};
+  dummy.data.assign(static_cast<size_t>(batch) * per_image, 0.0f);
+
+  auto input_tensor = buildInputTensor(dummy);
+  (void)executor_->run_inference(input_tensor);
+  RCLCPP_DEBUG(node_.get_logger(), "Cached tensor shape for batch size %d", batch);
+}
+
+bool BackendManager::isCudaRuntimeAvailable() const
+{
+  const char * libs[] = {"libcudart.so.12", "libcudart.so.11", "libcudart.so", "libcuda.so.1", "libcuda.so"};
+  for (const auto * lib : libs) {
+    void * handle = dlopen(lib, RTLD_LAZY | RTLD_LOCAL);
+    if (handle) {
+      dlclose(handle);
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string BackendManager::providerToString(Provider provider) const
+{
+  switch (provider) {
+    case Provider::TENSORRT:
+      return "tensorrt";
+    case Provider::CUDA:
+      return "cuda";
+    case Provider::CPU:
+      return "cpu";
+    default:
+      return "unknown";
+  }
+}
+
+void BackendManager::declareActiveProviderParameter(const std::string & value)
+{
+  rcl_interfaces::msg::ParameterDescriptor desc;
+  desc.read_only = true;
+  desc.description = "Current execution provider in use (read-only)";
+
+  if (node_.has_parameter("active_provider")) {
+    (void)node_.set_parameters({rclcpp::Parameter("active_provider", value)});
+  } else {
+    node_.declare_parameter<std::string>("active_provider", value, desc);
+  }
+}
+
+deep_ros::Tensor BackendManager::buildInputTensor(const PackedInput & packed) const
+{
+  deep_ros::Tensor tensor(packed.shape, deep_ros::DataType::FLOAT32, allocator_);
+  const size_t bytes = packed.data.size() * sizeof(float);
+  allocator_->copy_from_host(tensor.data(), packed.data.data(), bytes);
+  return tensor;
+}
+
+std::string BackendManager::providerToPluginName(Provider provider) const
+{
+  switch (provider) {
+    case Provider::TENSORRT:
+    case Provider::CUDA:
+      return "onnxruntime_gpu";
+    case Provider::CPU:
+      return "onnxruntime_cpu";
+    default:
+      return "";
+  }
+}
+
+std::vector<size_t> BackendManager::getOutputShape(const std::vector<size_t> & input_shape) const
+{
+  if (!executor_ || !allocator_) {
+    RCLCPP_ERROR(
+      node_.get_logger(),
+      "Cannot get output shape: backend not initialized (executor: %s, allocator: %s)",
+      executor_ ? "available" : "null",
+      allocator_ ? "available" : "null");
+    throw std::runtime_error("Cannot get output shape: backend executor or allocator not available");
+  }
+
+  // NO TRY-CATCH - let exceptions propagate
+  PackedInput dummy;
+  dummy.shape = input_shape;
+  size_t total_elements = 1;
+  for (size_t dim : input_shape) {
+    total_elements *= dim;
+  }
+  dummy.data.assign(total_elements, 0.0f);
+
+  auto input_tensor = buildInputTensor(dummy);
+  auto output_tensor = executor_->run_inference(input_tensor);
+  return output_tensor.shape();
+}
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/src/deep_object_detection_node.cpp b/deep_object_detection/src/deep_object_detection_node.cpp
new file mode 100644
index 0000000..84cbd91
--- /dev/null
+++ b/deep_object_detection/src/deep_object_detection_node.cpp
@@ -0,0 +1,599 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "deep_object_detection/deep_object_detection_node.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <cinttypes>
+#include <deque>
+#include <fstream>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <deep_msgs/msg/multi_image.hpp>
+#include <lifecycle_msgs/msg/state.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <rclcpp/rclcpp.hpp>
+#include <rclcpp_components/register_node_macro.hpp>
+#include <rclcpp_lifecycle/lifecycle_node.hpp>
+#include <rclcpp_lifecycle/lifecycle_publisher.hpp>
+
+#include "deep_object_detection/backend_manager.hpp"
+#include "deep_object_detection/detection_types.hpp"
+#include "deep_object_detection/generic_postprocessor.hpp"
+
+namespace deep_object_detection
+{
+
+DeepObjectDetectionNode::DeepObjectDetectionNode(const rclcpp::NodeOptions & options)
+: LifecycleNode("deep_object_detection_node", options)
+, dropped_images_count_(0)
+{
+  declareAndReadParameters();
+  RCLCPP_INFO(this->get_logger(), "Deep object detection node created, waiting for configuration");
+}
+
+void DeepObjectDetectionNode::declareAndReadParameters()
+{
+  params_.model_path = this->declare_parameter<std::string>("model_path", "");
+
+  auto class_names_path = this->declare_parameter<std::string>("class_names_path", "");
+  params_.model_metadata.num_classes = this->declare_parameter<int>("model.num_classes", 80);
+  params_.model_metadata.class_names_file = class_names_path;
+  auto bbox_format_str = this->declare_parameter<std::string>("model.bbox_format", "cxcywh");
+  params_.model_metadata.bbox_format = stringToBboxFormat(bbox_format_str);
+
+  params_.preprocessing.input_width = this->declare_parameter<int>("preprocessing.input_width", 640);
+  params_.preprocessing.input_height = this->declare_parameter<int>("preprocessing.input_height", 640);
+  auto normalization_type_str = this->declare_parameter<std::string>("preprocessing.normalization_type", "scale_0_1");
+  params_.preprocessing.normalization_type = stringToNormalizationType(normalization_type_str);
+  auto mean_d = this->declare_parameter<std::vector<double>>("preprocessing.mean", {0.0, 0.0, 0.0});
+  auto std_d = this->declare_parameter<std::vector<double>>("preprocessing.std", {1.0, 1.0, 1.0});
+  params_.preprocessing.mean.clear();
+  params_.preprocessing.std.clear();
+  params_.preprocessing.mean.reserve(mean_d.size());
+  params_.preprocessing.std.reserve(std_d.size());
+  std::transform(mean_d.begin(), mean_d.end(), std::back_inserter(params_.preprocessing.mean), [](double v) {
+    return static_cast<float>(v);
+  });
+  std::transform(std_d.begin(), std_d.end(), std::back_inserter(params_.preprocessing.std), [](double v) {
+    return static_cast<float>(v);
+  });
+  auto resize_method_str = this->declare_parameter<std::string>("preprocessing.resize_method", "letterbox");
+  params_.preprocessing.resize_method = stringToResizeMethod(resize_method_str);
+  params_.preprocessing.pad_value = 114;
+  params_.preprocessing.color_format = this->declare_parameter<std::string>("preprocessing.color_format", "rgb");
+
+  params_.postprocessing.score_threshold =
+    static_cast<float>(this->declare_parameter<double>("postprocessing.score_threshold", 0.25));
+  params_.postprocessing.nms_iou_threshold =
+    static_cast<float>(this->declare_parameter<double>("postprocessing.nms_iou_threshold", 0.45));
+  params_.postprocessing.max_detections = this->declare_parameter<int>("postprocessing.max_detections", 300);
+  auto score_activation_str = this->declare_parameter<std::string>("postprocessing.score_activation", "sigmoid");
+  params_.postprocessing.score_activation = stringToScoreActivation(score_activation_str);
+  params_.postprocessing.enable_nms = true;
+
+  auto class_score_mode_str = this->declare_parameter<std::string>("postprocessing.class_score_mode", "all_classes");
+  params_.postprocessing.class_score_mode = stringToClassScoreMode(class_score_mode_str);
+  params_.postprocessing.class_score_start_idx = -1;
+  params_.postprocessing.class_score_count = -1;
+  params_.postprocessing.coordinate_space = CoordinateSpace::PREPROCESSED;
+
+  params_.postprocessing.use_multi_output = this->declare_parameter<bool>("postprocessing.use_multi_output", false);
+  if (params_.postprocessing.use_multi_output) {
+    params_.postprocessing.output_boxes_idx = this->declare_parameter<int>("postprocessing.output_boxes_idx", 0);
+    params_.postprocessing.output_scores_idx = this->declare_parameter<int>("postprocessing.output_scores_idx", 1);
+    params_.postprocessing.output_classes_idx = this->declare_parameter<int>("postprocessing.output_classes_idx", 2);
+  } else {
+    params_.postprocessing.output_boxes_idx = 0;
+    params_.postprocessing.output_scores_idx = 1;
+    params_.postprocessing.output_classes_idx = 2;
+  }
+
+  params_.postprocessing.layout.auto_detect = this->declare_parameter<bool>("postprocessing.layout.auto_detect", true);
+  if (!params_.postprocessing.layout.auto_detect) {
+    params_.postprocessing.layout.batch_dim = this->declare_parameter<int>("postprocessing.layout.batch_dim", 0);
+    params_.postprocessing.layout.detection_dim =
+      this->declare_parameter<int>("postprocessing.layout.detection_dim", 1);
+    params_.postprocessing.layout.feature_dim = this->declare_parameter<int>("postprocessing.layout.feature_dim", 2);
+    params_.postprocessing.layout.bbox_start_idx =
+      this->declare_parameter<int>("postprocessing.layout.bbox_start_idx", 0);
+    params_.postprocessing.layout.bbox_count = this->declare_parameter<int>("postprocessing.layout.bbox_count", 4);
+    params_.postprocessing.layout.score_idx = this->declare_parameter<int>("postprocessing.layout.score_idx", 4);
+    params_.postprocessing.layout.class_idx = this->declare_parameter<int>("postprocessing.layout.class_idx", 5);
+  } else {
+    params_.postprocessing.layout.batch_dim = 0;
+    params_.postprocessing.layout.detection_dim = 1;
+    params_.postprocessing.layout.feature_dim = 2;
+    params_.postprocessing.layout.bbox_start_idx = 0;
+    params_.postprocessing.layout.bbox_count = 4;
+    params_.postprocessing.layout.score_idx = 4;
+    params_.postprocessing.layout.class_idx = 5;
+  }
+
+  input_topic_ = this->declare_parameter<std::string>("input_topic", "");
+  params_.input_qos_reliability = "best_effort";
+  params_.output_detections_topic = this->declare_parameter<std::string>("output_detections_topic", "/detections");
+
+  params_.max_batch_size = this->declare_parameter<int>("max_batch_size", 3);
+  params_.queue_size = this->declare_parameter<int>("queue_size", 10);
+  if (params_.queue_size < 0) {
+    RCLCPP_WARN(
+      this->get_logger(),
+      "queue_size (%d) is negative, clamping to 0 (unlimited internal queue). "
+      "Note: ROS QoS depth will use minimum of 1.",
+      params_.queue_size);
+    params_.queue_size = 0;
+  }
+  params_.queue_overflow_policy = QueueOverflowPolicy::DROP_OLDEST;
+  params_.decode_failure_policy = DecodeFailurePolicy::DROP;
+
+  params_.preferred_provider = this->declare_parameter<std::string>("preferred_provider", "tensorrt");
+  params_.device_id = this->declare_parameter<int>("device_id", 0);
+  params_.warmup_tensor_shapes = true;
+  params_.enable_trt_engine_cache = this->declare_parameter<bool>("enable_trt_engine_cache", false);
+  params_.trt_engine_cache_path =
+    this->declare_parameter<std::string>("trt_engine_cache_path", "/tmp/deep_ros_ort_trt_cache");
+}
+
+rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_configure(
+  const rclcpp_lifecycle::State &)
+{
+  RCLCPP_INFO(this->get_logger(), "Configuring deep object detection node");
+
+  try {
+    loadClassNames();
+    preprocessor_ = std::make_unique<ImagePreprocessor>(params_.preprocessing);
+    backend_manager_ = std::make_unique<BackendManager>(*this, params_);
+
+    try {
+      backend_manager_->initialize();
+    } catch (const std::exception & e) {
+      RCLCPP_ERROR(this->get_logger(), "Backend initialization failed: %s", e.what());
+      cleanupPartialConfiguration();
+      return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE;
+    }
+
+    const size_t batch = static_cast<size_t>(params_.max_batch_size);
+    const size_t channels = RGB_CHANNELS;
+    const size_t height = static_cast<size_t>(params_.preprocessing.input_height);
+    const size_t width = static_cast<size_t>(params_.preprocessing.input_width);
+    std::vector<size_t> input_shape = {batch, channels, height, width};
+    std::vector<size_t> output_shape = backend_manager_->getOutputShape(input_shape);
+
+    auto formatShape = [](const std::vector<size_t> & shape) {
+      if (shape.empty()) return std::string("auto-detect");
+      std::string result;
+      for (size_t i = 0; i < shape.size(); ++i) {
+        result += std::to_string(shape[i]);
+        if (i + 1 < shape.size()) result += ", ";
+      }
+      return result;
+    };
+
+    if (!output_shape.empty()) {
+      RCLCPP_INFO(this->get_logger(), "Detected model output shape: [%s]", formatShape(output_shape).c_str());
+    }
+
+    const auto & config = params_.postprocessing;
+    const auto & model_meta = params_.model_metadata;
+    const bool use_letterbox = (params_.preprocessing.resize_method == ResizeMethod::LETTERBOX);
+
+    GenericPostprocessor::OutputLayout layout =
+      GenericPostprocessor::autoConfigure(output_shape, params_.postprocessing.layout);
+    if (layout.auto_detect && !output_shape.empty()) {
+      RCLCPP_INFO(
+        this->get_logger(),
+        "Auto-detected layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu",
+        layout.batch_dim,
+        layout.detection_dim,
+        layout.feature_dim);
+    } else if (!layout.auto_detect) {
+      RCLCPP_INFO(
+        this->get_logger(),
+        "Using manual layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu",
+        layout.batch_dim,
+        layout.detection_dim,
+        layout.feature_dim);
+    } else {
+      RCLCPP_INFO(this->get_logger(), "Layout will be auto-detected from first inference");
+    }
+
+    postprocessor_ = std::make_unique<GenericPostprocessor>(
+      config, layout, model_meta.bbox_format, model_meta.num_classes, params_.class_names, use_letterbox);
+
+    auto pub = this->create_publisher<Detection2DArrayMsg>(params_.output_detections_topic, rclcpp::SensorDataQoS{});
+    detection_pub_ = std::static_pointer_cast<rclcpp_lifecycle::LifecyclePublisher<Detection2DArrayMsg>>(pub);
+
+    if (input_topic_.empty()) {
+      RCLCPP_ERROR(this->get_logger(), "input_topic is empty. Please set input_topic to a valid MultiImage topic.");
+      cleanupPartialConfiguration();
+      return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE;
+    }
+    RCLCPP_INFO(this->get_logger(), "Will subscribe to MultiImage topic: %s", input_topic_.c_str());
+
+    RCLCPP_INFO(
+      this->get_logger(),
+      "Deep object detection node configured. Model: %s, input size: %dx%d, batch_size: %d, provider: %s, "
+      "postprocessor: %s, output shape: [%s]",
+      params_.model_path.c_str(),
+      params_.preprocessing.input_width,
+      params_.preprocessing.input_height,
+      params_.max_batch_size,
+      backend_manager_->activeProvider().c_str(),
+      postprocessor_->getFormatName().c_str(),
+      formatShape(output_shape).c_str());
+
+    return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS;
+  } catch (const std::exception & e) {
+    RCLCPP_ERROR(this->get_logger(), "Failed to configure: %s", e.what());
+    cleanupPartialConfiguration();
+    return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE;
+  }
+}
+
+rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_activate(
+  const rclcpp_lifecycle::State &)
+{
+  RCLCPP_INFO(this->get_logger(), "Activating deep object detection node");
+
+  try {
+    callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::Reentrant);
+
+    setupSubscription();
+
+    batch_timer_ = this->create_wall_timer(
+      std::chrono::milliseconds(5), std::bind(&DeepObjectDetectionNode::onBatchTimer, this), callback_group_);
+
+    if (detection_pub_) {
+      detection_pub_->on_activate();
+    }
+
+    RCLCPP_INFO(this->get_logger(), "Deep object detection node activated and ready to process images");
+    return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS;
+  } catch (const std::exception & e) {
+    RCLCPP_ERROR(this->get_logger(), "Failed to activate: %s", e.what());
+    return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE;
+  }
+}
+
+void DeepObjectDetectionNode::cleanupPartialConfiguration()
+{
+  postprocessor_.reset();
+  backend_manager_.reset();
+  preprocessor_.reset();
+  detection_pub_.reset();
+}
+
+void DeepObjectDetectionNode::cleanupAllResources()
+{
+  stopSubscriptionsAndTimer();
+  backend_manager_.reset();
+  preprocessor_.reset();
+  postprocessor_.reset();
+  params_.class_names.clear();
+  detection_pub_.reset();
+}
+
+void DeepObjectDetectionNode::stopSubscriptionsAndTimer()
+{
+  if (batch_timer_) {
+    batch_timer_->cancel();
+    batch_timer_.reset();
+  }
+
+  if (multi_image_sub_) {
+    multi_image_sub_.reset();
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(queue_mutex_);
+    image_queue_.clear();
+  }
+}
+
+rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_deactivate(
+  const rclcpp_lifecycle::State &)
+{
+  RCLCPP_INFO(this->get_logger(), "Deactivating deep object detection node");
+  stopSubscriptionsAndTimer();
+  if (detection_pub_) {
+    detection_pub_->on_deactivate();
+  }
+
+  RCLCPP_INFO(this->get_logger(), "Deep object detection node deactivated");
+  return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS;
+}
+
+rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_cleanup(
+  const rclcpp_lifecycle::State &)
+{
+  RCLCPP_INFO(this->get_logger(), "Cleaning up deep object detection node");
+  cleanupAllResources();
+  RCLCPP_INFO(this->get_logger(), "Deep object detection node cleaned up");
+  return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS;
+}
+
+rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_shutdown(
+  const rclcpp_lifecycle::State &)
+{
+  RCLCPP_INFO(this->get_logger(), "Shutting down deep object detection node");
+  cleanupAllResources();
+  RCLCPP_INFO(this->get_logger(), "Deep object detection node shut down");
+  return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS;
+}
+
+void DeepObjectDetectionNode::setupSubscription()
+{
+  const size_t qos_depth = std::max(static_cast<size_t>(1), static_cast<size_t>(params_.queue_size));
+  auto qos_profile = rclcpp::QoS(rclcpp::KeepLast(qos_depth));
+  qos_profile.best_effort();
+
+  rclcpp::SubscriptionOptions options;
+  options.callback_group = callback_group_;
+
+  multi_image_sub_ = this->create_subscription<deep_msgs::msg::MultiImage>(
+    input_topic_,
+    qos_profile,
+    [this](const deep_msgs::msg::MultiImage::ConstSharedPtr msg) {
+      try {
+        this->onMultiImage(msg);
+      } catch (const std::exception & e) {
+        RCLCPP_ERROR(this->get_logger(), "Exception in onMultiImage callback: %s", e.what());
+      }
+    },
+    options);
+  RCLCPP_INFO(this->get_logger(), "Subscribed to MultiImage topic: %s", input_topic_.c_str());
+}
+
+void DeepObjectDetectionNode::onMultiImage(const deep_msgs::msg::MultiImage::ConstSharedPtr & msg)
+{
+  RCLCPP_DEBUG(this->get_logger(), "Received MultiImage message with %zu images", msg->images.size());
+  for (size_t i = 0; i < msg->images.size(); ++i) {
+    const auto & compressed_img = msg->images[i];
+    try {
+      handleCompressedImage(compressed_img);
+    } catch (const std::exception & e) {
+      RCLCPP_ERROR(this->get_logger(), "Exception processing image %zu in MultiImage: %s", i, e.what());
+    }
+  }
+  RCLCPP_DEBUG(this->get_logger(), "Processed MultiImage message with %zu synchronized images", msg->images.size());
+}
+
+void DeepObjectDetectionNode::handleCompressedImage(const sensor_msgs::msg::CompressedImage & msg)
+{
+  cv::Mat decoded = cv::imdecode(msg.data, cv::IMREAD_COLOR);
+  if (decoded.empty()) {
+    RCLCPP_WARN_THROTTLE(
+      this->get_logger(), *this->get_clock(), 1000, "Failed to decode compressed image, dropping frame");
+    return;
+  }
+
+  const int width = decoded.cols;
+  const int height = decoded.rows;
+  enqueueImage(std::move(decoded), msg.header);
+  RCLCPP_DEBUG(
+    this->get_logger(), "Enqueued decoded image (size: %zu bytes, decoded: %dx%d)", msg.data.size(), width, height);
+}
+
+void DeepObjectDetectionNode::enqueueImage(cv::Mat image, const std_msgs::msg::Header & header)
+{
+  size_t queue_size_after = 0;
+  bool dropped = false;
+  {
+    std::lock_guard<std::mutex> lock(queue_mutex_);
+    const size_t limit = static_cast<size_t>(params_.queue_size);
+    if (limit > 0 && image_queue_.size() >= limit) {
+      if (!image_queue_.empty()) {
+        image_queue_.pop_front();
+        dropped = true;
+        dropped_images_count_++;
+      }
+    }
+
+    image_queue_.push_back({std::move(image), header});
+    queue_size_after = image_queue_.size();
+  }
+
+  if (dropped) {
+    RCLCPP_WARN(
+      this->get_logger(),
+      "Queue overflow: dropped oldest image (queue size: %zu, limit: %zu, total dropped: %zu)",
+      queue_size_after,
+      static_cast<size_t>(params_.queue_size),
+      dropped_images_count_);
+  }
+}
+
+void DeepObjectDetectionNode::onBatchTimer()
+{
+  if (this->get_current_state().id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE) {
+    return;
+  }
+
+  std::vector<QueuedImage> batch;
+  const size_t batch_size = static_cast<size_t>(params_.max_batch_size);
+
+  {
+    std::lock_guard<std::mutex> lock(queue_mutex_);
+    if (image_queue_.size() < batch_size) {
+      return;
+    }
+
+    batch.reserve(batch_size);
+    for (size_t i = 0; i < batch_size; ++i) {
+      batch.push_back(std::move(image_queue_.front()));
+      image_queue_.pop_front();
+    }
+  }
+
+  if (!batch.empty()) {
+    try {
+      processBatch(batch);
+    } catch (const std::exception & e) {
+      RCLCPP_ERROR(this->get_logger(), "Exception in batch processing: %s", e.what());
+    }
+  }
+}
+
+void DeepObjectDetectionNode::processBatch(const std::vector<QueuedImage> & batch)
+{
+  if (!backend_manager_ || !backend_manager_->hasExecutor()) {
+    RCLCPP_ERROR(
+      this->get_logger(),
+      "Cannot process batch: backend not initialized (backend_manager_: %s, hasExecutor: %s)",
+      backend_manager_ ? "exists" : "null",
+      (backend_manager_ && backend_manager_->hasExecutor()) ? "true" : "false");
+    return;
+  }
+
+  auto start_time = std::chrono::steady_clock::now();
+  std::vector<cv::Mat> processed;
+  std::vector<ImageMeta> metas;
+  std::vector<std_msgs::msg::Header> headers;
+  processed.reserve(batch.size());
+  metas.reserve(batch.size());
+  headers.reserve(batch.size());
+
+  for (const auto & item : batch) {
+    ImageMeta meta;
+    cv::Mat preprocessed = preprocessor_->preprocess(item.bgr, meta);
+    if (preprocessed.empty()) {
+      RCLCPP_WARN(this->get_logger(), "Preprocessing returned empty image, skipping");
+      continue;
+    }
+    processed.push_back(std::move(preprocessed));
+    metas.push_back(meta);
+    headers.push_back(item.header);
+  }
+
+  if (processed.empty()) {
+    RCLCPP_WARN(this->get_logger(), "No valid images after preprocessing, skipping batch");
+    return;
+  }
+
+  const auto & packed_input = preprocessor_->pack(processed);
+  if (packed_input.data.empty()) {
+    RCLCPP_ERROR(this->get_logger(), "Packed input is empty after preprocessing %zu images", processed.size());
+    return;
+  }
+
+  std::vector<std::vector<SimpleDetection>> batch_detections;
+  if (params_.postprocessing.use_multi_output) {
+    auto output_tensors = backend_manager_->inferAllOutputs(packed_input);
+    batch_detections = postprocessor_->decodeMultiOutput(output_tensors, metas);
+  } else {
+    auto output_tensor = backend_manager_->infer(packed_input);
+    batch_detections = postprocessor_->decode(output_tensor, metas);
+  }
+
+  auto end_time = std::chrono::steady_clock::now();
+  auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+
+  size_t dropped_count = 0;
+  {
+    std::lock_guard<std::mutex> lock(queue_mutex_);
+    dropped_count = dropped_images_count_;
+  }
+
+  RCLCPP_INFO_THROTTLE(
+    this->get_logger(),
+    *this->get_clock(),
+    2000,
+    "Batch processing completed: %zu images in %" PRId64 " ms, total detections: %zu, dropped images: %zu",
+    batch.size(),
+    static_cast<int64_t>(elapsed_ms),
+    std::accumulate(
+      batch_detections.begin(),
+      batch_detections.end(),
+      size_t(0),
+      [](size_t sum, const auto & dets) { return sum + dets.size(); }),
+    dropped_count);
+
+  publishDetections(batch_detections, headers, metas);
+}
+
+void DeepObjectDetectionNode::publishDetections(
+  const std::vector<std::vector<SimpleDetection>> & batch_detections,
+  const std::vector<std_msgs::msg::Header> & headers,
+  const std::vector<ImageMeta> & metas)
+{
+  if (this->get_current_state().id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE) {
+    return;
+  }
+
+  if (!detection_pub_) {
+    RCLCPP_WARN_THROTTLE(
+      this->get_logger(), *this->get_clock(), 1000, "Cannot publish detections: publisher not initialized");
+    return;
+  }
+
+  size_t total_published = 0;
+  for (size_t i = 0; i < batch_detections.size() && i < headers.size() && i < metas.size(); ++i) {
+    Detection2DArrayMsg msg;
+    postprocessor_->fillDetectionMessage(headers[i], batch_detections[i], metas[i], msg);
+    detection_pub_->publish(msg);
+    total_published += batch_detections[i].size();
+  }
+  RCLCPP_INFO_THROTTLE(
+    this->get_logger(),
+    *this->get_clock(),
+    2000,
+    "Published %zu detection messages (%zu total detections)",
+    batch_detections.size(),
+    total_published);
+}
+
+void DeepObjectDetectionNode::loadClassNames()
+{
+  const auto & class_names_path = params_.model_metadata.class_names_file;
+  if (class_names_path.empty()) {
+    return;
+  }
+
+  params_.class_names.clear();
+
+  std::ifstream file(class_names_path);
+  if (!file.is_open()) {
+    throw std::runtime_error("Failed to open class_names_path: " + class_names_path);
+  }
+
+  std::string line;
+  while (std::getline(file, line)) {
+    if (!line.empty()) {
+      params_.class_names.push_back(line);
+    }
+  }
+
+  RCLCPP_INFO(
+    this->get_logger(), "Loaded %zu class names from %s", params_.class_names.size(), class_names_path.c_str());
+}
+
+std::shared_ptr<rclcpp_lifecycle::LifecycleNode> createDeepObjectDetectionNode(const rclcpp::NodeOptions & options)
+{
+  return std::make_shared<DeepObjectDetectionNode>(options);
+}
+
+}  // namespace deep_object_detection
+
+RCLCPP_COMPONENTS_REGISTER_NODE(deep_object_detection::DeepObjectDetectionNode)
diff --git a/deep_object_detection/src/generic_postprocessor.cpp b/deep_object_detection/src/generic_postprocessor.cpp
new file mode 100644
index 0000000..54e3b04
--- /dev/null
+++ b/deep_object_detection/src/generic_postprocessor.cpp
@@ -0,0 +1,619 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "deep_object_detection/generic_postprocessor.hpp"
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace deep_object_detection
+{
+
+GenericPostprocessor::GenericPostprocessor(
+  const PostprocessingConfig & config,
+  const OutputLayout & layout,
+  BboxFormat bbox_format,
+  int num_classes,
+  const std::vector<std::string> & class_names,
+  bool use_letterbox)
+: config_(config)
+, layout_(layout)
+, bbox_format_(bbox_format)
+, num_classes_(num_classes)
+, class_names_(class_names)
+, use_letterbox_(use_letterbox)
+{}
+
+GenericPostprocessor::OutputLayout GenericPostprocessor::detectLayout(const std::vector<size_t> & output_shape)
+{
+  OutputLayout layout;
+  layout.shape = output_shape;
+  layout.auto_detect = true;
+
+  if (output_shape.size() < 2) {
+    throw std::runtime_error("Output tensor must have at least 2 dimensions");
+  }
+
+  layout.batch_dim = 0;
+
+  if (output_shape.size() == 2) {
+    layout.detection_dim = 0;
+    layout.feature_dim = 1;
+    layout.bbox_start_idx = 0;
+    layout.bbox_count = 4;
+    layout.score_idx = 4;
+    layout.class_idx = 5;
+  } else if (output_shape.size() == 3) {
+    size_t dim1 = output_shape[1];
+    size_t dim2 = output_shape[2];
+
+    if (dim1 <= 8 && dim2 > 8) {
+      layout.detection_dim = 2;
+      layout.feature_dim = 1;
+    } else if (dim1 > 8 && dim2 <= 8) {
+      layout.detection_dim = 1;
+      layout.feature_dim = 2;
+    } else {
+      layout.detection_dim = 1;
+      layout.feature_dim = 2;
+    }
+
+    layout.bbox_start_idx = 0;
+    layout.bbox_count = 4;
+    layout.score_idx = 4;
+    layout.class_idx = 5;
+  } else {
+    layout.detection_dim = 1;
+    layout.feature_dim = 2;
+    layout.bbox_start_idx = 0;
+    layout.bbox_count = 4;
+    layout.score_idx = 4;
+    layout.class_idx = 5;
+  }
+
+  return layout;
+}
+
+GenericPostprocessor::OutputLayout GenericPostprocessor::autoConfigure(
+  const std::vector<size_t> & output_shape, const OutputLayoutConfig & layout_config)
+{
+  OutputLayout layout;
+
+  if (!layout_config.auto_detect) {
+    layout.auto_detect = false;
+    layout.batch_dim = static_cast<size_t>(layout_config.batch_dim);
+    layout.detection_dim = static_cast<size_t>(layout_config.detection_dim);
+    layout.feature_dim = static_cast<size_t>(layout_config.feature_dim);
+    layout.bbox_start_idx = static_cast<size_t>(layout_config.bbox_start_idx);
+    layout.bbox_count = static_cast<size_t>(layout_config.bbox_count);
+    layout.score_idx = static_cast<size_t>(layout_config.score_idx);
+    layout.class_idx = (layout_config.class_idx >= 0) ? static_cast<size_t>(layout_config.class_idx) : SIZE_MAX;
+    if (!output_shape.empty()) {
+      layout.shape = output_shape;
+    }
+  } else if (!output_shape.empty()) {
+    layout = detectLayout(output_shape);
+  } else {
+    layout.auto_detect = true;
+  }
+
+  return layout;
+}
+
+float GenericPostprocessor::applyActivation(float raw_score) const
+{
+  switch (config_.score_activation) {
+    case ScoreActivation::SIGMOID:
+      return 1.0f / (1.0f + std::exp(-raw_score));
+    case ScoreActivation::SOFTMAX:
+      return raw_score;
+    case ScoreActivation::NONE:
+    default:
+      return raw_score;
+  }
+}
+
+float GenericPostprocessor::extractValue(
+  const float * data,
+  size_t batch_idx,
+  size_t detection_idx,
+  size_t feature_idx,
+  const std::vector<size_t> & shape) const
+{
+  if (shape.size() == 2) {
+    return data[batch_idx * shape[1] + feature_idx];
+  } else if (shape.size() == 3) {
+    if (layout_.detection_dim == 1 && layout_.feature_dim == 2) {
+      return data[(batch_idx * shape[1] + detection_idx) * shape[2] + feature_idx];
+    } else if (layout_.detection_dim == 2 && layout_.feature_dim == 1) {
+      return data[(batch_idx * shape[1] + feature_idx) * shape[2] + detection_idx];
+    }
+  }
+
+  size_t index = 0;
+  size_t stride = 1;
+  for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+    size_t dim_idx = 0;
+    if (i == static_cast<int>(layout_.batch_dim)) {
+      dim_idx = batch_idx;
+    } else if (i == static_cast<int>(layout_.detection_dim)) {
+      dim_idx = detection_idx;
+    } else if (i == static_cast<int>(layout_.feature_dim)) {
+      dim_idx = feature_idx;
+    }
+    index += dim_idx * stride;
+    stride *= shape[i];
+  }
+
+  return data[index];
+}
+
+void GenericPostprocessor::convertBbox(
+  const float * bbox_data,
+  size_t batch_idx,
+  size_t detection_idx,
+  const std::vector<size_t> & shape,
+  SimpleDetection & det) const
+{
+  float coords[4];
+  for (size_t i = 0; i < 4; ++i) {
+    coords[i] = extractValue(bbox_data, batch_idx, detection_idx, layout_.bbox_start_idx + i, shape);
+  }
+
+  switch (bbox_format_) {
+    case BboxFormat::XYXY: {
+      det.x = (coords[0] + coords[2]) * 0.5f;
+      det.y = (coords[1] + coords[3]) * 0.5f;
+      det.width = coords[2] - coords[0];
+      det.height = coords[3] - coords[1];
+      break;
+    }
+    case BboxFormat::XYWH: {
+      det.x = coords[0] + coords[2] * 0.5f;
+      det.y = coords[1] + coords[3] * 0.5f;
+      det.width = coords[2];
+      det.height = coords[3];
+      break;
+    }
+    case BboxFormat::CXCYWH:
+    default: {
+      det.x = coords[0];
+      det.y = coords[1];
+      det.width = coords[2];
+      det.height = coords[3];
+      break;
+    }
+  }
+}
+
+std::vector<std::vector<SimpleDetection>> GenericPostprocessor::decode(
+  const deep_ros::Tensor & output, const std::vector<ImageMeta> & metas) const
+{
+  std::vector<std::vector<SimpleDetection>> batch_detections;
+
+  const auto & shape = output.shape();
+  if (shape.empty()) {
+    throw std::runtime_error("Output tensor has empty shape");
+  }
+
+  const float * data = output.data_as<float>();
+  if (!data) {
+    throw std::runtime_error("Output tensor has null data");
+  }
+
+  OutputLayout layout = layout_;
+  if (layout.auto_detect) {
+    layout = detectLayout(shape);
+  }
+
+  size_t batch_size = shape[layout.batch_dim];
+  size_t num_detections = shape.size() > layout.detection_dim ? shape[layout.detection_dim] : 1;
+  size_t num_features = shape.size() > layout.feature_dim ? shape[layout.feature_dim] : shape.back();
+
+  batch_detections.reserve(std::min(batch_size, metas.size()));
+
+  for (size_t b = 0; b < batch_size && b < metas.size(); ++b) {
+    std::vector<SimpleDetection> dets;
+    dets.reserve(num_detections);
+
+    for (size_t d = 0; d < num_detections; ++d) {
+      float score = 0.0f;
+      int32_t class_id = -1;
+
+      size_t class_start_idx =
+        (config_.class_score_start_idx >= 0) ? static_cast<size_t>(config_.class_score_start_idx) : layout.bbox_count;
+      size_t class_count = (config_.class_score_count > 0) ? static_cast<size_t>(config_.class_score_count)
+                                                           : static_cast<size_t>(num_classes_);
+
+      if (config_.class_score_mode == ClassScoreMode::ALL_CLASSES && num_features > class_start_idx) {
+        size_t class_end = std::min(num_features, class_start_idx + class_count);
+        std::vector<float> class_logits;
+        class_logits.reserve(class_end - class_start_idx);
+
+        for (size_t c = class_start_idx; c < class_end; ++c) {
+          float cls_logit = extractValue(data, b, d, c, shape);
+          class_logits.push_back(cls_logit);
+        }
+
+        if (config_.score_activation == ScoreActivation::SOFTMAX) {
+          float max_logit = *std::max_element(class_logits.begin(), class_logits.end());
+          float sum_exp = 0.0f;
+          for (float & logit : class_logits) {
+            logit = std::exp(logit - max_logit);
+            sum_exp += logit;
+          }
+
+          float max_score = -std::numeric_limits<float>::max();
+          size_t best_class = 0;
+          for (size_t i = 0; i < class_logits.size(); ++i) {
+            float prob = class_logits[i] / sum_exp;
+            if (prob > max_score) {
+              max_score = prob;
+              best_class = i;
+            }
+          }
+          score = max_score;
+          class_id = static_cast<int32_t>(best_class);
+        } else {
+          float max_score = -std::numeric_limits<float>::max();
+          size_t best_class = 0;
+          for (size_t i = 0; i < class_logits.size(); ++i) {
+            float cls_score = applyActivation(class_logits[i]);
+            if (cls_score > max_score) {
+              max_score = cls_score;
+              best_class = i;
+            }
+          }
+          score = max_score;
+          class_id = static_cast<int32_t>(best_class);
+        }
+      } else if (config_.class_score_mode == ClassScoreMode::SINGLE_CONFIDENCE) {
+        if (layout.score_idx < num_features) {
+          float raw_score = extractValue(data, b, d, layout.score_idx, shape);
+          score = applyActivation(raw_score);
+        }
+        if (layout.class_idx < num_features && layout.class_idx != SIZE_MAX) {
+          class_id = static_cast<int32_t>(std::round(extractValue(data, b, d, layout.class_idx, shape)));
+        }
+      } else if (layout.score_idx < num_features) {
+        float raw_score = extractValue(data, b, d, layout.score_idx, shape);
+        score = applyActivation(raw_score);
+        if (layout.class_idx < num_features && layout.class_idx != SIZE_MAX) {
+          class_id = static_cast<int32_t>(std::round(extractValue(data, b, d, layout.class_idx, shape)));
+        }
+      }
+
+      if (score < config_.score_threshold) {
+        continue;
+      }
+
+      SimpleDetection det;
+      convertBbox(data, b, d, shape, det);
+      det.score = score;
+      det.class_id = class_id;
+
+      if (config_.coordinate_space == CoordinateSpace::PREPROCESSED) {
+        adjustToOriginal(det, metas[b], use_letterbox_);
+      }
+      dets.push_back(det);
+    }
+
+    if (config_.enable_nms) {
+      batch_detections.push_back(applyNms(dets, config_.nms_iou_threshold));
+    } else {
+      batch_detections.push_back(dets);
+    }
+  }
+
+  return batch_detections;
+}
+
+std::vector<std::vector<SimpleDetection>> GenericPostprocessor::decodeMultiOutput(
+  const std::vector<deep_ros::Tensor> & outputs, const std::vector<ImageMeta> & metas) const
+{
+  std::vector<std::vector<SimpleDetection>> batch_detections;
+
+  if (outputs.empty()) {
+    throw std::runtime_error("No output tensors provided");
+  }
+
+  const int boxes_idx = config_.output_boxes_idx;
+  const int scores_idx = config_.output_scores_idx;
+  const int classes_idx = config_.output_classes_idx;
+
+  if (boxes_idx < 0 || boxes_idx >= static_cast<int>(outputs.size())) {
+    throw std::runtime_error("Invalid output_boxes_idx: " + std::to_string(boxes_idx));
+  }
+  if (scores_idx < 0 || scores_idx >= static_cast<int>(outputs.size())) {
+    throw std::runtime_error("Invalid output_scores_idx: " + std::to_string(scores_idx));
+  }
+
+  const auto & boxes_tensor = outputs[boxes_idx];
+  const auto & scores_tensor = outputs[scores_idx];
+
+  const auto & boxes_shape = boxes_tensor.shape();
+  const auto & scores_shape = scores_tensor.shape();
+
+  if (boxes_shape.empty() || scores_shape.empty()) {
+    throw std::runtime_error("Output tensors have empty shapes");
+  }
+
+  const float * boxes_data = boxes_tensor.data_as<float>();
+  const float * scores_data = scores_tensor.data_as<float>();
+
+  if (!boxes_data || !scores_data) {
+    throw std::runtime_error("Output tensors have null data");
+  }
+
+  size_t batch_size = boxes_shape[0];
+  size_t num_detections = boxes_shape.size() > 1 ? boxes_shape[1] : 1;
+  size_t bbox_dims = boxes_shape.size() > 2 ? boxes_shape[2] : boxes_shape.back();
+
+  if (bbox_dims < 4) {
+    throw std::runtime_error("Boxes tensor must have at least 4 values per detection");
+  }
+
+  batch_detections.reserve(std::min(batch_size, metas.size()));
+
+  for (size_t b = 0; b < batch_size && b < metas.size(); ++b) {
+    std::vector<SimpleDetection> dets;
+    dets.reserve(num_detections);
+
+    for (size_t d = 0; d < num_detections; ++d) {
+      size_t box_offset = (b * num_detections + d) * bbox_dims;
+      float x = boxes_data[box_offset];
+      float y = boxes_data[box_offset + 1];
+      float w = boxes_data[box_offset + 2];
+      float h = boxes_data[box_offset + 3];
+
+      float score = 0.0f;
+      int32_t class_id = -1;
+
+      size_t scores_dim1 = scores_shape.size() > 1 ? scores_shape[1] : 1;
+      size_t scores_dim2 = scores_shape.size() > 2 ? scores_shape[2] : 1;
+
+      if (scores_dim1 == static_cast<size_t>(num_classes_) || scores_dim2 == static_cast<size_t>(num_classes_)) {
+        bool detections_first = (scores_dim1 == num_detections);
+
+        if (detections_first) {
+          size_t score_base = (b * num_detections + d) * num_classes_;
+          float max_score = -std::numeric_limits<float>::max();
+          size_t best_class = 0;
+
+          for (size_t c = 0; c < static_cast<size_t>(num_classes_); ++c) {
+            float raw_score = scores_data[score_base + c];
+            float activated_score = applyActivation(raw_score);
+            if (activated_score > max_score) {
+              max_score = activated_score;
+              best_class = c;
+            }
+          }
+          score = max_score;
+          class_id = static_cast<int32_t>(best_class);
+        } else {
+          // [batch, num_classes, num_detections]
+          float max_score = -std::numeric_limits<float>::max();
+          size_t best_class = 0;
+
+          for (size_t c = 0; c < static_cast<size_t>(num_classes_); ++c) {
+            float raw_score = scores_data[b * num_classes_ * num_detections + c * num_detections + d];
+            float activated_score = applyActivation(raw_score);
+            if (activated_score > max_score) {
+              max_score = activated_score;
+              best_class = c;
+            }
+          }
+          score = max_score;
+          class_id = static_cast<int32_t>(best_class);
+        }
+      } else {
+        size_t score_offset = b * num_detections + d;
+        float raw_score = scores_data[score_offset];
+        score = applyActivation(raw_score);
+
+        if (classes_idx >= 0 && classes_idx < static_cast<int>(outputs.size())) {
+          const auto & classes_tensor = outputs[classes_idx];
+          const auto & classes_shape = classes_tensor.shape();
+          const float * classes_data = classes_tensor.data_as<float>();
+          if (classes_data && classes_shape.size() >= 2) {
+            size_t class_offset = b * classes_shape[1] + d;
+            class_id = static_cast<int32_t>(std::round(classes_data[class_offset]));
+          }
+        }
+      }
+
+      if (score < config_.score_threshold) {
+        continue;
+      }
+
+      SimpleDetection det;
+      if (bbox_format_ == BboxFormat::CXCYWH) {
+        det.x = x;
+        det.y = y;
+        det.width = w;
+        det.height = h;
+      } else if (bbox_format_ == BboxFormat::XYXY) {
+        det.x = x;
+        det.y = y;
+        det.width = w - x;
+        det.height = h - y;
+      } else if (bbox_format_ == BboxFormat::XYWH) {
+        det.x = x;
+        det.y = y;
+        det.width = w;
+        det.height = h;
+      }
+
+      det.score = score;
+      det.class_id = class_id;
+
+      if (config_.coordinate_space == CoordinateSpace::PREPROCESSED) {
+        adjustToOriginal(det, metas[b], use_letterbox_);
+      }
+      dets.push_back(det);
+    }
+
+    if (config_.enable_nms) {
+      batch_detections.push_back(applyNms(dets, config_.nms_iou_threshold));
+    } else {
+      batch_detections.push_back(dets);
+    }
+  }
+
+  return batch_detections;
+}
+
+void GenericPostprocessor::adjustToOriginal(SimpleDetection & det, const ImageMeta & meta, bool use_letterbox) const
+{
+  float cx = det.x;
+  float cy = det.y;
+  float w = det.width;
+  float h = det.height;
+
+  cx -= meta.pad_x;
+  cy -= meta.pad_y;
+
+  if (use_letterbox) {
+    const float inv_scale = meta.scale_x > 0.0f ? 1.0f / meta.scale_x : 1.0f;
+    cx *= inv_scale;
+    cy *= inv_scale;
+    w *= inv_scale;
+    h *= inv_scale;
+  } else {
+    cx *= meta.scale_x;
+    cy *= meta.scale_y;
+    w *= meta.scale_x;
+    h *= meta.scale_y;
+  }
+
+  det.x = std::max(0.0f, cx - w * 0.5f);
+  det.y = std::max(0.0f, cy - h * 0.5f);
+  det.width = w;
+  det.height = h;
+
+  det.x = std::min(det.x, static_cast<float>(meta.original_width));
+  det.y = std::min(det.y, static_cast<float>(meta.original_height));
+  det.width = std::min(det.width, static_cast<float>(meta.original_width) - det.x);
+  det.height = std::min(det.height, static_cast<float>(meta.original_height) - det.y);
+}
+
+float GenericPostprocessor::iou(const SimpleDetection & a, const SimpleDetection & b)
+{
+  const float x1 = std::max(a.x, b.x);
+  const float y1 = std::max(a.y, b.y);
+  const float x2 = std::min(a.x + a.width, b.x + b.width);
+  const float y2 = std::min(a.y + a.height, b.y + b.height);
+
+  const float inter_w = std::max(0.0f, x2 - x1);
+  const float inter_h = std::max(0.0f, y2 - y1);
+  const float inter_area = inter_w * inter_h;
+  const float area_a = a.width * a.height;
+  const float area_b = b.width * b.height;
+  const float union_area = area_a + area_b - inter_area;
+
+  if (union_area <= 0.0f) {
+    return 0.0f;
+  }
+  return inter_area / union_area;
+}
+
+std::vector<SimpleDetection> GenericPostprocessor::applyNms(
+  std::vector<SimpleDetection> dets, float iou_threshold) const
+{
+  std::vector<SimpleDetection> result;
+  if (dets.empty()) {
+    return result;
+  }
+
+  std::stable_sort(
+    dets.begin(), dets.end(), [](const SimpleDetection & a, const SimpleDetection & b) { return a.score > b.score; });
+
+  std::vector<bool> suppressed(dets.size(), false);
+  for (size_t i = 0; i < dets.size(); ++i) {
+    if (suppressed[i]) {
+      continue;
+    }
+    result.push_back(dets[i]);
+    for (size_t j = i + 1; j < dets.size(); ++j) {
+      if (suppressed[j]) {
+        continue;
+      }
+      if (dets[i].class_id != dets[j].class_id) {
+        continue;
+      }
+      if (iou(dets[i], dets[j]) > iou_threshold) {
+        suppressed[j] = true;
+      }
+    }
+  }
+
+  return result;
+}
+
+std::string GenericPostprocessor::classLabel(int class_id, const std::vector<std::string> & class_names) const
+{
+  if (class_id >= 0 && static_cast<size_t>(class_id) < class_names.size()) {
+    return class_names[static_cast<size_t>(class_id)];
+  }
+  return std::to_string(class_id);
+}
+
+void GenericPostprocessor::fillDetectionMessage(
+  const std_msgs::msg::Header & header,
+  const std::vector<SimpleDetection> & detections,
+  const ImageMeta & meta,
+  Detection2DArrayMsg & out_msg) const
+{
+  out_msg.header = header;
+
+#if __has_include(<deep_msgs/msg/detection2_d_array.hpp>)
+  out_msg.detections.clear();
+  out_msg.detections.reserve(detections.size());
+  for (const auto & det : detections) {
+    Detection2DMsg d;
+    d.x = det.x;
+    d.y = det.y;
+    d.width = det.width;
+    d.height = det.height;
+    d.score = det.score;
+    d.class_id = det.class_id;
+    d.label = classLabel(det.class_id, class_names_);
+    out_msg.detections.push_back(d);
+  }
+#else
+  (void)meta;
+  out_msg.detections.clear();
+  out_msg.detections.reserve(detections.size());
+  for (const auto & det : detections) {
+    Detection2DMsg d;
+    d.header = header;
+    d.bbox.center.position.x = det.x + det.width * 0.5;
+    d.bbox.center.position.y = det.y + det.height * 0.5;
+    d.bbox.size_x = det.width;
+    d.bbox.size_y = det.height;
+
+    vision_msgs::msg::ObjectHypothesisWithPose hyp;
+    hyp.hypothesis.class_id = classLabel(det.class_id, class_names_);
+    hyp.hypothesis.score = det.score;
+    d.results.push_back(hyp);
+    out_msg.detections.push_back(d);
+  }
+#endif
+}
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/src/image_preprocessor.cpp b/deep_object_detection/src/image_preprocessor.cpp
new file mode 100644
index 0000000..3a963d5
--- /dev/null
+++ b/deep_object_detection/src/image_preprocessor.cpp
@@ -0,0 +1,282 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "deep_object_detection/image_preprocessor.hpp"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+
+namespace deep_object_detection
+{
+
+ImagePreprocessor::ImagePreprocessor(const PreprocessingConfig & config)
+: config_(config)
+{}
+
+cv::Mat ImagePreprocessor::preprocess(const cv::Mat & bgr, ImageMeta & meta) const
+{
+  if (bgr.empty()) {
+    throw std::runtime_error("Input image is empty");
+  }
+
+  meta.original_width = bgr.cols;
+  meta.original_height = bgr.rows;
+
+  cv::Mat resized;
+
+  switch (config_.resize_method) {
+    case ResizeMethod::LETTERBOX:
+      resized = applyLetterbox(bgr, meta);
+      break;
+    case ResizeMethod::CROP:
+      resized = applyCrop(bgr, meta);
+      break;
+    case ResizeMethod::PAD:
+      resized = applyPad(bgr, meta);
+      break;
+    case ResizeMethod::RESIZE:
+    default:
+      resized = applyResize(bgr, meta);
+      break;
+  }
+
+  cv::Mat float_image;
+  resized.convertTo(float_image, CV_32F, 1.0 / 255.0);
+  applyNormalization(float_image);
+  applyColorConversion(float_image);
+
+  if (float_image.empty()) {
+    throw std::runtime_error("Preprocessing failed: result image is empty");
+  }
+  if (float_image.rows != config_.input_height || float_image.cols != config_.input_width) {
+    throw std::runtime_error(
+      "Preprocessing failed: result image size (" + std::to_string(float_image.cols) + "x" +
+      std::to_string(float_image.rows) + ") doesn't match expected size (" + std::to_string(config_.input_width) + "x" +
+      std::to_string(config_.input_height) + ")");
+  }
+  if (float_image.channels() != static_cast<int>(RGB_CHANNELS)) {
+    throw std::runtime_error(
+      "Preprocessing failed: result image has " + std::to_string(float_image.channels()) + " channels, expected " +
+      std::to_string(RGB_CHANNELS));
+  }
+
+  return float_image;
+}
+
+cv::Mat ImagePreprocessor::applyLetterbox(const cv::Mat & bgr, ImageMeta & meta) const
+{
+  const float scale = std::min(
+    static_cast<float>(config_.input_width) / static_cast<float>(bgr.cols),
+    static_cast<float>(config_.input_height) / static_cast<float>(bgr.rows));
+  const int new_w = std::max(1, static_cast<int>(std::round(bgr.cols * scale)));
+  const int new_h = std::max(1, static_cast<int>(std::round(bgr.rows * scale)));
+
+  cv::Mat resized;
+  cv::resize(bgr, resized, cv::Size(new_w, new_h));
+
+  const int pad_w = config_.input_width - new_w;
+  const int pad_h = config_.input_height - new_h;
+  const int pad_left = pad_w / 2;
+  const int pad_right = pad_w - pad_left;
+  const int pad_top = pad_h / 2;
+  const int pad_bottom = pad_h - pad_top;
+
+  cv::copyMakeBorder(
+    resized,
+    resized,
+    pad_top,
+    pad_bottom,
+    pad_left,
+    pad_right,
+    cv::BORDER_CONSTANT,
+    cv::Scalar(config_.pad_value, config_.pad_value, config_.pad_value));
+
+  meta.scale_x = meta.scale_y = scale;
+  meta.pad_x = static_cast<float>(pad_left);
+  meta.pad_y = static_cast<float>(pad_top);
+
+  return resized;
+}
+
+cv::Mat ImagePreprocessor::applyResize(const cv::Mat & bgr, ImageMeta & meta) const
+{
+  cv::Mat resized;
+  cv::resize(bgr, resized, cv::Size(config_.input_width, config_.input_height));
+  meta.scale_x = static_cast<float>(meta.original_width) / static_cast<float>(config_.input_width);
+  meta.scale_y = static_cast<float>(meta.original_height) / static_cast<float>(config_.input_height);
+  meta.pad_x = meta.pad_y = 0.0f;
+  return resized;
+}
+
+cv::Mat ImagePreprocessor::applyCrop(const cv::Mat & bgr, ImageMeta & meta) const
+{
+  const int crop_x = (bgr.cols - config_.input_width) / 2;
+  const int crop_y = (bgr.rows - config_.input_height) / 2;
+
+  cv::Mat cropped;
+  if (bgr.cols >= config_.input_width && bgr.rows >= config_.input_height) {
+    cropped = bgr(cv::Rect(crop_x, crop_y, config_.input_width, config_.input_height)).clone();
+  } else {
+    float scale =
+      std::max(static_cast<float>(config_.input_width) / bgr.cols, static_cast<float>(config_.input_height) / bgr.rows);
+    cv::Mat scaled;
+    cv::resize(bgr, scaled, cv::Size(), scale, scale);
+    const int new_crop_x = (scaled.cols - config_.input_width) / 2;
+    const int new_crop_y = (scaled.rows - config_.input_height) / 2;
+    cropped = scaled(cv::Rect(new_crop_x, new_crop_y, config_.input_width, config_.input_height)).clone();
+  }
+
+  meta.scale_x = static_cast<float>(meta.original_width) / static_cast<float>(config_.input_width);
+  meta.scale_y = static_cast<float>(meta.original_height) / static_cast<float>(config_.input_height);
+  meta.pad_x = static_cast<float>(-crop_x);
+  meta.pad_y = static_cast<float>(-crop_y);
+
+  return cropped;
+}
+
+cv::Mat ImagePreprocessor::applyPad(const cv::Mat & bgr, ImageMeta & meta) const
+{
+  const int pad_w = config_.input_width - bgr.cols;
+  const int pad_h = config_.input_height - bgr.rows;
+
+  cv::Mat padded;
+  if (pad_w >= 0 && pad_h >= 0) {
+    const int pad_left = pad_w / 2;
+    const int pad_right = pad_w - pad_left;
+    const int pad_top = pad_h / 2;
+    const int pad_bottom = pad_h - pad_top;
+
+    cv::copyMakeBorder(
+      bgr,
+      padded,
+      pad_top,
+      pad_bottom,
+      pad_left,
+      pad_right,
+      cv::BORDER_CONSTANT,
+      cv::Scalar(config_.pad_value, config_.pad_value, config_.pad_value));
+
+    meta.pad_x = static_cast<float>(pad_left);
+    meta.pad_y = static_cast<float>(pad_top);
+  } else {
+    return applyResize(bgr, meta);
+  }
+
+  meta.scale_x = meta.scale_y = 1.0f;
+  return padded;
+}
+
+void ImagePreprocessor::applyNormalization(cv::Mat & image) const
+{
+  switch (config_.normalization_type) {
+    case NormalizationType::IMAGENET: {
+      static const std::array<float, RGB_CHANNELS> imagenet_mean = {0.485f, 0.456f, 0.406f};
+      static const std::array<float, RGB_CHANNELS> imagenet_std = {0.229f, 0.224f, 0.225f};
+
+      std::vector<cv::Mat> channels;
+      cv::split(image, channels);
+      channels[0] = (channels[0] - imagenet_mean[2]) / imagenet_std[2];
+      channels[1] = (channels[1] - imagenet_mean[1]) / imagenet_std[1];
+      channels[2] = (channels[2] - imagenet_mean[0]) / imagenet_std[0];
+
+      cv::merge(channels, image);
+      break;
+    }
+    case NormalizationType::CUSTOM: {
+      if (config_.mean.size() >= RGB_CHANNELS && config_.std.size() >= RGB_CHANNELS) {
+        std::vector<cv::Mat> channels;
+        cv::split(image, channels);
+        for (size_t i = 0; i < RGB_CHANNELS; ++i) {
+          channels[i] = (channels[i] - config_.mean[i]) / config_.std[i];
+        }
+
+        cv::merge(channels, image);
+      }
+      break;
+    }
+    case NormalizationType::SCALE_0_1:
+    case NormalizationType::NONE:
+    default:
+      break;
+  }
+}
+
+void ImagePreprocessor::applyColorConversion(cv::Mat & image) const
+{
+  if (config_.color_format == "rgb" || config_.color_format == "RGB") {
+    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+  }
+}
+
+const PackedInput & ImagePreprocessor::pack(const std::vector<cv::Mat> & images) const
+{
+  auto & packed = packed_input_cache_;
+  packed.data.clear();
+  packed.shape.clear();
+  if (images.empty()) {
+    return packed;
+  }
+
+  const size_t batch = images.size();
+  const size_t channels = RGB_CHANNELS;
+  const size_t height = images[0].rows;
+  const size_t width = images[0].cols;
+  const size_t image_size = channels * height * width;
+  const size_t required = batch * image_size;
+
+  packed.shape = {batch, channels, height, width};
+  packed.data.resize(required);
+
+  std::array<cv::Mat, RGB_CHANNELS> channel_planes;
+  const size_t plane_elements = height * width;
+  const size_t plane_bytes = plane_elements * sizeof(float);
+  for (auto & plane : channel_planes) {
+    plane.create(static_cast<int>(height), static_cast<int>(width), CV_32F);
+  }
+
+  for (size_t b = 0; b < batch; ++b) {
+    const cv::Mat & img = images[b];
+    if (img.empty()) {
+      throw std::runtime_error("Image at batch index " + std::to_string(b) + " is empty");
+    }
+    if (img.channels() != static_cast<int>(RGB_CHANNELS) || img.type() != CV_32FC3) {
+      throw std::runtime_error(
+        "Preprocessed image at batch index " + std::to_string(b) + " must be CV_32FC3 (expected " +
+        std::to_string(RGB_CHANNELS) + " channels)");
+    }
+    if (img.rows != static_cast<int>(height) || img.cols != static_cast<int>(width)) {
+      throw std::runtime_error(
+        "Preprocessed image at batch index " + std::to_string(b) + " has size " + std::to_string(img.cols) + "x" +
+        std::to_string(img.rows) + ", expected " + std::to_string(width) + "x" + std::to_string(height));
+    }
+
+    cv::split(img, channel_planes.data());
+    float * batch_base = packed.data.data() + b * image_size;
+    for (size_t c = 0; c < channels; ++c) {
+      const float * src_ptr = channel_planes[c].ptr<float>();
+      float * dst_ptr = batch_base + c * plane_elements;
+      std::memcpy(dst_ptr, src_ptr, plane_bytes);
+    }
+  }
+
+  return packed;
+}
+
+}  // namespace deep_object_detection
diff --git a/deep_object_detection/src/main.cpp b/deep_object_detection/src/main.cpp
new file mode 100644
index 0000000..ae96955
--- /dev/null
+++ b/deep_object_detection/src/main.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2025-present WATonomous. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include <rclcpp/executors.hpp>
+
+#include "deep_object_detection/deep_object_detection_node.hpp"
+
+int main(int argc, char ** argv)
+{
+  rclcpp::init(argc, argv);
+  try {
+    auto node = deep_object_detection::createDeepObjectDetectionNode();
+    rclcpp::executors::MultiThreadedExecutor executor(rclcpp::ExecutorOptions(), 4);
+    executor.add_node(node->get_node_base_interface());
+    executor.spin();
+  } catch (const std::exception & e) {
+    std::cerr << "Failed to start deep_object_detection_node: " << e.what() << std::endl;
+  }
+  rclcpp::shutdown();
+  return 0;
+}
diff --git a/deep_ort_backend_plugin/src/ort_backend_executor.cpp b/deep_ort_backend_plugin/src/ort_backend_executor.cpp
index b8805a6..48fa989 100644
--- a/deep_ort_backend_plugin/src/ort_backend_executor.cpp
+++ b/deep_ort_backend_plugin/src/ort_backend_executor.cpp
@@ -78,9 +78,7 @@ deep_ros::Tensor OrtBackendExecutor::run_inference_impl(deep_ros::Tensor & input
     ONNXTensorElementDataType onnx_type = convert_to_onnx_type(input.dtype());
     std::vector<int64_t> input_shape_int64(input.shape().begin(), input.shape().end());
 
-    // Get our custom allocator for output binding
     auto custom_allocator_shared = get_ort_cpu_allocator();
-    auto * custom_allocator = static_cast<OrtCpuMemoryAllocator *>(custom_allocator_shared.get());
 
     // Create input tensor that wraps existing input memory (zero-copy!)
     size_t input_size_bytes = input.byte_size();
@@ -91,28 +89,22 @@ deep_ros::Tensor OrtBackendExecutor::run_inference_impl(deep_ros::Tensor & input
     Ort::AllocatorWithDefaultOptions allocator;
     auto input_name = session_->GetInputNameAllocated(0, allocator);
     auto output_name = session_->GetOutputNameAllocated(0, allocator);
+    const char * input_names[] = {input_name.get()};
+    const char * output_names[] = {output_name.get()};
 
-    // Create IO binding for zero-copy inference
-    Ort::IoBinding binding(*session_);
-    binding.BindInput(input_name.get(), ort_input);
+    auto output_tensors = session_->Run(Ort::RunOptions{nullptr}, input_names, &ort_input, 1, output_names, 1);
 
-    // Bind output to use our custom allocator - ONNX Runtime will allocate using our allocator
-    binding.BindOutput(output_name.get(), custom_allocator->get_ort_memory_info());
-
-    // Run inference with IO binding (zero-copy for both input and output!)
-    Ort::RunOptions run_options;
-    session_->Run(run_options, binding);
-
-    // Get output values allocated by ONNX Runtime using our custom allocator
-    Ort::AllocatorWithDefaultOptions default_allocator;
-    std::vector<Ort::Value> output_tensors = binding.GetOutputValues(default_allocator);
+    if (output_tensors.empty()) {
+      throw std::runtime_error("ONNX Runtime inference returned no outputs");
+    }
 
-    // Get output shape and create our tensor wrapping the ONNX-allocated memory
-    auto output_shape = get_output_shape(input.shape());
-    void * output_data = output_tensors[0].GetTensorMutableData<void>();
+    auto & output_tensor = output_tensors.front();
+    auto output_info = output_tensor.GetTensorTypeAndShapeInfo();
+    auto output_shape_int64 = output_info.GetShape();
+    std::vector<size_t> output_shape(output_shape_int64.begin(), output_shape_int64.end());
 
-    // Create deep_ros tensor that wraps the ONNX-allocated memory (zero-copy!)
-    deep_ros::Tensor output(output_data, output_shape, input.dtype());
+    deep_ros::Tensor output(output_shape, input.dtype(), custom_allocator_shared);
+    std::memcpy(output.data(), output_tensor.GetTensorData<float>(), output.byte_size());
 
     return output;
   } catch (const std::exception & e) {
diff --git a/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp b/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp
index d8ca4c2..697df84 100644
--- a/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp
+++ b/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp
@@ -67,7 +67,7 @@ const OrtMemoryInfo * OrtCpuMemoryAllocator::get_ort_memory_info() const
   return ort_memory_info_;
 }
 
-void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * this_, size_t size)
+void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * /*this_*/, size_t size)
 {
   if (instance_) {
     return instance_->allocate(size);
@@ -75,14 +75,14 @@ void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * this_, size_
   return nullptr;
 }
 
-void ORT_API_CALL OrtCpuMemoryAllocator::ort_free(OrtAllocator * this_, void * p)
+void ORT_API_CALL OrtCpuMemoryAllocator::ort_free(OrtAllocator * /*this_*/, void * p)
 {
   if (instance_) {
     instance_->deallocate(p);
   }
 }
 
-const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllocator * this_)
+const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllocator * /*this_*/)
 {
   if (instance_) {
     return instance_->get_ort_memory_info();
@@ -90,7 +90,7 @@ const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllo
   return nullptr;
 }
 
-void * ORT_API_CALL OrtCpuMemoryAllocator::ort_reserve(OrtAllocator * this_, size_t size)
+void * ORT_API_CALL OrtCpuMemoryAllocator::ort_reserve(OrtAllocator * /*this_*/, size_t size)
 {
   if (instance_) {
     return instance_->allocate(size);
diff --git a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp
index 6035f1c..c21785a 100644
--- a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp
+++ b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp
@@ -16,6 +16,7 @@
 
 #include <onnxruntime_cxx_api.h>
 
+#include <chrono>
 #include <filesystem>
 #include <memory>
 #include <string>
@@ -43,7 +44,12 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
    * @param execution_provider GPU execution provider: "cuda" or "tensorrt"
    * @param logger ROS logger for diagnostic messages
    */
-  explicit OrtGpuBackendExecutor(int device_id, const std::string & execution_provider, const rclcpp::Logger & logger);
+  explicit OrtGpuBackendExecutor(
+    int device_id,
+    const std::string & execution_provider,
+    const rclcpp::Logger & logger,
+    bool enable_trt_engine_cache,
+    std::string trt_engine_cache_path);
 
   /**
    * @brief Destructor
@@ -88,6 +94,8 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
   int device_id_;
   std::string execution_provider_;
   rclcpp::Logger logger_;
+  bool enable_trt_engine_cache_;
+  std::string trt_engine_cache_path_;
   std::unique_ptr<Ort::Env> env_;
   std::unique_ptr<Ort::Session> session_;
   std::unique_ptr<Ort::SessionOptions> session_options_;
@@ -122,14 +130,19 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
    * @return Expected output tensor shape
    * @throws std::runtime_error if model not loaded or shape inference fails
    */
-  std::vector<size_t> get_output_shape(const std::vector<size_t> & input_shape) const;
-
   /**
    * @brief Get element size in bytes for a data type
    * @param dtype Data type
    * @return Size in bytes per element
    */
   size_t get_element_size(deep_ros::DataType dtype) const;
+
+  bool is_tensorrt_provider() const;
+
+  std::string normalized_provider_;
+  bool tensorrt_engine_ready_{false};
+  bool tensorrt_engine_build_in_progress_{false};
+  std::chrono::steady_clock::time_point tensorrt_engine_build_start_;
 };
 
 }  // namespace deep_ort_gpu_backend
diff --git a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp
index e456f73..f822e52 100644
--- a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp
+++ b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp
@@ -80,6 +80,8 @@ class OrtGpuBackendPlugin : public deep_ros::DeepBackendPlugin
   rclcpp_lifecycle::LifecycleNode::SharedPtr node_;
   int device_id_;
   std::string execution_provider_;
+  bool enable_trt_engine_cache_;
+  std::string trt_engine_cache_path_;
   std::shared_ptr<deep_ros::BackendMemoryAllocator> allocator_;
   std::shared_ptr<deep_ros::BackendInferenceExecutor> executor_;
 
diff --git a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp
index 38d5362..22bd521 100644
--- a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp
+++ b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp
@@ -26,6 +26,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "deep_ort_gpu_backend_plugin/ort_gpu_memory_allocator.hpp"
@@ -39,10 +40,16 @@ namespace deep_ort_gpu_backend
 class OrtGpuCpuMemoryAllocator;
 
 OrtGpuBackendExecutor::OrtGpuBackendExecutor(
-  int device_id, const std::string & execution_provider, const rclcpp::Logger & logger)
+  int device_id,
+  const std::string & execution_provider,
+  const rclcpp::Logger & logger,
+  bool enable_trt_engine_cache,
+  std::string trt_engine_cache_path)
 : device_id_(device_id)
 , execution_provider_(execution_provider)
 , logger_(logger)
+, enable_trt_engine_cache_(enable_trt_engine_cache)
+, trt_engine_cache_path_(std::move(trt_engine_cache_path))
 , memory_info_(nullptr)
 {
   // Initialize ORT environment with minimal logging
@@ -80,7 +87,28 @@ bool OrtGpuBackendExecutor::load_model_impl(const std::filesystem::path & model_
     model_path_ = model_path;
 
     // Create session with GPU execution provider
+    // Note: For TensorRT, this is where the engine is built or loaded from cache
+    // Even with caching, this can take 5-30+ seconds depending on model size and GPU
+    const bool using_tensorrt = is_tensorrt_provider();
+    if (using_tensorrt) {
+      if (enable_trt_engine_cache_) {
+        RCLCPP_INFO(
+          logger_,
+          "Creating TensorRT session (loading cached engine from %s if available)...",
+          trt_engine_cache_path_.c_str());
+      } else {
+        RCLCPP_INFO(logger_, "Creating TensorRT session (building engine, caching disabled)...");
+      }
+    } else {
+      RCLCPP_INFO(logger_, "Creating CUDA session...");
+    }
+
+    auto start_time = std::chrono::steady_clock::now();
     session_ = std::make_unique<Ort::Session>(*env_, model_path_.c_str(), *session_options_);
+    auto end_time = std::chrono::steady_clock::now();
+    auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+
+    RCLCPP_INFO(logger_, "Session created in %.2f seconds", duration_ms / 1000.0);
 
     return true;
   } catch (const std::exception & e) {
@@ -95,8 +123,19 @@ deep_ros::Tensor OrtGpuBackendExecutor::run_inference_impl(deep_ros::Tensor & in
   if (!session_) {
     throw std::runtime_error("No ONNX session available");
   }
-
+  // for logging
   try {
+    const bool using_tensorrt = is_tensorrt_provider();
+    if (using_tensorrt && !tensorrt_engine_ready_ && !tensorrt_engine_build_in_progress_) {
+      tensorrt_engine_build_in_progress_ = true;
+      tensorrt_engine_build_start_ = std::chrono::steady_clock::now();
+      RCLCPP_INFO(
+        logger_,
+        "TensorRT engine build started (model=%s, batch_size=%zu)",
+        model_path_.filename().c_str(),
+        input.shape().empty() ? 0U : input.shape().front());
+    }
+
     // Convert deep_ros::DataType to ONNX tensor element type
     ONNXTensorElementDataType onnx_type = convert_to_onnx_type(input.dtype());
     std::vector<int64_t> input_shape_int64(input.shape().begin(), input.shape().end());
@@ -131,15 +170,39 @@ deep_ros::Tensor OrtGpuBackendExecutor::run_inference_impl(deep_ros::Tensor & in
     Ort::AllocatorWithDefaultOptions default_allocator;
     std::vector<Ort::Value> output_tensors = binding.GetOutputValues(default_allocator);
 
-    // Get output shape and data
-    auto output_shape = get_output_shape(input.shape());
-    void * output_data = output_tensors[0].GetTensorMutableData<void>();
+    if (output_tensors.empty()) {
+      throw std::runtime_error("TensorRT inference returned no outputs");
+    }
+
+    auto & output_tensor = output_tensors.front();
+    auto output_info = output_tensor.GetTensorTypeAndShapeInfo();
+    auto output_shape_int64 = output_info.GetShape();
+
+    std::vector<size_t> output_shape;
+    output_shape.reserve(output_shape_int64.size());
+    for (int64_t dim : output_shape_int64) {
+      if (dim <= 0) {
+        throw std::runtime_error("Invalid runtime output dimension: " + std::to_string(dim));
+      }
+      output_shape.push_back(static_cast<size_t>(dim));
+    }
+
+    void * output_data = output_tensor.GetTensorMutableData<void>();
 
     // Create deep_ros tensor wrapping the ONNX-allocated memory
     deep_ros::Tensor output(output_data, output_shape, input.dtype());
 
+    if (tensorrt_engine_build_in_progress_) {
+      const auto elapsed =
+        std::chrono::duration<double>(std::chrono::steady_clock::now() - tensorrt_engine_build_start_);
+      RCLCPP_INFO(logger_, "TensorRT engine build completed in %.2f seconds", elapsed.count());
+      tensorrt_engine_build_in_progress_ = false;
+      tensorrt_engine_ready_ = true;
+    }
+
     return output;
   } catch (const std::exception & e) {
+    tensorrt_engine_build_in_progress_ = false;
     throw std::runtime_error("GPU inference failed: " + std::string(e.what()));
   }
 }
@@ -157,12 +220,12 @@ void OrtGpuBackendExecutor::initialize_session_options()
   session_options_->SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
 
   // Configure execution provider based on string
-  std::string provider_lower = execution_provider_;
-  std::transform(provider_lower.begin(), provider_lower.end(), provider_lower.begin(), ::tolower);
+  normalized_provider_ = execution_provider_;
+  std::transform(normalized_provider_.begin(), normalized_provider_.end(), normalized_provider_.begin(), ::tolower);
 
-  if (provider_lower == "cuda") {
+  if (normalized_provider_ == "cuda") {
     configure_cuda_provider();
-  } else if (provider_lower == "tensorrt") {
+  } else if (normalized_provider_ == "tensorrt") {
     configure_tensorrt_provider();
   } else {
     throw std::runtime_error(
@@ -206,13 +269,28 @@ void OrtGpuBackendExecutor::configure_tensorrt_provider()
     tensorrt_options.trt_max_workspace_size = 67108864;  // 64MB
     tensorrt_options.trt_max_partition_iterations = 1;
     tensorrt_options.trt_min_subgraph_size = 1;
-    tensorrt_options.trt_engine_cache_enable = 0;
+    tensorrt_options.trt_engine_cache_enable = enable_trt_engine_cache_ ? 1 : 0;
     tensorrt_options.trt_force_sequential_engine_build = 1;
     tensorrt_options.trt_fp16_enable = 0;
     tensorrt_options.trt_int8_enable = 0;
     tensorrt_options.has_user_compute_stream = 0;
     tensorrt_options.user_compute_stream = nullptr;
 
+    std::string cache_path = trt_engine_cache_path_;
+    if (enable_trt_engine_cache_) {
+      if (cache_path.empty()) {
+        cache_path = "/tmp/deep_ros_ort_trt_cache";
+      }
+      std::error_code ec;
+      std::filesystem::create_directories(cache_path, ec);
+      if (ec) {
+        RCLCPP_WARN(
+          logger_, "Failed to create TensorRT cache directory %s: %s", cache_path.c_str(), ec.message().c_str());
+      }
+      trt_engine_cache_path_ = cache_path;
+      tensorrt_options.trt_engine_cache_path = trt_engine_cache_path_.c_str();
+    }
+
     RCLCPP_INFO(logger_, "Configuring TensorRT execution provider on device %d", device_id_);
     session_options_->AppendExecutionProvider_TensorRT(tensorrt_options);
     RCLCPP_INFO(logger_, "TensorRT provider registered successfully");
@@ -239,34 +317,6 @@ ONNXTensorElementDataType OrtGpuBackendExecutor::convert_to_onnx_type(deep_ros::
   }
 }
 
-std::vector<size_t> OrtGpuBackendExecutor::get_output_shape(const std::vector<size_t> & input_shape) const
-{
-  if (!session_) {
-    throw std::runtime_error("No model loaded");
-  }
-
-  try {
-    // Get output tensor info from the session
-    auto output_type_info = session_->GetOutputTypeInfo(0);
-    auto tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
-    auto shape = tensor_info.GetShape();
-
-    std::vector<size_t> output_shape;
-    for (int64_t dim : shape) {
-      if (dim == -1) {
-        // Dynamic dimension - use corresponding input dimension
-        output_shape.push_back(input_shape[output_shape.size()]);
-      } else {
-        output_shape.push_back(static_cast<size_t>(dim));
-      }
-    }
-
-    return output_shape;
-  } catch (const std::exception & e) {
-    throw std::runtime_error("Failed to get output shape: " + std::string(e.what()));
-  }
-}
-
 size_t OrtGpuBackendExecutor::get_element_size(deep_ros::DataType dtype) const
 {
   switch (dtype) {
@@ -285,4 +335,9 @@ size_t OrtGpuBackendExecutor::get_element_size(deep_ros::DataType dtype) const
   }
 }
 
+bool OrtGpuBackendExecutor::is_tensorrt_provider() const
+{
+  return normalized_provider_ == "tensorrt";
+}
+
 }  // namespace deep_ort_gpu_backend
diff --git a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp
index 5bf0757..6709ca4 100644
--- a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp
+++ b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp
@@ -29,6 +29,8 @@ namespace deep_ort_gpu_backend
 OrtGpuBackendPlugin::OrtGpuBackendPlugin()
 : device_id_(0)
 , execution_provider_("cuda")
+, enable_trt_engine_cache_(false)
+, trt_engine_cache_path_("/tmp/deep_ros_ort_trt_cache")
 {
   // GPU components will be initialized in initialize() after parameters are loaded
 }
@@ -37,13 +39,10 @@ void OrtGpuBackendPlugin::initialize(rclcpp_lifecycle::LifecycleNode::SharedPtr
 {
   node_ = node;
 
-  // Declare parameters with defaults
-  node_->declare_parameter("Backend.device_id", 0);
-  node_->declare_parameter("Backend.execution_provider", "cuda");
-
-  // Read parameters
   device_id_ = node_->get_parameter("Backend.device_id").as_int();
   execution_provider_ = node_->get_parameter("Backend.execution_provider").as_string();
+  enable_trt_engine_cache_ = node_->get_parameter("Backend.trt_engine_cache_enable").as_bool();
+  trt_engine_cache_path_ = node_->get_parameter("Backend.trt_engine_cache_path").as_string();
 
   RCLCPP_INFO(
     node_->get_logger(),
@@ -88,7 +87,8 @@ bool OrtGpuBackendPlugin::initialize_gpu_components()
     }
 
     // Create GPU executor
-    executor_ = std::make_shared<OrtGpuBackendExecutor>(device_id_, execution_provider_, node_->get_logger());
+    executor_ = std::make_shared<OrtGpuBackendExecutor>(
+      device_id_, execution_provider_, node_->get_logger(), enable_trt_engine_cache_, trt_engine_cache_path_);
     if (!executor_) {
       RCLCPP_ERROR(node_->get_logger(), "Failed to create GPU backend executor");
       return false;
diff --git a/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp b/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp
index 6896eb3..8dd0102 100644
--- a/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp
+++ b/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp
@@ -54,7 +54,7 @@ bool is_cuda_available()
   try {
     // Try creating a simple CUDA execution provider to verify CUDA availability
     auto logger = rclcpp::get_logger("test_cuda_availability");
-    OrtGpuBackendExecutor test_executor(0, "cuda", logger);
+    OrtGpuBackendExecutor test_executor(0, "cuda", logger, false, "");
     return true;
   } catch (...) {
     return false;
@@ -122,7 +122,7 @@ TEST_CASE("OrtGpuBackendExecutor basic functionality", "[executor]")
   }
 
   auto logger = rclcpp::get_logger("test_executor");
-  OrtGpuBackendExecutor executor(0, "cuda", logger);
+  OrtGpuBackendExecutor executor(0, "cuda", logger, false, "");
   auto allocator = get_ort_gpu_cpu_allocator();
 
   // Test supported formats
@@ -222,7 +222,7 @@ TEST_CASE("TensorRT execution provider", "[tensorrt][!mayfail]")
 {
   try {
     auto logger = rclcpp::get_logger("test_tensorrt");
-    OrtGpuBackendExecutor executor(0, "tensorrt", logger);
+    OrtGpuBackendExecutor executor(0, "tensorrt", logger, false, "");
 
     REQUIRE(executor.get_device_id() == 0);
 
diff --git a/deep_tools/camera_sync/config/multi_camera_sync_params.yaml b/deep_tools/camera_sync/config/multi_camera_sync_params.yaml
index 6c633fb..a5feb66 100644
--- a/deep_tools/camera_sync/config/multi_camera_sync_params.yaml
+++ b/deep_tools/camera_sync/config/multi_camera_sync_params.yaml
@@ -24,10 +24,10 @@ multi_camera_sync:
     # Camera topics to synchronize
     # For raw images, use topics like: ["/camera1/image_raw", "/camera2/image_raw"]
     # For compressed images, use topics like: ["/camera1/image_raw/compressed", "/camera2/image_raw/compressed"]
-    camera_topics: ["/front/image_compressed", "/back/image_compressed", "/port/image_compressed"]
+    camera_topics: ["/CAM_FRONT/image_rect_compressed", "/CAM_FRONT_LEFT/image_rect_compressed", "/CAM_FRONT_RIGHT/image_rect_compressed"]
 
     # Optional names for the cameras (if not provided, will auto-generate camera_1, camera_2, etc.)
-    camera_names: ["front", "back", "port"]
+    camera_names: ["front", "left", "right"]
 
     # Whether to use compressed images (sensor_msgs/CompressedImage) instead of raw (sensor_msgs/Image)
     # Set to true if your camera topics publish compressed images
diff --git a/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp b/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp
index 8a77696..82248e2 100644
--- a/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp
+++ b/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp
@@ -15,7 +15,13 @@
 #ifndef CAMERA_SYNC__MULTI_CAMERA_SYNC_NODE_HPP_
 #define CAMERA_SYNC__MULTI_CAMERA_SYNC_NODE_HPP_
 
-#include <cv_bridge/cv_bridge.h>
+#if __has_include(<cv_bridge/cv_bridge.hpp>)
+  #include <cv_bridge/cv_bridge.hpp>
+#elif __has_include(<cv_bridge/cv_bridge.h>)
+  #include <cv_bridge/cv_bridge.h>
+#else
+  #error "cv_bridge headers not found"
+#endif
 #include <message_filters/subscriber.h>
 #include <message_filters/sync_policies/approximate_time.h>
 #include <message_filters/synchronizer.h>
diff --git a/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in b/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in
new file mode 100644
index 0000000..394042e
--- /dev/null
+++ b/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in
@@ -0,0 +1,2 @@
+# colcon hooks: set LD_LIBRARY_PATH
+ament_prepend_unique_value LD_LIBRARY_PATH "$AMENT_CURRENT_PREFIX/lib"