diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 9ca7f59..9e4f615 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -34,7 +34,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* ENV DEBIAN_FRONTEND=interactive -# Install Tensorrt Runtime RUN curl -fsSL -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \ && dpkg -i cuda-keyring_1.1-1_all.deb \ && apt-get update && apt-get install -y --no-install-recommends \ @@ -59,8 +58,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ && rm -rf /var/lib/apt/lists/* -# Install pre-commit -RUN python3 -m pip install --no-cache-dir pre-commit==3.8.0 +# Install pre-commit (Ubuntu 24.04 ships PEP 668 managed Python, so allow system install) +# RUN python3 -m pip install --no-cache-dir --break-system-packages pre-commit==3.8.0 # Set working dir (matches VSCode workspace) WORKDIR /deep_ros_ws diff --git a/DEVELOPING.md b/DEVELOPING.md index c0009f5..e290f3a 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -26,6 +26,75 @@ This project includes VS Code dev container configurations for easy ROS2 develop - **Build tools**: Includes `colcon` and `rosdep` for ROS development - **Extensions**: C++, CMake, Python, and XML support pre-installed +### Stopping Containers + +After using "Rebuild and Reopen in Container", you can stop containers using: + +**Method 1: VS Code Command (Recommended)** +- Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac) +- Type: `Dev Containers: Reopen Folder Locally` +- This closes the container and returns to local mode + +**Method 2: Stop Container** +- Press `Ctrl+Shift+P` +- Type: `Dev Containers: Stop Container` +- This stops the container but keeps it available for later use + +**Method 3: Using Docker Commands** +From a terminal (outside the container): + +```bash +# List running containers +docker ps + +# Stop a specific container by name +docker stop + +# Stop all dev containers +docker ps -q --filter "name=vsc-deep_ros" | xargs -r docker stop + +# Stop and remove containers +docker stop +docker rm +``` + +**Method 4: Close VS Code Window** +- Simply closing the VS Code window will stop the container when you exit +- The container will remain stopped until you reopen the folder in container mode + +### Restarting Containers + +After stopping a container, you can restart it using: + +**Method 1: VS Code Command (Recommended)** +- Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac) +- Type: `Dev Containers: Reopen in Container` +- This will start the existing container or create a new one if needed + +**Method 2: Rebuild and Reopen** +- Press `Ctrl+Shift+P` +- Type: `Dev Containers: Rebuild and Reopen in Container` +- Use this if you want to rebuild the container from scratch (e.g., after Dockerfile changes) + +**Method 3: Using Docker Commands** +From a terminal (outside the container): + +```bash +# List all containers (including stopped) +docker ps -a + +# Start a stopped container by name +docker start + +# Start and attach to container +docker start +docker attach +``` + +**Method 4: Reopen Folder in VS Code** +- If VS Code detects a devcontainer configuration, it will prompt you to "Reopen in Container" +- Click the notification or use the command palette option + ### Common Commands Inside the container, you can do ros2 commands, colcon commands, rosdep, etc. diff --git a/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp b/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp index 2c6728c..30adf59 100644 --- a/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp +++ b/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp @@ -101,6 +101,17 @@ class BackendMemoryAllocator */ void copy_device_to_device(void * dst, const void * src, size_t bytes); + /** + * @brief Copy data from host (CPU) to device memory - alias for copy_from_host + * @param dst Destination device memory pointer + * @param src Source host memory pointer + * @param bytes Number of bytes to copy + */ + void copy_host_to_device(void * dst, const void * src, size_t bytes) + { + copy_from_host(dst, src, bytes); + } + protected: /** * @brief Implementation of copy_from_host (to be overridden by backends) diff --git a/deep_object_detection/CMakeLists.txt b/deep_object_detection/CMakeLists.txt new file mode 100644 index 0000000..4b09bcd --- /dev/null +++ b/deep_object_detection/CMakeLists.txt @@ -0,0 +1,128 @@ +# Copyright (c) 2025-present WATonomous. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.22) +project(deep_object_detection) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(rclcpp_components REQUIRED) +find_package(rclcpp_lifecycle REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(vision_msgs REQUIRED) +find_package(deep_core REQUIRED) +find_package(deep_msgs REQUIRED) +find_package(pluginlib REQUIRED) +find_package(rcl_interfaces REQUIRED) +find_package(OpenCV REQUIRED COMPONENTS core imgproc) + +add_executable(deep_object_detection_node + src/main.cpp +) + +add_library(deep_object_detection_lib SHARED + src/backend_manager.cpp + src/generic_postprocessor.cpp + src/image_preprocessor.cpp + src/deep_object_detection_node.cpp +) + +target_include_directories(deep_object_detection_lib PUBLIC + $ + $ +) + +target_link_libraries(deep_object_detection_lib + PUBLIC + ${rclcpp_TARGETS} + ${rclcpp_lifecycle_TARGETS} + ${deep_core_TARGETS} + ${pluginlib_TARGETS} + ${vision_msgs_TARGETS} + ${deep_msgs_TARGETS} + deep_core::deep_core_lib + PRIVATE + ${rclcpp_components_TARGETS} + ${sensor_msgs_TARGETS} + ${cv_bridge_TARGETS} + ${rcl_interfaces_TARGETS} + ${OpenCV_LIBRARIES} +) + +target_include_directories(deep_object_detection_lib SYSTEM PUBLIC ${OpenCV_INCLUDE_DIRS}) + +rclcpp_components_register_nodes(deep_object_detection_lib "deep_object_detection::DeepObjectDetectionNode") + +target_link_libraries(deep_object_detection_node + deep_object_detection_lib +) + +target_include_directories(deep_object_detection_node PUBLIC + $ + $ +) + +install(TARGETS deep_object_detection_lib + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION bin +) + +install(TARGETS deep_object_detection_node + RUNTIME DESTINATION lib/${PROJECT_NAME} +) + +install(DIRECTORY include/ + DESTINATION include +) + +install(DIRECTORY config launch + DESTINATION share/${PROJECT_NAME} +) + +if(BUILD_TESTING) + find_package(deep_test REQUIRED) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test/test_postprocessors.cpp) + add_deep_test(test_postprocessors test/test_postprocessors.cpp + LIBRARIES + deep_object_detection_lib + ${OpenCV_LIBRARIES} + ${cv_bridge_TARGETS} + ${sensor_msgs_TARGETS} + ${rcl_interfaces_TARGETS} + ) + endif() + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test/test_preprocessor.cpp) + add_deep_test(test_preprocessor test/test_preprocessor.cpp + LIBRARIES + deep_object_detection_lib + ${OpenCV_LIBRARIES} + ${cv_bridge_TARGETS} + ${sensor_msgs_TARGETS} + ${rcl_interfaces_TARGETS} + ) + endif() +endif() + +ament_package() diff --git a/deep_object_detection/README.md b/deep_object_detection/README.md new file mode 100644 index 0000000..4fc9bb3 --- /dev/null +++ b/deep_object_detection/README.md @@ -0,0 +1,296 @@ +# Deep Object Detection Node + +A deep learning object detection node for ROS 2. Simply provide any ONNX-compatible object detection model, and the node will automatically detect and adapt to its output format. + +## Overview + +The `deep_object_detection` package provides: +- Model-agnostic object detection using ONNX models +- Automatic output format detection and adaptation +- Multi-camera support via MultiImage messages with flexible batching +- Configurable preprocessing and postprocessing pipelines +- Explicit execution provider selection (TensorRT, CUDA, CPU) with fail-fast behavior +- Full ROS 2 lifecycle node support + +## Package Contents + +### Nodes + +- **`deep_object_detection_node`**: A lifecycle node that subscribes to MultiImage messages (synchronized multi-camera input), runs object detection inference, and publishes detection results. + +### Configuration + +- **`config/generic_model_params.yaml`**: Example configuration showing the parameter structure for model, preprocessing, postprocessing, and execution provider settings. + +### Launch Files + +- **`launch/deep_object_detection.launch.yaml`**: Launch file for the object detection node with lifecycle management. + +## Usage + +### Building the Package + +```bash +cd /path/to/your/ros2_workspace +colcon build --packages-select deep_object_detection +source install/setup.bash +``` + +### Running the Node + +1. **Basic usage**: + + ```bash + ros2 launch deep_object_detection deep_object_detection.launch.yaml \ + config_file:=/path/to/generic_model_params.yaml + ``` + +2. **With topic remappings**: + + ```bash + ros2 launch deep_object_detection deep_object_detection.launch.yaml \ + config_file:=/path/to/config.yaml \ + input_topic:=/my_camera/multi_image \ + output_detections_topic:=/my_detections + ``` + +3. **With specific model path**: + + ```bash + ros2 launch deep_object_detection deep_object_detection.launch.yaml \ + config_file:=/path/to/config.yaml \ + preferred_provider:=tensorrt + ``` + +4. **Manual lifecycle transitions** (if not using lifecycle manager): + + ```bash + # Configure the node + ros2 lifecycle set /deep_object_detection_node configure + + # Activate the node + ros2 lifecycle set /deep_object_detection_node activate + ``` + +### Running with Camera Sync Node + +The object detection node requires the `camera_sync` node for synchronized multi-camera processing. + +1. **Launch camera sync node**: + + ```bash + ros2 launch camera_sync multi_camera_sync.launch.yaml + ``` + +2. **Launch object detection node**: + + ```bash + ros2 launch deep_object_detection deep_object_detection.launch.yaml \ + config_file:=/path/to/config.yaml \ + input_topic:=/multi_camera_sync/multi_image_compressed + ``` + + Configure the detection node to subscribe to the MultiImage topic either via: + - **Remapping** (recommended): `input_topic:=/multi_camera_sync/multi_image_compressed` in launch command + - **Parameter**: Set `input_topic: "/multi_camera_sync/multi_image_compressed"` in config file + + Also configure batching to match number of cameras: + + ```yaml + min_batch_size: 3 # Minimum images before processing (match number of cameras) + max_batch_size: 3 # Maximum images per batch + ``` + +### Zero-Copy Component Container (High Performance) + +For maximum performance, run both `multi_camera_sync` and `deep_object_detection` nodes in a single component container with intra-process communication: + +```yaml +launch: + - node_container: + pkg: rclcpp_components + exec: component_container_mt + name: detection_container + param: + - name: use_intra_process_comms + value: true + composable_node: + - pkg: camera_sync + plugin: camera_sync::MultiCameraSyncNode + name: multi_camera_sync + extra_arg: + - name: use_intra_process_comms + value: "true" + - pkg: deep_object_detection + plugin: deep_object_detection::DeepObjectDetectionNode + name: deep_object_detection_node + extra_arg: + - name: use_intra_process_comms + value: "true" +``` + +## Configuration + +The node uses a nested parameter structure for configuration. Many parameters have sensible defaults. + +### Configuration Example + +```yaml +deep_object_detection_node: + ros__parameters: + # Required + model_path: "/path/to/model.onnx" # Absolute path to ONNX model file + input_topic: "/multi_camera_sync/multi_image_compressed" + + # Model configuration + model: + num_classes: 80 + bbox_format: "cxcywh" # cxcywh, xyxy, or xywh + class_names_path: "/path/to/classes.txt" # Optional: absolute path to class names file + + # Preprocessing + preprocessing: + input_width: 640 + input_height: 640 + normalization_type: "scale_0_1" # scale_0_1, imagenet, custom, none + resize_method: "letterbox" # letterbox, resize, crop, pad + color_format: "rgb" # rgb or bgr + mean: [0.0, 0.0, 0.0] # For custom normalization + std: [1.0, 1.0, 1.0] # For custom normalization + + # Postprocessing + postprocessing: + score_threshold: 0.25 + nms_iou_threshold: 0.45 + max_detections: 300 + score_activation: "sigmoid" # sigmoid, softmax, none + class_score_mode: "all_classes" # all_classes or single_confidence + use_multi_output: false + layout: + auto_detect: true + + # Execution provider + preferred_provider: "tensorrt" # tensorrt, cuda, or cpu + device_id: 0 + enable_trt_engine_cache: false + trt_engine_cache_path: "/tmp/deep_ros_ort_trt_cache" + + # Batching + min_batch_size: 1 + max_batch_size: 3 + max_batch_latency_ms: 0 # 0 = wait for min_batch_size + queue_size: 10 # 0 = unlimited + + # Output + output_detections_topic: "/detections" +``` + +### Execution Provider Selection + +The node supports explicit provider selection with fail-fast behavior. If the specified provider is unavailable or fails to initialize, the node will immediately throw an error (no silent fallbacks). + +**Available Providers:** +- `tensorrt` - TensorRT execution provider (requires CUDA and TensorRT) +- `cuda` - CUDA execution provider (requires CUDA) +- `cpu` - CPU execution provider (always available) + +## Expected Model Format + +The node automatically detects and adapts to various ONNX model output formats: + +- `[batch, detections, features]` - standard format +- `[batch, features, detections]` - transposed format +- `[batch, queries, 4+classes]` - query-based models (e.g., DETR) +- `[batch, channels, anchors]` - anchor-based models +- Any other layout - automatically detected and handled + +**Input:** The node expects MultiImage messages (deep_msgs/MultiImage) containing synchronized compressed images from multiple cameras. + +**Output:** Detection2DArray messages containing bounding boxes, scores, and class IDs for each image in the batch. + +## Parameters + +### Required Parameters +- **`model_path`** (string): Absolute path to ONNX model file (e.g., `/workspaces/deep_ros/yolov8m.onnx`). +- **`input_topic`** (string): MultiImage topic name to subscribe to. + +### Key Parameters +- **`class_names_path`** (string, optional): Absolute path to text file with class names, one per line (e.g., `/workspaces/deep_ros/deep_object_detection/config/coco_classes.txt`). If not provided, class IDs will be used in output messages. +- **`model.num_classes`** (int, default: 80): Number of detection classes. +- **`model.bbox_format`** (string, default: "cxcywh"): Bounding box format (cxcywh, xyxy, or xywh). +- **`preprocessing.input_width/input_height`** (int, default: 640): Model input image dimensions. +- **`preprocessing.normalization_type`** (string, default: "scale_0_1"): Normalization method. +- **`preprocessing.resize_method`** (string, default: "letterbox"): Image resizing method. +- **`postprocessing.score_threshold`** (float, default: 0.25): Minimum confidence score. +- **`postprocessing.nms_iou_threshold`** (float, default: 0.45): IoU threshold for NMS. +- **`preferred_provider`** (string, default: "tensorrt"): Execution provider (tensorrt, cuda, or cpu). +- **`min_batch_size`** (int, default: 1): Minimum images before processing. +- **`max_batch_size`** (int, default: 3): Maximum images per batch. + +See `config/generic_model_params.yaml` for a complete parameter reference. + +## Topics + +Topic names can be configured either via parameters in the config file or via remappings in the launch file. Remappings take precedence over parameter values. + +### Input Topics +- **`input_topic`**: MultiImage messages (deep_msgs/MultiImage) containing synchronized compressed images from multiple cameras. + +**Note:** The node only supports MultiImage messages. Individual camera topics are not supported. + +**Configuration:** +- Via parameter: Set `input_topic` in the config file +- Via remapping: Use `input_topic:=/your/topic/name` in the launch command or add remapping in launch file + +### Output Topics +- **`output_detections_topic`**: Detection2DArray messages (default: "/detections") containing bounding boxes, scores, and class IDs for each image in the batch. + +**Configuration:** +- Via parameter: Set `output_detections_topic` in the config file +- Via remapping: Use `output_detections_topic:=/your/topic/name` in the launch command or add remapping in launch file + +**Example launch file with remappings:** + +```yaml +- node: + pkg: "deep_object_detection" + exec: "deep_object_detection_node" + name: "deep_object_detection_node" + remap: + - from: "/multi_camera_sync/multi_image_compressed" + to: "/my_camera/multi_image" + - from: "/detections" + to: "/my_detections" + param: + - from: "$(var config_file)" +``` + +## Batching Behavior + +The node supports flexible batching with three parameters: + +- **`min_batch_size`**: Minimum number of images required before processing (default: 1) +- **`max_batch_size`**: Maximum number of images per batch (default: 3) +- **`max_batch_latency_ms`**: Maximum wait time in milliseconds before processing even if `min_batch_size` is not met (default: 0 = wait indefinitely) + +**Use Cases:** +- **Single camera**: `min_batch_size=1, max_batch_size=1` - process each image immediately +- **Multi-camera synchronized**: `min_batch_size=N, max_batch_size=N` where N = number of cameras +- **Best-effort batching**: `min_batch_size=1, max_batch_size=N` with optional `max_batch_latency_ms` timeout + +## Limitations + +1. **MultiImage input only**: The node only supports MultiImage messages. Individual camera topics are not supported. +2. **Compressed images only**: Only compressed images (sensor_msgs/CompressedImage) are supported. Raw images are not supported. +3. **Fail-fast provider selection**: If the specified execution provider is unavailable, the node fails immediately. No automatic fallback to other providers. +4. **No dynamic reconfiguration**: Parameters cannot be changed at runtime. Node must be reconfigured to change parameters. + +## Troubleshooting + +- **"No plugin loaded"**: Check that the backend plugin name is correct in the configuration +- **"No model loaded"**: Verify the model path exists and is a valid ONNX file +- **"Provider initialization failed"**: Check that the specified execution provider (tensorrt/cuda) is available and properly configured +- **Lifecycle errors**: Ensure the node is properly configured before activation +- **No detections**: Verify that the model output format matches the expected format, or enable auto-detection +- **Plugin discovery issues**: Check that `deep_ort_backend_plugin` is built and sourced +- **CUDA/TensorRT errors**: Ensure CUDA libraries and TensorRT (if using) are properly installed and accessible diff --git a/deep_object_detection/config/coco_classes.txt b/deep_object_detection/config/coco_classes.txt new file mode 100644 index 0000000..ca76c80 --- /dev/null +++ b/deep_object_detection/config/coco_classes.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/deep_object_detection/config/generic_model_params.yaml b/deep_object_detection/config/generic_model_params.yaml new file mode 100644 index 0000000..b9d90e8 --- /dev/null +++ b/deep_object_detection/config/generic_model_params.yaml @@ -0,0 +1,46 @@ +deep_object_detection_node: + ros__parameters: + model_path: "/workspaces/deep_ros/yolov8m.onnx" + class_names_path: "/workspaces/deep_ros/deep_object_detection/config/coco_classes.txt" + + model: + num_classes: 80 + bbox_format: "cxcywh" + + preprocessing: + input_width: 640 + input_height: 640 + normalization_type: "scale_0_1" + resize_method: "letterbox" + color_format: "rgb" + + postprocessing: + score_threshold: 0.25 + nms_iou_threshold: 0.45 + max_detections: 300 + score_activation: "sigmoid" + use_multi_output: false + + class_score_mode: "all_classes" + class_score_start_idx: -1 + class_score_count: -1 + + coordinate_space: "preprocessed" + + layout: + auto_detect: true + batch_dim: 0 + detection_dim: 1 + feature_dim: 2 + bbox_start_idx: 0 + bbox_count: 4 + score_idx: 4 + class_idx: 5 + + max_batch_size: 3 + queue_size: 10 + + preferred_provider: "tensorrt" + device_id: 0 + enable_trt_engine_cache: true + trt_engine_cache_path: "/tmp/deep_ros_ort_trt_cache" diff --git a/deep_object_detection/include/deep_object_detection/backend_manager.hpp b/deep_object_detection/include/deep_object_detection/backend_manager.hpp new file mode 100644 index 0000000..b3d2ba8 --- /dev/null +++ b/deep_object_detection/include/deep_object_detection/backend_manager.hpp @@ -0,0 +1,232 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file backend_manager.hpp + * @brief Backend plugin manager for ONNX Runtime inference + * + * This header defines the BackendManager class which: + * - Loads ONNX Runtime backend plugins (CPU, CUDA, TensorRT) + * - Initializes models with selected execution providers + * - Provides memory allocators for tensor operations + * - Executes inference on batched input tensors + * - Supports fail-fast behavior (no silent fallbacks) + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "deep_object_detection/detection_types.hpp" + +namespace deep_ros +{ +class BackendInferenceExecutor; +class BackendMemoryAllocator; +} // namespace deep_ros + +namespace deep_object_detection +{ + +/** + * @brief Manages backend plugin loading and inference execution + * + * Handles loading of ONNX Runtime backend plugins (CPU, CUDA, TensorRT), + * model loading, and inference execution. Supports fail-fast behavior: + * if the requested provider is unavailable, initialization fails immediately. + * + * The manager: + * - Loads the appropriate backend plugin based on preferred_provider + * - Initializes the model with the selected execution provider + * - Provides memory allocator for tensor operations + * - Executes inference on batched input tensors + * - Warms up tensor shape cache for optimal performance + */ +class BackendManager +{ +public: + /** + * @brief Construct the backend manager + * @param node Lifecycle node reference for parameter access and logging + * @param params Detection parameters containing model path and provider settings + * + * Does not initialize the backend. Call initialize() after construction. + */ + BackendManager(rclcpp_lifecycle::LifecycleNode & node, const DetectionParams & params); + + /** + * @brief Destructor: releases plugin resources + */ + ~BackendManager(); + + /** + * @brief Initialize the backend: load plugin, model, and warmup + * + * Loads the backend plugin, initializes the model with the selected execution + * provider, and warms up tensor shape cache. Throws if provider is unavailable + * or model loading fails (fail-fast behavior). + */ + void initialize(); + + /** + * @brief Run inference on a batched input tensor + * @param input Packed input tensor (batched preprocessed images) + * @return Output tensor from model inference + * + * Converts PackedInput to deep_ros::Tensor and runs inference. + * Returns the first output tensor (for single-output models). + */ + deep_ros::Tensor infer(const PackedInput & input); + + /** + * @brief Run inference and return all output tensors + * @param input Packed input tensor (batched preprocessed images) + * @return Vector of all output tensors from model + * + * Useful for multi-output models (e.g., separate boxes, scores, classes). + */ + std::vector inferAllOutputs(const PackedInput & input); + + /** + * @brief Get the active execution provider name + * @return String name of the active provider ("tensorrt", "cuda", or "cpu") + */ + const std::string & activeProvider() const + { + return active_provider_; + } + + /** + * @brief Get the memory allocator for tensor operations + * @return Shared pointer to backend memory allocator + * + * Used by preprocessor to allocate tensors in backend-appropriate memory + * (e.g., GPU memory for CUDA/TensorRT, CPU memory for CPU provider). + */ + std::shared_ptr allocator() const + { + return allocator_; + } + + /** + * @brief Check if executor is initialized + * @return True if executor is available, false otherwise + */ + bool hasExecutor() const + { + return static_cast(executor_); + } + + /** + * @brief Get output tensor shape for given input shape + * @param input_shape Input tensor shape [batch, channels, height, width] + * @return Output tensor shape + * + * Uses cached shape information from warmup. Returns shape of first output + * tensor for the given input shape. + */ + std::vector getOutputShape(const std::vector & input_shape) const; + +private: + /** + * @brief Initialize the backend plugin and load model + * + * Loads the plugin using pluginlib, creates executor and allocator, + * and loads the ONNX model. Throws if provider is unavailable or model + * loading fails. + */ + void initializeBackend(); + + /** + * @brief Warmup tensor shape cache for optimal performance + * + * Runs dummy inference with expected input shapes to warm up TensorRT + * engine cache and optimize memory allocation. Always enabled (not configurable). + */ + void warmupTensorShapeCache(); + + /** + * @brief Parse provider string to Provider enum + * @param provider_str Provider name ("tensorrt", "cuda", or "cpu") + * @return Provider enum value + */ + Provider parseProvider(const std::string & provider_str) const; + + /** + * @brief Check if CUDA runtime is available + * @return True if CUDA is available, false otherwise + * + * Used to validate CUDA/TensorRT provider requirements. + */ + bool isCudaRuntimeAvailable() const; + + /** + * @brief Convert Provider enum to string + * @param provider Provider enum value + * @return String representation ("tensorrt", "cuda", or "cpu") + */ + std::string providerToString(Provider provider) const; + + /** + * @brief Declare active_provider parameter for runtime visibility + * @param value Provider name to declare + * + * Makes the active provider visible in ROS parameter server for debugging. + */ + void declareActiveProviderParameter(const std::string & value); + + /** + * @brief Build input tensor from PackedInput + * @param packed Packed input data (flattened float array and shape) + * @return Tensor ready for inference + * + * Converts PackedInput (CPU memory) to deep_ros::Tensor using the + * backend allocator (may copy to GPU memory for CUDA/TensorRT). + */ + deep_ros::Tensor buildInputTensor(const PackedInput & packed) const; + + /// Reference to lifecycle node (for parameters and logging) + rclcpp_lifecycle::LifecycleNode & node_; + /// Detection parameters (model path, provider settings) + const DetectionParams & params_; + /// Selected execution provider enum + Provider provider_; + /// Active provider name (for logging and parameter server) + std::string active_provider_{"unknown"}; + /// Inference executor (runs model inference) + std::shared_ptr executor_; + /// Memory allocator (for tensor allocation) + std::shared_ptr allocator_; + /// Plugin loader (loads backend plugin via pluginlib) + std::unique_ptr> plugin_loader_; + /// Plugin instance (holds loaded plugin) + pluginlib::UniquePtr plugin_holder_; + + /** + * @brief Map Provider enum to plugin name + * @param provider Provider enum value + * @return Plugin name for pluginlib (e.g., "deep_ort_backend_plugin::OrtBackendPlugin") + */ + std::string providerToPluginName(Provider provider) const; +}; + +} // namespace deep_object_detection diff --git a/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp b/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp new file mode 100644 index 0000000..b1a4107 --- /dev/null +++ b/deep_object_detection/include/deep_object_detection/deep_object_detection_node.hpp @@ -0,0 +1,277 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file deep_object_detection_node.hpp + * @brief ROS 2 lifecycle node for object detection using ONNX models + * + * This header defines the main DeepObjectDetectionNode class which: + * - Subscribes to MultiImage messages (synchronized multi-camera input) + * - Batches images for efficient inference + * - Runs preprocessing, inference, and postprocessing + * - Publishes Detection2DArray messages with bounding boxes and scores + * - Manages lifecycle states (configure, activate, deactivate, cleanup, shutdown) + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "deep_object_detection/backend_manager.hpp" +#include "deep_object_detection/detection_types.hpp" +#include "deep_object_detection/generic_postprocessor.hpp" +#include "deep_object_detection/image_preprocessor.hpp" + +namespace deep_object_detection +{ + +/** + * @brief ROS2 lifecycle node for object detection using ONNX models + * + * This node performs object detection on synchronized multi-camera streams via MultiImage messages. + * It supports: + * - MultiImage input: synchronized compressed images from multiple cameras + * - Batch processing: groups images for efficient inference + * - Multiple backends: CPU, CUDA, or TensorRT execution providers + * - Configurable preprocessing: resizing, normalization, color format conversion + * - Flexible postprocessing: NMS, score thresholds, multiple output formats + * + * The node follows the ROS2 lifecycle pattern: + * - unconfigured -> configuring -> inactive -> activating -> active + * - Can be deactivated/activated without full cleanup + * - Subscriptions and publishers are only active when the node is in the active state + */ +class DeepObjectDetectionNode : public rclcpp_lifecycle::LifecycleNode +{ +public: + /** + * @brief Construct the object detection node + * @param options Node options for ROS2 configuration + */ + explicit DeepObjectDetectionNode(const rclcpp::NodeOptions & options = rclcpp::NodeOptions()); + + /** + * @brief Configure the node: load model, initialize backends, setup postprocessor + * @return SUCCESS if configuration succeeds, FAILURE otherwise + * + * Loads class names, initializes image preprocessor, backend manager, and postprocessor. + * Detects model output shape and configures layout. Does not start subscriptions. + */ + rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_configure( + const rclcpp_lifecycle::State &) override; + + /** + * @brief Activate the node: start subscriptions and batch processing timer + * @return SUCCESS if activation succeeds, FAILURE otherwise + * + * Creates subscriptions (either MultiImage or individual camera topics based on configuration), + * starts the batch processing timer, and activates the detection publisher. + */ + rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_activate( + const rclcpp_lifecycle::State &) override; + + /** + * @brief Deactivate the node: stop subscriptions and timer + * @return SUCCESS + * + * Stops all subscriptions, cancels the batch timer, and deactivates the publisher. + * Node can be reactivated without full cleanup. + */ + rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_deactivate( + const rclcpp_lifecycle::State &) override; + + /** + * @brief Cleanup the node: release all resources + * @return SUCCESS + * + * Releases backend manager, preprocessor, postprocessor, and clears all data. + * Node must be reconfigured before reactivation. + */ + rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_cleanup( + const rclcpp_lifecycle::State &) override; + + /** + * @brief Shutdown the node: final cleanup + * @return SUCCESS + * + * Performs final cleanup. Node cannot be reactivated after shutdown. + */ + rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn on_shutdown( + const rclcpp_lifecycle::State &) override; + +private: + /** + * @brief Declare and read all ROS parameters from the parameter server + * + * Reads configuration for model, preprocessing, postprocessing, execution provider, + * batching, and topic names. Validates required parameters and sets defaults. + */ + void declareAndReadParameters(); + + /** + * @brief Setup the MultiImage subscription + * + * Creates a subscription to the input_topic_ with best_effort QoS. + * Subscription is only active when node is in active state. + */ + void setupSubscription(); + + /** + * @brief Callback for MultiImage messages + * @param msg Shared pointer to MultiImage message containing synchronized compressed images + * + * Decodes each compressed image in the MultiImage message and enqueues them + * for batch processing. Failed decodes are dropped according to decode_failure_policy. + */ + void onMultiImage(const deep_msgs::msg::MultiImage::ConstSharedPtr & msg); + + /** + * @brief Handle a single compressed image from MultiImage message + * @param msg Compressed image message to decode and enqueue + * + * Decodes the compressed image using cv::imdecode(). If decoding fails, + * the image is dropped according to decode_failure_policy. + */ + void handleCompressedImage(const sensor_msgs::msg::CompressedImage & msg); + + /** + * @brief Enqueue an image for batch processing + * @param image Decoded BGR image (OpenCV Mat) + * @param header ROS message header with timestamp and frame_id + * + * Adds the image to the processing queue. If queue is full, drops oldest + * image according to queue_overflow_policy. Thread-safe. + */ + void enqueueImage(cv::Mat image, const std_msgs::msg::Header & header); + + /** + * @brief Timer callback for batch processing + * + * Called periodically (5ms) to check if enough images have accumulated + * to form a batch. Processes batch if min_batch_size is met or max_batch_latency_ms + * timeout has elapsed. + */ + void onBatchTimer(); + + /** + * @brief Process a batch of images through the inference pipeline + * @param batch Vector of queued images to process + * + * Runs preprocessing, inference, and postprocessing for the batch. + * Publishes detection results. Runs asynchronously to avoid blocking timer callback. + */ + void processBatch(const std::vector & batch); + + /** + * @brief Publish detection results for a batch + * @param batch_detections Detections for each image in the batch + * @param headers ROS headers for each image (for timestamp and frame_id) + * @param metas Image metadata for coordinate transformations + * + * Converts SimpleDetection objects to Detection2DArray messages and publishes + * them. Each image in the batch gets its own Detection2DArray message. + */ + void publishDetections( + const std::vector> & batch_detections, + const std::vector & headers, + const std::vector & metas); + + /** + * @brief Load class names from file + * + * Reads class names from class_names_path (one per line) and stores them + * in params_.class_names. If file doesn't exist or is empty, class_names + * remains empty and class IDs will be used in output messages. + */ + void loadClassNames(); + + /** + * @brief Cleanup resources if configuration partially fails + * + * Releases resources that were allocated during on_configure() if an error + * occurs partway through configuration. Ensures no resource leaks. + */ + void cleanupPartialConfiguration(); + + /** + * @brief Cleanup all node resources + * + * Releases backend_manager_, preprocessor_, postprocessor_, and clears + * all subscriptions and timers. Called during on_cleanup() and on_shutdown(). + */ + void cleanupAllResources(); + + /** + * @brief Stop all subscriptions and cancel batch timer + * + * Called during on_deactivate() to stop processing without full cleanup. + * Node can be reactivated without reconfiguration. + */ + void stopSubscriptionsAndTimer(); + + /// Configuration parameters loaded from ROS parameter server + DetectionParams params_; + + /// Subscription to MultiImage topic (synchronized multi-camera input) + rclcpp::Subscription::SharedPtr multi_image_sub_; + /// Input topic name (can be set via parameter or remapping) + std::string input_topic_; + /// Publisher for Detection2DArray messages + rclcpp_lifecycle::LifecyclePublisher::SharedPtr detection_pub_; + /// Timer for periodic batch processing checks (5ms period) + rclcpp::TimerBase::SharedPtr batch_timer_; + + /// Queue of images waiting to be processed (thread-safe) + std::deque image_queue_; + /// Mutex protecting image_queue_ access + std::mutex queue_mutex_; + /// Callback group for subscription (allows concurrent execution) + rclcpp::CallbackGroup::SharedPtr callback_group_; + + /// Counter for dropped images (queue overflow or decode failures) + size_t dropped_images_count_; + + /// Image preprocessor (resize, normalize, color conversion) + std::unique_ptr preprocessor_; + /// Postprocessor (NMS, coordinate transformation, message formatting) + std::unique_ptr postprocessor_; + /// Backend manager (handles ONNX Runtime plugin loading and inference) + std::unique_ptr backend_manager_; +}; + +/** + * @brief Factory function to create a DeepObjectDetectionNode instance + * @param options ROS2 node options for configuration + * @return Shared pointer to lifecycle node instance + * + * Used by rclcpp_components for composable node loading. + * Allows the node to be loaded as a component in a component container. + */ +std::shared_ptr createDeepObjectDetectionNode( + const rclcpp::NodeOptions & options = rclcpp::NodeOptions()); + +} // namespace deep_object_detection diff --git a/deep_object_detection/include/deep_object_detection/detection_types.hpp b/deep_object_detection/include/deep_object_detection/detection_types.hpp new file mode 100644 index 0000000..f601af9 --- /dev/null +++ b/deep_object_detection/include/deep_object_detection/detection_types.hpp @@ -0,0 +1,447 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file detection_types.hpp + * @brief Type definitions and enums for deep object detection + * + * This header defines: + * - Enum types for configuration options (Provider, BboxFormat, NormalizationType, etc.) + * - Configuration structures (PreprocessingConfig, PostprocessingConfig, DetectionParams) + * - Data structures (ImageMeta, QueuedImage, PackedInput, SimpleDetection) + * - Helper functions for string-to-enum conversion + * - ROS message type aliases (Detection2DMsg, Detection2DArrayMsg) + */ + +#pragma once + +#include +#include + +#include +#include + +#if __has_include() + #include + #include + +namespace deep_object_detection +{ +using Detection2DMsg = deep_msgs::msg::Detection2D; +using Detection2DArrayMsg = deep_msgs::msg::Detection2DArray; +} // namespace deep_object_detection +#else + #include + #include + #include + +namespace deep_object_detection +{ +using Detection2DMsg = vision_msgs::msg::Detection2D; +using Detection2DArrayMsg = vision_msgs::msg::Detection2DArray; +} // namespace deep_object_detection +#endif + +namespace deep_object_detection +{ + +/// Number of RGB color channels +constexpr size_t RGB_CHANNELS = 3; + +/** + * @brief Execution provider for ONNX Runtime inference + */ +enum class Provider +{ + TENSORRT, ///< TensorRT execution provider (requires CUDA and TensorRT) + CUDA, ///< CUDA execution provider (requires CUDA) + CPU ///< CPU execution provider (always available) +}; + +/** + * @brief Postprocessor type (currently only generic is supported) + */ +enum class PostprocessorType +{ + GENERIC ///< Generic postprocessor (supports various output formats) +}; + +/** + * @brief Bounding box coordinate format + */ +enum class BboxFormat +{ + CXCYWH, ///< Center x, center y, width, height (YOLO format) + XYXY, ///< Top-left x, top-left y, bottom-right x, bottom-right y + XYWH ///< Top-left x, top-left y, width, height +}; + +/** + * @brief Image normalization method + */ +enum class NormalizationType +{ + IMAGENET, ///< ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + SCALE_0_1, ///< Scale to [0, 1] range (divide by 255.0) + CUSTOM, ///< Custom mean and std values + NONE ///< No normalization +}; + +/** + * @brief Image resizing method + */ +enum class ResizeMethod +{ + LETTERBOX, ///< Maintain aspect ratio, pad with gray (114) to fit input size + RESIZE, ///< Stretch to input size (may distort aspect ratio) + CROP, ///< Center crop to input size + PAD ///< Pad to input size +}; + +/** + * @brief Score activation function for raw model outputs + */ +enum class ScoreActivation +{ + SIGMOID, ///< Sigmoid activation: 1 / (1 + exp(-x)) + SOFTMAX, ///< Softmax activation (not fully implemented, returns raw) + NONE ///< No activation (use raw scores) +}; + +/** + * @brief How class scores are extracted from model output + */ +enum class ClassScoreMode +{ + ALL_CLASSES, ///< Extract class scores from all class logits (multi-class) + SINGLE_CONFIDENCE ///< Use a single confidence score (single-class models) +}; + +/** + * @brief Coordinate space for bounding boxes + */ +enum class CoordinateSpace +{ + PREPROCESSED, ///< Coordinates in preprocessed image space (model input size) + ORIGINAL ///< Coordinates in original image space (not currently used) +}; + +/** + * @brief Policy for handling queue overflow + */ +enum class QueueOverflowPolicy +{ + DROP_OLDEST, ///< Drop oldest image when queue is full (default) + DROP_NEWEST, ///< Drop newest image when queue is full + THROW ///< Throw exception when queue is full (not currently used) +}; + +/** + * @brief Policy for handling image decode failures + */ +enum class DecodeFailurePolicy +{ + DROP, ///< Drop failed decode and log warning (default) + THROW ///< Throw exception on decode failure (not currently used) +}; + +/** + * @brief Image metadata for coordinate transformation + * + * Stores information about preprocessing transformations needed to + * transform bounding box coordinates from preprocessed image space + * back to original image space. + */ +struct ImageMeta +{ + int original_width = 0; ///< Original image width before preprocessing + int original_height = 0; ///< Original image height before preprocessing + float scale_x = 1.0f; ///< Horizontal scale factor (original_width / input_width) + float scale_y = 1.0f; ///< Vertical scale factor (original_height / input_height) + float pad_x = 0.0f; ///< Horizontal padding offset (for letterbox) + float pad_y = 0.0f; ///< Vertical padding offset (for letterbox) +}; + +/** + * @brief Queued image waiting for batch processing + */ +struct QueuedImage +{ + cv::Mat bgr; ///< Decoded BGR image (OpenCV Mat) + std_msgs::msg::Header header; ///< ROS message header with timestamp and frame_id +}; + +/** + * @brief Packed input tensor for model inference + * + * Flattened float array in NCHW format (batch, channels, height, width) + * ready for model input. Shape vector describes tensor dimensions. + */ +struct PackedInput +{ + std::vector data; ///< Flattened float array (NCHW format) + std::vector shape; ///< Tensor shape [batch, channels, height, width] +}; + +/** + * @brief Preprocessing configuration parameters + */ +struct PreprocessingConfig +{ + int input_width = 640; ///< Model input image width in pixels + int input_height = 640; ///< Model input image height in pixels + NormalizationType normalization_type = NormalizationType::SCALE_0_1; ///< Normalization method + std::vector mean = {0.0f, 0.0f, 0.0f}; ///< Mean values for custom normalization (RGB order) + std::vector std = {1.0f, 1.0f, 1.0f}; ///< Std values for custom normalization (RGB order) + ResizeMethod resize_method = ResizeMethod::LETTERBOX; ///< Image resizing method + int pad_value = 114; ///< Padding value for letterbox (gray, YOLO standard) + std::string color_format = "bgr"; ///< Input color format ("bgr" or "rgb") +}; + +/** + * @brief Model metadata configuration + */ +struct ModelMetadata +{ + int num_classes = 80; ///< Number of detection classes + std::string class_names_file; ///< Path to class names file (one per line, optional) + BboxFormat bbox_format = BboxFormat::CXCYWH; ///< Bounding box format used by model +}; + +/** + * @brief Output tensor layout configuration + * + * Describes the structure of model output tensors for manual layout specification. + * Only used when auto_detect is false. + */ +struct OutputLayoutConfig +{ + bool auto_detect = true; ///< True to auto-detect layout, false to use manual config + int batch_dim = 0; ///< Batch dimension index + int detection_dim = 1; ///< Detection dimension index + int feature_dim = 2; ///< Feature dimension index + int bbox_start_idx = 0; ///< Starting index for bbox coordinates in feature dimension + int bbox_count = 4; ///< Number of bbox coordinates (always 4) + int score_idx = 4; ///< Index for confidence score in feature dimension + int class_idx = 5; ///< Index for class ID in feature dimension +}; + +/** + * @brief Postprocessing configuration parameters + */ +struct PostprocessingConfig +{ + float score_threshold = 0.25f; ///< Minimum confidence score (detections below are filtered) + float nms_iou_threshold = 0.45f; ///< IoU threshold for Non-Maximum Suppression + int max_detections = 300; ///< Maximum number of detections per image (after NMS) + ScoreActivation score_activation = ScoreActivation::SIGMOID; ///< Score activation function + bool enable_nms = true; ///< Enable Non-Maximum Suppression (always true, not configurable) + bool use_multi_output = false; ///< True if model has separate outputs for boxes, scores, classes + int output_boxes_idx = 0; ///< Output index for bounding boxes (if use_multi_output) + int output_scores_idx = 1; ///< Output index for scores (if use_multi_output) + int output_classes_idx = 2; ///< Output index for class IDs (if use_multi_output) + ClassScoreMode class_score_mode = ClassScoreMode::ALL_CLASSES; ///< How class scores are extracted + int class_score_start_idx = -1; ///< Start index for class scores (-1 = use all) + int class_score_count = -1; ///< Count of class scores (-1 = use all) + CoordinateSpace coordinate_space = CoordinateSpace::PREPROCESSED; ///< Coordinate space (always PREPROCESSED) + OutputLayoutConfig layout; ///< Output layout configuration +}; + +/** + * @brief Complete detection node configuration parameters + * + * Aggregates all configuration for model, preprocessing, postprocessing, + * execution provider, batching, and topics. + */ +struct DetectionParams +{ + std::string model_path; ///< Absolute path to ONNX model file + ModelMetadata model_metadata; ///< Model metadata (classes, bbox format) + PreprocessingConfig preprocessing; ///< Preprocessing configuration + PostprocessingConfig postprocessing; ///< Postprocessing configuration + std::string input_qos_reliability{"best_effort"}; ///< Input topic QoS reliability (always "best_effort") + std::string output_detections_topic{"/detections"}; ///< Output topic for detections + int max_batch_size{3}; ///< Maximum images per batch + int queue_size{10}; ///< Maximum queue size (0 = unlimited) + QueueOverflowPolicy queue_overflow_policy{QueueOverflowPolicy::DROP_OLDEST}; ///< Queue overflow policy + DecodeFailurePolicy decode_failure_policy{DecodeFailurePolicy::DROP}; ///< Decode failure policy + std::string preferred_provider{"tensorrt"}; ///< Preferred execution provider ("tensorrt", "cuda", or "cpu") + int device_id{0}; ///< GPU device ID (for CUDA/TensorRT) + bool warmup_tensor_shapes{true}; ///< Warmup tensor shapes (always true, not configurable) + bool enable_trt_engine_cache{false}; ///< Enable TensorRT engine caching + std::string trt_engine_cache_path{"/tmp/deep_ros_ort_trt_cache"}; ///< TensorRT engine cache directory + std::vector class_names; ///< Class name strings (loaded from file or empty) +}; + +/** + * @brief Convert string to PostprocessorType enum + * @param type Postprocessor type string (currently unused, always returns GENERIC) + * @return PostprocessorType::GENERIC + */ +inline PostprocessorType stringToPostprocessorType(const std::string & /*type*/) +{ + return PostprocessorType::GENERIC; +} + +/** + * @brief Convert string to BboxFormat enum + * @param format Format string ("cxcywh", "xyxy", or "xywh", case-insensitive) + * @return BboxFormat enum (defaults to CXCYWH if unknown) + */ +inline BboxFormat stringToBboxFormat(const std::string & format) +{ + if (format == "cxcywh" || format == "CXCYWH") { + return BboxFormat::CXCYWH; + } else if (format == "xyxy" || format == "XYXY") { + return BboxFormat::XYXY; + } else if (format == "xywh" || format == "XYWH") { + return BboxFormat::XYWH; + } + return BboxFormat::CXCYWH; +} + +/** + * @brief Convert string to NormalizationType enum + * @param type Normalization type string ("imagenet", "scale_0_1", "yolo", "custom", or "none", case-insensitive) + * @return NormalizationType enum (defaults to SCALE_0_1 if unknown) + */ +inline NormalizationType stringToNormalizationType(const std::string & type) +{ + if (type == "imagenet" || type == "IMAGENET") { + return NormalizationType::IMAGENET; + } else if (type == "scale_0_1" || type == "SCALE_0_1" || type == "yolo" || type == "YOLO") { + return NormalizationType::SCALE_0_1; + } else if (type == "custom" || type == "CUSTOM") { + return NormalizationType::CUSTOM; + } else if (type == "none" || type == "NONE") { + return NormalizationType::NONE; + } + return NormalizationType::SCALE_0_1; +} + +/** + * @brief Convert string to ResizeMethod enum + * @param method Resize method string ("letterbox", "resize", "crop", or "pad", case-insensitive) + * @return ResizeMethod enum (defaults to LETTERBOX if unknown) + */ +inline ResizeMethod stringToResizeMethod(const std::string & method) +{ + if (method == "letterbox" || method == "LETTERBOX") { + return ResizeMethod::LETTERBOX; + } else if (method == "resize" || method == "RESIZE") { + return ResizeMethod::RESIZE; + } else if (method == "crop" || method == "CROP") { + return ResizeMethod::CROP; + } else if (method == "pad" || method == "PAD") { + return ResizeMethod::PAD; + } + return ResizeMethod::LETTERBOX; +} + +/** + * @brief Convert string to ScoreActivation enum + * @param activation Activation string ("sigmoid", "softmax", or "none", case-insensitive) + * @return ScoreActivation enum (defaults to SIGMOID if unknown) + */ +inline ScoreActivation stringToScoreActivation(const std::string & activation) +{ + if (activation == "sigmoid" || activation == "SIGMOID") { + return ScoreActivation::SIGMOID; + } else if (activation == "softmax" || activation == "SOFTMAX") { + return ScoreActivation::SOFTMAX; + } else if (activation == "none" || activation == "NONE") { + return ScoreActivation::NONE; + } + return ScoreActivation::SIGMOID; +} + +/** + * @brief Convert string to ClassScoreMode enum + * @param mode Mode string ("all_classes" or "single_confidence", case-insensitive) + * @return ClassScoreMode enum (defaults to ALL_CLASSES if unknown) + */ +inline ClassScoreMode stringToClassScoreMode(const std::string & mode) +{ + if (mode == "all_classes" || mode == "ALL_CLASSES") { + return ClassScoreMode::ALL_CLASSES; + } else if (mode == "single_confidence" || mode == "SINGLE_CONFIDENCE") { + return ClassScoreMode::SINGLE_CONFIDENCE; + } + return ClassScoreMode::ALL_CLASSES; +} + +/** + * @brief Convert string to CoordinateSpace enum + * @param space Space string ("preprocessed" or "original", case-insensitive) + * @return CoordinateSpace enum (defaults to PREPROCESSED if unknown) + */ +inline CoordinateSpace stringToCoordinateSpace(const std::string & space) +{ + if (space == "preprocessed" || space == "PREPROCESSED") { + return CoordinateSpace::PREPROCESSED; + } else if (space == "original" || space == "ORIGINAL") { + return CoordinateSpace::ORIGINAL; + } + return CoordinateSpace::PREPROCESSED; +} + +/** + * @brief Convert string to QueueOverflowPolicy enum + * @param policy Policy string ("drop_oldest", "drop_newest", or "throw", case-insensitive) + * @return QueueOverflowPolicy enum (defaults to DROP_OLDEST if unknown) + */ +inline QueueOverflowPolicy stringToQueueOverflowPolicy(const std::string & policy) +{ + if (policy == "drop_oldest" || policy == "DROP_OLDEST") { + return QueueOverflowPolicy::DROP_OLDEST; + } else if (policy == "drop_newest" || policy == "DROP_NEWEST") { + return QueueOverflowPolicy::DROP_NEWEST; + } else if (policy == "throw" || policy == "THROW") { + return QueueOverflowPolicy::THROW; + } + return QueueOverflowPolicy::DROP_OLDEST; +} + +/** + * @brief Convert string to DecodeFailurePolicy enum + * @param policy Policy string ("drop" or "throw", case-insensitive) + * @return DecodeFailurePolicy enum (defaults to DROP if unknown) + */ +inline DecodeFailurePolicy stringToDecodeFailurePolicy(const std::string & policy) +{ + if (policy == "drop" || policy == "DROP") { + return DecodeFailurePolicy::DROP; + } else if (policy == "throw" || policy == "THROW") { + return DecodeFailurePolicy::THROW; + } + return DecodeFailurePolicy::DROP; +} + +/** + * @brief Simple detection structure (internal representation) + * + * Stores a single detection with bounding box, score, and class ID. + * Coordinates are in original image space (x, y, width, height). + */ +struct SimpleDetection +{ + float x = 0.0f; ///< Top-left x coordinate in original image space + float y = 0.0f; ///< Top-left y coordinate in original image space + float width = 0.0f; ///< Bounding box width in original image space + float height = 0.0f; ///< Bounding box height in original image space + float score = 0.0f; ///< Confidence score [0, 1] + int32_t class_id = -1; ///< Class ID (-1 if unknown) +}; + +} // namespace deep_object_detection diff --git a/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp b/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp new file mode 100644 index 0000000..888028a --- /dev/null +++ b/deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp @@ -0,0 +1,277 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file generic_postprocessor.hpp + * @brief Generic postprocessor for object detection model outputs + * + * This header defines the GenericPostprocessor class which: + * - Automatically detects output tensor layouts + * - Applies score activation and thresholding + * - Performs Non-Maximum Suppression (NMS) + * - Transforms coordinates from preprocessed to original image space + * - Formats detections into ROS Detection2DArray messages + * - Supports both single-output and multi-output models + */ + +#pragma once + +#include +#include +#include + +#include +#include + +#include "deep_object_detection/detection_types.hpp" + +namespace deep_object_detection +{ + +/** + * @brief Generic postprocessor for object detection models + * + * Handles postprocessing pipeline for various ONNX model output formats: + * - Automatic layout detection (supports [batch, detections, features], [batch, features, detections], etc.) + * - Score activation (sigmoid, softmax, or none) + * - Score thresholding + * - Non-maximum suppression (NMS) + * - Coordinate transformation from preprocessed to original image space + * - Message formatting for ROS Detection2DArray + * + * Supports both single-output models (boxes + scores + classes in one tensor) + * and multi-output models (separate tensors for boxes, scores, classes). + */ +class GenericPostprocessor +{ +public: + /** + * @brief Output tensor layout configuration + * + * Describes the structure of model output tensors: + * - Dimension indices for batch, detections, and features + * - Feature indices for bbox coordinates, scores, and class IDs + * - Support for transposed layouts and multi-output models + */ + struct OutputLayout + { + std::vector shape; ///< Output tensor shape + size_t batch_dim = 0; ///< Batch dimension index + size_t detection_dim = 1; ///< Detection dimension index + size_t feature_dim = 2; ///< Feature dimension index + size_t bbox_start_idx = 0; ///< Starting index for bbox coordinates in feature dimension + size_t bbox_count = 4; ///< Number of bbox coordinates (always 4) + size_t score_idx = 4; ///< Index for confidence score in feature dimension + size_t class_idx = 5; ///< Index for class ID in feature dimension + bool has_separate_class_output = false; ///< True if class IDs are in separate output tensor + size_t class_output_idx = 0; ///< Output index for separate class tensor (if applicable) + bool auto_detect = true; ///< True if layout should be auto-detected + }; + + /** + * @brief Construct the postprocessor + * @param config Postprocessing configuration (thresholds, NMS, activation) + * @param layout Output tensor layout configuration + * @param bbox_format Bounding box format (cxcywh, xyxy, or xywh) + * @param num_classes Number of detection classes + * @param class_names Vector of class name strings (empty if using class IDs) + * @param use_letterbox True if letterbox resize was used (affects coordinate transformation) + */ + GenericPostprocessor( + const PostprocessingConfig & config, + const OutputLayout & layout, + BboxFormat bbox_format, + int num_classes, + const std::vector & class_names, + bool use_letterbox); + + /** + * @brief Automatically detect output tensor layout from shape + * @param output_shape Model output tensor shape + * @return Detected OutputLayout + * + * Analyzes tensor shape to determine layout: + * - [batch, detections, features] -> standard layout + * - [batch, features, detections] -> transposed layout + * - [batch, queries, 4+classes] -> query-based (DETR-style) + * - Other shapes -> heuristic-based detection + */ + static OutputLayout detectLayout(const std::vector & output_shape); + + /** + * @brief Auto-configure output layout based on config and optional output shape + * @param output_shape Model output shape (can be empty for deferred detection) + * @param layout_config Layout configuration from parameters + * @return Configured OutputLayout + * + * Handles both manual and auto-detection modes. If auto_detect is true and output_shape + * is available, automatically detects layout. Otherwise uses manual config or defers detection. + */ + static OutputLayout autoConfigure(const std::vector & output_shape, const OutputLayoutConfig & layout_config); + + /** + * @brief Decode model output tensor to detections + * @param output Model output tensor (single output) + * @param metas Image metadata for coordinate transformation + * @return Vector of detections per image in batch + * + * Extracts bounding boxes, scores, and class IDs from output tensor. + * Applies score thresholding, NMS, and coordinate transformation. + * Returns one vector of detections per image in the batch. + */ + std::vector> decode( + const deep_ros::Tensor & output, const std::vector & metas) const; + + /** + * @brief Decode multi-output model to detections + * @param outputs Vector of model output tensors (boxes, scores, classes separately) + * @param metas Image metadata for coordinate transformation + * @return Vector of detections per image in batch + * + * For models with separate outputs for boxes, scores, and classes. + * Uses output_boxes_idx, output_scores_idx, output_classes_idx from config. + */ + std::vector> decodeMultiOutput( + const std::vector & outputs, const std::vector & metas) const; + + /** + * @brief Get postprocessor format name + * @return Format name string ("generic") + */ + std::string getFormatName() const + { + return "generic"; + } + + /** + * @brief Fill ROS Detection2DArray message with detections + * @param header ROS message header (timestamp and frame_id) + * @param detections Vector of detections for one image + * @param meta Image metadata + * @param out_msg Output message to fill + * + * Converts SimpleDetection objects to ROS Detection2D messages + * and populates the Detection2DArray message. + */ + void fillDetectionMessage( + const std_msgs::msg::Header & header, + const std::vector & detections, + const ImageMeta & meta, + Detection2DArrayMsg & out_msg) const; + +protected: + /** + * @brief Transform detection coordinates from preprocessed to original image space + * @param det Detection to transform (modified in-place) + * @param meta Image metadata (scale, padding) + * @param use_letterbox True if letterbox resize was used + * + * Transforms bounding box coordinates from preprocessed image space + * (model input size) to original image space using scale and padding + * information from preprocessing. + */ + void adjustToOriginal(SimpleDetection & det, const ImageMeta & meta, bool use_letterbox) const; + + /** + * @brief Apply Non-Maximum Suppression to detections + * @param dets Vector of detections (modified in-place) + * @param iou_threshold IoU threshold for NMS + * @return Filtered detections after NMS + * + * Removes overlapping detections with lower scores. Detections are + * sorted by score (descending) and suppressed if IoU > threshold. + */ + std::vector applyNms(std::vector dets, float iou_threshold) const; + + /** + * @brief Calculate Intersection over Union (IoU) between two detections + * @param a First detection + * @param b Second detection + * @return IoU value [0, 1] + */ + static float iou(const SimpleDetection & a, const SimpleDetection & b); + + /** + * @brief Get class label string for a class ID + * @param class_id Class ID + * @param class_names Vector of class name strings + * @return Class label string (name if available, otherwise ID as string) + */ + std::string classLabel(int class_id, const std::vector & class_names) const; + + /** + * @brief Apply score activation function + * @param raw_score Raw score from model + * @return Activated score + * + * Applies activation according to config: + * - sigmoid: 1 / (1 + exp(-x)) + * - softmax: exp(x) / sum(exp(x)) (not applicable here, returns raw) + * - none: return raw score + */ + float applyActivation(float raw_score) const; + +private: + /** + * @brief Extract a single value from output tensor + * @param data Pointer to tensor data (flattened array) + * @param batch_idx Batch index + * @param detection_idx Detection index + * @param feature_idx Feature index + * @param shape Tensor shape for index calculation + * @return Extracted float value + * + * Calculates linear index from multi-dimensional indices and extracts value. + * Handles both standard and transposed layouts. + */ + float extractValue( + const float * data, + size_t batch_idx, + size_t detection_idx, + size_t feature_idx, + const std::vector & shape) const; + + /** + * @brief Convert bbox data from tensor to SimpleDetection + * @param bbox_data Pointer to bbox coordinates in tensor + * @param batch_idx Batch index + * @param detection_idx Detection index + * @param shape Tensor shape + * @param det Output detection (bbox coordinates filled) + * + * Extracts 4 bbox coordinates and converts from model format (cxcywh, xyxy, or xywh) + * to SimpleDetection format (x, y, width, height in original image space). + */ + void convertBbox( + const float * bbox_data, + size_t batch_idx, + size_t detection_idx, + const std::vector & shape, + SimpleDetection & det) const; + + /// Postprocessing configuration (thresholds, NMS, activation) + PostprocessingConfig config_; + /// Output tensor layout configuration + OutputLayout layout_; + /// Bounding box format (cxcywh, xyxy, or xywh) + BboxFormat bbox_format_; + /// Number of detection classes + int num_classes_; + /// Class name strings (empty if using class IDs) + std::vector class_names_; + /// True if letterbox resize was used (affects coordinate transformation) + bool use_letterbox_; +}; + +} // namespace deep_object_detection diff --git a/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp b/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp new file mode 100644 index 0000000..fd59651 --- /dev/null +++ b/deep_object_detection/include/deep_object_detection/image_preprocessor.hpp @@ -0,0 +1,162 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file image_preprocessor.hpp + * @brief Image preprocessing for object detection models + * + * This header defines the ImagePreprocessor class which: + * - Resizes images (letterbox, resize, crop, or pad) + * - Converts color formats (BGR to RGB) + * - Applies normalization (scale_0_1, imagenet, custom, or none) + * - Batches multiple images into a single tensor + * - Tracks metadata for coordinate transformation + */ + +#pragma once + +#include +#include + +#include + +#include "deep_object_detection/detection_types.hpp" + +namespace deep_object_detection +{ + +/** + * @brief Preprocesses images for model input + * + * Handles image preprocessing pipeline: + * - Resizing (letterbox, resize, crop, or pad) + * - Color format conversion (BGR to RGB) + * - Normalization (scale_0_1, imagenet, custom, or none) + * - Batching multiple images into a single tensor + * + * The preprocessor maintains aspect ratio when using letterbox resize + * and tracks metadata (scale, padding) for coordinate transformation + * in postprocessing. + */ +class ImagePreprocessor +{ +public: + /** + * @brief Construct the preprocessor with configuration + * @param config Preprocessing configuration (resize method, normalization, etc.) + */ + explicit ImagePreprocessor(const PreprocessingConfig & config); + + /** + * @brief Preprocess a single BGR image + * @param bgr Input BGR image (OpenCV Mat) + * @param meta Output metadata (original size, scale, padding) for coordinate transformation + * @return Preprocessed image ready for model input + * + * Applies resize, color conversion, and normalization according to config. + * Updates meta with transformation parameters needed for postprocessing. + */ + cv::Mat preprocess(const cv::Mat & bgr, ImageMeta & meta) const; + + /** + * @brief Pack a batch of preprocessed images into a single tensor + * @param images Vector of preprocessed images (same size, normalized) + * @return Reference to PackedInput containing flattened float array and shape + * + * Converts batch of OpenCV Mats to a single flattened float array + * in NCHW format (batch, channels, height, width). Uses internal cache + * to avoid reallocation. Thread-safe for read-only access. + */ + const PackedInput & pack(const std::vector & images) const; + + /** + * @brief Get preprocessing configuration + * @return Reference to preprocessing configuration + */ + const PreprocessingConfig & config() const + { + return config_; + } + +private: + /** + * @brief Apply letterbox resize (maintain aspect ratio, pad with gray) + * @param bgr Input BGR image + * @param meta Output metadata (scale and padding info) + * @return Resized image with letterbox padding + * + * Maintains aspect ratio by scaling to fit within input dimensions, + * then pads with gray (114) to reach exact input size. Updates meta + * with scale factors and padding offsets. + */ + cv::Mat applyLetterbox(const cv::Mat & bgr, ImageMeta & meta) const; + + /** + * @brief Apply simple resize (stretch to input size) + * @param bgr Input BGR image + * @param meta Output metadata (scale info) + * @return Resized image (may distort aspect ratio) + * + * Stretches image to exact input dimensions. Updates meta with scale factors. + */ + cv::Mat applyResize(const cv::Mat & bgr, ImageMeta & meta) const; + + /** + * @brief Apply center crop to input size + * @param bgr Input BGR image + * @param meta Output metadata (scale info) + * @return Cropped image + * + * Crops center region to input dimensions. Updates meta with scale factors. + */ + cv::Mat applyCrop(const cv::Mat & bgr, ImageMeta & meta) const; + + /** + * @brief Apply padding to input size + * @param bgr Input BGR image + * @param meta Output metadata (padding info) + * @return Padded image + * + * Pads image with gray (114) to reach input dimensions. Updates meta with padding offsets. + */ + cv::Mat applyPad(const cv::Mat & bgr, ImageMeta & meta) const; + + /** + * @brief Apply normalization to image + * @param image Image to normalize (modified in-place) + * + * Applies normalization according to config: + * - scale_0_1: divide by 255.0 + * - imagenet: subtract mean [0.485, 0.456, 0.406], divide by std [0.229, 0.224, 0.225] + * - custom: use config mean and std + * - none: no normalization + */ + void applyNormalization(cv::Mat & image) const; + + /** + * @brief Convert BGR to RGB if needed + * @param image Image to convert (modified in-place) + * + * Converts BGR to RGB if color_format is "rgb" in config. + * No-op if color_format is "bgr". + */ + void applyColorConversion(cv::Mat & image) const; + + /// Preprocessing configuration + PreprocessingConfig config_; + /// Cached packed input (reused to avoid reallocation) + mutable PackedInput packed_input_cache_; +}; + +} // namespace deep_object_detection diff --git a/deep_object_detection/launch/deep_object_detection.launch.yaml b/deep_object_detection/launch/deep_object_detection.launch.yaml new file mode 100644 index 0000000..2f3d8ef --- /dev/null +++ b/deep_object_detection/launch/deep_object_detection.launch.yaml @@ -0,0 +1,64 @@ +# Copyright (c) 2025-present WATonomous. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +launch: + - arg: + name: "config_file" + default: "$(find-pkg-share deep_object_detection)/config/generic_model_params.yaml" + description: "Path to YAML config for object detection" + + - arg: + name: "input_topic" + default: "/multi_camera_sync/multi_image_compressed" + description: "Input MultiImage topic (configured via remapping)" + + - arg: + name: "output_detections_topic" + default: "/detections" + description: "Output detections topic (configured via remapping)" + + - arg: + name: "use_lifecycle_manager" + default: "true" + description: "Whether to use nav2_lifecycle_manager for automatic lifecycle transitions" + + - node: + pkg: "deep_object_detection" + exec: "deep_object_detection_node" + name: "deep_object_detection_node" + output: "screen" + remap: + - from: "/multi_camera_sync/multi_image_compressed" + to: "$(var input_topic)" + - from: "/detections" + to: "$(var output_detections_topic)" + param: + - from: "$(var config_file)" + - name: "input_topic" + value: "$(var input_topic)" + - name: "output_detections_topic" + value: "$(var output_detections_topic)" + + - node: + pkg: "nav2_lifecycle_manager" + exec: "lifecycle_manager" + name: "detection_lifecycle_manager" + output: "screen" + param: + - name: "node_names" + value: ["deep_object_detection_node"] + - name: "autostart" + value: true + - name: "bond_timeout" + value: 0.0 diff --git a/deep_object_detection/package.xml b/deep_object_detection/package.xml new file mode 100644 index 0000000..7f9ca80 --- /dev/null +++ b/deep_object_detection/package.xml @@ -0,0 +1,36 @@ + + + deep_object_detection + 0.1.0 + + Generic model-agnostic deep learning object detection node using ONNX Runtime with explicit provider selection (TensorRT/CUDA/CPU). + Works with any ONNX-compatible object detection model - automatically detects and adapts to model output format. + Features dynamic batching, multi-camera support, configurable preprocessing/postprocessing, and fail-fast error handling. + + + WATonomous + Apache License 2.0 + + ament_cmake + + rclcpp + rclcpp_components + rclcpp_lifecycle + sensor_msgs + cv_bridge + vision_msgs + deep_core + deep_msgs + deep_ort_backend_plugin + deep_ort_gpu_backend_plugin + onnxruntime_vendor + onnxruntime_gpu_vendor + opencv2 + rcl_interfaces + + deep_test + + + ament_cmake + + diff --git a/deep_object_detection/src/backend_manager.cpp b/deep_object_detection/src/backend_manager.cpp new file mode 100644 index 0000000..2492819 --- /dev/null +++ b/deep_object_detection/src/backend_manager.cpp @@ -0,0 +1,277 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deep_object_detection/backend_manager.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace deep_object_detection +{ + +BackendManager::BackendManager(rclcpp_lifecycle::LifecycleNode & node, const DetectionParams & params) +: node_(node) +, params_(params) +, plugin_loader_( + std::make_unique>("deep_core", "deep_ros::DeepBackendPlugin")) +{} + +BackendManager::~BackendManager() +{ + plugin_holder_.reset(); + executor_.reset(); + allocator_.reset(); +} + +void BackendManager::initialize() +{ + provider_ = parseProvider(params_.preferred_provider); + initializeBackend(); +} + +deep_ros::Tensor BackendManager::infer(const PackedInput & input) +{ + if (!executor_) { + RCLCPP_ERROR(node_.get_logger(), "Cannot perform inference: no backend executor initialized"); + throw std::runtime_error("Cannot perform inference: no backend executor initialized"); + } + + auto input_tensor = buildInputTensor(input); + return executor_->run_inference(input_tensor); +} + +std::vector BackendManager::inferAllOutputs(const PackedInput & input) +{ + if (!executor_) { + RCLCPP_ERROR(node_.get_logger(), "Cannot perform inference: no backend executor initialized"); + throw std::runtime_error("Cannot perform inference: no backend executor initialized"); + } + + auto input_tensor = buildInputTensor(input); + auto output = executor_->run_inference(input_tensor); + return {output}; +} + +Provider BackendManager::parseProvider(const std::string & provider_str) const +{ + std::string normalized = provider_str; + std::transform(normalized.begin(), normalized.end(), normalized.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + + if (normalized == "cpu") { + return Provider::CPU; + } else if (normalized == "cuda") { + return Provider::CUDA; + } else if (normalized == "tensorrt") { + return Provider::TENSORRT; + } else { + throw std::runtime_error("Unknown provider: " + provider_str + ". Valid options: cpu, cuda, tensorrt"); + } +} + +void BackendManager::initializeBackend() +{ + if ((provider_ == Provider::TENSORRT || provider_ == Provider::CUDA) && !isCudaRuntimeAvailable()) { + std::string error = "Provider " + providerToString(provider_) + + " requires CUDA runtime libraries (libcudart/libcuda) which are not available"; + throw std::runtime_error(error); + } + + const std::string plugin_name = providerToPluginName(provider_); + if (plugin_name.empty()) { + throw std::runtime_error("No plugin name for provider: " + providerToString(provider_)); + } + + const auto provider_name = providerToString(provider_); + + if (!node_.has_parameter("Backend.execution_provider")) { + node_.declare_parameter("Backend.execution_provider", provider_name); + } else { + node_.set_parameters({rclcpp::Parameter("Backend.execution_provider", provider_name)}); + } + + if (!node_.has_parameter("Backend.device_id")) { + node_.declare_parameter("Backend.device_id", params_.device_id); + } else { + node_.set_parameters({rclcpp::Parameter("Backend.device_id", params_.device_id)}); + } + + if (!node_.has_parameter("Backend.trt_engine_cache_enable")) { + node_.declare_parameter("Backend.trt_engine_cache_enable", params_.enable_trt_engine_cache); + } else { + node_.set_parameters({rclcpp::Parameter("Backend.trt_engine_cache_enable", params_.enable_trt_engine_cache)}); + } + + if (!node_.has_parameter("Backend.trt_engine_cache_path")) { + node_.declare_parameter("Backend.trt_engine_cache_path", params_.trt_engine_cache_path); + } else { + node_.set_parameters({rclcpp::Parameter("Backend.trt_engine_cache_path", params_.trt_engine_cache_path)}); + } + + auto node_ptr = node_.shared_from_this(); + plugin_holder_ = plugin_loader_->createUniqueInstance(plugin_name); + plugin_holder_->initialize(node_ptr); + allocator_ = plugin_holder_->get_allocator(); + executor_ = plugin_holder_->get_inference_executor(); + + if (!executor_ || !allocator_) { + throw std::runtime_error("Executor or allocator is null after plugin initialization"); + } + + if (params_.model_path.empty()) { + throw std::runtime_error("Model path is not set"); + } + + if (!executor_->load_model(params_.model_path)) { + throw std::runtime_error("Failed to load model: " + params_.model_path); + } + + active_provider_ = providerToString(provider_); + declareActiveProviderParameter(active_provider_); + warmupTensorShapeCache(); + + RCLCPP_INFO( + node_.get_logger(), + "Initialized backend using provider: %s (device %d)", + active_provider_.c_str(), + params_.device_id); +} + +void BackendManager::warmupTensorShapeCache() +{ + if (!params_.warmup_tensor_shapes) { + return; + } + if (provider_ == Provider::CPU) { + return; + } + if (!executor_ || !allocator_) { + return; + } + const size_t channels = RGB_CHANNELS; + const size_t height = static_cast(params_.preprocessing.input_height); + const size_t width = static_cast(params_.preprocessing.input_width); + const size_t per_image = channels * height * width; + + const int batch = params_.max_batch_size; + RCLCPP_INFO( + node_.get_logger(), "Priming %s backend tensor shapes for batch size %d", active_provider_.c_str(), batch); + + PackedInput dummy; + dummy.shape = {static_cast(batch), channels, height, width}; + dummy.data.assign(static_cast(batch) * per_image, 0.0f); + + auto input_tensor = buildInputTensor(dummy); + (void)executor_->run_inference(input_tensor); + RCLCPP_DEBUG(node_.get_logger(), "Cached tensor shape for batch size %d", batch); +} + +bool BackendManager::isCudaRuntimeAvailable() const +{ + const char * libs[] = {"libcudart.so.12", "libcudart.so.11", "libcudart.so", "libcuda.so.1", "libcuda.so"}; + for (const auto * lib : libs) { + void * handle = dlopen(lib, RTLD_LAZY | RTLD_LOCAL); + if (handle) { + dlclose(handle); + return true; + } + } + return false; +} + +std::string BackendManager::providerToString(Provider provider) const +{ + switch (provider) { + case Provider::TENSORRT: + return "tensorrt"; + case Provider::CUDA: + return "cuda"; + case Provider::CPU: + return "cpu"; + default: + return "unknown"; + } +} + +void BackendManager::declareActiveProviderParameter(const std::string & value) +{ + rcl_interfaces::msg::ParameterDescriptor desc; + desc.read_only = true; + desc.description = "Current execution provider in use (read-only)"; + + if (node_.has_parameter("active_provider")) { + (void)node_.set_parameters({rclcpp::Parameter("active_provider", value)}); + } else { + node_.declare_parameter("active_provider", value, desc); + } +} + +deep_ros::Tensor BackendManager::buildInputTensor(const PackedInput & packed) const +{ + deep_ros::Tensor tensor(packed.shape, deep_ros::DataType::FLOAT32, allocator_); + const size_t bytes = packed.data.size() * sizeof(float); + allocator_->copy_from_host(tensor.data(), packed.data.data(), bytes); + return tensor; +} + +std::string BackendManager::providerToPluginName(Provider provider) const +{ + switch (provider) { + case Provider::TENSORRT: + case Provider::CUDA: + return "onnxruntime_gpu"; + case Provider::CPU: + return "onnxruntime_cpu"; + default: + return ""; + } +} + +std::vector BackendManager::getOutputShape(const std::vector & input_shape) const +{ + if (!executor_ || !allocator_) { + RCLCPP_ERROR( + node_.get_logger(), + "Cannot get output shape: backend not initialized (executor: %s, allocator: %s)", + executor_ ? "available" : "null", + allocator_ ? "available" : "null"); + throw std::runtime_error("Cannot get output shape: backend executor or allocator not available"); + } + + // NO TRY-CATCH - let exceptions propagate + PackedInput dummy; + dummy.shape = input_shape; + size_t total_elements = 1; + for (size_t dim : input_shape) { + total_elements *= dim; + } + dummy.data.assign(total_elements, 0.0f); + + auto input_tensor = buildInputTensor(dummy); + auto output_tensor = executor_->run_inference(input_tensor); + return output_tensor.shape(); +} + +} // namespace deep_object_detection diff --git a/deep_object_detection/src/deep_object_detection_node.cpp b/deep_object_detection/src/deep_object_detection_node.cpp new file mode 100644 index 0000000..84cbd91 --- /dev/null +++ b/deep_object_detection/src/deep_object_detection_node.cpp @@ -0,0 +1,599 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deep_object_detection/deep_object_detection_node.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "deep_object_detection/backend_manager.hpp" +#include "deep_object_detection/detection_types.hpp" +#include "deep_object_detection/generic_postprocessor.hpp" + +namespace deep_object_detection +{ + +DeepObjectDetectionNode::DeepObjectDetectionNode(const rclcpp::NodeOptions & options) +: LifecycleNode("deep_object_detection_node", options) +, dropped_images_count_(0) +{ + declareAndReadParameters(); + RCLCPP_INFO(this->get_logger(), "Deep object detection node created, waiting for configuration"); +} + +void DeepObjectDetectionNode::declareAndReadParameters() +{ + params_.model_path = this->declare_parameter("model_path", ""); + + auto class_names_path = this->declare_parameter("class_names_path", ""); + params_.model_metadata.num_classes = this->declare_parameter("model.num_classes", 80); + params_.model_metadata.class_names_file = class_names_path; + auto bbox_format_str = this->declare_parameter("model.bbox_format", "cxcywh"); + params_.model_metadata.bbox_format = stringToBboxFormat(bbox_format_str); + + params_.preprocessing.input_width = this->declare_parameter("preprocessing.input_width", 640); + params_.preprocessing.input_height = this->declare_parameter("preprocessing.input_height", 640); + auto normalization_type_str = this->declare_parameter("preprocessing.normalization_type", "scale_0_1"); + params_.preprocessing.normalization_type = stringToNormalizationType(normalization_type_str); + auto mean_d = this->declare_parameter>("preprocessing.mean", {0.0, 0.0, 0.0}); + auto std_d = this->declare_parameter>("preprocessing.std", {1.0, 1.0, 1.0}); + params_.preprocessing.mean.clear(); + params_.preprocessing.std.clear(); + params_.preprocessing.mean.reserve(mean_d.size()); + params_.preprocessing.std.reserve(std_d.size()); + std::transform(mean_d.begin(), mean_d.end(), std::back_inserter(params_.preprocessing.mean), [](double v) { + return static_cast(v); + }); + std::transform(std_d.begin(), std_d.end(), std::back_inserter(params_.preprocessing.std), [](double v) { + return static_cast(v); + }); + auto resize_method_str = this->declare_parameter("preprocessing.resize_method", "letterbox"); + params_.preprocessing.resize_method = stringToResizeMethod(resize_method_str); + params_.preprocessing.pad_value = 114; + params_.preprocessing.color_format = this->declare_parameter("preprocessing.color_format", "rgb"); + + params_.postprocessing.score_threshold = + static_cast(this->declare_parameter("postprocessing.score_threshold", 0.25)); + params_.postprocessing.nms_iou_threshold = + static_cast(this->declare_parameter("postprocessing.nms_iou_threshold", 0.45)); + params_.postprocessing.max_detections = this->declare_parameter("postprocessing.max_detections", 300); + auto score_activation_str = this->declare_parameter("postprocessing.score_activation", "sigmoid"); + params_.postprocessing.score_activation = stringToScoreActivation(score_activation_str); + params_.postprocessing.enable_nms = true; + + auto class_score_mode_str = this->declare_parameter("postprocessing.class_score_mode", "all_classes"); + params_.postprocessing.class_score_mode = stringToClassScoreMode(class_score_mode_str); + params_.postprocessing.class_score_start_idx = -1; + params_.postprocessing.class_score_count = -1; + params_.postprocessing.coordinate_space = CoordinateSpace::PREPROCESSED; + + params_.postprocessing.use_multi_output = this->declare_parameter("postprocessing.use_multi_output", false); + if (params_.postprocessing.use_multi_output) { + params_.postprocessing.output_boxes_idx = this->declare_parameter("postprocessing.output_boxes_idx", 0); + params_.postprocessing.output_scores_idx = this->declare_parameter("postprocessing.output_scores_idx", 1); + params_.postprocessing.output_classes_idx = this->declare_parameter("postprocessing.output_classes_idx", 2); + } else { + params_.postprocessing.output_boxes_idx = 0; + params_.postprocessing.output_scores_idx = 1; + params_.postprocessing.output_classes_idx = 2; + } + + params_.postprocessing.layout.auto_detect = this->declare_parameter("postprocessing.layout.auto_detect", true); + if (!params_.postprocessing.layout.auto_detect) { + params_.postprocessing.layout.batch_dim = this->declare_parameter("postprocessing.layout.batch_dim", 0); + params_.postprocessing.layout.detection_dim = + this->declare_parameter("postprocessing.layout.detection_dim", 1); + params_.postprocessing.layout.feature_dim = this->declare_parameter("postprocessing.layout.feature_dim", 2); + params_.postprocessing.layout.bbox_start_idx = + this->declare_parameter("postprocessing.layout.bbox_start_idx", 0); + params_.postprocessing.layout.bbox_count = this->declare_parameter("postprocessing.layout.bbox_count", 4); + params_.postprocessing.layout.score_idx = this->declare_parameter("postprocessing.layout.score_idx", 4); + params_.postprocessing.layout.class_idx = this->declare_parameter("postprocessing.layout.class_idx", 5); + } else { + params_.postprocessing.layout.batch_dim = 0; + params_.postprocessing.layout.detection_dim = 1; + params_.postprocessing.layout.feature_dim = 2; + params_.postprocessing.layout.bbox_start_idx = 0; + params_.postprocessing.layout.bbox_count = 4; + params_.postprocessing.layout.score_idx = 4; + params_.postprocessing.layout.class_idx = 5; + } + + input_topic_ = this->declare_parameter("input_topic", ""); + params_.input_qos_reliability = "best_effort"; + params_.output_detections_topic = this->declare_parameter("output_detections_topic", "/detections"); + + params_.max_batch_size = this->declare_parameter("max_batch_size", 3); + params_.queue_size = this->declare_parameter("queue_size", 10); + if (params_.queue_size < 0) { + RCLCPP_WARN( + this->get_logger(), + "queue_size (%d) is negative, clamping to 0 (unlimited internal queue). " + "Note: ROS QoS depth will use minimum of 1.", + params_.queue_size); + params_.queue_size = 0; + } + params_.queue_overflow_policy = QueueOverflowPolicy::DROP_OLDEST; + params_.decode_failure_policy = DecodeFailurePolicy::DROP; + + params_.preferred_provider = this->declare_parameter("preferred_provider", "tensorrt"); + params_.device_id = this->declare_parameter("device_id", 0); + params_.warmup_tensor_shapes = true; + params_.enable_trt_engine_cache = this->declare_parameter("enable_trt_engine_cache", false); + params_.trt_engine_cache_path = + this->declare_parameter("trt_engine_cache_path", "/tmp/deep_ros_ort_trt_cache"); +} + +rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_configure( + const rclcpp_lifecycle::State &) +{ + RCLCPP_INFO(this->get_logger(), "Configuring deep object detection node"); + + try { + loadClassNames(); + preprocessor_ = std::make_unique(params_.preprocessing); + backend_manager_ = std::make_unique(*this, params_); + + try { + backend_manager_->initialize(); + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Backend initialization failed: %s", e.what()); + cleanupPartialConfiguration(); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE; + } + + const size_t batch = static_cast(params_.max_batch_size); + const size_t channels = RGB_CHANNELS; + const size_t height = static_cast(params_.preprocessing.input_height); + const size_t width = static_cast(params_.preprocessing.input_width); + std::vector input_shape = {batch, channels, height, width}; + std::vector output_shape = backend_manager_->getOutputShape(input_shape); + + auto formatShape = [](const std::vector & shape) { + if (shape.empty()) return std::string("auto-detect"); + std::string result; + for (size_t i = 0; i < shape.size(); ++i) { + result += std::to_string(shape[i]); + if (i + 1 < shape.size()) result += ", "; + } + return result; + }; + + if (!output_shape.empty()) { + RCLCPP_INFO(this->get_logger(), "Detected model output shape: [%s]", formatShape(output_shape).c_str()); + } + + const auto & config = params_.postprocessing; + const auto & model_meta = params_.model_metadata; + const bool use_letterbox = (params_.preprocessing.resize_method == ResizeMethod::LETTERBOX); + + GenericPostprocessor::OutputLayout layout = + GenericPostprocessor::autoConfigure(output_shape, params_.postprocessing.layout); + if (layout.auto_detect && !output_shape.empty()) { + RCLCPP_INFO( + this->get_logger(), + "Auto-detected layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu", + layout.batch_dim, + layout.detection_dim, + layout.feature_dim); + } else if (!layout.auto_detect) { + RCLCPP_INFO( + this->get_logger(), + "Using manual layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu", + layout.batch_dim, + layout.detection_dim, + layout.feature_dim); + } else { + RCLCPP_INFO(this->get_logger(), "Layout will be auto-detected from first inference"); + } + + postprocessor_ = std::make_unique( + config, layout, model_meta.bbox_format, model_meta.num_classes, params_.class_names, use_letterbox); + + auto pub = this->create_publisher(params_.output_detections_topic, rclcpp::SensorDataQoS{}); + detection_pub_ = std::static_pointer_cast>(pub); + + if (input_topic_.empty()) { + RCLCPP_ERROR(this->get_logger(), "input_topic is empty. Please set input_topic to a valid MultiImage topic."); + cleanupPartialConfiguration(); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE; + } + RCLCPP_INFO(this->get_logger(), "Will subscribe to MultiImage topic: %s", input_topic_.c_str()); + + RCLCPP_INFO( + this->get_logger(), + "Deep object detection node configured. Model: %s, input size: %dx%d, batch_size: %d, provider: %s, " + "postprocessor: %s, output shape: [%s]", + params_.model_path.c_str(), + params_.preprocessing.input_width, + params_.preprocessing.input_height, + params_.max_batch_size, + backend_manager_->activeProvider().c_str(), + postprocessor_->getFormatName().c_str(), + formatShape(output_shape).c_str()); + + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS; + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Failed to configure: %s", e.what()); + cleanupPartialConfiguration(); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE; + } +} + +rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_activate( + const rclcpp_lifecycle::State &) +{ + RCLCPP_INFO(this->get_logger(), "Activating deep object detection node"); + + try { + callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::Reentrant); + + setupSubscription(); + + batch_timer_ = this->create_wall_timer( + std::chrono::milliseconds(5), std::bind(&DeepObjectDetectionNode::onBatchTimer, this), callback_group_); + + if (detection_pub_) { + detection_pub_->on_activate(); + } + + RCLCPP_INFO(this->get_logger(), "Deep object detection node activated and ready to process images"); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS; + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Failed to activate: %s", e.what()); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::FAILURE; + } +} + +void DeepObjectDetectionNode::cleanupPartialConfiguration() +{ + postprocessor_.reset(); + backend_manager_.reset(); + preprocessor_.reset(); + detection_pub_.reset(); +} + +void DeepObjectDetectionNode::cleanupAllResources() +{ + stopSubscriptionsAndTimer(); + backend_manager_.reset(); + preprocessor_.reset(); + postprocessor_.reset(); + params_.class_names.clear(); + detection_pub_.reset(); +} + +void DeepObjectDetectionNode::stopSubscriptionsAndTimer() +{ + if (batch_timer_) { + batch_timer_->cancel(); + batch_timer_.reset(); + } + + if (multi_image_sub_) { + multi_image_sub_.reset(); + } + + { + std::lock_guard lock(queue_mutex_); + image_queue_.clear(); + } +} + +rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_deactivate( + const rclcpp_lifecycle::State &) +{ + RCLCPP_INFO(this->get_logger(), "Deactivating deep object detection node"); + stopSubscriptionsAndTimer(); + if (detection_pub_) { + detection_pub_->on_deactivate(); + } + + RCLCPP_INFO(this->get_logger(), "Deep object detection node deactivated"); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS; +} + +rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_cleanup( + const rclcpp_lifecycle::State &) +{ + RCLCPP_INFO(this->get_logger(), "Cleaning up deep object detection node"); + cleanupAllResources(); + RCLCPP_INFO(this->get_logger(), "Deep object detection node cleaned up"); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS; +} + +rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn DeepObjectDetectionNode::on_shutdown( + const rclcpp_lifecycle::State &) +{ + RCLCPP_INFO(this->get_logger(), "Shutting down deep object detection node"); + cleanupAllResources(); + RCLCPP_INFO(this->get_logger(), "Deep object detection node shut down"); + return rclcpp_lifecycle::node_interfaces::LifecycleNodeInterface::CallbackReturn::SUCCESS; +} + +void DeepObjectDetectionNode::setupSubscription() +{ + const size_t qos_depth = std::max(static_cast(1), static_cast(params_.queue_size)); + auto qos_profile = rclcpp::QoS(rclcpp::KeepLast(qos_depth)); + qos_profile.best_effort(); + + rclcpp::SubscriptionOptions options; + options.callback_group = callback_group_; + + multi_image_sub_ = this->create_subscription( + input_topic_, + qos_profile, + [this](const deep_msgs::msg::MultiImage::ConstSharedPtr msg) { + try { + this->onMultiImage(msg); + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Exception in onMultiImage callback: %s", e.what()); + } + }, + options); + RCLCPP_INFO(this->get_logger(), "Subscribed to MultiImage topic: %s", input_topic_.c_str()); +} + +void DeepObjectDetectionNode::onMultiImage(const deep_msgs::msg::MultiImage::ConstSharedPtr & msg) +{ + RCLCPP_DEBUG(this->get_logger(), "Received MultiImage message with %zu images", msg->images.size()); + for (size_t i = 0; i < msg->images.size(); ++i) { + const auto & compressed_img = msg->images[i]; + try { + handleCompressedImage(compressed_img); + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Exception processing image %zu in MultiImage: %s", i, e.what()); + } + } + RCLCPP_DEBUG(this->get_logger(), "Processed MultiImage message with %zu synchronized images", msg->images.size()); +} + +void DeepObjectDetectionNode::handleCompressedImage(const sensor_msgs::msg::CompressedImage & msg) +{ + cv::Mat decoded = cv::imdecode(msg.data, cv::IMREAD_COLOR); + if (decoded.empty()) { + RCLCPP_WARN_THROTTLE( + this->get_logger(), *this->get_clock(), 1000, "Failed to decode compressed image, dropping frame"); + return; + } + + const int width = decoded.cols; + const int height = decoded.rows; + enqueueImage(std::move(decoded), msg.header); + RCLCPP_DEBUG( + this->get_logger(), "Enqueued decoded image (size: %zu bytes, decoded: %dx%d)", msg.data.size(), width, height); +} + +void DeepObjectDetectionNode::enqueueImage(cv::Mat image, const std_msgs::msg::Header & header) +{ + size_t queue_size_after = 0; + bool dropped = false; + { + std::lock_guard lock(queue_mutex_); + const size_t limit = static_cast(params_.queue_size); + if (limit > 0 && image_queue_.size() >= limit) { + if (!image_queue_.empty()) { + image_queue_.pop_front(); + dropped = true; + dropped_images_count_++; + } + } + + image_queue_.push_back({std::move(image), header}); + queue_size_after = image_queue_.size(); + } + + if (dropped) { + RCLCPP_WARN( + this->get_logger(), + "Queue overflow: dropped oldest image (queue size: %zu, limit: %zu, total dropped: %zu)", + queue_size_after, + static_cast(params_.queue_size), + dropped_images_count_); + } +} + +void DeepObjectDetectionNode::onBatchTimer() +{ + if (this->get_current_state().id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE) { + return; + } + + std::vector batch; + const size_t batch_size = static_cast(params_.max_batch_size); + + { + std::lock_guard lock(queue_mutex_); + if (image_queue_.size() < batch_size) { + return; + } + + batch.reserve(batch_size); + for (size_t i = 0; i < batch_size; ++i) { + batch.push_back(std::move(image_queue_.front())); + image_queue_.pop_front(); + } + } + + if (!batch.empty()) { + try { + processBatch(batch); + } catch (const std::exception & e) { + RCLCPP_ERROR(this->get_logger(), "Exception in batch processing: %s", e.what()); + } + } +} + +void DeepObjectDetectionNode::processBatch(const std::vector & batch) +{ + if (!backend_manager_ || !backend_manager_->hasExecutor()) { + RCLCPP_ERROR( + this->get_logger(), + "Cannot process batch: backend not initialized (backend_manager_: %s, hasExecutor: %s)", + backend_manager_ ? "exists" : "null", + (backend_manager_ && backend_manager_->hasExecutor()) ? "true" : "false"); + return; + } + + auto start_time = std::chrono::steady_clock::now(); + std::vector processed; + std::vector metas; + std::vector headers; + processed.reserve(batch.size()); + metas.reserve(batch.size()); + headers.reserve(batch.size()); + + for (const auto & item : batch) { + ImageMeta meta; + cv::Mat preprocessed = preprocessor_->preprocess(item.bgr, meta); + if (preprocessed.empty()) { + RCLCPP_WARN(this->get_logger(), "Preprocessing returned empty image, skipping"); + continue; + } + processed.push_back(std::move(preprocessed)); + metas.push_back(meta); + headers.push_back(item.header); + } + + if (processed.empty()) { + RCLCPP_WARN(this->get_logger(), "No valid images after preprocessing, skipping batch"); + return; + } + + const auto & packed_input = preprocessor_->pack(processed); + if (packed_input.data.empty()) { + RCLCPP_ERROR(this->get_logger(), "Packed input is empty after preprocessing %zu images", processed.size()); + return; + } + + std::vector> batch_detections; + if (params_.postprocessing.use_multi_output) { + auto output_tensors = backend_manager_->inferAllOutputs(packed_input); + batch_detections = postprocessor_->decodeMultiOutput(output_tensors, metas); + } else { + auto output_tensor = backend_manager_->infer(packed_input); + batch_detections = postprocessor_->decode(output_tensor, metas); + } + + auto end_time = std::chrono::steady_clock::now(); + auto elapsed_ms = std::chrono::duration_cast(end_time - start_time).count(); + + size_t dropped_count = 0; + { + std::lock_guard lock(queue_mutex_); + dropped_count = dropped_images_count_; + } + + RCLCPP_INFO_THROTTLE( + this->get_logger(), + *this->get_clock(), + 2000, + "Batch processing completed: %zu images in %" PRId64 " ms, total detections: %zu, dropped images: %zu", + batch.size(), + static_cast(elapsed_ms), + std::accumulate( + batch_detections.begin(), + batch_detections.end(), + size_t(0), + [](size_t sum, const auto & dets) { return sum + dets.size(); }), + dropped_count); + + publishDetections(batch_detections, headers, metas); +} + +void DeepObjectDetectionNode::publishDetections( + const std::vector> & batch_detections, + const std::vector & headers, + const std::vector & metas) +{ + if (this->get_current_state().id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE) { + return; + } + + if (!detection_pub_) { + RCLCPP_WARN_THROTTLE( + this->get_logger(), *this->get_clock(), 1000, "Cannot publish detections: publisher not initialized"); + return; + } + + size_t total_published = 0; + for (size_t i = 0; i < batch_detections.size() && i < headers.size() && i < metas.size(); ++i) { + Detection2DArrayMsg msg; + postprocessor_->fillDetectionMessage(headers[i], batch_detections[i], metas[i], msg); + detection_pub_->publish(msg); + total_published += batch_detections[i].size(); + } + RCLCPP_INFO_THROTTLE( + this->get_logger(), + *this->get_clock(), + 2000, + "Published %zu detection messages (%zu total detections)", + batch_detections.size(), + total_published); +} + +void DeepObjectDetectionNode::loadClassNames() +{ + const auto & class_names_path = params_.model_metadata.class_names_file; + if (class_names_path.empty()) { + return; + } + + params_.class_names.clear(); + + std::ifstream file(class_names_path); + if (!file.is_open()) { + throw std::runtime_error("Failed to open class_names_path: " + class_names_path); + } + + std::string line; + while (std::getline(file, line)) { + if (!line.empty()) { + params_.class_names.push_back(line); + } + } + + RCLCPP_INFO( + this->get_logger(), "Loaded %zu class names from %s", params_.class_names.size(), class_names_path.c_str()); +} + +std::shared_ptr createDeepObjectDetectionNode(const rclcpp::NodeOptions & options) +{ + return std::make_shared(options); +} + +} // namespace deep_object_detection + +RCLCPP_COMPONENTS_REGISTER_NODE(deep_object_detection::DeepObjectDetectionNode) diff --git a/deep_object_detection/src/generic_postprocessor.cpp b/deep_object_detection/src/generic_postprocessor.cpp new file mode 100644 index 0000000..54e3b04 --- /dev/null +++ b/deep_object_detection/src/generic_postprocessor.cpp @@ -0,0 +1,619 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deep_object_detection/generic_postprocessor.hpp" + +#include +#include +#include +#include +#include +#include + +namespace deep_object_detection +{ + +GenericPostprocessor::GenericPostprocessor( + const PostprocessingConfig & config, + const OutputLayout & layout, + BboxFormat bbox_format, + int num_classes, + const std::vector & class_names, + bool use_letterbox) +: config_(config) +, layout_(layout) +, bbox_format_(bbox_format) +, num_classes_(num_classes) +, class_names_(class_names) +, use_letterbox_(use_letterbox) +{} + +GenericPostprocessor::OutputLayout GenericPostprocessor::detectLayout(const std::vector & output_shape) +{ + OutputLayout layout; + layout.shape = output_shape; + layout.auto_detect = true; + + if (output_shape.size() < 2) { + throw std::runtime_error("Output tensor must have at least 2 dimensions"); + } + + layout.batch_dim = 0; + + if (output_shape.size() == 2) { + layout.detection_dim = 0; + layout.feature_dim = 1; + layout.bbox_start_idx = 0; + layout.bbox_count = 4; + layout.score_idx = 4; + layout.class_idx = 5; + } else if (output_shape.size() == 3) { + size_t dim1 = output_shape[1]; + size_t dim2 = output_shape[2]; + + if (dim1 <= 8 && dim2 > 8) { + layout.detection_dim = 2; + layout.feature_dim = 1; + } else if (dim1 > 8 && dim2 <= 8) { + layout.detection_dim = 1; + layout.feature_dim = 2; + } else { + layout.detection_dim = 1; + layout.feature_dim = 2; + } + + layout.bbox_start_idx = 0; + layout.bbox_count = 4; + layout.score_idx = 4; + layout.class_idx = 5; + } else { + layout.detection_dim = 1; + layout.feature_dim = 2; + layout.bbox_start_idx = 0; + layout.bbox_count = 4; + layout.score_idx = 4; + layout.class_idx = 5; + } + + return layout; +} + +GenericPostprocessor::OutputLayout GenericPostprocessor::autoConfigure( + const std::vector & output_shape, const OutputLayoutConfig & layout_config) +{ + OutputLayout layout; + + if (!layout_config.auto_detect) { + layout.auto_detect = false; + layout.batch_dim = static_cast(layout_config.batch_dim); + layout.detection_dim = static_cast(layout_config.detection_dim); + layout.feature_dim = static_cast(layout_config.feature_dim); + layout.bbox_start_idx = static_cast(layout_config.bbox_start_idx); + layout.bbox_count = static_cast(layout_config.bbox_count); + layout.score_idx = static_cast(layout_config.score_idx); + layout.class_idx = (layout_config.class_idx >= 0) ? static_cast(layout_config.class_idx) : SIZE_MAX; + if (!output_shape.empty()) { + layout.shape = output_shape; + } + } else if (!output_shape.empty()) { + layout = detectLayout(output_shape); + } else { + layout.auto_detect = true; + } + + return layout; +} + +float GenericPostprocessor::applyActivation(float raw_score) const +{ + switch (config_.score_activation) { + case ScoreActivation::SIGMOID: + return 1.0f / (1.0f + std::exp(-raw_score)); + case ScoreActivation::SOFTMAX: + return raw_score; + case ScoreActivation::NONE: + default: + return raw_score; + } +} + +float GenericPostprocessor::extractValue( + const float * data, + size_t batch_idx, + size_t detection_idx, + size_t feature_idx, + const std::vector & shape) const +{ + if (shape.size() == 2) { + return data[batch_idx * shape[1] + feature_idx]; + } else if (shape.size() == 3) { + if (layout_.detection_dim == 1 && layout_.feature_dim == 2) { + return data[(batch_idx * shape[1] + detection_idx) * shape[2] + feature_idx]; + } else if (layout_.detection_dim == 2 && layout_.feature_dim == 1) { + return data[(batch_idx * shape[1] + feature_idx) * shape[2] + detection_idx]; + } + } + + size_t index = 0; + size_t stride = 1; + for (int i = static_cast(shape.size()) - 1; i >= 0; --i) { + size_t dim_idx = 0; + if (i == static_cast(layout_.batch_dim)) { + dim_idx = batch_idx; + } else if (i == static_cast(layout_.detection_dim)) { + dim_idx = detection_idx; + } else if (i == static_cast(layout_.feature_dim)) { + dim_idx = feature_idx; + } + index += dim_idx * stride; + stride *= shape[i]; + } + + return data[index]; +} + +void GenericPostprocessor::convertBbox( + const float * bbox_data, + size_t batch_idx, + size_t detection_idx, + const std::vector & shape, + SimpleDetection & det) const +{ + float coords[4]; + for (size_t i = 0; i < 4; ++i) { + coords[i] = extractValue(bbox_data, batch_idx, detection_idx, layout_.bbox_start_idx + i, shape); + } + + switch (bbox_format_) { + case BboxFormat::XYXY: { + det.x = (coords[0] + coords[2]) * 0.5f; + det.y = (coords[1] + coords[3]) * 0.5f; + det.width = coords[2] - coords[0]; + det.height = coords[3] - coords[1]; + break; + } + case BboxFormat::XYWH: { + det.x = coords[0] + coords[2] * 0.5f; + det.y = coords[1] + coords[3] * 0.5f; + det.width = coords[2]; + det.height = coords[3]; + break; + } + case BboxFormat::CXCYWH: + default: { + det.x = coords[0]; + det.y = coords[1]; + det.width = coords[2]; + det.height = coords[3]; + break; + } + } +} + +std::vector> GenericPostprocessor::decode( + const deep_ros::Tensor & output, const std::vector & metas) const +{ + std::vector> batch_detections; + + const auto & shape = output.shape(); + if (shape.empty()) { + throw std::runtime_error("Output tensor has empty shape"); + } + + const float * data = output.data_as(); + if (!data) { + throw std::runtime_error("Output tensor has null data"); + } + + OutputLayout layout = layout_; + if (layout.auto_detect) { + layout = detectLayout(shape); + } + + size_t batch_size = shape[layout.batch_dim]; + size_t num_detections = shape.size() > layout.detection_dim ? shape[layout.detection_dim] : 1; + size_t num_features = shape.size() > layout.feature_dim ? shape[layout.feature_dim] : shape.back(); + + batch_detections.reserve(std::min(batch_size, metas.size())); + + for (size_t b = 0; b < batch_size && b < metas.size(); ++b) { + std::vector dets; + dets.reserve(num_detections); + + for (size_t d = 0; d < num_detections; ++d) { + float score = 0.0f; + int32_t class_id = -1; + + size_t class_start_idx = + (config_.class_score_start_idx >= 0) ? static_cast(config_.class_score_start_idx) : layout.bbox_count; + size_t class_count = (config_.class_score_count > 0) ? static_cast(config_.class_score_count) + : static_cast(num_classes_); + + if (config_.class_score_mode == ClassScoreMode::ALL_CLASSES && num_features > class_start_idx) { + size_t class_end = std::min(num_features, class_start_idx + class_count); + std::vector class_logits; + class_logits.reserve(class_end - class_start_idx); + + for (size_t c = class_start_idx; c < class_end; ++c) { + float cls_logit = extractValue(data, b, d, c, shape); + class_logits.push_back(cls_logit); + } + + if (config_.score_activation == ScoreActivation::SOFTMAX) { + float max_logit = *std::max_element(class_logits.begin(), class_logits.end()); + float sum_exp = 0.0f; + for (float & logit : class_logits) { + logit = std::exp(logit - max_logit); + sum_exp += logit; + } + + float max_score = -std::numeric_limits::max(); + size_t best_class = 0; + for (size_t i = 0; i < class_logits.size(); ++i) { + float prob = class_logits[i] / sum_exp; + if (prob > max_score) { + max_score = prob; + best_class = i; + } + } + score = max_score; + class_id = static_cast(best_class); + } else { + float max_score = -std::numeric_limits::max(); + size_t best_class = 0; + for (size_t i = 0; i < class_logits.size(); ++i) { + float cls_score = applyActivation(class_logits[i]); + if (cls_score > max_score) { + max_score = cls_score; + best_class = i; + } + } + score = max_score; + class_id = static_cast(best_class); + } + } else if (config_.class_score_mode == ClassScoreMode::SINGLE_CONFIDENCE) { + if (layout.score_idx < num_features) { + float raw_score = extractValue(data, b, d, layout.score_idx, shape); + score = applyActivation(raw_score); + } + if (layout.class_idx < num_features && layout.class_idx != SIZE_MAX) { + class_id = static_cast(std::round(extractValue(data, b, d, layout.class_idx, shape))); + } + } else if (layout.score_idx < num_features) { + float raw_score = extractValue(data, b, d, layout.score_idx, shape); + score = applyActivation(raw_score); + if (layout.class_idx < num_features && layout.class_idx != SIZE_MAX) { + class_id = static_cast(std::round(extractValue(data, b, d, layout.class_idx, shape))); + } + } + + if (score < config_.score_threshold) { + continue; + } + + SimpleDetection det; + convertBbox(data, b, d, shape, det); + det.score = score; + det.class_id = class_id; + + if (config_.coordinate_space == CoordinateSpace::PREPROCESSED) { + adjustToOriginal(det, metas[b], use_letterbox_); + } + dets.push_back(det); + } + + if (config_.enable_nms) { + batch_detections.push_back(applyNms(dets, config_.nms_iou_threshold)); + } else { + batch_detections.push_back(dets); + } + } + + return batch_detections; +} + +std::vector> GenericPostprocessor::decodeMultiOutput( + const std::vector & outputs, const std::vector & metas) const +{ + std::vector> batch_detections; + + if (outputs.empty()) { + throw std::runtime_error("No output tensors provided"); + } + + const int boxes_idx = config_.output_boxes_idx; + const int scores_idx = config_.output_scores_idx; + const int classes_idx = config_.output_classes_idx; + + if (boxes_idx < 0 || boxes_idx >= static_cast(outputs.size())) { + throw std::runtime_error("Invalid output_boxes_idx: " + std::to_string(boxes_idx)); + } + if (scores_idx < 0 || scores_idx >= static_cast(outputs.size())) { + throw std::runtime_error("Invalid output_scores_idx: " + std::to_string(scores_idx)); + } + + const auto & boxes_tensor = outputs[boxes_idx]; + const auto & scores_tensor = outputs[scores_idx]; + + const auto & boxes_shape = boxes_tensor.shape(); + const auto & scores_shape = scores_tensor.shape(); + + if (boxes_shape.empty() || scores_shape.empty()) { + throw std::runtime_error("Output tensors have empty shapes"); + } + + const float * boxes_data = boxes_tensor.data_as(); + const float * scores_data = scores_tensor.data_as(); + + if (!boxes_data || !scores_data) { + throw std::runtime_error("Output tensors have null data"); + } + + size_t batch_size = boxes_shape[0]; + size_t num_detections = boxes_shape.size() > 1 ? boxes_shape[1] : 1; + size_t bbox_dims = boxes_shape.size() > 2 ? boxes_shape[2] : boxes_shape.back(); + + if (bbox_dims < 4) { + throw std::runtime_error("Boxes tensor must have at least 4 values per detection"); + } + + batch_detections.reserve(std::min(batch_size, metas.size())); + + for (size_t b = 0; b < batch_size && b < metas.size(); ++b) { + std::vector dets; + dets.reserve(num_detections); + + for (size_t d = 0; d < num_detections; ++d) { + size_t box_offset = (b * num_detections + d) * bbox_dims; + float x = boxes_data[box_offset]; + float y = boxes_data[box_offset + 1]; + float w = boxes_data[box_offset + 2]; + float h = boxes_data[box_offset + 3]; + + float score = 0.0f; + int32_t class_id = -1; + + size_t scores_dim1 = scores_shape.size() > 1 ? scores_shape[1] : 1; + size_t scores_dim2 = scores_shape.size() > 2 ? scores_shape[2] : 1; + + if (scores_dim1 == static_cast(num_classes_) || scores_dim2 == static_cast(num_classes_)) { + bool detections_first = (scores_dim1 == num_detections); + + if (detections_first) { + size_t score_base = (b * num_detections + d) * num_classes_; + float max_score = -std::numeric_limits::max(); + size_t best_class = 0; + + for (size_t c = 0; c < static_cast(num_classes_); ++c) { + float raw_score = scores_data[score_base + c]; + float activated_score = applyActivation(raw_score); + if (activated_score > max_score) { + max_score = activated_score; + best_class = c; + } + } + score = max_score; + class_id = static_cast(best_class); + } else { + // [batch, num_classes, num_detections] + float max_score = -std::numeric_limits::max(); + size_t best_class = 0; + + for (size_t c = 0; c < static_cast(num_classes_); ++c) { + float raw_score = scores_data[b * num_classes_ * num_detections + c * num_detections + d]; + float activated_score = applyActivation(raw_score); + if (activated_score > max_score) { + max_score = activated_score; + best_class = c; + } + } + score = max_score; + class_id = static_cast(best_class); + } + } else { + size_t score_offset = b * num_detections + d; + float raw_score = scores_data[score_offset]; + score = applyActivation(raw_score); + + if (classes_idx >= 0 && classes_idx < static_cast(outputs.size())) { + const auto & classes_tensor = outputs[classes_idx]; + const auto & classes_shape = classes_tensor.shape(); + const float * classes_data = classes_tensor.data_as(); + if (classes_data && classes_shape.size() >= 2) { + size_t class_offset = b * classes_shape[1] + d; + class_id = static_cast(std::round(classes_data[class_offset])); + } + } + } + + if (score < config_.score_threshold) { + continue; + } + + SimpleDetection det; + if (bbox_format_ == BboxFormat::CXCYWH) { + det.x = x; + det.y = y; + det.width = w; + det.height = h; + } else if (bbox_format_ == BboxFormat::XYXY) { + det.x = x; + det.y = y; + det.width = w - x; + det.height = h - y; + } else if (bbox_format_ == BboxFormat::XYWH) { + det.x = x; + det.y = y; + det.width = w; + det.height = h; + } + + det.score = score; + det.class_id = class_id; + + if (config_.coordinate_space == CoordinateSpace::PREPROCESSED) { + adjustToOriginal(det, metas[b], use_letterbox_); + } + dets.push_back(det); + } + + if (config_.enable_nms) { + batch_detections.push_back(applyNms(dets, config_.nms_iou_threshold)); + } else { + batch_detections.push_back(dets); + } + } + + return batch_detections; +} + +void GenericPostprocessor::adjustToOriginal(SimpleDetection & det, const ImageMeta & meta, bool use_letterbox) const +{ + float cx = det.x; + float cy = det.y; + float w = det.width; + float h = det.height; + + cx -= meta.pad_x; + cy -= meta.pad_y; + + if (use_letterbox) { + const float inv_scale = meta.scale_x > 0.0f ? 1.0f / meta.scale_x : 1.0f; + cx *= inv_scale; + cy *= inv_scale; + w *= inv_scale; + h *= inv_scale; + } else { + cx *= meta.scale_x; + cy *= meta.scale_y; + w *= meta.scale_x; + h *= meta.scale_y; + } + + det.x = std::max(0.0f, cx - w * 0.5f); + det.y = std::max(0.0f, cy - h * 0.5f); + det.width = w; + det.height = h; + + det.x = std::min(det.x, static_cast(meta.original_width)); + det.y = std::min(det.y, static_cast(meta.original_height)); + det.width = std::min(det.width, static_cast(meta.original_width) - det.x); + det.height = std::min(det.height, static_cast(meta.original_height) - det.y); +} + +float GenericPostprocessor::iou(const SimpleDetection & a, const SimpleDetection & b) +{ + const float x1 = std::max(a.x, b.x); + const float y1 = std::max(a.y, b.y); + const float x2 = std::min(a.x + a.width, b.x + b.width); + const float y2 = std::min(a.y + a.height, b.y + b.height); + + const float inter_w = std::max(0.0f, x2 - x1); + const float inter_h = std::max(0.0f, y2 - y1); + const float inter_area = inter_w * inter_h; + const float area_a = a.width * a.height; + const float area_b = b.width * b.height; + const float union_area = area_a + area_b - inter_area; + + if (union_area <= 0.0f) { + return 0.0f; + } + return inter_area / union_area; +} + +std::vector GenericPostprocessor::applyNms( + std::vector dets, float iou_threshold) const +{ + std::vector result; + if (dets.empty()) { + return result; + } + + std::stable_sort( + dets.begin(), dets.end(), [](const SimpleDetection & a, const SimpleDetection & b) { return a.score > b.score; }); + + std::vector suppressed(dets.size(), false); + for (size_t i = 0; i < dets.size(); ++i) { + if (suppressed[i]) { + continue; + } + result.push_back(dets[i]); + for (size_t j = i + 1; j < dets.size(); ++j) { + if (suppressed[j]) { + continue; + } + if (dets[i].class_id != dets[j].class_id) { + continue; + } + if (iou(dets[i], dets[j]) > iou_threshold) { + suppressed[j] = true; + } + } + } + + return result; +} + +std::string GenericPostprocessor::classLabel(int class_id, const std::vector & class_names) const +{ + if (class_id >= 0 && static_cast(class_id) < class_names.size()) { + return class_names[static_cast(class_id)]; + } + return std::to_string(class_id); +} + +void GenericPostprocessor::fillDetectionMessage( + const std_msgs::msg::Header & header, + const std::vector & detections, + const ImageMeta & meta, + Detection2DArrayMsg & out_msg) const +{ + out_msg.header = header; + +#if __has_include() + out_msg.detections.clear(); + out_msg.detections.reserve(detections.size()); + for (const auto & det : detections) { + Detection2DMsg d; + d.x = det.x; + d.y = det.y; + d.width = det.width; + d.height = det.height; + d.score = det.score; + d.class_id = det.class_id; + d.label = classLabel(det.class_id, class_names_); + out_msg.detections.push_back(d); + } +#else + (void)meta; + out_msg.detections.clear(); + out_msg.detections.reserve(detections.size()); + for (const auto & det : detections) { + Detection2DMsg d; + d.header = header; + d.bbox.center.position.x = det.x + det.width * 0.5; + d.bbox.center.position.y = det.y + det.height * 0.5; + d.bbox.size_x = det.width; + d.bbox.size_y = det.height; + + vision_msgs::msg::ObjectHypothesisWithPose hyp; + hyp.hypothesis.class_id = classLabel(det.class_id, class_names_); + hyp.hypothesis.score = det.score; + d.results.push_back(hyp); + out_msg.detections.push_back(d); + } +#endif +} + +} // namespace deep_object_detection diff --git a/deep_object_detection/src/image_preprocessor.cpp b/deep_object_detection/src/image_preprocessor.cpp new file mode 100644 index 0000000..3a963d5 --- /dev/null +++ b/deep_object_detection/src/image_preprocessor.cpp @@ -0,0 +1,282 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deep_object_detection/image_preprocessor.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +namespace deep_object_detection +{ + +ImagePreprocessor::ImagePreprocessor(const PreprocessingConfig & config) +: config_(config) +{} + +cv::Mat ImagePreprocessor::preprocess(const cv::Mat & bgr, ImageMeta & meta) const +{ + if (bgr.empty()) { + throw std::runtime_error("Input image is empty"); + } + + meta.original_width = bgr.cols; + meta.original_height = bgr.rows; + + cv::Mat resized; + + switch (config_.resize_method) { + case ResizeMethod::LETTERBOX: + resized = applyLetterbox(bgr, meta); + break; + case ResizeMethod::CROP: + resized = applyCrop(bgr, meta); + break; + case ResizeMethod::PAD: + resized = applyPad(bgr, meta); + break; + case ResizeMethod::RESIZE: + default: + resized = applyResize(bgr, meta); + break; + } + + cv::Mat float_image; + resized.convertTo(float_image, CV_32F, 1.0 / 255.0); + applyNormalization(float_image); + applyColorConversion(float_image); + + if (float_image.empty()) { + throw std::runtime_error("Preprocessing failed: result image is empty"); + } + if (float_image.rows != config_.input_height || float_image.cols != config_.input_width) { + throw std::runtime_error( + "Preprocessing failed: result image size (" + std::to_string(float_image.cols) + "x" + + std::to_string(float_image.rows) + ") doesn't match expected size (" + std::to_string(config_.input_width) + "x" + + std::to_string(config_.input_height) + ")"); + } + if (float_image.channels() != static_cast(RGB_CHANNELS)) { + throw std::runtime_error( + "Preprocessing failed: result image has " + std::to_string(float_image.channels()) + " channels, expected " + + std::to_string(RGB_CHANNELS)); + } + + return float_image; +} + +cv::Mat ImagePreprocessor::applyLetterbox(const cv::Mat & bgr, ImageMeta & meta) const +{ + const float scale = std::min( + static_cast(config_.input_width) / static_cast(bgr.cols), + static_cast(config_.input_height) / static_cast(bgr.rows)); + const int new_w = std::max(1, static_cast(std::round(bgr.cols * scale))); + const int new_h = std::max(1, static_cast(std::round(bgr.rows * scale))); + + cv::Mat resized; + cv::resize(bgr, resized, cv::Size(new_w, new_h)); + + const int pad_w = config_.input_width - new_w; + const int pad_h = config_.input_height - new_h; + const int pad_left = pad_w / 2; + const int pad_right = pad_w - pad_left; + const int pad_top = pad_h / 2; + const int pad_bottom = pad_h - pad_top; + + cv::copyMakeBorder( + resized, + resized, + pad_top, + pad_bottom, + pad_left, + pad_right, + cv::BORDER_CONSTANT, + cv::Scalar(config_.pad_value, config_.pad_value, config_.pad_value)); + + meta.scale_x = meta.scale_y = scale; + meta.pad_x = static_cast(pad_left); + meta.pad_y = static_cast(pad_top); + + return resized; +} + +cv::Mat ImagePreprocessor::applyResize(const cv::Mat & bgr, ImageMeta & meta) const +{ + cv::Mat resized; + cv::resize(bgr, resized, cv::Size(config_.input_width, config_.input_height)); + meta.scale_x = static_cast(meta.original_width) / static_cast(config_.input_width); + meta.scale_y = static_cast(meta.original_height) / static_cast(config_.input_height); + meta.pad_x = meta.pad_y = 0.0f; + return resized; +} + +cv::Mat ImagePreprocessor::applyCrop(const cv::Mat & bgr, ImageMeta & meta) const +{ + const int crop_x = (bgr.cols - config_.input_width) / 2; + const int crop_y = (bgr.rows - config_.input_height) / 2; + + cv::Mat cropped; + if (bgr.cols >= config_.input_width && bgr.rows >= config_.input_height) { + cropped = bgr(cv::Rect(crop_x, crop_y, config_.input_width, config_.input_height)).clone(); + } else { + float scale = + std::max(static_cast(config_.input_width) / bgr.cols, static_cast(config_.input_height) / bgr.rows); + cv::Mat scaled; + cv::resize(bgr, scaled, cv::Size(), scale, scale); + const int new_crop_x = (scaled.cols - config_.input_width) / 2; + const int new_crop_y = (scaled.rows - config_.input_height) / 2; + cropped = scaled(cv::Rect(new_crop_x, new_crop_y, config_.input_width, config_.input_height)).clone(); + } + + meta.scale_x = static_cast(meta.original_width) / static_cast(config_.input_width); + meta.scale_y = static_cast(meta.original_height) / static_cast(config_.input_height); + meta.pad_x = static_cast(-crop_x); + meta.pad_y = static_cast(-crop_y); + + return cropped; +} + +cv::Mat ImagePreprocessor::applyPad(const cv::Mat & bgr, ImageMeta & meta) const +{ + const int pad_w = config_.input_width - bgr.cols; + const int pad_h = config_.input_height - bgr.rows; + + cv::Mat padded; + if (pad_w >= 0 && pad_h >= 0) { + const int pad_left = pad_w / 2; + const int pad_right = pad_w - pad_left; + const int pad_top = pad_h / 2; + const int pad_bottom = pad_h - pad_top; + + cv::copyMakeBorder( + bgr, + padded, + pad_top, + pad_bottom, + pad_left, + pad_right, + cv::BORDER_CONSTANT, + cv::Scalar(config_.pad_value, config_.pad_value, config_.pad_value)); + + meta.pad_x = static_cast(pad_left); + meta.pad_y = static_cast(pad_top); + } else { + return applyResize(bgr, meta); + } + + meta.scale_x = meta.scale_y = 1.0f; + return padded; +} + +void ImagePreprocessor::applyNormalization(cv::Mat & image) const +{ + switch (config_.normalization_type) { + case NormalizationType::IMAGENET: { + static const std::array imagenet_mean = {0.485f, 0.456f, 0.406f}; + static const std::array imagenet_std = {0.229f, 0.224f, 0.225f}; + + std::vector channels; + cv::split(image, channels); + channels[0] = (channels[0] - imagenet_mean[2]) / imagenet_std[2]; + channels[1] = (channels[1] - imagenet_mean[1]) / imagenet_std[1]; + channels[2] = (channels[2] - imagenet_mean[0]) / imagenet_std[0]; + + cv::merge(channels, image); + break; + } + case NormalizationType::CUSTOM: { + if (config_.mean.size() >= RGB_CHANNELS && config_.std.size() >= RGB_CHANNELS) { + std::vector channels; + cv::split(image, channels); + for (size_t i = 0; i < RGB_CHANNELS; ++i) { + channels[i] = (channels[i] - config_.mean[i]) / config_.std[i]; + } + + cv::merge(channels, image); + } + break; + } + case NormalizationType::SCALE_0_1: + case NormalizationType::NONE: + default: + break; + } +} + +void ImagePreprocessor::applyColorConversion(cv::Mat & image) const +{ + if (config_.color_format == "rgb" || config_.color_format == "RGB") { + cv::cvtColor(image, image, cv::COLOR_BGR2RGB); + } +} + +const PackedInput & ImagePreprocessor::pack(const std::vector & images) const +{ + auto & packed = packed_input_cache_; + packed.data.clear(); + packed.shape.clear(); + if (images.empty()) { + return packed; + } + + const size_t batch = images.size(); + const size_t channels = RGB_CHANNELS; + const size_t height = images[0].rows; + const size_t width = images[0].cols; + const size_t image_size = channels * height * width; + const size_t required = batch * image_size; + + packed.shape = {batch, channels, height, width}; + packed.data.resize(required); + + std::array channel_planes; + const size_t plane_elements = height * width; + const size_t plane_bytes = plane_elements * sizeof(float); + for (auto & plane : channel_planes) { + plane.create(static_cast(height), static_cast(width), CV_32F); + } + + for (size_t b = 0; b < batch; ++b) { + const cv::Mat & img = images[b]; + if (img.empty()) { + throw std::runtime_error("Image at batch index " + std::to_string(b) + " is empty"); + } + if (img.channels() != static_cast(RGB_CHANNELS) || img.type() != CV_32FC3) { + throw std::runtime_error( + "Preprocessed image at batch index " + std::to_string(b) + " must be CV_32FC3 (expected " + + std::to_string(RGB_CHANNELS) + " channels)"); + } + if (img.rows != static_cast(height) || img.cols != static_cast(width)) { + throw std::runtime_error( + "Preprocessed image at batch index " + std::to_string(b) + " has size " + std::to_string(img.cols) + "x" + + std::to_string(img.rows) + ", expected " + std::to_string(width) + "x" + std::to_string(height)); + } + + cv::split(img, channel_planes.data()); + float * batch_base = packed.data.data() + b * image_size; + for (size_t c = 0; c < channels; ++c) { + const float * src_ptr = channel_planes[c].ptr(); + float * dst_ptr = batch_base + c * plane_elements; + std::memcpy(dst_ptr, src_ptr, plane_bytes); + } + } + + return packed; +} + +} // namespace deep_object_detection diff --git a/deep_object_detection/src/main.cpp b/deep_object_detection/src/main.cpp new file mode 100644 index 0000000..ae96955 --- /dev/null +++ b/deep_object_detection/src/main.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2025-present WATonomous. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "deep_object_detection/deep_object_detection_node.hpp" + +int main(int argc, char ** argv) +{ + rclcpp::init(argc, argv); + try { + auto node = deep_object_detection::createDeepObjectDetectionNode(); + rclcpp::executors::MultiThreadedExecutor executor(rclcpp::ExecutorOptions(), 4); + executor.add_node(node->get_node_base_interface()); + executor.spin(); + } catch (const std::exception & e) { + std::cerr << "Failed to start deep_object_detection_node: " << e.what() << std::endl; + } + rclcpp::shutdown(); + return 0; +} diff --git a/deep_ort_backend_plugin/src/ort_backend_executor.cpp b/deep_ort_backend_plugin/src/ort_backend_executor.cpp index b8805a6..48fa989 100644 --- a/deep_ort_backend_plugin/src/ort_backend_executor.cpp +++ b/deep_ort_backend_plugin/src/ort_backend_executor.cpp @@ -78,9 +78,7 @@ deep_ros::Tensor OrtBackendExecutor::run_inference_impl(deep_ros::Tensor & input ONNXTensorElementDataType onnx_type = convert_to_onnx_type(input.dtype()); std::vector input_shape_int64(input.shape().begin(), input.shape().end()); - // Get our custom allocator for output binding auto custom_allocator_shared = get_ort_cpu_allocator(); - auto * custom_allocator = static_cast(custom_allocator_shared.get()); // Create input tensor that wraps existing input memory (zero-copy!) size_t input_size_bytes = input.byte_size(); @@ -91,28 +89,22 @@ deep_ros::Tensor OrtBackendExecutor::run_inference_impl(deep_ros::Tensor & input Ort::AllocatorWithDefaultOptions allocator; auto input_name = session_->GetInputNameAllocated(0, allocator); auto output_name = session_->GetOutputNameAllocated(0, allocator); + const char * input_names[] = {input_name.get()}; + const char * output_names[] = {output_name.get()}; - // Create IO binding for zero-copy inference - Ort::IoBinding binding(*session_); - binding.BindInput(input_name.get(), ort_input); + auto output_tensors = session_->Run(Ort::RunOptions{nullptr}, input_names, &ort_input, 1, output_names, 1); - // Bind output to use our custom allocator - ONNX Runtime will allocate using our allocator - binding.BindOutput(output_name.get(), custom_allocator->get_ort_memory_info()); - - // Run inference with IO binding (zero-copy for both input and output!) - Ort::RunOptions run_options; - session_->Run(run_options, binding); - - // Get output values allocated by ONNX Runtime using our custom allocator - Ort::AllocatorWithDefaultOptions default_allocator; - std::vector output_tensors = binding.GetOutputValues(default_allocator); + if (output_tensors.empty()) { + throw std::runtime_error("ONNX Runtime inference returned no outputs"); + } - // Get output shape and create our tensor wrapping the ONNX-allocated memory - auto output_shape = get_output_shape(input.shape()); - void * output_data = output_tensors[0].GetTensorMutableData(); + auto & output_tensor = output_tensors.front(); + auto output_info = output_tensor.GetTensorTypeAndShapeInfo(); + auto output_shape_int64 = output_info.GetShape(); + std::vector output_shape(output_shape_int64.begin(), output_shape_int64.end()); - // Create deep_ros tensor that wraps the ONNX-allocated memory (zero-copy!) - deep_ros::Tensor output(output_data, output_shape, input.dtype()); + deep_ros::Tensor output(output_shape, input.dtype(), custom_allocator_shared); + std::memcpy(output.data(), output_tensor.GetTensorData(), output.byte_size()); return output; } catch (const std::exception & e) { diff --git a/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp b/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp index d8ca4c2..697df84 100644 --- a/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp +++ b/deep_ort_backend_plugin/src/ort_cpu_memory_allocator.cpp @@ -67,7 +67,7 @@ const OrtMemoryInfo * OrtCpuMemoryAllocator::get_ort_memory_info() const return ort_memory_info_; } -void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * this_, size_t size) +void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * /*this_*/, size_t size) { if (instance_) { return instance_->allocate(size); @@ -75,14 +75,14 @@ void * ORT_API_CALL OrtCpuMemoryAllocator::ort_alloc(OrtAllocator * this_, size_ return nullptr; } -void ORT_API_CALL OrtCpuMemoryAllocator::ort_free(OrtAllocator * this_, void * p) +void ORT_API_CALL OrtCpuMemoryAllocator::ort_free(OrtAllocator * /*this_*/, void * p) { if (instance_) { instance_->deallocate(p); } } -const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllocator * this_) +const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllocator * /*this_*/) { if (instance_) { return instance_->get_ort_memory_info(); @@ -90,7 +90,7 @@ const OrtMemoryInfo * ORT_API_CALL OrtCpuMemoryAllocator::ort_info(const OrtAllo return nullptr; } -void * ORT_API_CALL OrtCpuMemoryAllocator::ort_reserve(OrtAllocator * this_, size_t size) +void * ORT_API_CALL OrtCpuMemoryAllocator::ort_reserve(OrtAllocator * /*this_*/, size_t size) { if (instance_) { return instance_->allocate(size); diff --git a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp index 6035f1c..c21785a 100644 --- a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp +++ b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -43,7 +44,12 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor * @param execution_provider GPU execution provider: "cuda" or "tensorrt" * @param logger ROS logger for diagnostic messages */ - explicit OrtGpuBackendExecutor(int device_id, const std::string & execution_provider, const rclcpp::Logger & logger); + explicit OrtGpuBackendExecutor( + int device_id, + const std::string & execution_provider, + const rclcpp::Logger & logger, + bool enable_trt_engine_cache, + std::string trt_engine_cache_path); /** * @brief Destructor @@ -88,6 +94,8 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor int device_id_; std::string execution_provider_; rclcpp::Logger logger_; + bool enable_trt_engine_cache_; + std::string trt_engine_cache_path_; std::unique_ptr env_; std::unique_ptr session_; std::unique_ptr session_options_; @@ -122,14 +130,19 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor * @return Expected output tensor shape * @throws std::runtime_error if model not loaded or shape inference fails */ - std::vector get_output_shape(const std::vector & input_shape) const; - /** * @brief Get element size in bytes for a data type * @param dtype Data type * @return Size in bytes per element */ size_t get_element_size(deep_ros::DataType dtype) const; + + bool is_tensorrt_provider() const; + + std::string normalized_provider_; + bool tensorrt_engine_ready_{false}; + bool tensorrt_engine_build_in_progress_{false}; + std::chrono::steady_clock::time_point tensorrt_engine_build_start_; }; } // namespace deep_ort_gpu_backend diff --git a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp index e456f73..f822e52 100644 --- a/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp +++ b/deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp @@ -80,6 +80,8 @@ class OrtGpuBackendPlugin : public deep_ros::DeepBackendPlugin rclcpp_lifecycle::LifecycleNode::SharedPtr node_; int device_id_; std::string execution_provider_; + bool enable_trt_engine_cache_; + std::string trt_engine_cache_path_; std::shared_ptr allocator_; std::shared_ptr executor_; diff --git a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp index 38d5362..22bd521 100644 --- a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp +++ b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_executor.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "deep_ort_gpu_backend_plugin/ort_gpu_memory_allocator.hpp" @@ -39,10 +40,16 @@ namespace deep_ort_gpu_backend class OrtGpuCpuMemoryAllocator; OrtGpuBackendExecutor::OrtGpuBackendExecutor( - int device_id, const std::string & execution_provider, const rclcpp::Logger & logger) + int device_id, + const std::string & execution_provider, + const rclcpp::Logger & logger, + bool enable_trt_engine_cache, + std::string trt_engine_cache_path) : device_id_(device_id) , execution_provider_(execution_provider) , logger_(logger) +, enable_trt_engine_cache_(enable_trt_engine_cache) +, trt_engine_cache_path_(std::move(trt_engine_cache_path)) , memory_info_(nullptr) { // Initialize ORT environment with minimal logging @@ -80,7 +87,28 @@ bool OrtGpuBackendExecutor::load_model_impl(const std::filesystem::path & model_ model_path_ = model_path; // Create session with GPU execution provider + // Note: For TensorRT, this is where the engine is built or loaded from cache + // Even with caching, this can take 5-30+ seconds depending on model size and GPU + const bool using_tensorrt = is_tensorrt_provider(); + if (using_tensorrt) { + if (enable_trt_engine_cache_) { + RCLCPP_INFO( + logger_, + "Creating TensorRT session (loading cached engine from %s if available)...", + trt_engine_cache_path_.c_str()); + } else { + RCLCPP_INFO(logger_, "Creating TensorRT session (building engine, caching disabled)..."); + } + } else { + RCLCPP_INFO(logger_, "Creating CUDA session..."); + } + + auto start_time = std::chrono::steady_clock::now(); session_ = std::make_unique(*env_, model_path_.c_str(), *session_options_); + auto end_time = std::chrono::steady_clock::now(); + auto duration_ms = std::chrono::duration_cast(end_time - start_time).count(); + + RCLCPP_INFO(logger_, "Session created in %.2f seconds", duration_ms / 1000.0); return true; } catch (const std::exception & e) { @@ -95,8 +123,19 @@ deep_ros::Tensor OrtGpuBackendExecutor::run_inference_impl(deep_ros::Tensor & in if (!session_) { throw std::runtime_error("No ONNX session available"); } - + // for logging try { + const bool using_tensorrt = is_tensorrt_provider(); + if (using_tensorrt && !tensorrt_engine_ready_ && !tensorrt_engine_build_in_progress_) { + tensorrt_engine_build_in_progress_ = true; + tensorrt_engine_build_start_ = std::chrono::steady_clock::now(); + RCLCPP_INFO( + logger_, + "TensorRT engine build started (model=%s, batch_size=%zu)", + model_path_.filename().c_str(), + input.shape().empty() ? 0U : input.shape().front()); + } + // Convert deep_ros::DataType to ONNX tensor element type ONNXTensorElementDataType onnx_type = convert_to_onnx_type(input.dtype()); std::vector input_shape_int64(input.shape().begin(), input.shape().end()); @@ -131,15 +170,39 @@ deep_ros::Tensor OrtGpuBackendExecutor::run_inference_impl(deep_ros::Tensor & in Ort::AllocatorWithDefaultOptions default_allocator; std::vector output_tensors = binding.GetOutputValues(default_allocator); - // Get output shape and data - auto output_shape = get_output_shape(input.shape()); - void * output_data = output_tensors[0].GetTensorMutableData(); + if (output_tensors.empty()) { + throw std::runtime_error("TensorRT inference returned no outputs"); + } + + auto & output_tensor = output_tensors.front(); + auto output_info = output_tensor.GetTensorTypeAndShapeInfo(); + auto output_shape_int64 = output_info.GetShape(); + + std::vector output_shape; + output_shape.reserve(output_shape_int64.size()); + for (int64_t dim : output_shape_int64) { + if (dim <= 0) { + throw std::runtime_error("Invalid runtime output dimension: " + std::to_string(dim)); + } + output_shape.push_back(static_cast(dim)); + } + + void * output_data = output_tensor.GetTensorMutableData(); // Create deep_ros tensor wrapping the ONNX-allocated memory deep_ros::Tensor output(output_data, output_shape, input.dtype()); + if (tensorrt_engine_build_in_progress_) { + const auto elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - tensorrt_engine_build_start_); + RCLCPP_INFO(logger_, "TensorRT engine build completed in %.2f seconds", elapsed.count()); + tensorrt_engine_build_in_progress_ = false; + tensorrt_engine_ready_ = true; + } + return output; } catch (const std::exception & e) { + tensorrt_engine_build_in_progress_ = false; throw std::runtime_error("GPU inference failed: " + std::string(e.what())); } } @@ -157,12 +220,12 @@ void OrtGpuBackendExecutor::initialize_session_options() session_options_->SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); // Configure execution provider based on string - std::string provider_lower = execution_provider_; - std::transform(provider_lower.begin(), provider_lower.end(), provider_lower.begin(), ::tolower); + normalized_provider_ = execution_provider_; + std::transform(normalized_provider_.begin(), normalized_provider_.end(), normalized_provider_.begin(), ::tolower); - if (provider_lower == "cuda") { + if (normalized_provider_ == "cuda") { configure_cuda_provider(); - } else if (provider_lower == "tensorrt") { + } else if (normalized_provider_ == "tensorrt") { configure_tensorrt_provider(); } else { throw std::runtime_error( @@ -206,13 +269,28 @@ void OrtGpuBackendExecutor::configure_tensorrt_provider() tensorrt_options.trt_max_workspace_size = 67108864; // 64MB tensorrt_options.trt_max_partition_iterations = 1; tensorrt_options.trt_min_subgraph_size = 1; - tensorrt_options.trt_engine_cache_enable = 0; + tensorrt_options.trt_engine_cache_enable = enable_trt_engine_cache_ ? 1 : 0; tensorrt_options.trt_force_sequential_engine_build = 1; tensorrt_options.trt_fp16_enable = 0; tensorrt_options.trt_int8_enable = 0; tensorrt_options.has_user_compute_stream = 0; tensorrt_options.user_compute_stream = nullptr; + std::string cache_path = trt_engine_cache_path_; + if (enable_trt_engine_cache_) { + if (cache_path.empty()) { + cache_path = "/tmp/deep_ros_ort_trt_cache"; + } + std::error_code ec; + std::filesystem::create_directories(cache_path, ec); + if (ec) { + RCLCPP_WARN( + logger_, "Failed to create TensorRT cache directory %s: %s", cache_path.c_str(), ec.message().c_str()); + } + trt_engine_cache_path_ = cache_path; + tensorrt_options.trt_engine_cache_path = trt_engine_cache_path_.c_str(); + } + RCLCPP_INFO(logger_, "Configuring TensorRT execution provider on device %d", device_id_); session_options_->AppendExecutionProvider_TensorRT(tensorrt_options); RCLCPP_INFO(logger_, "TensorRT provider registered successfully"); @@ -239,34 +317,6 @@ ONNXTensorElementDataType OrtGpuBackendExecutor::convert_to_onnx_type(deep_ros:: } } -std::vector OrtGpuBackendExecutor::get_output_shape(const std::vector & input_shape) const -{ - if (!session_) { - throw std::runtime_error("No model loaded"); - } - - try { - // Get output tensor info from the session - auto output_type_info = session_->GetOutputTypeInfo(0); - auto tensor_info = output_type_info.GetTensorTypeAndShapeInfo(); - auto shape = tensor_info.GetShape(); - - std::vector output_shape; - for (int64_t dim : shape) { - if (dim == -1) { - // Dynamic dimension - use corresponding input dimension - output_shape.push_back(input_shape[output_shape.size()]); - } else { - output_shape.push_back(static_cast(dim)); - } - } - - return output_shape; - } catch (const std::exception & e) { - throw std::runtime_error("Failed to get output shape: " + std::string(e.what())); - } -} - size_t OrtGpuBackendExecutor::get_element_size(deep_ros::DataType dtype) const { switch (dtype) { @@ -285,4 +335,9 @@ size_t OrtGpuBackendExecutor::get_element_size(deep_ros::DataType dtype) const } } +bool OrtGpuBackendExecutor::is_tensorrt_provider() const +{ + return normalized_provider_ == "tensorrt"; +} + } // namespace deep_ort_gpu_backend diff --git a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp index 5bf0757..6709ca4 100644 --- a/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp +++ b/deep_ort_gpu_backend_plugin/src/ort_gpu_backend_plugin.cpp @@ -29,6 +29,8 @@ namespace deep_ort_gpu_backend OrtGpuBackendPlugin::OrtGpuBackendPlugin() : device_id_(0) , execution_provider_("cuda") +, enable_trt_engine_cache_(false) +, trt_engine_cache_path_("/tmp/deep_ros_ort_trt_cache") { // GPU components will be initialized in initialize() after parameters are loaded } @@ -37,13 +39,10 @@ void OrtGpuBackendPlugin::initialize(rclcpp_lifecycle::LifecycleNode::SharedPtr { node_ = node; - // Declare parameters with defaults - node_->declare_parameter("Backend.device_id", 0); - node_->declare_parameter("Backend.execution_provider", "cuda"); - - // Read parameters device_id_ = node_->get_parameter("Backend.device_id").as_int(); execution_provider_ = node_->get_parameter("Backend.execution_provider").as_string(); + enable_trt_engine_cache_ = node_->get_parameter("Backend.trt_engine_cache_enable").as_bool(); + trt_engine_cache_path_ = node_->get_parameter("Backend.trt_engine_cache_path").as_string(); RCLCPP_INFO( node_->get_logger(), @@ -88,7 +87,8 @@ bool OrtGpuBackendPlugin::initialize_gpu_components() } // Create GPU executor - executor_ = std::make_shared(device_id_, execution_provider_, node_->get_logger()); + executor_ = std::make_shared( + device_id_, execution_provider_, node_->get_logger(), enable_trt_engine_cache_, trt_engine_cache_path_); if (!executor_) { RCLCPP_ERROR(node_->get_logger(), "Failed to create GPU backend executor"); return false; diff --git a/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp b/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp index 6896eb3..8dd0102 100644 --- a/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp +++ b/deep_ort_gpu_backend_plugin/test/test_ort_gpu_backend.cpp @@ -54,7 +54,7 @@ bool is_cuda_available() try { // Try creating a simple CUDA execution provider to verify CUDA availability auto logger = rclcpp::get_logger("test_cuda_availability"); - OrtGpuBackendExecutor test_executor(0, "cuda", logger); + OrtGpuBackendExecutor test_executor(0, "cuda", logger, false, ""); return true; } catch (...) { return false; @@ -122,7 +122,7 @@ TEST_CASE("OrtGpuBackendExecutor basic functionality", "[executor]") } auto logger = rclcpp::get_logger("test_executor"); - OrtGpuBackendExecutor executor(0, "cuda", logger); + OrtGpuBackendExecutor executor(0, "cuda", logger, false, ""); auto allocator = get_ort_gpu_cpu_allocator(); // Test supported formats @@ -222,7 +222,7 @@ TEST_CASE("TensorRT execution provider", "[tensorrt][!mayfail]") { try { auto logger = rclcpp::get_logger("test_tensorrt"); - OrtGpuBackendExecutor executor(0, "tensorrt", logger); + OrtGpuBackendExecutor executor(0, "tensorrt", logger, false, ""); REQUIRE(executor.get_device_id() == 0); diff --git a/deep_tools/camera_sync/config/multi_camera_sync_params.yaml b/deep_tools/camera_sync/config/multi_camera_sync_params.yaml index 6c633fb..a5feb66 100644 --- a/deep_tools/camera_sync/config/multi_camera_sync_params.yaml +++ b/deep_tools/camera_sync/config/multi_camera_sync_params.yaml @@ -24,10 +24,10 @@ multi_camera_sync: # Camera topics to synchronize # For raw images, use topics like: ["/camera1/image_raw", "/camera2/image_raw"] # For compressed images, use topics like: ["/camera1/image_raw/compressed", "/camera2/image_raw/compressed"] - camera_topics: ["/front/image_compressed", "/back/image_compressed", "/port/image_compressed"] + camera_topics: ["/CAM_FRONT/image_rect_compressed", "/CAM_FRONT_LEFT/image_rect_compressed", "/CAM_FRONT_RIGHT/image_rect_compressed"] # Optional names for the cameras (if not provided, will auto-generate camera_1, camera_2, etc.) - camera_names: ["front", "back", "port"] + camera_names: ["front", "left", "right"] # Whether to use compressed images (sensor_msgs/CompressedImage) instead of raw (sensor_msgs/Image) # Set to true if your camera topics publish compressed images diff --git a/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp b/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp index 8a77696..82248e2 100644 --- a/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp +++ b/deep_tools/camera_sync/include/camera_sync/multi_camera_sync_node.hpp @@ -15,7 +15,13 @@ #ifndef CAMERA_SYNC__MULTI_CAMERA_SYNC_NODE_HPP_ #define CAMERA_SYNC__MULTI_CAMERA_SYNC_NODE_HPP_ -#include +#if __has_include() + #include +#elif __has_include() + #include +#else + #error "cv_bridge headers not found" +#endif #include #include #include diff --git a/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in b/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in new file mode 100644 index 0000000..394042e --- /dev/null +++ b/onnxruntime_gpu_vendor/env_hook/onnxruntime_library_path.sh.in @@ -0,0 +1,2 @@ +# colcon hooks: set LD_LIBRARY_PATH +ament_prepend_unique_value LD_LIBRARY_PATH "$AMENT_CURRENT_PREFIX/lib"