From 67ef765eff2f9118fb1242e0e79203e511a38d93 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 23 Apr 2026 15:44:34 -0700 Subject: [PATCH 1/6] Initial impl --- cmake/onnxruntime_providers_cuda_plugin.cmake | 7 + docs/cuda_plugin_ep/cuda_plugin_ep_design.md | 97 ++++++++++-- .../core/providers/cuda/plugin/cuda_ep.cc | 25 +++ .../core/providers/cuda/plugin/cuda_ep.h | 6 + .../cuda/plugin/cuda_profiler_plugin.cc | 148 ++++++++++++++++++ .../cuda/plugin/cuda_profiler_plugin.h | 43 +++++ .../transformers/test_cuda_plugin_ep.py | 95 +++++++++++ 7 files changed, 410 insertions(+), 11 deletions(-) create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index f7b9c7be7c765..6bb13a923c11e 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -265,6 +265,13 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE ${PROTOBUF_LIB} ) +if (onnxruntime_ENABLE_CUDA_PROFILING) + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti) + # USE_CUDA is required by cupti_manager.h guards. ENABLE_CUDA_PROFILING activates + # the profiler implementation in cuda_profiler_plugin.cc. + target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE USE_CUDA ENABLE_CUDA_PROFILING) +endif() + # Symbol visibility — only export CreateEpFactories and ReleaseEpFactory target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ORT_API_MANUAL_INIT BUILD_CUDA_EP_AS_PLUGIN ORT_USE_EP_API_ADAPTERS=1 ONNX_ML=1 ONNX_NAMESPACE=onnx ONNX_USE_LITE_PROTO=1) diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md index ba7b07b97535e..a7d33ffd62e30 100644 --- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md +++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md @@ -831,29 +831,104 @@ include/onnxruntime/ep/ --- -## 14. Future Work +## 14. Profiling and Observability -1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP. +The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output. -2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower: +### 14.1 Architecture + +The profiling stack has three layers: + +1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge. +2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`. +3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing. + +``` +ORT Profiler + └─ PluginEpProfiler (bridge, in ORT core) + └─ OrtEpProfilerImpl callbacks (C API boundary) + └─ CudaPluginEpProfiler (in plugin DLL) + └─ CUPTIManager singleton (in plugin DLL) + └─ CUPTI activity APIs (GPU tracing) +``` + +### 14.2 CUPTI Integration + +The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs. + +CUPTI activities enabled: +- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls +- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls +- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution +- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers +- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs + +### 14.3 Correlation ID Flow + +The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`. + +### 14.4 Event Collection (EndProfiling) + +When ORT calls `EndProfiling`: +1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`). +2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted. +3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`. +4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`. + +The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only; the `PluginEpProfiler` bridge on the ORT side handles merging EP events into the global event timeline. + +### 14.5 Design Differences from In-Tree CUDA EP Profiler + +| Aspect | In-tree CUDA EP | CUDA Plugin EP | +|--------|----------------|----------------| +| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge merges | +| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is | +| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future | +| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) | + +### 14.6 Build Configuration + +CUPTI profiling is conditional: +- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON` +- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target +- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin` +- **Source**: `cupti_manager.cc` compiled into the plugin + +When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled. + +### 14.7 Files + +| File | Role | +|------|------| +| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition | +| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations | +| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration | +| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring | +| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage | + +--- + +## 15. Future Work + +1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower: - Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks. - Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`. - For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion. -3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels. +2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels. -4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. +3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. -5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. +4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. -6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). +5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). -7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land. +6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land. -8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected. +7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected. -9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption: +8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption: **A. Registration-time validation pass** @@ -881,7 +956,7 @@ include/onnxruntime/ep/ | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr | | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration | -10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation. +9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation. **A. Resource accounting** diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc index 7c2970c468216..68ad66a11b1d1 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc @@ -134,6 +134,13 @@ CudaEp::CudaEp(CudaEpFactory& factory, const Config& config, const OrtLogger& lo // Resource accounting — allows ORT to query available device memory for budget enforcement GetAvailableResource = GetAvailableResourceImpl; + // Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time +#if defined(ENABLE_CUDA_PROFILING) + CreateProfiler = CreateProfilerImpl; +#else + CreateProfiler = nullptr; +#endif + const OrtApi& ort_api = factory_.GetOrtApi(); Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO, "CUDA Plugin EP created", @@ -651,5 +658,23 @@ OrtStatus* ORT_API_CALL CudaEp::GetAvailableResourceImpl( EXCEPTION_TO_STATUS_END } +#if defined(ENABLE_CUDA_PROFILING) +/*static*/ +OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl( + OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept { + EXCEPTION_TO_STATUS_BEGIN + + if (profiler == nullptr) { + return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null"); + } + + auto* ep = static_cast(this_ptr); + *profiler = new CudaPluginEpProfiler(ep->factory_.GetEpApi()); + return nullptr; + + EXCEPTION_TO_STATUS_END +} +#endif // defined(ENABLE_CUDA_PROFILING) + } // namespace cuda_plugin } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h index 502902c53070b..faaeebf9ceae0 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h @@ -5,6 +5,7 @@ #include "cuda_plugin_utils.h" #include "cuda_graph_plugin.h" +#include "cuda_profiler_plugin.h" #include "ep/adapters.h" #include @@ -91,6 +92,11 @@ class CudaEp : public onnxruntime::ep::adapter::Ep { static OrtStatus* ORT_API_CALL GetAvailableResourceImpl( const OrtEp* this_ptr, OrtResourceCount* available) noexcept; +#if defined(ENABLE_CUDA_PROFILING) + static OrtStatus* ORT_API_CALL CreateProfilerImpl( + OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept; +#endif + /// Helper to parse the graph annotation ID from run options. CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc new file mode 100644 index 0000000000000..4e8dc039a794c --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc @@ -0,0 +1,148 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "cuda_profiler_plugin.h" + +#if defined(ENABLE_CUDA_PROFILING) + +#include +#include +#include + +namespace onnxruntime { +namespace cuda_plugin { + +CudaPluginEpProfiler::CudaPluginEpProfiler(const OrtEpApi& api) + : OrtEpProfilerImpl{}, ep_api(api) { + ort_version_supported = ORT_API_VERSION; + Release = ReleaseImpl; + StartProfiling = StartProfilingImpl; + EndProfiling = EndProfilingImpl; + StartEvent = StartEventImpl; + StopEvent = StopEventImpl; + + auto& manager = profiling::CUPTIManager::GetInstance(); + client_handle_ = manager.RegisterClient(); +} + +CudaPluginEpProfiler::~CudaPluginEpProfiler() { + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.DeregisterClient(client_handle_); +} + +/*static*/ +void ORT_API_CALL CudaPluginEpProfiler::ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept { + delete static_cast(this_ptr); +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartProfilingImpl( + OrtEpProfilerImpl* this_ptr, + int64_t ep_profiling_start_offset_ns) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + self->ep_profiling_start_offset_ns_ = ep_profiling_start_offset_ns; + self->start_time_point_ = TimePoint::clock::now(); + + // Reconstruct the approximate ORT profiling start time so that GPU event + // timestamps (computed by CUPTIManager::Consume) are relative to ORT's start. + self->ort_profiling_start_ = self->start_time_point_ - + std::chrono::duration_cast( + std::chrono::nanoseconds(ep_profiling_start_offset_ns)); + + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.StartLogging(); + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartEventImpl( + OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + // The bridge provides an absolute epoch-based correlation ID. Pass TimePoint{} + // (epoch) so PushCorrelation adds zero offset and the unique_cid equals the + // correlation ID directly. This avoids double-adding the epoch offset that + // GPUTracerManager::PushCorrelation normally computes. + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.PushCorrelation(self->client_handle_, ort_event_correlation_id, TimePoint{}); + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StopEventImpl( + OrtEpProfilerImpl* /*this_ptr*/, + uint64_t /*ort_event_correlation_id*/, + const OrtProfilingEvent* /*ort_event*/) noexcept { + EXCEPTION_TO_STATUS_BEGIN + + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.PopCorrelation(); + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl( + OrtEpProfilerImpl* this_ptr, + OrtProfilingEventsContainer* c_events_container) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + auto& manager = profiling::CUPTIManager::GetInstance(); + + // Consume GPU events. Timestamps are computed relative to ort_profiling_start_ + // by CUPTIManager::ProcessActivityBuffers, so they match ORT's timeline. + std::map event_map; + manager.Consume(self->client_handle_, self->ort_profiling_start_, event_map); + + // Flatten all GPU events and convert to OrtProfilingEvent. + std::vector events; + for (auto& [correlation_id, event_list] : event_map) { + for (const auto& record : event_list) { + // Build parallel key/value arrays to use the raw-pointer ProfilingEvent + // constructor, avoiding a copy from InlinedHashMap to std::unordered_map. + InlinedVector arg_keys; + InlinedVector arg_values; + arg_keys.reserve(record.args.size()); + arg_values.reserve(record.args.size()); + for (const auto& [k, v] : record.args) { + arg_keys.push_back(k.c_str()); + arg_values.push_back(v.c_str()); + } + + events.emplace_back( + OrtProfilingEventCategory_KERNEL, + record.pid, + record.tid, + record.name.c_str(), + record.ts, + record.dur, + arg_keys.data(), + arg_values.data(), + arg_keys.size()); + } + } + + if (!events.empty()) { + Ort::UnownedProfilingEventsContainer events_container(c_events_container); + Ort::Status status = events_container.AddEvents(events); + return status.release(); + } + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +} // namespace cuda_plugin +} // namespace onnxruntime + +#endif // defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h new file mode 100644 index 0000000000000..061dc583bd00e --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#if defined(ENABLE_CUDA_PROFILING) + +#include "cuda_plugin_utils.h" +#include "cupti_manager.h" +#include "core/common/gpu_profiler_common.h" + +namespace onnxruntime { +namespace cuda_plugin { + +/// Plugin-side implementation of OrtEpProfilerImpl for CUDA. +/// Delegates to CUPTIManager (within the plugin DLL) for GPU activity tracing +/// and implements the C callback interface expected by ORT's PluginEpProfiler bridge. +struct CudaPluginEpProfiler : OrtEpProfilerImpl { + const OrtEpApi& ep_api; + uint64_t client_handle_ = 0; + int64_t ep_profiling_start_offset_ns_ = 0; + TimePoint start_time_point_; + TimePoint ort_profiling_start_; + + explicit CudaPluginEpProfiler(const OrtEpApi& api); + ~CudaPluginEpProfiler(); + + static void ORT_API_CALL ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept; + static OrtStatus* ORT_API_CALL StartProfilingImpl(OrtEpProfilerImpl* this_ptr, + int64_t ep_profiling_start_offset_ns) noexcept; + static OrtStatus* ORT_API_CALL StartEventImpl(OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id) noexcept; + static OrtStatus* ORT_API_CALL StopEventImpl(OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id, + const OrtProfilingEvent* ort_event) noexcept; + static OrtStatus* ORT_API_CALL EndProfilingImpl(OrtEpProfilerImpl* this_ptr, + OrtProfilingEventsContainer* events_container) noexcept; +}; + +} // namespace cuda_plugin +} // namespace onnxruntime + +#endif // defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py index 7b05d364309d9..def500450b234 100644 --- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py +++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import json import os import tempfile import unittest @@ -2379,6 +2380,100 @@ def test_iobinding_matmul(self): if os.path.exists(model_path): os.remove(model_path) + # ---- Profiling tests ---- + + def _run_profiling_test(self): + """Run a model with session-level profiling enabled and verify the JSON output. + + When CUDA profiling is enabled (ENABLE_CUDA_PROFILING), also verify + that GPU kernel events appear in the profile with expected metadata + (category Kernel, stream, block_x). + """ + target_device = get_cuda_plugin_device() + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as tmp: + model_path = tmp.name + profile_file = None + try: + create_matmul_model(model_path) + sess_options = _create_session_options() + sess_options.add_provider_for_devices([target_device], {}) + + profile_prefix = os.path.join(tempfile.gettempdir(), "cuda_plugin_ep_profiling_test") + sess_options.enable_profiling = True + sess_options.profile_file_prefix = profile_prefix + + sess = onnxrt.InferenceSession(model_path, sess_options=sess_options) + + assigned_nodes, assignment_info = _get_assigned_nodes(sess, CUDA_PLUGIN_EP_NAME) + self.assertTrue( + assigned_nodes, + f"{CUDA_PLUGIN_EP_NAME} was assigned no nodes. " + f"Assignments: {_format_assignment_summary(assignment_info)}", + ) + + a = np.random.rand(3, 4).astype(np.float32) + b = np.random.rand(4, 5).astype(np.float32) + sess.run(None, {"A": a, "B": b}) + + profile_file = sess.end_profiling() + self.assertTrue(profile_file, "No profile file returned") + self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}") + + with open(profile_file, "r") as f: + profile_data = json.load(f) + + self.assertIsInstance(profile_data, list) + self.assertGreater(len(profile_data), 0, "Profile JSON is empty") + + # Every event entry must have standard tracing fields. + required_keys = {"pid", "dur", "ts", "ph", "name", "args"} + for entry in profile_data: + if not isinstance(entry, dict): + continue + if "name" not in entry: + continue + for key in required_keys: + self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}") + + # Check for GPU kernel events. These only appear when the build has + # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way: + # without CUDA profiling the events list simply won't contain Kernel + # entries, and the test validates the basic profiling infrastructure. + kernel_events = [ + e for e in profile_data + if isinstance(e, dict) and e.get("cat") == "Kernel" + ] + has_cuda_profiling = len(kernel_events) > 0 + + if has_cuda_profiling: + # Validate GPU kernel event metadata. + for event in kernel_events: + self.assertIn("ts", event) + self.assertIn("dur", event) + self.assertGreaterEqual(event["dur"], 0) + args = event.get("args", {}) + # CUPTI events include stream and block dimensions. + self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}") + self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}") + else: + # No GPU kernel events — CUDA profiling is likely not enabled. + # The test still validates the basic profiling JSON structure above. + print( + "Note: No GPU Kernel events found in profile. " + "CUDA profiling may not be enabled in this build." + ) + + finally: + if os.path.exists(model_path): + os.remove(model_path) + if profile_file and os.path.exists(profile_file): + os.remove(profile_file) + + def test_session_profiling(self): + """Verify session-level profiling produces valid output with the CUDA Plugin EP.""" + self._run_profiling_test() + if __name__ == "__main__": unittest.main() From bfd9445b953faa40048cdca4c76d47c9adf4bd27 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 23 Apr 2026 19:07:19 -0700 Subject: [PATCH 2/6] Address feedback --- docs/cuda_plugin_ep/cuda_plugin_ep_design.md | 4 ++-- onnxruntime/core/providers/cuda/plugin/cuda_ep.cc | 5 ++++- .../providers/cuda/plugin/cuda_profiler_plugin.cc | 3 ++- .../providers/cuda/plugin/cuda_profiler_plugin.h | 4 ++-- .../test/python/transformers/test_cuda_plugin_ep.py | 12 +++--------- 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md index a7d33ffd62e30..09d117ba2aac1 100644 --- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md +++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md @@ -875,13 +875,13 @@ When ORT calls `EndProfiling`: 3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`. 4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`. -The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only; the `PluginEpProfiler` bridge on the ORT side handles merging EP events into the global event timeline. +The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers. ### 14.5 Design Differences from In-Tree CUDA EP Profiler | Aspect | In-tree CUDA EP | CUDA Plugin EP | |--------|----------------|----------------| -| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge merges | +| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering | | Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is | | `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future | | Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) | diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc index 68ad66a11b1d1..df6ce15f2376d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc @@ -668,8 +668,11 @@ OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl( return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null"); } + *profiler = nullptr; + auto* ep = static_cast(this_ptr); - *profiler = new CudaPluginEpProfiler(ep->factory_.GetEpApi()); + auto profiler_impl = std::make_unique(ep->factory_.GetEpApi()); + *profiler = profiler_impl.release(); return nullptr; EXCEPTION_TO_STATUS_END diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc index 4e8dc039a794c..33622a52a6c42 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc @@ -106,7 +106,8 @@ OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl( // Flatten all GPU events and convert to OrtProfilingEvent. std::vector events; - for (auto& [correlation_id, event_list] : event_map) { + for (auto& kv : event_map) { + auto& event_list = kv.second; for (const auto& record : event_list) { // Build parallel key/value arrays to use the raw-pointer ProfilingEvent // constructor, avoiding a copy from InlinedHashMap to std::unordered_map. diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h index 061dc583bd00e..89e786b462eab 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h @@ -3,7 +3,7 @@ #pragma once -#if defined(ENABLE_CUDA_PROFILING) +#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) #include "cuda_plugin_utils.h" #include "cupti_manager.h" @@ -40,4 +40,4 @@ struct CudaPluginEpProfiler : OrtEpProfilerImpl { } // namespace cuda_plugin } // namespace onnxruntime -#endif // defined(ENABLE_CUDA_PROFILING) +#endif // defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py index def500450b234..15ff77d29dd07 100644 --- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py +++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py @@ -2420,7 +2420,7 @@ def _run_profiling_test(self): self.assertTrue(profile_file, "No profile file returned") self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}") - with open(profile_file, "r") as f: + with open(profile_file) as f: profile_data = json.load(f) self.assertIsInstance(profile_data, list) @@ -2440,10 +2440,7 @@ def _run_profiling_test(self): # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way: # without CUDA profiling the events list simply won't contain Kernel # entries, and the test validates the basic profiling infrastructure. - kernel_events = [ - e for e in profile_data - if isinstance(e, dict) and e.get("cat") == "Kernel" - ] + kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"] has_cuda_profiling = len(kernel_events) > 0 if has_cuda_profiling: @@ -2459,10 +2456,7 @@ def _run_profiling_test(self): else: # No GPU kernel events — CUDA profiling is likely not enabled. # The test still validates the basic profiling JSON structure above. - print( - "Note: No GPU Kernel events found in profile. " - "CUDA profiling may not be enabled in this build." - ) + print("Note: No GPU Kernel events found in profile. CUDA profiling may not be enabled in this build.") finally: if os.path.exists(model_path): From cae16bb0db4b3b6172987fc8d38acca59e267202 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 27 Apr 2026 12:45:22 -0700 Subject: [PATCH 3/6] address review comments --- cmake/onnxruntime_providers_cuda_plugin.cmake | 4 +--- docs/cuda_plugin_ep/cuda_plugin_ep_design.md | 1 + .../core/providers/cuda/cupti_manager.cc | 4 ++-- .../core/providers/cuda/cupti_manager.h | 9 ++------ .../cuda/plugin/cuda_profiler_plugin.h | 4 ++-- .../transformers/test_cuda_plugin_ep.py | 22 ++++++++++++++----- 6 files changed, 24 insertions(+), 20 deletions(-) diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 6bb13a923c11e..5275a273e6012 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -267,9 +267,7 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE if (onnxruntime_ENABLE_CUDA_PROFILING) target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti) - # USE_CUDA is required by cupti_manager.h guards. ENABLE_CUDA_PROFILING activates - # the profiler implementation in cuda_profiler_plugin.cc. - target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE USE_CUDA ENABLE_CUDA_PROFILING) + target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING) endif() # Symbol visibility — only export CreateEpFactories and ReleaseEpFactory diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md index 09d117ba2aac1..15f8188505b37 100644 --- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md +++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md @@ -884,6 +884,7 @@ The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUPro | Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering | | Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is | | `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future | +| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter | | Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) | ### 14.6 Build Configuration diff --git a/onnxruntime/core/providers/cuda/cupti_manager.cc b/onnxruntime/core/providers/cuda/cupti_manager.cc index a2c6daea1e0cd..6ce129bce4fdb 100644 --- a/onnxruntime/core/providers/cuda/cupti_manager.cc +++ b/onnxruntime/core/providers/cuda/cupti_manager.cc @@ -8,7 +8,7 @@ namespace onnxruntime { namespace profiling { -#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#if defined(ENABLE_CUDA_PROFILING) static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) { switch (kind) { @@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size)); } -#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ +#endif /* defined(ENABLE_CUDA_PROFILING) */ } // namespace profiling } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cupti_manager.h b/onnxruntime/core/providers/cuda/cupti_manager.h index cca78dcec5ea5..0977023981ce6 100644 --- a/onnxruntime/core/providers/cuda/cupti_manager.h +++ b/onnxruntime/core/providers/cuda/cupti_manager.h @@ -3,7 +3,7 @@ #pragma once -#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#if defined(ENABLE_CUDA_PROFILING) #include #include @@ -11,10 +11,6 @@ #include -// Do not move the check for CUDA_VERSION above #include -// the macros are defined in cupti.h -#if defined(USE_CUDA) - #include "core/common/gpu_profiler_common.h" #include "core/common/inlined_containers.h" @@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager { } /* namespace profiling */ } /* namespace onnxruntime */ -#endif /* #if defined(USE_CUDA) */ -#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ +#endif /* #if defined(ENABLE_CUDA_PROFILING) */ diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h index 89e786b462eab..061dc583bd00e 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h @@ -3,7 +3,7 @@ #pragma once -#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#if defined(ENABLE_CUDA_PROFILING) #include "cuda_plugin_utils.h" #include "cupti_manager.h" @@ -40,4 +40,4 @@ struct CudaPluginEpProfiler : OrtEpProfilerImpl { } // namespace cuda_plugin } // namespace onnxruntime -#endif // defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#endif // defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py index 15ff77d29dd07..62bfe1a6852f9 100644 --- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py +++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py @@ -2437,13 +2437,23 @@ def _run_profiling_test(self): self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}") # Check for GPU kernel events. These only appear when the build has - # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way: - # without CUDA profiling the events list simply won't contain Kernel - # entries, and the test validates the basic profiling infrastructure. + # ENABLE_CUDA_PROFILING=ON. + # + # When the env var ORT_CUDA_PROFILING_ENABLED=1 is set (e.g. by the + # profiling-enabled CI job), we require at least one Kernel event so + # that a broken CUPTI integration cannot ship green. kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"] - has_cuda_profiling = len(kernel_events) > 0 + expect_cuda_profiling = os.environ.get("ORT_CUDA_PROFILING_ENABLED") == "1" + + if expect_cuda_profiling: + self.assertGreater( + len(kernel_events), + 0, + "ORT_CUDA_PROFILING_ENABLED=1 but no GPU Kernel events found in profile. " + "CUPTI integration may be broken.", + ) - if has_cuda_profiling: + if len(kernel_events) > 0: # Validate GPU kernel event metadata. for event in kernel_events: self.assertIn("ts", event) @@ -2453,7 +2463,7 @@ def _run_profiling_test(self): # CUPTI events include stream and block dimensions. self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}") self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}") - else: + elif not expect_cuda_profiling: # No GPU kernel events — CUDA profiling is likely not enabled. # The test still validates the basic profiling JSON structure above. print("Note: No GPU Kernel events found in profile. CUDA profiling may not be enabled in this build.") From 2ea62ece59a0c624c72fa05ed98ca35beb6827e1 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 27 Apr 2026 13:59:20 -0700 Subject: [PATCH 4/6] Address review comments --- cmake/onnxruntime_providers_cuda_plugin.cmake | 1 + onnxruntime/core/providers/cuda/plugin/cuda_ep.cc | 1 + 2 files changed, 2 insertions(+) diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 920fe7850acaf..4825c334f44ea 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -268,6 +268,7 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE if (onnxruntime_ENABLE_CUDA_PROFILING) target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING) +endif() # Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided. if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc index df6ce15f2376d..e00b64eb2b9bd 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc @@ -12,6 +12,7 @@ #include #include +#include #include #include #include From d59b1af81191b4135779cb0221ba14940090c56a Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 27 Apr 2026 18:09:58 -0700 Subject: [PATCH 5/6] Modify CI to enable profiling and signal the test to run --- .github/workflows/linux_cuda_plugin_ci.yml | 5 +++- .github/workflows/windows_cuda_plugin.yml | 4 +++ .../transformers/test_cuda_plugin_ep.py | 29 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index 362a4dcc8f2bf..84985819597a2 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -36,6 +36,7 @@ jobs: --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 + --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH' @@ -120,7 +121,9 @@ jobs: export ORT_CUDA_PLUGIN_PATH=/build/Release/Release/libonnxruntime_providers_cuda_plugin.so echo \"ORT_CUDA_PLUGIN_PATH=\$ORT_CUDA_PLUGIN_PATH\" ls -la \$ORT_CUDA_PLUGIN_PATH - + # Signal the test that CUPTI profiling is available so it + # asserts GPU kernel events are present in the profile. + export ORT_CUDA_PROFILING_ENABLED=1 cd /onnxruntime_src/onnxruntime/test/python/transformers python test_cuda_plugin_ep.py " diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index 52219ae8fc071..0b3b3f59dbe13 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -83,6 +83,7 @@ jobs: --skip_tests ` --use_vcpkg ` --use_vcpkg_ms_internal_asset_cache ` + --enable_cuda_profiling ` --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 ` --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON @@ -194,6 +195,9 @@ jobs: Write-Error "CUDA plugin EP library not found at $env:ORT_CUDA_PLUGIN_PATH" exit 1 } + # Signal the test that CUPTI profiling is available so it + # asserts GPU kernel events are present in the profile. + $env:ORT_CUDA_PROFILING_ENABLED = "1" python test_cuda_plugin_ep.py if ($lastExitCode -ne 0) { exit $lastExitCode diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py index 0bc757f7a5fca..c2cd928eabb97 100644 --- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py +++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py @@ -2479,6 +2479,35 @@ def _run_profiling_test(self): # CUPTI events include stream and block dimensions. self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}") self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}") + + # Timeline plausibility: GPU kernel timestamps must fall within + # the session's profiling window. All ts values are in + # microseconds relative to profiling start (epoch 0). Compute + # the window from CPU-side events (Session/Node/Api) and assert + # kernel events fit. This catches timestamp-domain mismatches + # (e.g., missing NormalizeGPUTimestampToCPUEpoch in + # cupti_manager.cc). + cpu_events = [ + e + for e in profile_data + if isinstance(e, dict) and e.get("cat") in ("Session", "Node", "Api") and "ts" in e and "dur" in e + ] + if cpu_events: + session_end_us = max(e["ts"] + e["dur"] for e in cpu_events) + # Allow a small margin for GPU-side clock skew (100ms). + margin_us = 100_000 + for event in kernel_events: + ts = event["ts"] + self.assertGreaterEqual( + ts, + -margin_us, + f"GPU kernel event timestamp before profiling start (domain mismatch?): {event}", + ) + self.assertLessEqual( + ts, + session_end_us + margin_us, + f"GPU kernel event timestamp beyond session end (domain mismatch?): {event}", + ) elif not expect_cuda_profiling: # No GPU kernel events — CUDA profiling is likely not enabled. # The test still validates the basic profiling JSON structure above. From 82c0034609a926eb64a93acf14eb557f56f82010 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 28 Apr 2026 13:46:44 -0700 Subject: [PATCH 6/6] Fix build problem --- onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h index 061dc583bd00e..0d605a5ef95f6 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h @@ -6,7 +6,7 @@ #if defined(ENABLE_CUDA_PROFILING) #include "cuda_plugin_utils.h" -#include "cupti_manager.h" +#include "core/providers/cuda/cupti_manager.h" #include "core/common/gpu_profiler_common.h" namespace onnxruntime {