microsoft · yuslepukhin · Apr 23, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -36,6 +36,7 @@ jobs:
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
+        --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
       python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
@@ -120,7 +121,9 @@ jobs:
               export ORT_CUDA_PLUGIN_PATH=/build/Release/Release/libonnxruntime_providers_cuda_plugin.so
               echo \"ORT_CUDA_PLUGIN_PATH=\$ORT_CUDA_PLUGIN_PATH\"
               ls -la \$ORT_CUDA_PLUGIN_PATH
-
+              # Signal the test that CUPTI profiling is available so it
+              # asserts GPU kernel events are present in the profile.
+              export ORT_CUDA_PROFILING_ENABLED=1
               cd /onnxruntime_src/onnxruntime/test/python/transformers
               python test_cuda_plugin_ep.py
             "
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
@@ -83,6 +83,7 @@ jobs:
             --skip_tests `
             --use_vcpkg `
             --use_vcpkg_ms_internal_asset_cache `
+            --enable_cuda_profiling `
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
             --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
 
@@ -194,6 +195,9 @@ jobs:
             Write-Error "CUDA plugin EP library not found at $env:ORT_CUDA_PLUGIN_PATH"
             exit 1
           }
+          # Signal the test that CUPTI profiling is available so it
+          # asserts GPU kernel events are present in the profile.
+          $env:ORT_CUDA_PROFILING_ENABLED = "1"
           python test_cuda_plugin_ep.py
           if ($lastExitCode -ne 0) {
             exit $lastExitCode

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -265,6 +265,11 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
     ${PROTOBUF_LIB}
 )
 
+if (onnxruntime_ENABLE_CUDA_PROFILING)
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING)
+endif()
+
 # Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided.
 if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION)
   set(onnxruntime_PLUGIN_EP_VERSION "${ORT_VERSION}-dev")

diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -831,29 +831,105 @@ include/onnxruntime/ep/
 
 ---
 
-## 14. Future Work
+## 14. Profiling and Observability
 
-1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
+The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output.
 
-2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
+### 14.1 Architecture
+
+The profiling stack has three layers:
+
+1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge.
+2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`.
+3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing.
+
+```
+ORT Profiler
+  └─ PluginEpProfiler (bridge, in ORT core)
+       └─ OrtEpProfilerImpl callbacks (C API boundary)
+            └─ CudaPluginEpProfiler (in plugin DLL)
+                 └─ CUPTIManager singleton (in plugin DLL)
+                      └─ CUPTI activity APIs (GPU tracing)
+```
+
+### 14.2 CUPTI Integration
+
+The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs.
+
+CUPTI activities enabled:
+- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls
+- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls
+- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution
+- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers
+- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs
+
+### 14.3 Correlation ID Flow
+
+The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`.
+
+### 14.4 Event Collection (EndProfiling)
+
+When ORT calls `EndProfiling`:
+1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`).
+2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted.
+3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`.
+4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`.
+
+The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers.
+
+### 14.5 Design Differences from In-Tree CUDA EP Profiler
+
+| Aspect | In-tree CUDA EP | CUDA Plugin EP |
+|--------|----------------|----------------|
+| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering |
+| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
+| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
+| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter |
+| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |
+
+### 14.6 Build Configuration
+
+CUPTI profiling is conditional:
+- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON`
+- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target
+- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin`
+- **Source**: `cupti_manager.cc` compiled into the plugin
+
+When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled.
+
+### 14.7 Files
+
+| File | Role |
+|------|------|
+| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition |
+| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations |
+| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration |
+| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring |
+| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage |
+
+---
+
+## 15. Future Work
+
+1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
 
    - Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks.
    - Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`.
    - For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion.
 
-3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
+2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
 
-4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
+3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
 
-5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
+4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
 
-6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
+5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
 
-7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
+6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
 
-8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
+7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
 
-9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
+8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
 
     **A. Registration-time validation pass**
 
@@ -881,7 +957,7 @@ include/onnxruntime/ep/
     | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr |
     | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration |
 
-10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
+9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
 
     **A. Resource accounting**
 

diff --git a/onnxruntime/core/providers/cuda/cupti_manager.cc b/onnxruntime/core/providers/cuda/cupti_manager.cc
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 namespace profiling {
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) {
   switch (kind) {
@@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer
       ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size));
 }
 
-#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* defined(ENABLE_CUDA_PROFILING) */
 
 }  // namespace profiling
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cupti_manager.h b/onnxruntime/core/providers/cuda/cupti_manager.h
@@ -3,18 +3,14 @@
 
 #pragma once
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 #include <atomic>
 #include <mutex>
 #include <vector>
 
 #include <cupti.h>
 
-// Do not move the check for CUDA_VERSION above #include <cupti.h>
-// the macros are defined in cupti.h
-#if defined(USE_CUDA)
-
 #include "core/common/gpu_profiler_common.h"
 #include "core/common/inlined_containers.h"
 
@@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager<CUPTIManager> {
 } /* namespace profiling */
 } /* namespace onnxruntime */
 
-#endif /* #if defined(USE_CUDA) */
-#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* #if defined(ENABLE_CUDA_PROFILING) */
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -12,6 +12,7 @@
 
 #include <cstring>
 #include <limits>
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@@ -134,6 +135,13 @@
   // Resource accounting — allows ORT to query available device memory for budget enforcement
   GetAvailableResource = GetAvailableResourceImpl;
 
+  // Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time
+#if defined(ENABLE_CUDA_PROFILING)
+  CreateProfiler = CreateProfilerImpl;
+#else
+  CreateProfiler = nullptr;
+#endif
+
   const OrtApi& ort_api = factory_.GetOrtApi();
   Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO,
                                                    "CUDA Plugin EP created",
@@ -651,5 +659,26 @@
   EXCEPTION_TO_STATUS_END
 }
 
+#if defined(ENABLE_CUDA_PROFILING)
+/*static*/
+OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl(
+    OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
+  if (profiler == nullptr) {
+    return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null");
+  }
+
+  *profiler = nullptr;
+
+  auto* ep = static_cast<CudaEp*>(this_ptr);
+  auto profiler_impl = std::make_unique<CudaPluginEpProfiler>(ep->factory_.GetEpApi());
+  *profiler = profiler_impl.release();
+  return nullptr;
+
+  EXCEPTION_TO_STATUS_END
+}
+#endif  // defined(ENABLE_CUDA_PROFILING)
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
@@ -5,6 +5,7 @@
 
 #include "cuda_plugin_utils.h"
 #include "cuda_graph_plugin.h"
+#include "cuda_profiler_plugin.h"
 #include "ep/adapters.h"
 
 #include <memory>
@@ -91,6 +92,11 @@
   static OrtStatus* ORT_API_CALL GetAvailableResourceImpl(
       const OrtEp* this_ptr, OrtResourceCount* available) noexcept;
 
+#if defined(ENABLE_CUDA_PROFILING)
+  static OrtStatus* ORT_API_CALL CreateProfilerImpl(
+      OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept;
+#endif
+
   /// Helper to parse the graph annotation ID from run options.
   CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const;