From 67ef765eff2f9118fb1242e0e79203e511a38d93 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 23 Apr 2026 15:44:34 -0700
Subject: [PATCH 1/6] Initial impl

---
 cmake/onnxruntime_providers_cuda_plugin.cmake |   7 +
 docs/cuda_plugin_ep/cuda_plugin_ep_design.md  |  97 ++++++++++--
 .../core/providers/cuda/plugin/cuda_ep.cc     |  25 +++
 .../core/providers/cuda/plugin/cuda_ep.h      |   6 +
 .../cuda/plugin/cuda_profiler_plugin.cc       | 148 ++++++++++++++++++
 .../cuda/plugin/cuda_profiler_plugin.h        |  43 +++++
 .../transformers/test_cuda_plugin_ep.py       |  95 +++++++++++
 7 files changed, 410 insertions(+), 11 deletions(-)
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index f7b9c7be7c765..6bb13a923c11e 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -265,6 +265,13 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
     ${PROTOBUF_LIB}
 )
 
+if (onnxruntime_ENABLE_CUDA_PROFILING)
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
+    # USE_CUDA is required by cupti_manager.h guards. ENABLE_CUDA_PROFILING activates
+    # the profiler implementation in cuda_profiler_plugin.cc.
+    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE USE_CUDA ENABLE_CUDA_PROFILING)
+endif()
+
 # Symbol visibility — only export CreateEpFactories and ReleaseEpFactory
 target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ORT_API_MANUAL_INIT BUILD_CUDA_EP_AS_PLUGIN ORT_USE_EP_API_ADAPTERS=1 ONNX_ML=1 ONNX_NAMESPACE=onnx ONNX_USE_LITE_PROTO=1)
 
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
index ba7b07b97535e..a7d33ffd62e30 100644
--- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
+++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -831,29 +831,104 @@ include/onnxruntime/ep/
 
 ---
 
-## 14. Future Work
+## 14. Profiling and Observability
 
-1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
+The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output.
 
-2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
+### 14.1 Architecture
+
+The profiling stack has three layers:
+
+1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge.
+2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`.
+3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing.
+
+```
+ORT Profiler
+  └─ PluginEpProfiler (bridge, in ORT core)
+       └─ OrtEpProfilerImpl callbacks (C API boundary)
+            └─ CudaPluginEpProfiler (in plugin DLL)
+                 └─ CUPTIManager singleton (in plugin DLL)
+                      └─ CUPTI activity APIs (GPU tracing)
+```
+
+### 14.2 CUPTI Integration
+
+The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs.
+
+CUPTI activities enabled:
+- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls
+- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls
+- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution
+- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers
+- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs
+
+### 14.3 Correlation ID Flow
+
+The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`.
+
+### 14.4 Event Collection (EndProfiling)
+
+When ORT calls `EndProfiling`:
+1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`).
+2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted.
+3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`.
+4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`.
+
+The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only; the `PluginEpProfiler` bridge on the ORT side handles merging EP events into the global event timeline.
+
+### 14.5 Design Differences from In-Tree CUDA EP Profiler
+
+| Aspect | In-tree CUDA EP | CUDA Plugin EP |
+|--------|----------------|----------------|
+| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge merges |
+| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
+| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
+| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |
+
+### 14.6 Build Configuration
+
+CUPTI profiling is conditional:
+- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON`
+- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target
+- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin`
+- **Source**: `cupti_manager.cc` compiled into the plugin
+
+When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled.
+
+### 14.7 Files
+
+| File | Role |
+|------|------|
+| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition |
+| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations |
+| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration |
+| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring |
+| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage |
+
+---
+
+## 15. Future Work
+
+1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
 
    - Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks.
    - Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`.
    - For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion.
 
-3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
+2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
 
-4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
+3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
 
-5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
+4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
 
-6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
+5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
 
-7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
+6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
 
-8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
+7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
 
-9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
+8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
 
     **A. Registration-time validation pass**
 
@@ -881,7 +956,7 @@ include/onnxruntime/ep/
     | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr |
     | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration |
 
-10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
+9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
 
     **A. Resource accounting**
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
index 7c2970c468216..68ad66a11b1d1 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -134,6 +134,13 @@ CudaEp::CudaEp(CudaEpFactory& factory, const Config& config, const OrtLogger& lo
   // Resource accounting — allows ORT to query available device memory for budget enforcement
   GetAvailableResource = GetAvailableResourceImpl;
 
+  // Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time
+#if defined(ENABLE_CUDA_PROFILING)
+  CreateProfiler = CreateProfilerImpl;
+#else
+  CreateProfiler = nullptr;
+#endif
+
   const OrtApi& ort_api = factory_.GetOrtApi();
   Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO,
                                                    "CUDA Plugin EP created",
@@ -651,5 +658,23 @@ OrtStatus* ORT_API_CALL CudaEp::GetAvailableResourceImpl(
   EXCEPTION_TO_STATUS_END
 }
 
+#if defined(ENABLE_CUDA_PROFILING)
+/*static*/
+OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl(
+    OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
+  if (profiler == nullptr) {
+    return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null");
+  }
+
+  auto* ep = static_cast<CudaEp*>(this_ptr);
+  *profiler = new CudaPluginEpProfiler(ep->factory_.GetEpApi());
+  return nullptr;
+
+  EXCEPTION_TO_STATUS_END
+}
+#endif  // defined(ENABLE_CUDA_PROFILING)
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
index 502902c53070b..faaeebf9ceae0 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
@@ -5,6 +5,7 @@
 
 #include "cuda_plugin_utils.h"
 #include "cuda_graph_plugin.h"
+#include "cuda_profiler_plugin.h"
 #include "ep/adapters.h"
 
 #include <memory>
@@ -91,6 +92,11 @@ class CudaEp : public onnxruntime::ep::adapter::Ep {
   static OrtStatus* ORT_API_CALL GetAvailableResourceImpl(
       const OrtEp* this_ptr, OrtResourceCount* available) noexcept;
 
+#if defined(ENABLE_CUDA_PROFILING)
+  static OrtStatus* ORT_API_CALL CreateProfilerImpl(
+      OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept;
+#endif
+
   /// Helper to parse the graph annotation ID from run options.
   CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const;
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
new file mode 100644
index 0000000000000..4e8dc039a794c
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "cuda_profiler_plugin.h"
+
+#if defined(ENABLE_CUDA_PROFILING)
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+CudaPluginEpProfiler::CudaPluginEpProfiler(const OrtEpApi& api)
+    : OrtEpProfilerImpl{}, ep_api(api) {
+  ort_version_supported = ORT_API_VERSION;
+  Release = ReleaseImpl;
+  StartProfiling = StartProfilingImpl;
+  EndProfiling = EndProfilingImpl;
+  StartEvent = StartEventImpl;
+  StopEvent = StopEventImpl;
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  client_handle_ = manager.RegisterClient();
+}
+
+CudaPluginEpProfiler::~CudaPluginEpProfiler() {
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.DeregisterClient(client_handle_);
+}
+
+/*static*/
+void ORT_API_CALL CudaPluginEpProfiler::ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept {
+  delete static_cast<CudaPluginEpProfiler*>(this_ptr);
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartProfilingImpl(
+    OrtEpProfilerImpl* this_ptr,
+    int64_t ep_profiling_start_offset_ns) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  self->ep_profiling_start_offset_ns_ = ep_profiling_start_offset_ns;
+  self->start_time_point_ = TimePoint::clock::now();
+
+  // Reconstruct the approximate ORT profiling start time so that GPU event
+  // timestamps (computed by CUPTIManager::Consume) are relative to ORT's start.
+  self->ort_profiling_start_ = self->start_time_point_ -
+                               std::chrono::duration_cast<TimePoint::duration>(
+                                   std::chrono::nanoseconds(ep_profiling_start_offset_ns));
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.StartLogging();
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartEventImpl(
+    OrtEpProfilerImpl* this_ptr,
+    uint64_t ort_event_correlation_id) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  // The bridge provides an absolute epoch-based correlation ID. Pass TimePoint{}
+  // (epoch) so PushCorrelation adds zero offset and the unique_cid equals the
+  // correlation ID directly. This avoids double-adding the epoch offset that
+  // GPUTracerManager::PushCorrelation normally computes.
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.PushCorrelation(self->client_handle_, ort_event_correlation_id, TimePoint{});
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StopEventImpl(
+    OrtEpProfilerImpl* /*this_ptr*/,
+    uint64_t /*ort_event_correlation_id*/,
+    const OrtProfilingEvent* /*ort_event*/) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.PopCorrelation();
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl(
+    OrtEpProfilerImpl* this_ptr,
+    OrtProfilingEventsContainer* c_events_container) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+
+  // Consume GPU events. Timestamps are computed relative to ort_profiling_start_
+  // by CUPTIManager::ProcessActivityBuffers, so they match ORT's timeline.
+  std::map<uint64_t, profiling::Events> event_map;
+  manager.Consume(self->client_handle_, self->ort_profiling_start_, event_map);
+
+  // Flatten all GPU events and convert to OrtProfilingEvent.
+  std::vector<Ort::ProfilingEvent> events;
+  for (auto& [correlation_id, event_list] : event_map) {
+    for (const auto& record : event_list) {
+      // Build parallel key/value arrays to use the raw-pointer ProfilingEvent
+      // constructor, avoiding a copy from InlinedHashMap to std::unordered_map.
+      InlinedVector<const char*> arg_keys;
+      InlinedVector<const char*> arg_values;
+      arg_keys.reserve(record.args.size());
+      arg_values.reserve(record.args.size());
+      for (const auto& [k, v] : record.args) {
+        arg_keys.push_back(k.c_str());
+        arg_values.push_back(v.c_str());
+      }
+
+      events.emplace_back(
+          OrtProfilingEventCategory_KERNEL,
+          record.pid,
+          record.tid,
+          record.name.c_str(),
+          record.ts,
+          record.dur,
+          arg_keys.data(),
+          arg_values.data(),
+          arg_keys.size());
+    }
+  }
+
+  if (!events.empty()) {
+    Ort::UnownedProfilingEventsContainer events_container(c_events_container);
+    Ort::Status status = events_container.AddEvents(events);
+    return status.release();
+  }
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
+
+#endif  // defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
new file mode 100644
index 0000000000000..061dc583bd00e
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(ENABLE_CUDA_PROFILING)
+
+#include "cuda_plugin_utils.h"
+#include "cupti_manager.h"
+#include "core/common/gpu_profiler_common.h"
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+/// Plugin-side implementation of OrtEpProfilerImpl for CUDA.
+/// Delegates to CUPTIManager (within the plugin DLL) for GPU activity tracing
+/// and implements the C callback interface expected by ORT's PluginEpProfiler bridge.
+struct CudaPluginEpProfiler : OrtEpProfilerImpl {
+  const OrtEpApi& ep_api;
+  uint64_t client_handle_ = 0;
+  int64_t ep_profiling_start_offset_ns_ = 0;
+  TimePoint start_time_point_;
+  TimePoint ort_profiling_start_;
+
+  explicit CudaPluginEpProfiler(const OrtEpApi& api);
+  ~CudaPluginEpProfiler();
+
+  static void ORT_API_CALL ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept;
+  static OrtStatus* ORT_API_CALL StartProfilingImpl(OrtEpProfilerImpl* this_ptr,
+                                                    int64_t ep_profiling_start_offset_ns) noexcept;
+  static OrtStatus* ORT_API_CALL StartEventImpl(OrtEpProfilerImpl* this_ptr,
+                                                uint64_t ort_event_correlation_id) noexcept;
+  static OrtStatus* ORT_API_CALL StopEventImpl(OrtEpProfilerImpl* this_ptr,
+                                               uint64_t ort_event_correlation_id,
+                                               const OrtProfilingEvent* ort_event) noexcept;
+  static OrtStatus* ORT_API_CALL EndProfilingImpl(OrtEpProfilerImpl* this_ptr,
+                                                  OrtProfilingEventsContainer* events_container) noexcept;
+};
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
+
+#endif  // defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
index 7b05d364309d9..def500450b234 100644
--- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
+++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import json
 import os
 import tempfile
 import unittest
@@ -2379,6 +2380,100 @@ def test_iobinding_matmul(self):
             if os.path.exists(model_path):
                 os.remove(model_path)
 
+    # ---- Profiling tests ----
+
+    def _run_profiling_test(self):
+        """Run a model with session-level profiling enabled and verify the JSON output.
+
+        When CUDA profiling is enabled (ENABLE_CUDA_PROFILING), also verify
+        that GPU kernel events appear in the profile with expected metadata
+        (category Kernel, stream, block_x).
+        """
+        target_device = get_cuda_plugin_device()
+
+        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as tmp:
+            model_path = tmp.name
+        profile_file = None
+        try:
+            create_matmul_model(model_path)
+            sess_options = _create_session_options()
+            sess_options.add_provider_for_devices([target_device], {})
+
+            profile_prefix = os.path.join(tempfile.gettempdir(), "cuda_plugin_ep_profiling_test")
+            sess_options.enable_profiling = True
+            sess_options.profile_file_prefix = profile_prefix
+
+            sess = onnxrt.InferenceSession(model_path, sess_options=sess_options)
+
+            assigned_nodes, assignment_info = _get_assigned_nodes(sess, CUDA_PLUGIN_EP_NAME)
+            self.assertTrue(
+                assigned_nodes,
+                f"{CUDA_PLUGIN_EP_NAME} was assigned no nodes. "
+                f"Assignments: {_format_assignment_summary(assignment_info)}",
+            )
+
+            a = np.random.rand(3, 4).astype(np.float32)
+            b = np.random.rand(4, 5).astype(np.float32)
+            sess.run(None, {"A": a, "B": b})
+
+            profile_file = sess.end_profiling()
+            self.assertTrue(profile_file, "No profile file returned")
+            self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}")
+
+            with open(profile_file, "r") as f:
+                profile_data = json.load(f)
+
+            self.assertIsInstance(profile_data, list)
+            self.assertGreater(len(profile_data), 0, "Profile JSON is empty")
+
+            # Every event entry must have standard tracing fields.
+            required_keys = {"pid", "dur", "ts", "ph", "name", "args"}
+            for entry in profile_data:
+                if not isinstance(entry, dict):
+                    continue
+                if "name" not in entry:
+                    continue
+                for key in required_keys:
+                    self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}")
+
+            # Check for GPU kernel events. These only appear when the build has
+            # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way:
+            # without CUDA profiling the events list simply won't contain Kernel
+            # entries, and the test validates the basic profiling infrastructure.
+            kernel_events = [
+                e for e in profile_data
+                if isinstance(e, dict) and e.get("cat") == "Kernel"
+            ]
+            has_cuda_profiling = len(kernel_events) > 0
+
+            if has_cuda_profiling:
+                # Validate GPU kernel event metadata.
+                for event in kernel_events:
+                    self.assertIn("ts", event)
+                    self.assertIn("dur", event)
+                    self.assertGreaterEqual(event["dur"], 0)
+                    args = event.get("args", {})
+                    # CUPTI events include stream and block dimensions.
+                    self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}")
+                    self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}")
+            else:
+                # No GPU kernel events — CUDA profiling is likely not enabled.
+                # The test still validates the basic profiling JSON structure above.
+                print(
+                    "Note: No GPU Kernel events found in profile. "
+                    "CUDA profiling may not be enabled in this build."
+                )
+
+        finally:
+            if os.path.exists(model_path):
+                os.remove(model_path)
+            if profile_file and os.path.exists(profile_file):
+                os.remove(profile_file)
+
+    def test_session_profiling(self):
+        """Verify session-level profiling produces valid output with the CUDA Plugin EP."""
+        self._run_profiling_test()
+
 
 if __name__ == "__main__":
     unittest.main()

From bfd9445b953faa40048cdca4c76d47c9adf4bd27 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 23 Apr 2026 19:07:19 -0700
Subject: [PATCH 2/6] Address feedback

---
 docs/cuda_plugin_ep/cuda_plugin_ep_design.md         |  4 ++--
 onnxruntime/core/providers/cuda/plugin/cuda_ep.cc    |  5 ++++-
 .../providers/cuda/plugin/cuda_profiler_plugin.cc    |  3 ++-
 .../providers/cuda/plugin/cuda_profiler_plugin.h     |  4 ++--
 .../test/python/transformers/test_cuda_plugin_ep.py  | 12 +++---------
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
index a7d33ffd62e30..09d117ba2aac1 100644
--- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
+++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -875,13 +875,13 @@ When ORT calls `EndProfiling`:
 3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`.
 4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`.
 
-The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only; the `PluginEpProfiler` bridge on the ORT side handles merging EP events into the global event timeline.
+The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers.
 
 ### 14.5 Design Differences from In-Tree CUDA EP Profiler
 
 | Aspect | In-tree CUDA EP | CUDA Plugin EP |
 |--------|----------------|----------------|
-| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge merges |
+| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering |
 | Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
 | `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
 | Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
index 68ad66a11b1d1..df6ce15f2376d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -668,8 +668,11 @@ OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl(
     return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null");
   }
 
+  *profiler = nullptr;
+
   auto* ep = static_cast<CudaEp*>(this_ptr);
-  *profiler = new CudaPluginEpProfiler(ep->factory_.GetEpApi());
+  auto profiler_impl = std::make_unique<CudaPluginEpProfiler>(ep->factory_.GetEpApi());
+  *profiler = profiler_impl.release();
   return nullptr;
 
   EXCEPTION_TO_STATUS_END
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
index 4e8dc039a794c..33622a52a6c42 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
@@ -106,7 +106,8 @@ OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl(
 
   // Flatten all GPU events and convert to OrtProfilingEvent.
   std::vector<Ort::ProfilingEvent> events;
-  for (auto& [correlation_id, event_list] : event_map) {
+  for (auto& kv : event_map) {
+    auto& event_list = kv.second;
     for (const auto& record : event_list) {
       // Build parallel key/value arrays to use the raw-pointer ProfilingEvent
       // constructor, avoiding a copy from InlinedHashMap to std::unordered_map.
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
index 061dc583bd00e..89e786b462eab 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#if defined(ENABLE_CUDA_PROFILING)
+#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
 
 #include "cuda_plugin_utils.h"
 #include "cupti_manager.h"
@@ -40,4 +40,4 @@ struct CudaPluginEpProfiler : OrtEpProfilerImpl {
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
 
-#endif  // defined(ENABLE_CUDA_PROFILING)
+#endif  // defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
index def500450b234..15ff77d29dd07 100644
--- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
+++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
@@ -2420,7 +2420,7 @@ def _run_profiling_test(self):
             self.assertTrue(profile_file, "No profile file returned")
             self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}")
 
-            with open(profile_file, "r") as f:
+            with open(profile_file) as f:
                 profile_data = json.load(f)
 
             self.assertIsInstance(profile_data, list)
@@ -2440,10 +2440,7 @@ def _run_profiling_test(self):
             # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way:
             # without CUDA profiling the events list simply won't contain Kernel
             # entries, and the test validates the basic profiling infrastructure.
-            kernel_events = [
-                e for e in profile_data
-                if isinstance(e, dict) and e.get("cat") == "Kernel"
-            ]
+            kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"]
             has_cuda_profiling = len(kernel_events) > 0
 
             if has_cuda_profiling:
@@ -2459,10 +2456,7 @@ def _run_profiling_test(self):
             else:
                 # No GPU kernel events — CUDA profiling is likely not enabled.
                 # The test still validates the basic profiling JSON structure above.
-                print(
-                    "Note: No GPU Kernel events found in profile. "
-                    "CUDA profiling may not be enabled in this build."
-                )
+                print("Note: No GPU Kernel events found in profile. CUDA profiling may not be enabled in this build.")
 
         finally:
             if os.path.exists(model_path):

From cae16bb0db4b3b6172987fc8d38acca59e267202 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 27 Apr 2026 12:45:22 -0700
Subject: [PATCH 3/6] address review comments

---
 cmake/onnxruntime_providers_cuda_plugin.cmake |  4 +---
 docs/cuda_plugin_ep/cuda_plugin_ep_design.md  |  1 +
 .../core/providers/cuda/cupti_manager.cc      |  4 ++--
 .../core/providers/cuda/cupti_manager.h       |  9 ++------
 .../cuda/plugin/cuda_profiler_plugin.h        |  4 ++--
 .../transformers/test_cuda_plugin_ep.py       | 22 ++++++++++++++-----
 6 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 6bb13a923c11e..5275a273e6012 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -267,9 +267,7 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
 
 if (onnxruntime_ENABLE_CUDA_PROFILING)
     target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
-    # USE_CUDA is required by cupti_manager.h guards. ENABLE_CUDA_PROFILING activates
-    # the profiler implementation in cuda_profiler_plugin.cc.
-    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE USE_CUDA ENABLE_CUDA_PROFILING)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING)
 endif()
 
 # Symbol visibility — only export CreateEpFactories and ReleaseEpFactory
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
index 09d117ba2aac1..15f8188505b37 100644
--- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
+++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -884,6 +884,7 @@ The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUPro
 | Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering |
 | Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
 | `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
+| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter |
 | Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |
 
 ### 14.6 Build Configuration
diff --git a/onnxruntime/core/providers/cuda/cupti_manager.cc b/onnxruntime/core/providers/cuda/cupti_manager.cc
index a2c6daea1e0cd..6ce129bce4fdb 100644
--- a/onnxruntime/core/providers/cuda/cupti_manager.cc
+++ b/onnxruntime/core/providers/cuda/cupti_manager.cc
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 namespace profiling {
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) {
   switch (kind) {
@@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer
       ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size));
 }
 
-#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* defined(ENABLE_CUDA_PROFILING) */
 
 }  // namespace profiling
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cupti_manager.h b/onnxruntime/core/providers/cuda/cupti_manager.h
index cca78dcec5ea5..0977023981ce6 100644
--- a/onnxruntime/core/providers/cuda/cupti_manager.h
+++ b/onnxruntime/core/providers/cuda/cupti_manager.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 #include <atomic>
 #include <mutex>
@@ -11,10 +11,6 @@
 
 #include <cupti.h>
 
-// Do not move the check for CUDA_VERSION above #include <cupti.h>
-// the macros are defined in cupti.h
-#if defined(USE_CUDA)
-
 #include "core/common/gpu_profiler_common.h"
 #include "core/common/inlined_containers.h"
 
@@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager<CUPTIManager> {
 } /* namespace profiling */
 } /* namespace onnxruntime */
 
-#endif /* #if defined(USE_CUDA) */
-#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* #if defined(ENABLE_CUDA_PROFILING) */
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
index 89e786b462eab..061dc583bd00e 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 #include "cuda_plugin_utils.h"
 #include "cupti_manager.h"
@@ -40,4 +40,4 @@ struct CudaPluginEpProfiler : OrtEpProfilerImpl {
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
 
-#endif  // defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#endif  // defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
index 15ff77d29dd07..62bfe1a6852f9 100644
--- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
+++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
@@ -2437,13 +2437,23 @@ def _run_profiling_test(self):
                     self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}")
 
             # Check for GPU kernel events. These only appear when the build has
-            # ENABLE_CUDA_PROFILING=ON. The test is written to pass either way:
-            # without CUDA profiling the events list simply won't contain Kernel
-            # entries, and the test validates the basic profiling infrastructure.
+            # ENABLE_CUDA_PROFILING=ON.
+            #
+            # When the env var ORT_CUDA_PROFILING_ENABLED=1 is set (e.g. by the
+            # profiling-enabled CI job), we require at least one Kernel event so
+            # that a broken CUPTI integration cannot ship green.
             kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"]
-            has_cuda_profiling = len(kernel_events) > 0
+            expect_cuda_profiling = os.environ.get("ORT_CUDA_PROFILING_ENABLED") == "1"
+
+            if expect_cuda_profiling:
+                self.assertGreater(
+                    len(kernel_events),
+                    0,
+                    "ORT_CUDA_PROFILING_ENABLED=1 but no GPU Kernel events found in profile. "
+                    "CUPTI integration may be broken.",
+                )
 
-            if has_cuda_profiling:
+            if len(kernel_events) > 0:
                 # Validate GPU kernel event metadata.
                 for event in kernel_events:
                     self.assertIn("ts", event)
@@ -2453,7 +2463,7 @@ def _run_profiling_test(self):
                     # CUPTI events include stream and block dimensions.
                     self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}")
                     self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}")
-            else:
+            elif not expect_cuda_profiling:
                 # No GPU kernel events — CUDA profiling is likely not enabled.
                 # The test still validates the basic profiling JSON structure above.
                 print("Note: No GPU Kernel events found in profile. CUDA profiling may not be enabled in this build.")

From 2ea62ece59a0c624c72fa05ed98ca35beb6827e1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 27 Apr 2026 13:59:20 -0700
Subject: [PATCH 4/6] Address review comments

---
 cmake/onnxruntime_providers_cuda_plugin.cmake     | 1 +
 onnxruntime/core/providers/cuda/plugin/cuda_ep.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 920fe7850acaf..4825c334f44ea 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -268,6 +268,7 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
 if (onnxruntime_ENABLE_CUDA_PROFILING)
     target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
     target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING)
+endif()
     
 # Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided.
 if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION)
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
index df6ce15f2376d..e00b64eb2b9bd 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -12,6 +12,7 @@
 
 #include <cstring>
 #include <limits>
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <string_view>

From d59b1af81191b4135779cb0221ba14940090c56a Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 27 Apr 2026 18:09:58 -0700
Subject: [PATCH 5/6] Modify CI to enable profiling and signal the test to run

---
 .github/workflows/linux_cuda_plugin_ci.yml    |  5 +++-
 .github/workflows/windows_cuda_plugin.yml     |  4 +++
 .../transformers/test_cuda_plugin_ep.py       | 29 +++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index 362a4dcc8f2bf..84985819597a2 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -36,6 +36,7 @@ jobs:
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
+        --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
       python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
@@ -120,7 +121,9 @@ jobs:
               export ORT_CUDA_PLUGIN_PATH=/build/Release/Release/libonnxruntime_providers_cuda_plugin.so
               echo \"ORT_CUDA_PLUGIN_PATH=\$ORT_CUDA_PLUGIN_PATH\"
               ls -la \$ORT_CUDA_PLUGIN_PATH
-
+              # Signal the test that CUPTI profiling is available so it
+              # asserts GPU kernel events are present in the profile.
+              export ORT_CUDA_PROFILING_ENABLED=1
               cd /onnxruntime_src/onnxruntime/test/python/transformers
               python test_cuda_plugin_ep.py
             "
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index 52219ae8fc071..0b3b3f59dbe13 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -83,6 +83,7 @@ jobs:
             --skip_tests `
             --use_vcpkg `
             --use_vcpkg_ms_internal_asset_cache `
+            --enable_cuda_profiling `
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
             --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
 
@@ -194,6 +195,9 @@ jobs:
             Write-Error "CUDA plugin EP library not found at $env:ORT_CUDA_PLUGIN_PATH"
             exit 1
           }
+          # Signal the test that CUPTI profiling is available so it
+          # asserts GPU kernel events are present in the profile.
+          $env:ORT_CUDA_PROFILING_ENABLED = "1"
           python test_cuda_plugin_ep.py
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
index 0bc757f7a5fca..c2cd928eabb97 100644
--- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
+++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
@@ -2479,6 +2479,35 @@ def _run_profiling_test(self):
                     # CUPTI events include stream and block dimensions.
                     self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}")
                     self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}")
+
+                # Timeline plausibility: GPU kernel timestamps must fall within
+                # the session's profiling window. All ts values are in
+                # microseconds relative to profiling start (epoch 0). Compute
+                # the window from CPU-side events (Session/Node/Api) and assert
+                # kernel events fit. This catches timestamp-domain mismatches
+                # (e.g., missing NormalizeGPUTimestampToCPUEpoch in
+                # cupti_manager.cc).
+                cpu_events = [
+                    e
+                    for e in profile_data
+                    if isinstance(e, dict) and e.get("cat") in ("Session", "Node", "Api") and "ts" in e and "dur" in e
+                ]
+                if cpu_events:
+                    session_end_us = max(e["ts"] + e["dur"] for e in cpu_events)
+                    # Allow a small margin for GPU-side clock skew (100ms).
+                    margin_us = 100_000
+                    for event in kernel_events:
+                        ts = event["ts"]
+                        self.assertGreaterEqual(
+                            ts,
+                            -margin_us,
+                            f"GPU kernel event timestamp before profiling start (domain mismatch?): {event}",
+                        )
+                        self.assertLessEqual(
+                            ts,
+                            session_end_us + margin_us,
+                            f"GPU kernel event timestamp beyond session end (domain mismatch?): {event}",
+                        )
             elif not expect_cuda_profiling:
                 # No GPU kernel events — CUDA profiling is likely not enabled.
                 # The test still validates the basic profiling JSON structure above.

From 82c0034609a926eb64a93acf14eb557f56f82010 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 28 Apr 2026 13:46:44 -0700
Subject: [PATCH 6/6] Fix build problem

---
 onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
index 061dc583bd00e..0d605a5ef95f6 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
@@ -6,7 +6,7 @@
 #if defined(ENABLE_CUDA_PROFILING)
 
 #include "cuda_plugin_utils.h"
-#include "cupti_manager.h"
+#include "core/providers/cuda/cupti_manager.h"
 #include "core/common/gpu_profiler_common.h"
 
 namespace onnxruntime {