diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index 362a4dcc8f2bf..3b532c486cdfc 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -36,6 +36,7 @@ jobs:
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
+        --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
       python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index 52219ae8fc071..07083a5caa08a 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -83,6 +83,7 @@ jobs:
             --skip_tests `
             --use_vcpkg `
             --use_vcpkg_ms_internal_asset_cache `
+            --enable_cuda_profiling `
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
             --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
 
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 93fe8e0921986..e345c944dccf8 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -265,6 +265,11 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
     ${PROTOBUF_LIB}
 )
 
+if (onnxruntime_ENABLE_CUDA_PROFILING)
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING)
+endif()
+
 # Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided.
 if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION)
   set(onnxruntime_PLUGIN_EP_VERSION "${ORT_VERSION}-dev")
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
index ba7b07b97535e..15f8188505b37 100644
--- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
+++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -831,29 +831,105 @@ include/onnxruntime/ep/
 
 ---
 
-## 14. Future Work
+## 14. Profiling and Observability
 
-1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
+The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output.
 
-2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
+### 14.1 Architecture
+
+The profiling stack has three layers:
+
+1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge.
+2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`.
+3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing.
+
+```
+ORT Profiler
+  └─ PluginEpProfiler (bridge, in ORT core)
+       └─ OrtEpProfilerImpl callbacks (C API boundary)
+            └─ CudaPluginEpProfiler (in plugin DLL)
+                 └─ CUPTIManager singleton (in plugin DLL)
+                      └─ CUPTI activity APIs (GPU tracing)
+```
+
+### 14.2 CUPTI Integration
+
+The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs.
+
+CUPTI activities enabled:
+- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls
+- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls
+- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution
+- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers
+- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs
+
+### 14.3 Correlation ID Flow
+
+The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`.
+
+### 14.4 Event Collection (EndProfiling)
+
+When ORT calls `EndProfiling`:
+1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`).
+2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted.
+3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`.
+4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`.
+
+The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers.
+
+### 14.5 Design Differences from In-Tree CUDA EP Profiler
+
+| Aspect | In-tree CUDA EP | CUDA Plugin EP |
+|--------|----------------|----------------|
+| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering |
+| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
+| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
+| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter |
+| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |
+
+### 14.6 Build Configuration
+
+CUPTI profiling is conditional:
+- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON`
+- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target
+- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin`
+- **Source**: `cupti_manager.cc` compiled into the plugin
+
+When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled.
+
+### 14.7 Files
+
+| File | Role |
+|------|------|
+| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition |
+| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations |
+| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration |
+| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring |
+| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage |
+
+---
+
+## 15. Future Work
+
+1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
 
    - Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks.
    - Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`.
    - For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion.
 
-3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
+2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
 
-4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
+3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
 
-5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
+4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
 
-6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
+5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
 
-7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
+6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
 
-8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
+7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
 
-9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
+8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
 
     **A. Registration-time validation pass**
 
@@ -881,7 +957,7 @@ include/onnxruntime/ep/
     | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr |
     | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration |
 
-10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
+9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
 
     **A. Resource accounting**
 
diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h
index df50f558ed8b9..8a86039b19680 100644
--- a/include/onnxruntime/core/common/gpu_profiler_common.h
+++ b/include/onnxruntime/core/common/gpu_profiler_common.h
@@ -122,6 +122,10 @@ class GPUTracerManager {
     tracing_enabled_ = this_as_derived->OnStartLogging();
   }
 
+  bool IsTracingEnabled() const noexcept {
+    return tracing_enabled_;
+  }
+
   void Consume(uint64_t client_handle, const TimePoint& start_time, std::map<uint64_t, Events>& events) {
     auto this_as_derived = static_cast<TDerived*>(this);
     events.clear();
@@ -442,11 +446,16 @@ class GPUProfilerBase : public EpProfiler {
   TimePoint profiling_start_time_;
 
  public:
-  virtual bool StartProfiling(TimePoint profiling_start_time) override {
+  virtual Status StartProfiling(TimePoint profiling_start_time) override {
     auto& manager = TManager::GetInstance();
     manager.StartLogging();
     profiling_start_time_ = profiling_start_time;
-    return true;
+    if (!manager.IsTracingEnabled()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "GPU activity tracing failed to start. "
+                             "The tracing library may be unavailable or blocked by system policy.");
+    }
+    return Status::OK();
   }
 
   virtual void EndProfiling(TimePoint start_time, Events& events) override {
diff --git a/include/onnxruntime/core/common/profiler_common.h b/include/onnxruntime/core/common/profiler_common.h
index 236249253fb39..48a75731eb5e1 100644
--- a/include/onnxruntime/core/common/profiler_common.h
+++ b/include/onnxruntime/core/common/profiler_common.h
@@ -87,8 +87,11 @@ class EpProfiler {
   /// Allows EP profiler to initialize profiling utilities and record the profiling start time.
   /// </summary>
   /// <param name="profiling_start_time">Timepoint denoting the start of profiling.</param>
-  /// <returns>True if profiling was started successfully.</returns>
-  virtual bool StartProfiling(TimePoint profiling_start_time) = 0;
+  /// <returns>Status::OK() if profiling was started successfully, or an error Status
+  /// describing why profiling could not start (e.g., CUPTI unavailable, tracing blocked by policy).
+  /// Callers should not treat a failed start as fatal — the session can still execute without
+  /// profiling, but the status should be surfaced for diagnostic purposes.</returns>
+  virtual Status StartProfiling(TimePoint profiling_start_time) = 0;
 
   /// <summary>
   /// Called when profiling ends to collect the EP's new profiling events since the last call to StartProfiling.
diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc
index 19508723f6cf0..43bb45b898191 100644
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@@ -49,7 +49,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) {
   custom_logger_ = custom_logger;
   profiling_start_time_ = std::chrono::high_resolution_clock::now();
   for (const auto& ep_profiler : ep_profilers_) {
-    ep_profiler->StartProfiling(profiling_start_time_);
+    auto status = ep_profiler->StartProfiling(profiling_start_time_);
+    if (!status.IsOK() && ep_start_profiling_status_.IsOK()) {
+      ep_start_profiling_status_ = status;
+    }
   }
 }
 
@@ -62,7 +65,10 @@ void Profiler::StartProfiling(const std::basic_string<T>& file_name) {
   profile_stream_file_ = ToUTF8String(file_name);
   profiling_start_time_ = std::chrono::high_resolution_clock::now();
   for (const auto& ep_profiler : ep_profilers_) {
-    ep_profiler->StartProfiling(profiling_start_time_);
+    auto status = ep_profiler->StartProfiling(profiling_start_time_);
+    if (!status.IsOK() && ep_start_profiling_status_.IsOK()) {
+      ep_start_profiling_status_ = status;
+    }
   }
 }
 
diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h
index fe23593bdc2d0..5d4c0fc3e5070 100644
--- a/onnxruntime/core/common/profiler.h
+++ b/onnxruntime/core/common/profiler.h
@@ -114,11 +114,22 @@ class Profiler {
     if (ep_profiler) {
       ep_profilers_.push_back(std::move(ep_profiler));
       if (enabled_) {
-        ep_profilers_.back()->StartProfiling(profiling_start_time_);
+        auto status = ep_profilers_.back()->StartProfiling(profiling_start_time_);
+        if (!status.IsOK() && ep_start_profiling_status_.IsOK()) {
+          ep_start_profiling_status_ = status;
+        }
       }
     }
   }
 
+  /// Returns the aggregate status from calling StartProfiling on EP profilers.
+  /// OK if all EP profilers started successfully (or if none are registered).
+  /// Returns the first error status encountered otherwise.
+  const Status& GetEpProfilingStatus() const { return ep_start_profiling_status_; }
+
+  /// Returns true if at least one EP profiler was registered.
+  bool HasEpProfilers() const { return !ep_profilers_.empty(); }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler);
 
@@ -156,6 +167,10 @@ class Profiler {
 #endif
 
   std::vector<std::unique_ptr<EpProfiler>> ep_profilers_;
+
+  // Aggregate status from EP profiler StartProfiling calls.
+  // Stores the first error encountered.
+  Status ep_start_profiling_status_;
 };
 
 }  // namespace profiling
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h
index cc67bb8a0f285..be26d1425fdb1 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.h
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.h
@@ -32,7 +32,7 @@ class CudaProfiler final : public EpProfiler {
   CudaProfiler() = default;
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
   ~CudaProfiler() {}
-  bool StartProfiling(TimePoint) override { return true; }
+  Status StartProfiling(TimePoint) override { return Status::OK(); }
   void EndProfiling(TimePoint, Events&) override {}
   void Start(uint64_t) override {}
   void Stop(uint64_t, const EventRecord&) override {}
diff --git a/onnxruntime/core/providers/cuda/cupti_manager.cc b/onnxruntime/core/providers/cuda/cupti_manager.cc
index a2c6daea1e0cd..6ce129bce4fdb 100644
--- a/onnxruntime/core/providers/cuda/cupti_manager.cc
+++ b/onnxruntime/core/providers/cuda/cupti_manager.cc
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 namespace profiling {
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) {
   switch (kind) {
@@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer
       ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size));
 }
 
-#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* defined(ENABLE_CUDA_PROFILING) */
 
 }  // namespace profiling
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cupti_manager.h b/onnxruntime/core/providers/cuda/cupti_manager.h
index cca78dcec5ea5..0977023981ce6 100644
--- a/onnxruntime/core/providers/cuda/cupti_manager.h
+++ b/onnxruntime/core/providers/cuda/cupti_manager.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
+#if defined(ENABLE_CUDA_PROFILING)
 
 #include <atomic>
 #include <mutex>
@@ -11,10 +11,6 @@
 
 #include <cupti.h>
 
-// Do not move the check for CUDA_VERSION above #include <cupti.h>
-// the macros are defined in cupti.h
-#if defined(USE_CUDA)
-
 #include "core/common/gpu_profiler_common.h"
 #include "core/common/inlined_containers.h"
 
@@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager<CUPTIManager> {
 } /* namespace profiling */
 } /* namespace onnxruntime */
 
-#endif /* #if defined(USE_CUDA) */
-#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
+#endif /* #if defined(ENABLE_CUDA_PROFILING) */
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
index 7c2970c468216..e00b64eb2b9bd 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -12,6 +12,7 @@
 
 #include <cstring>
 #include <limits>
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@@ -134,6 +135,13 @@ CudaEp::CudaEp(CudaEpFactory& factory, const Config& config, const OrtLogger& lo
   // Resource accounting — allows ORT to query available device memory for budget enforcement
   GetAvailableResource = GetAvailableResourceImpl;
 
+  // Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time
+#if defined(ENABLE_CUDA_PROFILING)
+  CreateProfiler = CreateProfilerImpl;
+#else
+  CreateProfiler = nullptr;
+#endif
+
   const OrtApi& ort_api = factory_.GetOrtApi();
   Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO,
                                                    "CUDA Plugin EP created",
@@ -651,5 +659,26 @@ OrtStatus* ORT_API_CALL CudaEp::GetAvailableResourceImpl(
   EXCEPTION_TO_STATUS_END
 }
 
+#if defined(ENABLE_CUDA_PROFILING)
+/*static*/
+OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl(
+    OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
+  if (profiler == nullptr) {
+    return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null");
+  }
+
+  *profiler = nullptr;
+
+  auto* ep = static_cast<CudaEp*>(this_ptr);
+  auto profiler_impl = std::make_unique<CudaPluginEpProfiler>(ep->factory_.GetEpApi());
+  *profiler = profiler_impl.release();
+  return nullptr;
+
+  EXCEPTION_TO_STATUS_END
+}
+#endif  // defined(ENABLE_CUDA_PROFILING)
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
index 502902c53070b..faaeebf9ceae0 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h
@@ -5,6 +5,7 @@
 
 #include "cuda_plugin_utils.h"
 #include "cuda_graph_plugin.h"
+#include "cuda_profiler_plugin.h"
 #include "ep/adapters.h"
 
 #include <memory>
@@ -91,6 +92,11 @@ class CudaEp : public onnxruntime::ep::adapter::Ep {
   static OrtStatus* ORT_API_CALL GetAvailableResourceImpl(
       const OrtEp* this_ptr, OrtResourceCount* available) noexcept;
 
+#if defined(ENABLE_CUDA_PROFILING)
+  static OrtStatus* ORT_API_CALL CreateProfilerImpl(
+      OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept;
+#endif
+
   /// Helper to parse the graph annotation ID from run options.
   CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const;
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
new file mode 100644
index 0000000000000..b7d3c656112cb
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc
@@ -0,0 +1,158 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "cuda_profiler_plugin.h"
+
+#if defined(ENABLE_CUDA_PROFILING)
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+CudaPluginEpProfiler::CudaPluginEpProfiler(const OrtEpApi& api)
+    : OrtEpProfilerImpl{}, ep_api(api) {
+  ort_version_supported = ORT_API_VERSION;
+  Release = ReleaseImpl;
+  StartProfiling = StartProfilingImpl;
+  EndProfiling = EndProfilingImpl;
+  StartEvent = StartEventImpl;
+  StopEvent = StopEventImpl;
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  client_handle_ = manager.RegisterClient();
+}
+
+CudaPluginEpProfiler::~CudaPluginEpProfiler() {
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.DeregisterClient(client_handle_);
+}
+
+/*static*/
+void ORT_API_CALL CudaPluginEpProfiler::ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept {
+  delete static_cast<CudaPluginEpProfiler*>(this_ptr);
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartProfilingImpl(
+    OrtEpProfilerImpl* this_ptr,
+    int64_t ep_profiling_start_offset_ns) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  auto now = TimePoint::clock::now();
+
+  // Reconstruct the approximate ORT profiling start time so that GPU event
+  // timestamps (computed by CUPTIManager::Consume) are relative to ORT's start.
+  // The result equals (ORT's profiling start) + (cross-DLL call latency), which
+  // is typically < 1 µs — acceptable for profiling-level accuracy.
+  self->ort_profiling_start_ = now -
+                               std::chrono::duration_cast<TimePoint::duration>(
+                                   std::chrono::nanoseconds(ep_profiling_start_offset_ns));
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.StartLogging();
+
+  if (!manager.IsTracingEnabled()) {
+    return Ort::GetApi().CreateStatus(
+        ORT_EP_FAIL,
+        "CUPTI activity tracing failed to start. "
+        "GPU kernel events will not be available in the profile. "
+        "Check that the CUDA driver supports CUPTI and the CUPTI library is accessible.");
+  }
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartEventImpl(
+    OrtEpProfilerImpl* this_ptr,
+    uint64_t ort_event_correlation_id) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  // The bridge provides an absolute epoch-based correlation ID. Pass TimePoint{}
+  // (epoch) so PushCorrelation adds zero offset and the unique_cid equals the
+  // correlation ID directly. This avoids double-adding the epoch offset that
+  // GPUTracerManager::PushCorrelation normally computes.
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.PushCorrelation(self->client_handle_, ort_event_correlation_id, TimePoint{});
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StopEventImpl(
+    OrtEpProfilerImpl* /*this_ptr*/,
+    uint64_t /*ort_event_correlation_id*/,
+    const OrtProfilingEvent* /*ort_event*/) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+  manager.PopCorrelation();
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl(
+    OrtEpProfilerImpl* this_ptr,
+    OrtProfilingEventsContainer* c_events_container) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+  auto* self = static_cast<CudaPluginEpProfiler*>(this_ptr);
+
+  auto& manager = profiling::CUPTIManager::GetInstance();
+
+  // Consume GPU events. Timestamps are computed relative to ort_profiling_start_
+  // by CUPTIManager::ProcessActivityBuffers, so they match ORT's timeline.
+  std::map<uint64_t, profiling::Events> event_map;
+  manager.Consume(self->client_handle_, self->ort_profiling_start_, event_map);
+
+  // Flatten all GPU events and convert to OrtProfilingEvent.
+  std::vector<Ort::ProfilingEvent> events;
+  for (auto& kv : event_map) {
+    auto& event_list = kv.second;
+    for (const auto& record : event_list) {
+      // Build parallel key/value arrays to use the raw-pointer ProfilingEvent
+      // constructor, avoiding a copy from InlinedHashMap to std::unordered_map.
+      InlinedVector<const char*> arg_keys;
+      InlinedVector<const char*> arg_values;
+      arg_keys.reserve(record.args.size());
+      arg_values.reserve(record.args.size());
+      for (const auto& [k, v] : record.args) {
+        arg_keys.push_back(k.c_str());
+        arg_values.push_back(v.c_str());
+      }
+
+      events.emplace_back(
+          OrtProfilingEventCategory_KERNEL,
+          record.pid,
+          record.tid,
+          record.name.c_str(),
+          record.ts,
+          record.dur,
+          arg_keys.data(),
+          arg_values.data(),
+          arg_keys.size());
+    }
+  }
+
+  if (!events.empty()) {
+    Ort::UnownedProfilingEventsContainer events_container(c_events_container);
+    Ort::Status status = events_container.AddEvents(events);
+    return status.release();
+  }
+
+  return nullptr;
+  EXCEPTION_TO_STATUS_END
+}
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
+
+#endif  // defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
new file mode 100644
index 0000000000000..77460a40341ac
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(ENABLE_CUDA_PROFILING)
+
+#include "cuda_plugin_utils.h"
+#include "core/providers/cuda/cupti_manager.h"
+#include "core/common/gpu_profiler_common.h"
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+/// Plugin-side implementation of OrtEpProfilerImpl for CUDA.
+/// Delegates to CUPTIManager (within the plugin DLL) for GPU activity tracing
+/// and implements the C callback interface expected by ORT's PluginEpProfiler bridge.
+struct CudaPluginEpProfiler : OrtEpProfilerImpl {
+  const OrtEpApi& ep_api;
+  uint64_t client_handle_ = 0;
+  TimePoint ort_profiling_start_;
+
+  explicit CudaPluginEpProfiler(const OrtEpApi& api);
+  ~CudaPluginEpProfiler();
+
+  static void ORT_API_CALL ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept;
+  static OrtStatus* ORT_API_CALL StartProfilingImpl(OrtEpProfilerImpl* this_ptr,
+                                                    int64_t ep_profiling_start_offset_ns) noexcept;
+  static OrtStatus* ORT_API_CALL StartEventImpl(OrtEpProfilerImpl* this_ptr,
+                                                uint64_t ort_event_correlation_id) noexcept;
+  static OrtStatus* ORT_API_CALL StopEventImpl(OrtEpProfilerImpl* this_ptr,
+                                               uint64_t ort_event_correlation_id,
+                                               const OrtProfilingEvent* ort_event) noexcept;
+  static OrtStatus* ORT_API_CALL EndProfilingImpl(OrtEpProfilerImpl* this_ptr,
+                                                  OrtProfilingEventsContainer* events_container) noexcept;
+};
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
+
+#endif  // defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
index 2f6255f79441a..cd59d9427115f 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
@@ -10,12 +10,12 @@ namespace profiling {
 
 #if defined(USE_VITISAI)
 
-bool VitisaiProfiler::StartProfiling(TimePoint tp) {
+Status VitisaiProfiler::StartProfiling(TimePoint tp) {
   // Notify VAIP EP that profiling has started with base timestamp
   profiler_start(std::chrono::duration_cast<std::chrono::microseconds>(
                      tp.time_since_epoch())
                      .count());
-  return true;
+  return Status::OK();
 }
 
 void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) {
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
index 35e97011cb95c..7c72ae8a8eaef 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_profiler.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
@@ -12,7 +12,7 @@ class VitisaiProfiler final : public EpProfiler {
   VitisaiProfiler() = default;
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler);
   ~VitisaiProfiler() {}
-  bool StartProfiling(TimePoint) override;
+  Status StartProfiling(TimePoint) override;
   void EndProfiling(TimePoint, Events&) override;
   void Start(uint64_t) override {}
   void Stop(uint64_t, const EventRecord&) override {}
diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
index 6e826218464cf..146deb5fe7bba 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
@@ -11,9 +11,9 @@ namespace webgpu {
 
 WebGpuProfiler::WebGpuProfiler(WebGpuContext& context) : context_{context} {}
 
-bool WebGpuProfiler::StartProfiling(TimePoint) {
+Status WebGpuProfiler::StartProfiling(TimePoint) {
   enabled_ = true;
-  return true;
+  return Status::OK();
 }
 
 void WebGpuProfiler::EndProfiling(TimePoint tp, onnxruntime::profiling::Events& events) {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.h b/onnxruntime/core/providers/webgpu/webgpu_profiler.h
index 2b530563b2c5c..25becb827f3f3 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_profiler.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.h
@@ -15,7 +15,7 @@ class WebGpuProfiler final : public onnxruntime::profiling::EpProfiler {
   WebGpuProfiler(WebGpuContext& context);
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuProfiler);
   ~WebGpuProfiler() {}
-  bool StartProfiling(TimePoint) override;
+  Status StartProfiling(TimePoint) override;
   void EndProfiling(TimePoint, onnxruntime::profiling::Events&) override;
   void Start(uint64_t) override {
   }
diff --git a/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc b/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc
index e6538de2546f0..5cfe777681078 100644
--- a/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc
@@ -47,7 +47,7 @@ PluginEpProfiler::~PluginEpProfiler() {
   profiler_impl_.Release(&profiler_impl_);
 }
 
-bool PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) {
+Status PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) {
   // Store the epoch-based profiling start time for computing absolute correlation IDs in Start()/Stop().
   profiling_start_time_epoch_us_ = static_cast<uint64_t>(
       std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).count());
@@ -56,17 +56,15 @@ bool PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) {
   int64_t offset_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
                           std::chrono::high_resolution_clock::now() - profiling_start_time)
                           .count();
-  bool success = true;
+
   Status status = ToStatusAndRelease(profiler_impl_.StartProfiling(&profiler_impl_, offset_ns));
 
   if (!status.IsOK()) {
-    // Log error but don't throw as profiling failures shouldn't break execution.
     LOGS(logger_, ERROR) << "OrtEpProfilerImpl::StartProfiling() for " << ep_name_ << " returned an error OrtStatus: "
                          << status.ErrorMessage();
-    success = false;
   }
 
-  return success;
+  return status;
 }
 
 void PluginEpProfiler::EndProfiling(TimePoint /*profiling_start_time*/, profiling::Events& events) {
diff --git a/onnxruntime/core/session/plugin_ep/ep_event_profiling.h b/onnxruntime/core/session/plugin_ep/ep_event_profiling.h
index 8147162dcb56e..b7d8bdcb37c9c 100644
--- a/onnxruntime/core/session/plugin_ep/ep_event_profiling.h
+++ b/onnxruntime/core/session/plugin_ep/ep_event_profiling.h
@@ -53,7 +53,7 @@ class PluginEpProfiler final : public profiling::EpProfiler {
   ~PluginEpProfiler() override;
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PluginEpProfiler);
 
-  bool StartProfiling(TimePoint profiling_start_time) override;
+  Status StartProfiling(TimePoint profiling_start_time) override;
   void EndProfiling(TimePoint start_time, profiling::Events& events) override;
 
   void Start(uint64_t relative_ort_event_id) override;
diff --git a/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc b/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc
index 6f29361502a73..e28d9a4b4753b 100644
--- a/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc
@@ -207,6 +207,23 @@ class PluginEpOpKernel final : public controlflow::IControlFlowKernel {
 Status PluginEpOpKernel::Create(FuncManager& /*fn_manager*/, const OpKernelInfo& info,
                                 OrtKernelCreateFunc kernel_create_func, void* kernel_create_func_state,
                                 /*out*/ std::unique_ptr<PluginEpOpKernel>& op_kernel) {
+  const auto* ep = info.GetExecutionProvider();
+  ORT_ENFORCE(ep != nullptr, "IExecutionProvider* retrieved from OpKernelInfo should never be nullptr");
+  const auto* ort_ep = ep->GetOrtEp();
+  ORT_ENFORCE(ort_ep != nullptr, "GetOrtEp() returned nullptr for EP '", ep->Type(), "'");
+
+  // Sanity-check the OrtEp to detect corruption early (e.g., garbage pointer from vtable mismatch).
+  ORT_ENFORCE(ort_ep->ort_version_supported > 0 && ort_ep->ort_version_supported <= ORT_API_VERSION,
+              "OrtEp for '", ep->Type(), "' has invalid ort_version_supported=", ort_ep->ort_version_supported,
+              " (expected 1..", ORT_API_VERSION, "). Possible pointer corruption or stale plugin DLL.");
+  ORT_ENFORCE(ort_ep->GetName != nullptr,
+              "OrtEp for '", ep->Type(), "' has null GetName function pointer. Possible pointer corruption.");
+  const char* ort_ep_name = ort_ep->GetName(ort_ep);
+  ORT_ENFORCE(ort_ep_name != nullptr && ep->Type() == ort_ep_name,
+              "OrtEp::GetName() returned '", (ort_ep_name ? ort_ep_name : "<null>"),
+              "' but IExecutionProvider::Type() is '", ep->Type(),
+              "'. Possible pointer corruption or EP mismatch.");
+
   // OpKernel's constructor *copies* the OpKernelInfo.
   // Therefore, must create the OpKernel instance immediately so that we can pass the actual OpKernelInfo
   // to the plugin EP's kernel creation function.
@@ -218,8 +235,6 @@ Status PluginEpOpKernel::Create(FuncManager& /*fn_manager*/, const OpKernelInfo&
 
   const auto& op_type = info.node().OpType();
   const auto& node_name = info.node().Name();
-  const auto* ep = info.GetExecutionProvider();
-  ORT_ENFORCE(ep != nullptr, "IExecutionProvider* retrieved from OpKernelInfo should never be nullptr");
   const auto& ep_name = ep->Type();
 
   // Do some basic checks for the OrtKernelImpl provided by the EP. Other checks for missing function implementations
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc
new file mode 100644
index 0000000000000..bf57b2fae4591
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc
@@ -0,0 +1,289 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Tests for the CUDA plugin EP profiling integration.
+// Uses InferenceSessionWrapper to directly query the Profiler's EP start status,
+// which propagates the OrtStatus* returned by the plugin's StartProfiling C API.
+// This distinguishes "profiling not compiled in / CUPTI unavailable" (skip)
+// from "profiling started but no kernel events appeared" (regression/fail).
+
+#if defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) && defined(ENABLE_CUDA_PROFILING)
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+#include <gsl/gsl>
+#include <gtest/gtest.h>
+#include "nlohmann/json.hpp"
+
+#include "core/framework/data_types.h"
+#include "core/framework/error_code_helper.h"
+#include "core/framework/run_options.h"
+#include "core/framework/tensor.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/inference_session.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/session/ort_env.h"
+#include "core/session/utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/file_util.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+namespace onnxruntime {
+namespace test {
+namespace {
+
+constexpr const char* kCudaPluginEpName = "CudaPluginExecutionProvider";
+constexpr const char* kRegistrationName = "CudaPluginProfilingTest";
+
+std::filesystem::path GetCudaPluginLibraryPath() {
+  return GetSharedLibraryFileName(ORT_TSTR("onnxruntime_providers_cuda_plugin"));
+}
+
+// Get the internal OrtEnv from the C++ Ort::Env wrapper.
+OrtEnv& GetOrtEnv() {
+  return *static_cast<OrtEnv*>(*ort_env);
+}
+
+// RAII handle that registers/unregisters the CUDA plugin EP library.
+// Uses the C API directly to avoid exceptions (the plugin DLL may fail to load
+// if CUPTI is not on PATH due to the hard import dependency).
+// OrtStatus* returns are wrapped in Ort::Status for leak-safe RAII.
+class ScopedCudaPluginRegistration {
+ public:
+  ScopedCudaPluginRegistration(Ort::Env& env, const char* registration_name)
+      : env_(env), name_(registration_name) {
+    auto lib_path = GetCudaPluginLibraryPath();
+    if (!std::filesystem::exists(lib_path)) {
+      load_error_ = "Plugin library not found: " + lib_path.string();
+      return;
+    }
+    Ort::Status status(Ort::GetApi().RegisterExecutionProviderLibrary(
+        env_, name_.c_str(), lib_path.c_str()));
+    if (!status.IsOK()) {
+      load_error_ = status.GetErrorMessage();
+      return;
+    }
+    available_ = true;
+  }
+
+  ~ScopedCudaPluginRegistration() {
+    if (available_) {
+      Ort::Status status(Ort::GetApi().UnregisterExecutionProviderLibrary(
+          env_, name_.c_str()));
+      ORT_UNUSED_PARAMETER(status);  // intentionally ignore unregister errors during teardown
+    }
+  }
+
+  bool IsAvailable() const { return available_; }
+  const std::string& LoadError() const { return load_error_; }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ScopedCudaPluginRegistration);
+
+ private:
+  Ort::Env& env_;
+  std::string name_;
+  std::string load_error_;
+  bool available_ = false;
+};
+
+// Find the CUDA plugin EP device after registration.
+Ort::ConstEpDevice FindCudaPluginDevice(Ort::Env& env) {
+  auto ep_devices = env.GetEpDevices();
+  for (const auto& device : ep_devices) {
+    if (strcmp(device.EpName(), kCudaPluginEpName) == 0) {
+      return device;
+    }
+  }
+  return Ort::ConstEpDevice{nullptr};
+}
+
+}  // namespace
+
+class CudaPluginProfilingTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "No CUDA device available.";
+    }
+
+    registration_ = std::make_unique<ScopedCudaPluginRegistration>(
+        *ort_env, kRegistrationName);
+    if (!registration_->IsAvailable()) {
+      GTEST_SKIP() << "CUDA plugin EP library not available: "
+                   << registration_->LoadError();
+    }
+
+    cuda_device_ = FindCudaPluginDevice(*ort_env);
+    if (!cuda_device_) {
+      GTEST_SKIP() << "No CUDA plugin EP device found after registration.";
+    }
+  }
+
+  void TearDown() override {
+    registration_.reset();
+    cudaDeviceSynchronize();
+  }
+
+  std::unique_ptr<ScopedCudaPluginRegistration> registration_;
+  Ort::ConstEpDevice cuda_device_{nullptr};
+};
+
+// Test that session-level profiling produces valid JSON with GPU Kernel events
+// when CUPTI is functional. Uses InferenceSessionWrapper to directly query the
+// profiler's EP start status for definitive pass/skip/fail decisions.
+TEST_F(CudaPluginProfilingTest, SessionProfiling_ProducesValidProfile) {
+  const ORTCHAR_T* model_path = ORT_TSTR("testdata/matmul_1.onnx");
+
+  // Set up session options with the plugin EP and profiling enabled.
+  OrtSessionOptions ort_options;
+
+  const OrtEpDevice* device_ptr = static_cast<const OrtEpDevice*>(cuda_device_);
+  auto ep_devices_span = gsl::make_span(&device_ptr, 1);
+
+  std::unique_ptr<IExecutionProviderFactory> factory;
+  ASSERT_STATUS_OK(CreateIExecutionProviderFactoryForEpDevices(
+      GetOrtEnv().GetEnvironment(), ep_devices_span, factory));
+  ort_options.provider_factories.push_back(std::move(factory));
+
+  auto profile_prefix = std::filesystem::temp_directory_path() / ORT_TSTR("cuda_plugin_profiling_test");
+  ort_options.value.enable_profiling = true;
+  ort_options.value.profile_file_prefix = profile_prefix.native();
+
+  // Create session via InferenceSessionWrapper for internal access.
+  InferenceSessionWrapper session(ort_options.value, GetOrtEnv().GetEnvironment());
+  ASSERT_STATUS_OK(session.Load(model_path));
+
+  OrtStatus* init_status = InitializeSession(&ort_options, session);
+  ASSERT_STATUS_OK(ToStatusAndRelease(init_status));
+
+  // Check EP profiling status. Three scenarios:
+  // 1. No EP profiler registered (plugin not built with profiling) → skip
+  // 2. EP profiler registered but StartProfiling failed (CUPTI unavailable/blocked) → skip
+  // 3. EP profiler started successfully → kernel events MUST appear
+  const auto& profiler = session.GetProfiling();
+  if (!profiler.HasEpProfilers()) {
+    GTEST_SKIP() << "Plugin EP did not register a profiler. "
+                 << "It may have been built without ENABLE_CUDA_PROFILING.";
+  }
+
+  const auto& ep_profiling_status = profiler.GetEpProfilingStatus();
+  if (!ep_profiling_status.IsOK()) {
+    GTEST_SKIP() << "EP profiling did not start: " << ep_profiling_status.ErrorMessage();
+  }
+
+  // Profiling started successfully — run inference.
+  // Input X:[3,2] float, output Y:[3,1].
+  std::vector<float> x_data(6, 1.0f);
+  int64_t x_shape[] = {3, 2};
+
+  OrtValue input_tensor;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(),
+                       TensorShape(x_shape, 2),
+                       x_data.data(), OrtMemoryInfo(),
+                       input_tensor);
+
+  std::vector<std::string> feed_names = {"X"};
+  std::vector<OrtValue> feeds = {input_tensor};
+  std::vector<std::string> output_names = {"Y"};
+  std::vector<OrtValue> fetches;
+
+  RunOptions run_options;
+  ASSERT_STATUS_OK(session.Run(run_options, feed_names, feeds, output_names, &fetches));
+
+  // End profiling and read the output file.
+  std::string profile_file_path = session.EndProfiling();
+
+  auto cleanup_profile = gsl::finally([&profile_file_path] {
+    std::error_code ec;
+    std::filesystem::remove(profile_file_path, ec);
+  });
+
+  ASSERT_TRUE(std::filesystem::exists(profile_file_path))
+      << "Profile file not found: " << profile_file_path;
+
+  std::ifstream profile_stream(profile_file_path);
+  ASSERT_TRUE(profile_stream.is_open()) << "Could not open: " << profile_file_path;
+
+  std::string content(std::istreambuf_iterator<char>{profile_stream},
+                      std::istreambuf_iterator<char>{});
+  profile_stream.close();
+
+  auto profile_json = nlohmann::json::parse(content);
+  ASSERT_TRUE(profile_json.is_array()) << "Profile JSON is not an array";
+  ASSERT_GT(profile_json.size(), 0u) << "Profile JSON is empty";
+
+  // Validate standard fields on all entries.
+  for (const auto& entry : profile_json) {
+    if (!entry.is_object() || !entry.contains("name")) {
+      continue;
+    }
+    EXPECT_TRUE(entry.contains("pid")) << "Missing 'pid': " << entry;
+    EXPECT_TRUE(entry.contains("ts")) << "Missing 'ts': " << entry;
+    EXPECT_TRUE(entry.contains("dur")) << "Missing 'dur': " << entry;
+    EXPECT_TRUE(entry.contains("ph")) << "Missing 'ph': " << entry;
+    EXPECT_TRUE(entry.contains("args")) << "Missing 'args': " << entry;
+  }
+
+  // Since EP profiling started OK, GPU Kernel events MUST be present.
+  // Their absence indicates a regression in the profiling wiring.
+  std::vector<nlohmann::json> kernel_events;
+  for (const auto& entry : profile_json) {
+    if (entry.is_object() && entry.contains("cat") && entry["cat"] == "Kernel") {
+      kernel_events.push_back(entry);
+    }
+  }
+
+  ASSERT_FALSE(kernel_events.empty())
+      << "EP profiling started successfully (CUPTI tracing is active) but no GPU "
+      << "Kernel events were found in the profile output. This is a regression.\n"
+      << "Profile content (first 2000 chars): " << content.substr(0, 2000);
+
+  // Validate kernel event metadata.
+  for (const auto& event : kernel_events) {
+    EXPECT_TRUE(event.contains("ts")) << event;
+    EXPECT_TRUE(event.contains("dur")) << event;
+    EXPECT_GE(event["dur"].get<int64_t>(), 0) << event;
+    ASSERT_TRUE(event.contains("args")) << "Kernel event missing 'args': " << event;
+    const auto& args = event["args"];
+    EXPECT_TRUE(args.contains("stream")) << "Kernel missing 'stream': " << event;
+    EXPECT_TRUE(args.contains("block_x")) << "Kernel missing 'block_x': " << event;
+  }
+
+  // Timeline plausibility: kernel timestamps should fall within the session
+  // profiling window (derived from CPU-side events).
+  int64_t session_end_us = 0;
+  for (const auto& entry : profile_json) {
+    if (!entry.is_object() || !entry.contains("cat")) continue;
+    std::string cat = entry["cat"].get<std::string>();
+    if ((cat == "Session" || cat == "Node" || cat == "Api") &&
+        entry.contains("ts") && entry.contains("dur")) {
+      int64_t end = entry["ts"].get<int64_t>() + entry["dur"].get<int64_t>();
+      session_end_us = std::max(session_end_us, end);
+    }
+  }
+
+  if (session_end_us > 0) {
+    constexpr int64_t kMarginUs = 10'000;  // 10ms margin for GPU clock skew
+    for (const auto& event : kernel_events) {
+      int64_t ts = event["ts"].get<int64_t>();
+      EXPECT_GE(ts, -kMarginUs)
+          << "GPU kernel ts before profiling start (domain mismatch?): " << event;
+      EXPECT_LE(ts, session_end_us + kMarginUs)
+          << "GPU kernel ts beyond session end (domain mismatch?): " << event;
+    }
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) && defined(ENABLE_CUDA_PROFILING)
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
index ff9f2edd9d002..c03545fc31435 100644
--- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
+++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import json
 import os
 import tempfile
 import unittest
@@ -2395,6 +2396,85 @@ def test_iobinding_matmul(self):
             if os.path.exists(model_path):
                 os.remove(model_path)
 
+    # ---- Profiling tests ----
+
+    def _run_profiling_test(self):
+        """Run a model with session-level profiling enabled and verify the JSON output.
+
+        Validates that profiling produces a valid JSON file with standard event
+        fields. GPU Kernel event validation (CUPTI) is handled by the C++ test
+        (cuda_plugin_profiling_test.cc) which can directly probe CUPTI availability.
+        """
+        target_device = get_cuda_plugin_device()
+
+        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as tmp:
+            model_path = tmp.name
+        profile_file = None
+        try:
+            create_matmul_model(model_path)
+            sess_options = _create_session_options()
+            sess_options.add_provider_for_devices([target_device], {})
+
+            profile_prefix = os.path.join(tempfile.gettempdir(), "cuda_plugin_ep_profiling_test")
+            sess_options.enable_profiling = True
+            sess_options.profile_file_prefix = profile_prefix
+
+            sess = onnxrt.InferenceSession(model_path, sess_options=sess_options)
+
+            assigned_nodes, assignment_info = _get_assigned_nodes(sess, CUDA_PLUGIN_EP_NAME)
+            self.assertTrue(
+                assigned_nodes,
+                f"{CUDA_PLUGIN_EP_NAME} was assigned no nodes. "
+                f"Assignments: {_format_assignment_summary(assignment_info)}",
+            )
+
+            a = np.random.rand(3, 4).astype(np.float32)
+            b = np.random.rand(4, 5).astype(np.float32)
+            sess.run(None, {"A": a, "B": b})
+
+            profile_file = sess.end_profiling()
+            self.assertTrue(profile_file, "No profile file returned")
+            self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}")
+
+            with open(profile_file) as f:
+                profile_data = json.load(f)
+
+            self.assertIsInstance(profile_data, list)
+            self.assertGreater(len(profile_data), 0, "Profile JSON is empty")
+
+            # Every event entry must have standard tracing fields.
+            required_keys = {"pid", "dur", "ts", "ph", "name", "args"}
+            for entry in profile_data:
+                if not isinstance(entry, dict):
+                    continue
+                if "name" not in entry:
+                    continue
+                for key in required_keys:
+                    self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}")
+
+            # If GPU kernel events are present, validate their metadata.
+            kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"]
+            if kernel_events:
+                for event in kernel_events:
+                    self.assertIn("ts", event)
+                    self.assertIn("dur", event)
+                    self.assertGreaterEqual(event["dur"], 0)
+                    args = event.get("args", {})
+                    self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}")
+                    self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}")
+            else:
+                print("Note: No GPU Kernel events in profile (CUPTI may not be available).")
+
+        finally:
+            if os.path.exists(model_path):
+                os.remove(model_path)
+            if profile_file and os.path.exists(profile_file):
+                os.remove(profile_file)
+
+    def test_session_profiling(self):
+        """Verify session-level profiling produces valid output with the CUDA Plugin EP."""
+        self._run_profiling_test()
+
 
 if __name__ == "__main__":
     unittest.main()