Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/linux_cuda_plugin_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
--cuda_version=12.8
--cuda_home=/usr/local/cuda-12.8
--cudnn_home=/usr/local/cuda-12.8
--enable_cuda_profiling
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
--cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
Expand Down Expand Up @@ -120,7 +121,9 @@ jobs:
export ORT_CUDA_PLUGIN_PATH=/build/Release/Release/libonnxruntime_providers_cuda_plugin.so
echo \"ORT_CUDA_PLUGIN_PATH=\$ORT_CUDA_PLUGIN_PATH\"
ls -la \$ORT_CUDA_PLUGIN_PATH

# Signal the test that CUPTI profiling is available so it
# asserts GPU kernel events are present in the profile.
export ORT_CUDA_PROFILING_ENABLED=1
cd /onnxruntime_src/onnxruntime/test/python/transformers
python test_cuda_plugin_ep.py
"
4 changes: 4 additions & 0 deletions .github/workflows/windows_cuda_plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
--skip_tests `
--use_vcpkg `
--use_vcpkg_ms_internal_asset_cache `
--enable_cuda_profiling `
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
--cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON

Expand Down Expand Up @@ -194,6 +195,9 @@ jobs:
Write-Error "CUDA plugin EP library not found at $env:ORT_CUDA_PLUGIN_PATH"
exit 1
}
# Signal the test that CUPTI profiling is available so it
# asserts GPU kernel events are present in the profile.
$env:ORT_CUDA_PROFILING_ENABLED = "1"
python test_cuda_plugin_ep.py
if ($lastExitCode -ne 0) {
exit $lastExitCode
Expand Down
5 changes: 5 additions & 0 deletions cmake/onnxruntime_providers_cuda_plugin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
${PROTOBUF_LIB}
)

if (onnxruntime_ENABLE_CUDA_PROFILING)
target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti)
target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING)
endif()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: trailing whitespace on the blank line after endif().


Comment thread
yuslepukhin marked this conversation as resolved.
# Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided.
if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION)
set(onnxruntime_PLUGIN_EP_VERSION "${ORT_VERSION}-dev")
Expand Down
98 changes: 87 additions & 11 deletions docs/cuda_plugin_ep/cuda_plugin_ep_design.md
Original file line number Diff line number Diff line change
Expand Up @@ -831,29 +831,105 @@ include/onnxruntime/ep/

---

## 14. Future Work
## 14. Profiling and Observability

1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output.

2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:
### 14.1 Architecture

The profiling stack has three layers:

1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge.
2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`.
3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing.

```
ORT Profiler
└─ PluginEpProfiler (bridge, in ORT core)
└─ OrtEpProfilerImpl callbacks (C API boundary)
└─ CudaPluginEpProfiler (in plugin DLL)
└─ CUPTIManager singleton (in plugin DLL)
└─ CUPTI activity APIs (GPU tracing)
```

### 14.2 CUPTI Integration

The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs.

CUPTI activities enabled:
- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls
- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls
- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution
- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers
- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs

### 14.3 Correlation ID Flow

The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`.

### 14.4 Event Collection (EndProfiling)

When ORT calls `EndProfiling`:
1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`).
2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted.
3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`.
4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`.

The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers.
Comment thread
yuslepukhin marked this conversation as resolved.

### 14.5 Design Differences from In-Tree CUDA EP Profiler

| Aspect | In-tree CUDA EP | CUDA Plugin EP |
|--------|----------------|----------------|
| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering |
| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is |
| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future |
| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter |
| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) |

### 14.6 Build Configuration

CUPTI profiling is conditional:
- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON`
- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target
- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin`
- **Source**: `cupti_manager.cc` compiled into the plugin

When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled.

### 14.7 Files

| File | Role |
|------|------|
| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition |
| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations |
| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration |
| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring |
| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage |

---

## 15. Future Work

1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower:

- Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks.
- Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`.
- For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion.

3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.
2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels.

4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.

5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.

6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).

7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.
6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land.

8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.
7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected.

9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:
8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption:

**A. Registration-time validation pass**

Expand Down Expand Up @@ -881,7 +957,7 @@ include/onnxruntime/ep/
| `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr |
| `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration |

10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.
9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation.

**A. Resource accounting**

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/cupti_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
namespace onnxruntime {
namespace profiling {

#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
#if defined(ENABLE_CUDA_PROFILING)
Comment thread
yuslepukhin marked this conversation as resolved.

static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) {
switch (kind) {
Expand Down Expand Up @@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer
ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size));
}

#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
#endif /* defined(ENABLE_CUDA_PROFILING) */

} // namespace profiling
} // namespace onnxruntime
9 changes: 2 additions & 7 deletions onnxruntime/core/providers/cuda/cupti_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,14 @@

#pragma once

#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
#if defined(ENABLE_CUDA_PROFILING)

#include <atomic>
#include <mutex>
#include <vector>

#include <cupti.h>

Comment thread
yuslepukhin marked this conversation as resolved.
// Do not move the check for CUDA_VERSION above #include <cupti.h>
// the macros are defined in cupti.h
#if defined(USE_CUDA)

#include "core/common/gpu_profiler_common.h"
#include "core/common/inlined_containers.h"

Expand Down Expand Up @@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager<CUPTIManager> {
} /* namespace profiling */
} /* namespace onnxruntime */

#endif /* #if defined(USE_CUDA) */
#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
#endif /* #if defined(ENABLE_CUDA_PROFILING) */
29 changes: 29 additions & 0 deletions onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <cstring>
#include <limits>
#include <memory>

Check warning on line 15 in onnxruntime/core/providers/cuda/plugin/cuda_ep.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Found C++ system header after other header. Should be: cuda_ep.h, c system, c++ system, other. [build/include_order] [4] Raw Output: onnxruntime/core/providers/cuda/plugin/cuda_ep.cc:15: Found C++ system header after other header. Should be: cuda_ep.h, c system, c++ system, other. [build/include_order] [4]
#include <stdexcept>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -134,6 +135,13 @@
// Resource accounting — allows ORT to query available device memory for budget enforcement
GetAvailableResource = GetAvailableResourceImpl;

// Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time
#if defined(ENABLE_CUDA_PROFILING)
CreateProfiler = CreateProfilerImpl;
#else
CreateProfiler = nullptr;
#endif

const OrtApi& ort_api = factory_.GetOrtApi();
Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO,
"CUDA Plugin EP created",
Expand Down Expand Up @@ -651,5 +659,26 @@
EXCEPTION_TO_STATUS_END
}

#if defined(ENABLE_CUDA_PROFILING)
/*static*/
OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl(
OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept {
EXCEPTION_TO_STATUS_BEGIN

if (profiler == nullptr) {
return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null");
}

*profiler = nullptr;

auto* ep = static_cast<CudaEp*>(this_ptr);
auto profiler_impl = std::make_unique<CudaPluginEpProfiler>(ep->factory_.GetEpApi());
*profiler = profiler_impl.release();
return nullptr;

EXCEPTION_TO_STATUS_END
}
#endif // defined(ENABLE_CUDA_PROFILING)

} // namespace cuda_plugin
} // namespace onnxruntime
6 changes: 6 additions & 0 deletions onnxruntime/core/providers/cuda/plugin/cuda_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "cuda_plugin_utils.h"
#include "cuda_graph_plugin.h"
#include "cuda_profiler_plugin.h"

Check warning on line 8 in onnxruntime/core/providers/cuda/plugin/cuda_ep.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Include the directory when naming header files [build/include_subdir] [4] Raw Output: onnxruntime/core/providers/cuda/plugin/cuda_ep.h:8: Include the directory when naming header files [build/include_subdir] [4]
#include "ep/adapters.h"

#include <memory>
Expand Down Expand Up @@ -91,6 +92,11 @@
static OrtStatus* ORT_API_CALL GetAvailableResourceImpl(
const OrtEp* this_ptr, OrtResourceCount* available) noexcept;

#if defined(ENABLE_CUDA_PROFILING)
static OrtStatus* ORT_API_CALL CreateProfilerImpl(
OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept;
#endif

/// Helper to parse the graph annotation ID from run options.
CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const;

Expand Down
Loading
Loading