diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index 362a4dcc8f2bf..3b532c486cdfc 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -36,6 +36,7 @@ jobs: --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 + --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH' diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index 52219ae8fc071..07083a5caa08a 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -83,6 +83,7 @@ jobs: --skip_tests ` --use_vcpkg ` --use_vcpkg_ms_internal_asset_cache ` + --enable_cuda_profiling ` --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 ` --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 93fe8e0921986..e345c944dccf8 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -265,6 +265,11 @@ target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE ${PROTOBUF_LIB} ) +if (onnxruntime_ENABLE_CUDA_PROFILING) + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE CUDA::cupti) + target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_PROFILING) +endif() + # Default plugin EP version to ORT_VERSION with "-dev" suffix if not explicitly provided. if(NOT DEFINED onnxruntime_PLUGIN_EP_VERSION) set(onnxruntime_PLUGIN_EP_VERSION "${ORT_VERSION}-dev") diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md index ba7b07b97535e..15f8188505b37 100644 --- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md +++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md @@ -831,29 +831,105 @@ include/onnxruntime/ep/ --- -## 14. Future Work +## 14. Profiling and Observability -1. **Profiling and observability** — ORT's generic plugin EP bridge now supports `OrtEp::CreateProfiler`, but the CUDA plugin EP does not implement that callback yet. Future work should add CUDA-plugin-specific profiler wiring, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP. +The CUDA plugin EP implements the `OrtEpProfilerImpl` interface (introduced in ORT 1.25 via [PR #27649](https://github.com/microsoft/onnxruntime/pull/27649)) to participate in ORT's profiling system. When profiling is enabled, GPU kernel executions (CUDA kernels, memory copies) captured by NVIDIA CUPTI appear alongside ORT's CPU-side events in the profiling output. -2. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower: +### 14.1 Architecture + +The profiling stack has three layers: + +1. **ORT Core** (`Profiler` in `profiler.cc`) — drives the profiling lifecycle. It calls `PluginExecutionProvider::GetProfiler()`, which invokes `OrtEp::CreateProfiler` on the plugin and wraps the returned `OrtEpProfilerImpl` in a `PluginEpProfiler` bridge. +2. **Bridge** (`PluginEpProfiler` in `ep_event_profiling.cc`) — adapts the C++ `EpProfiler` interface to the C `OrtEpProfilerImpl` callbacks. It handles clock synchronization (provides an epoch-independent offset in `StartProfiling`) and converts relative ORT event IDs to absolute epoch-based correlation IDs for `StartEvent`/`StopEvent`. +3. **Plugin-side profiler** (`CudaPluginEpProfiler` in `cuda_profiler_plugin.h/.cc`) — implements `OrtEpProfilerImpl` inside the plugin DLL. Delegates to `CUPTIManager` for GPU activity tracing. + +``` +ORT Profiler + └─ PluginEpProfiler (bridge, in ORT core) + └─ OrtEpProfilerImpl callbacks (C API boundary) + └─ CudaPluginEpProfiler (in plugin DLL) + └─ CUPTIManager singleton (in plugin DLL) + └─ CUPTI activity APIs (GPU tracing) +``` + +### 14.2 CUPTI Integration + +The plugin DLL links `CUDA::cupti` and compiles `cupti_manager.cc` when `onnxruntime_ENABLE_CUDA_PROFILING` is ON. The `CUPTIManager` singleton lives inside the plugin DLL, isolated from any in-tree CUDA EP in the same process. This is the expected isolation model for plugin EPs. + +CUPTI activities enabled: +- `CUPTI_ACTIVITY_KIND_RUNTIME` — CUDA runtime API calls +- `CUPTI_ACTIVITY_KIND_DRIVER` — CUDA driver API calls +- `CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL` — GPU kernel execution +- `CUPTI_ACTIVITY_KIND_MEMCPY` — device memory transfers +- `CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION` — maps GPU activities to ORT event correlation IDs + +### 14.3 Correlation ID Flow + +The plugin API's `StartEvent`/`StopEvent` receive **absolute epoch-based** correlation IDs (converted by the `PluginEpProfiler` bridge from ORT's relative event IDs). These are pushed directly to CUPTI's external correlation stack via `cuptiActivityPushExternalCorrelationId`, allowing CUPTI to tag GPU activities with the corresponding ORT event. When `StopEvent` is called, the correlation ID is popped. This matches the pattern used by the in-tree CUDA EP's `GPUTracerManager::PushCorrelation`/`PopCorrelation`. + +### 14.4 Event Collection (EndProfiling) + +When ORT calls `EndProfiling`: +1. CUPTI activity buffers are flushed (`cuptiActivityFlushAll`). +2. GPU activity records are processed — kernel names, timestamps, durations, and stream/grid metadata are extracted. +3. Events are converted to `Ort::ProfilingEvent` instances with `OrtProfilingEventCategory_KERNEL`. +4. Events are appended to the `OrtProfilingEventsContainer` via `AddEvents`. + +The plugin does **not** perform the post-hoc merge/sort that the in-tree `GPUProfilerBase::EndProfiling` does. The plugin API is append-only, and the `PluginEpProfiler` bridge on the ORT side likewise appends EP events to ORT's profiling event collection without merge/sort by timestamp or correlation ID. Any ordering or interleaving into a global timeline is handled by downstream trace consumers. + +### 14.5 Design Differences from In-Tree CUDA EP Profiler + +| Aspect | In-tree CUDA EP | CUDA Plugin EP | +|--------|----------------|----------------| +| Event merge | `GPUProfilerBase::MergeEvents` interleaves GPU events into ORT's array (has known sort-order bug) | Append-only; ORT-side bridge appends only, and trace consumers handle ordering | +| Correlation IDs | Relative → absolute conversion in `GPUTracerManager::PushCorrelation` | Bridge provides absolute IDs directly; plugin pushes to CUPTI as-is | +| `StopEvent` metadata | Ignored (just pops correlation) | ORT event metadata available; currently unused, can annotate GPU events in future | +| GPU→ORT event linkage | Implicit via CUPTI external correlation IDs merged into timeline | GPU events carry only CUPTI metadata (`stream`, `grid_*`, `block_*`); no ORT correlation or parent identifier is attached. Downstream consumers must relate GPU kernels to ORT nodes via timestamp proximity. This is a known limitation; future work may attach `correlation_id` or parent event name via `StopEvent`'s `OrtProfilingEvent` parameter | +| Singleton scope | Process-wide `CUPTIManager` in main ORT DLL | DLL-local `CUPTIManager` in plugin (process isolation) | + +### 14.6 Build Configuration + +CUPTI profiling is conditional: +- **CMake flag**: `onnxruntime_ENABLE_CUDA_PROFILING=ON` +- **Compile definition**: `ENABLE_CUDA_PROFILING` added to the plugin target +- **Link**: `CUDA::cupti` linked to `onnxruntime_providers_cuda_plugin` +- **Source**: `cupti_manager.cc` compiled into the plugin + +When profiling is disabled (default), `CudaEp::CreateProfiler` is set to `nullptr` and no CUPTI code is compiled. + +### 14.7 Files + +| File | Role | +|------|------| +| `plugin/cuda_profiler_plugin.h` | `CudaPluginEpProfiler` struct definition | +| `plugin/cuda_profiler_plugin.cc` | Profiler callback implementations | +| `plugin/cuda_ep.h` | `CreateProfilerImpl` declaration | +| `plugin/cuda_ep.cc` | `CreateProfiler` callback wiring | +| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Conditional CUPTI linkage | + +--- + +## 15. Future Work + +1. **Remaining stream/adapter parity for framework-style `Stream*` consumers** — Much of the broad `Stream*` gap has already been addressed: the plugin adapter now provides an `OrtStreamAdapter` / `PluginStreamShim` path for framework-style `Stream*` call sites, FFT is included, and quantization/diffusion kernels are no longer excluded as a class. Remaining work is narrower: - Continue using `Stream(context)` / `GetOrtStream(context)` patterns for migrated kernels rather than adding raw-stream-only forks. - Audit still-excluded directories that require more than a stream handle: `contrib_ops/cuda/llm/*`, `contrib_ops/cuda/transformers/*`, and `contrib_ops/cuda/collective/*`. - For each re-inclusion pass, add or extend focused plugin tests before removing the CMake exclusion. -3. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels. +2. **Contrib LLM migration pass** — Still open. The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded in `cmake/onnxruntime_providers_cuda_plugin.cmake`. The remaining work is a dedicated contrib-LLM adapter pass: resolve any plugin build failures under `ORT_USE_EP_API_ADAPTERS`, keep the normal stream/scratch-buffer helpers, remove the `contrib_ops/cuda/llm/*` CMake filters, and add focused tests or parity-report coverage for the first re-included kernels. -4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. +3. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. -5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. +4. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. -6. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). +5. **Remaining contrib exclusions** — Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). -7. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land. +6. **CI integration and targeted benchmarking** — Partially complete. Basic CUDA plugin build + `test_cuda_plugin_ep.py` coverage now exists in Linux and Windows plugin CI workflows. Remaining work is perf-oriented and feature-specific validation: add targeted benchmarks or perf gates for graph replay and allocator behavior, and extend CI once profiling and tunable-op support land. -8. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected. +7. **NHWC cleanup and hardening** — Partially complete. Runtime NHWC callbacks, second-pass capability handling for pre-assigned NHWC nodes, cached provider-config access, and focused Conv/BatchNormalization/Pool tests are in place. Remaining work is the cleanup described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the conversion allowlist with the bundled CUDA EP, improve internal-domain kernel-miss diagnostics, and add stronger structural assertions that plugin-backed NHWC execution was actually selected. -9. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption: +8. **OpSchema-validated kernel registration after PR #27713** — PR #27713 has already landed, so the `OrtEpApi` and C++ wrappers for querying ONNX operator schemas are available (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). The remaining work is plugin-side adoption: **A. Registration-time validation pass** @@ -881,7 +957,7 @@ include/onnxruntime/ep/ | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr | | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration | -10. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation. +9. **Resource accounting and annotation-based partitioning after PR #27595** — PR #27595 has already landed, so ORT now has framework-side resource accounting and layering annotations. The remaining CUDA plugin work is to bridge those capabilities through the plugin EP API and plugin capability implementation. **A. Resource accounting** diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h index df50f558ed8b9..8a86039b19680 100644 --- a/include/onnxruntime/core/common/gpu_profiler_common.h +++ b/include/onnxruntime/core/common/gpu_profiler_common.h @@ -122,6 +122,10 @@ class GPUTracerManager { tracing_enabled_ = this_as_derived->OnStartLogging(); } + bool IsTracingEnabled() const noexcept { + return tracing_enabled_; + } + void Consume(uint64_t client_handle, const TimePoint& start_time, std::map& events) { auto this_as_derived = static_cast(this); events.clear(); @@ -442,11 +446,16 @@ class GPUProfilerBase : public EpProfiler { TimePoint profiling_start_time_; public: - virtual bool StartProfiling(TimePoint profiling_start_time) override { + virtual Status StartProfiling(TimePoint profiling_start_time) override { auto& manager = TManager::GetInstance(); manager.StartLogging(); profiling_start_time_ = profiling_start_time; - return true; + if (!manager.IsTracingEnabled()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "GPU activity tracing failed to start. " + "The tracing library may be unavailable or blocked by system policy."); + } + return Status::OK(); } virtual void EndProfiling(TimePoint start_time, Events& events) override { diff --git a/include/onnxruntime/core/common/profiler_common.h b/include/onnxruntime/core/common/profiler_common.h index 236249253fb39..48a75731eb5e1 100644 --- a/include/onnxruntime/core/common/profiler_common.h +++ b/include/onnxruntime/core/common/profiler_common.h @@ -87,8 +87,11 @@ class EpProfiler { /// Allows EP profiler to initialize profiling utilities and record the profiling start time. /// /// Timepoint denoting the start of profiling. - /// True if profiling was started successfully. - virtual bool StartProfiling(TimePoint profiling_start_time) = 0; + /// Status::OK() if profiling was started successfully, or an error Status + /// describing why profiling could not start (e.g., CUPTI unavailable, tracing blocked by policy). + /// Callers should not treat a failed start as fatal — the session can still execute without + /// profiling, but the status should be surfaced for diagnostic purposes. + virtual Status StartProfiling(TimePoint profiling_start_time) = 0; /// /// Called when profiling ends to collect the EP's new profiling events since the last call to StartProfiling. diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 19508723f6cf0..43bb45b898191 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -49,7 +49,10 @@ void Profiler::StartProfiling(const logging::Logger* custom_logger) { custom_logger_ = custom_logger; profiling_start_time_ = std::chrono::high_resolution_clock::now(); for (const auto& ep_profiler : ep_profilers_) { - ep_profiler->StartProfiling(profiling_start_time_); + auto status = ep_profiler->StartProfiling(profiling_start_time_); + if (!status.IsOK() && ep_start_profiling_status_.IsOK()) { + ep_start_profiling_status_ = status; + } } } @@ -62,7 +65,10 @@ void Profiler::StartProfiling(const std::basic_string& file_name) { profile_stream_file_ = ToUTF8String(file_name); profiling_start_time_ = std::chrono::high_resolution_clock::now(); for (const auto& ep_profiler : ep_profilers_) { - ep_profiler->StartProfiling(profiling_start_time_); + auto status = ep_profiler->StartProfiling(profiling_start_time_); + if (!status.IsOK() && ep_start_profiling_status_.IsOK()) { + ep_start_profiling_status_ = status; + } } } diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index fe23593bdc2d0..5d4c0fc3e5070 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -114,11 +114,22 @@ class Profiler { if (ep_profiler) { ep_profilers_.push_back(std::move(ep_profiler)); if (enabled_) { - ep_profilers_.back()->StartProfiling(profiling_start_time_); + auto status = ep_profilers_.back()->StartProfiling(profiling_start_time_); + if (!status.IsOK() && ep_start_profiling_status_.IsOK()) { + ep_start_profiling_status_ = status; + } } } } + /// Returns the aggregate status from calling StartProfiling on EP profilers. + /// OK if all EP profilers started successfully (or if none are registered). + /// Returns the first error status encountered otherwise. + const Status& GetEpProfilingStatus() const { return ep_start_profiling_status_; } + + /// Returns true if at least one EP profiler was registered. + bool HasEpProfilers() const { return !ep_profilers_.empty(); } + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler); @@ -156,6 +167,10 @@ class Profiler { #endif std::vector> ep_profilers_; + + // Aggregate status from EP profiler StartProfiling calls. + // Stores the first error encountered. + Status ep_start_profiling_status_; }; } // namespace profiling diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h index cc67bb8a0f285..be26d1425fdb1 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.h +++ b/onnxruntime/core/providers/cuda/cuda_profiler.h @@ -32,7 +32,7 @@ class CudaProfiler final : public EpProfiler { CudaProfiler() = default; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler); ~CudaProfiler() {} - bool StartProfiling(TimePoint) override { return true; } + Status StartProfiling(TimePoint) override { return Status::OK(); } void EndProfiling(TimePoint, Events&) override {} void Start(uint64_t) override {} void Stop(uint64_t, const EventRecord&) override {} diff --git a/onnxruntime/core/providers/cuda/cupti_manager.cc b/onnxruntime/core/providers/cuda/cupti_manager.cc index a2c6daea1e0cd..6ce129bce4fdb 100644 --- a/onnxruntime/core/providers/cuda/cupti_manager.cc +++ b/onnxruntime/core/providers/cuda/cupti_manager.cc @@ -8,7 +8,7 @@ namespace onnxruntime { namespace profiling { -#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#if defined(ENABLE_CUDA_PROFILING) static inline std::string GetMemcpyKindString(CUpti_ActivityMemcpyKind kind) { switch (kind) { @@ -179,7 +179,7 @@ void CUPTIAPI CUPTIManager::BufferCompleted(CUcontext, uint32_t, uint8_t* buffer ProfilerActivityBuffer::CreateFromPreallocatedBuffer(std::move(buffer_ptr), valid_size)); } -#endif /* defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ +#endif /* defined(ENABLE_CUDA_PROFILING) */ } // namespace profiling } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cupti_manager.h b/onnxruntime/core/providers/cuda/cupti_manager.h index cca78dcec5ea5..0977023981ce6 100644 --- a/onnxruntime/core/providers/cuda/cupti_manager.h +++ b/onnxruntime/core/providers/cuda/cupti_manager.h @@ -3,7 +3,7 @@ #pragma once -#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) +#if defined(ENABLE_CUDA_PROFILING) #include #include @@ -11,10 +11,6 @@ #include -// Do not move the check for CUDA_VERSION above #include -// the macros are defined in cupti.h -#if defined(USE_CUDA) - #include "core/common/gpu_profiler_common.h" #include "core/common/inlined_containers.h" @@ -51,5 +47,4 @@ class CUPTIManager : public GPUTracerManager { } /* namespace profiling */ } /* namespace onnxruntime */ -#endif /* #if defined(USE_CUDA) */ -#endif /* #if defined (USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ +#endif /* #if defined(ENABLE_CUDA_PROFILING) */ diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc index 7c2970c468216..e00b64eb2b9bd 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -134,6 +135,13 @@ CudaEp::CudaEp(CudaEpFactory& factory, const Config& config, const OrtLogger& lo // Resource accounting — allows ORT to query available device memory for budget enforcement GetAvailableResource = GetAvailableResourceImpl; + // Profiling — CUPTI-based GPU activity tracing when profiling is enabled at build time +#if defined(ENABLE_CUDA_PROFILING) + CreateProfiler = CreateProfilerImpl; +#else + CreateProfiler = nullptr; +#endif + const OrtApi& ort_api = factory_.GetOrtApi(); Ort::Status log_status(ort_api.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_INFO, "CUDA Plugin EP created", @@ -651,5 +659,26 @@ OrtStatus* ORT_API_CALL CudaEp::GetAvailableResourceImpl( EXCEPTION_TO_STATUS_END } +#if defined(ENABLE_CUDA_PROFILING) +/*static*/ +OrtStatus* ORT_API_CALL CudaEp::CreateProfilerImpl( + OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept { + EXCEPTION_TO_STATUS_BEGIN + + if (profiler == nullptr) { + return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "`profiler` must not be null"); + } + + *profiler = nullptr; + + auto* ep = static_cast(this_ptr); + auto profiler_impl = std::make_unique(ep->factory_.GetEpApi()); + *profiler = profiler_impl.release(); + return nullptr; + + EXCEPTION_TO_STATUS_END +} +#endif // defined(ENABLE_CUDA_PROFILING) + } // namespace cuda_plugin } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h index 502902c53070b..faaeebf9ceae0 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep.h @@ -5,6 +5,7 @@ #include "cuda_plugin_utils.h" #include "cuda_graph_plugin.h" +#include "cuda_profiler_plugin.h" #include "ep/adapters.h" #include @@ -91,6 +92,11 @@ class CudaEp : public onnxruntime::ep::adapter::Ep { static OrtStatus* ORT_API_CALL GetAvailableResourceImpl( const OrtEp* this_ptr, OrtResourceCount* available) noexcept; +#if defined(ENABLE_CUDA_PROFILING) + static OrtStatus* ORT_API_CALL CreateProfilerImpl( + OrtEp* this_ptr, OrtEpProfilerImpl** profiler) noexcept; +#endif + /// Helper to parse the graph annotation ID from run options. CudaGraphAnnotation_t GetGraphAnnotationId(const OrtRunOptions* run_options) const; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc new file mode 100644 index 0000000000000..b7d3c656112cb --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.cc @@ -0,0 +1,158 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "cuda_profiler_plugin.h" + +#if defined(ENABLE_CUDA_PROFILING) + +#include +#include +#include + +namespace onnxruntime { +namespace cuda_plugin { + +CudaPluginEpProfiler::CudaPluginEpProfiler(const OrtEpApi& api) + : OrtEpProfilerImpl{}, ep_api(api) { + ort_version_supported = ORT_API_VERSION; + Release = ReleaseImpl; + StartProfiling = StartProfilingImpl; + EndProfiling = EndProfilingImpl; + StartEvent = StartEventImpl; + StopEvent = StopEventImpl; + + auto& manager = profiling::CUPTIManager::GetInstance(); + client_handle_ = manager.RegisterClient(); +} + +CudaPluginEpProfiler::~CudaPluginEpProfiler() { + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.DeregisterClient(client_handle_); +} + +/*static*/ +void ORT_API_CALL CudaPluginEpProfiler::ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept { + delete static_cast(this_ptr); +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartProfilingImpl( + OrtEpProfilerImpl* this_ptr, + int64_t ep_profiling_start_offset_ns) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + auto now = TimePoint::clock::now(); + + // Reconstruct the approximate ORT profiling start time so that GPU event + // timestamps (computed by CUPTIManager::Consume) are relative to ORT's start. + // The result equals (ORT's profiling start) + (cross-DLL call latency), which + // is typically < 1 µs — acceptable for profiling-level accuracy. + self->ort_profiling_start_ = now - + std::chrono::duration_cast( + std::chrono::nanoseconds(ep_profiling_start_offset_ns)); + + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.StartLogging(); + + if (!manager.IsTracingEnabled()) { + return Ort::GetApi().CreateStatus( + ORT_EP_FAIL, + "CUPTI activity tracing failed to start. " + "GPU kernel events will not be available in the profile. " + "Check that the CUDA driver supports CUPTI and the CUPTI library is accessible."); + } + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StartEventImpl( + OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + // The bridge provides an absolute epoch-based correlation ID. Pass TimePoint{} + // (epoch) so PushCorrelation adds zero offset and the unique_cid equals the + // correlation ID directly. This avoids double-adding the epoch offset that + // GPUTracerManager::PushCorrelation normally computes. + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.PushCorrelation(self->client_handle_, ort_event_correlation_id, TimePoint{}); + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::StopEventImpl( + OrtEpProfilerImpl* /*this_ptr*/, + uint64_t /*ort_event_correlation_id*/, + const OrtProfilingEvent* /*ort_event*/) noexcept { + EXCEPTION_TO_STATUS_BEGIN + + auto& manager = profiling::CUPTIManager::GetInstance(); + manager.PopCorrelation(); + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaPluginEpProfiler::EndProfilingImpl( + OrtEpProfilerImpl* this_ptr, + OrtProfilingEventsContainer* c_events_container) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto* self = static_cast(this_ptr); + + auto& manager = profiling::CUPTIManager::GetInstance(); + + // Consume GPU events. Timestamps are computed relative to ort_profiling_start_ + // by CUPTIManager::ProcessActivityBuffers, so they match ORT's timeline. + std::map event_map; + manager.Consume(self->client_handle_, self->ort_profiling_start_, event_map); + + // Flatten all GPU events and convert to OrtProfilingEvent. + std::vector events; + for (auto& kv : event_map) { + auto& event_list = kv.second; + for (const auto& record : event_list) { + // Build parallel key/value arrays to use the raw-pointer ProfilingEvent + // constructor, avoiding a copy from InlinedHashMap to std::unordered_map. + InlinedVector arg_keys; + InlinedVector arg_values; + arg_keys.reserve(record.args.size()); + arg_values.reserve(record.args.size()); + for (const auto& [k, v] : record.args) { + arg_keys.push_back(k.c_str()); + arg_values.push_back(v.c_str()); + } + + events.emplace_back( + OrtProfilingEventCategory_KERNEL, + record.pid, + record.tid, + record.name.c_str(), + record.ts, + record.dur, + arg_keys.data(), + arg_values.data(), + arg_keys.size()); + } + } + + if (!events.empty()) { + Ort::UnownedProfilingEventsContainer events_container(c_events_container); + Ort::Status status = events_container.AddEvents(events); + return status.release(); + } + + return nullptr; + EXCEPTION_TO_STATUS_END +} + +} // namespace cuda_plugin +} // namespace onnxruntime + +#endif // defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h new file mode 100644 index 0000000000000..77460a40341ac --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_profiler_plugin.h @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#if defined(ENABLE_CUDA_PROFILING) + +#include "cuda_plugin_utils.h" +#include "core/providers/cuda/cupti_manager.h" +#include "core/common/gpu_profiler_common.h" + +namespace onnxruntime { +namespace cuda_plugin { + +/// Plugin-side implementation of OrtEpProfilerImpl for CUDA. +/// Delegates to CUPTIManager (within the plugin DLL) for GPU activity tracing +/// and implements the C callback interface expected by ORT's PluginEpProfiler bridge. +struct CudaPluginEpProfiler : OrtEpProfilerImpl { + const OrtEpApi& ep_api; + uint64_t client_handle_ = 0; + TimePoint ort_profiling_start_; + + explicit CudaPluginEpProfiler(const OrtEpApi& api); + ~CudaPluginEpProfiler(); + + static void ORT_API_CALL ReleaseImpl(OrtEpProfilerImpl* this_ptr) noexcept; + static OrtStatus* ORT_API_CALL StartProfilingImpl(OrtEpProfilerImpl* this_ptr, + int64_t ep_profiling_start_offset_ns) noexcept; + static OrtStatus* ORT_API_CALL StartEventImpl(OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id) noexcept; + static OrtStatus* ORT_API_CALL StopEventImpl(OrtEpProfilerImpl* this_ptr, + uint64_t ort_event_correlation_id, + const OrtProfilingEvent* ort_event) noexcept; + static OrtStatus* ORT_API_CALL EndProfilingImpl(OrtEpProfilerImpl* this_ptr, + OrtProfilingEventsContainer* events_container) noexcept; +}; + +} // namespace cuda_plugin +} // namespace onnxruntime + +#endif // defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc index 2f6255f79441a..cd59d9427115f 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc @@ -10,12 +10,12 @@ namespace profiling { #if defined(USE_VITISAI) -bool VitisaiProfiler::StartProfiling(TimePoint tp) { +Status VitisaiProfiler::StartProfiling(TimePoint tp) { // Notify VAIP EP that profiling has started with base timestamp profiler_start(std::chrono::duration_cast( tp.time_since_epoch()) .count()); - return true; + return Status::OK(); } void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) { diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h index 35e97011cb95c..7c72ae8a8eaef 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_profiler.h +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h @@ -12,7 +12,7 @@ class VitisaiProfiler final : public EpProfiler { VitisaiProfiler() = default; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler); ~VitisaiProfiler() {} - bool StartProfiling(TimePoint) override; + Status StartProfiling(TimePoint) override; void EndProfiling(TimePoint, Events&) override; void Start(uint64_t) override {} void Stop(uint64_t, const EventRecord&) override {} diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc index 6e826218464cf..146deb5fe7bba 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc @@ -11,9 +11,9 @@ namespace webgpu { WebGpuProfiler::WebGpuProfiler(WebGpuContext& context) : context_{context} {} -bool WebGpuProfiler::StartProfiling(TimePoint) { +Status WebGpuProfiler::StartProfiling(TimePoint) { enabled_ = true; - return true; + return Status::OK(); } void WebGpuProfiler::EndProfiling(TimePoint tp, onnxruntime::profiling::Events& events) { diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.h b/onnxruntime/core/providers/webgpu/webgpu_profiler.h index 2b530563b2c5c..25becb827f3f3 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_profiler.h +++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.h @@ -15,7 +15,7 @@ class WebGpuProfiler final : public onnxruntime::profiling::EpProfiler { WebGpuProfiler(WebGpuContext& context); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WebGpuProfiler); ~WebGpuProfiler() {} - bool StartProfiling(TimePoint) override; + Status StartProfiling(TimePoint) override; void EndProfiling(TimePoint, onnxruntime::profiling::Events&) override; void Start(uint64_t) override { } diff --git a/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc b/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc index e6538de2546f0..5cfe777681078 100644 --- a/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc +++ b/onnxruntime/core/session/plugin_ep/ep_event_profiling.cc @@ -47,7 +47,7 @@ PluginEpProfiler::~PluginEpProfiler() { profiler_impl_.Release(&profiler_impl_); } -bool PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) { +Status PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) { // Store the epoch-based profiling start time for computing absolute correlation IDs in Start()/Stop(). profiling_start_time_epoch_us_ = static_cast( std::chrono::duration_cast(profiling_start_time.time_since_epoch()).count()); @@ -56,17 +56,15 @@ bool PluginEpProfiler::StartProfiling(TimePoint profiling_start_time) { int64_t offset_ns = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - profiling_start_time) .count(); - bool success = true; + Status status = ToStatusAndRelease(profiler_impl_.StartProfiling(&profiler_impl_, offset_ns)); if (!status.IsOK()) { - // Log error but don't throw as profiling failures shouldn't break execution. LOGS(logger_, ERROR) << "OrtEpProfilerImpl::StartProfiling() for " << ep_name_ << " returned an error OrtStatus: " << status.ErrorMessage(); - success = false; } - return success; + return status; } void PluginEpProfiler::EndProfiling(TimePoint /*profiling_start_time*/, profiling::Events& events) { diff --git a/onnxruntime/core/session/plugin_ep/ep_event_profiling.h b/onnxruntime/core/session/plugin_ep/ep_event_profiling.h index 8147162dcb56e..b7d8bdcb37c9c 100644 --- a/onnxruntime/core/session/plugin_ep/ep_event_profiling.h +++ b/onnxruntime/core/session/plugin_ep/ep_event_profiling.h @@ -53,7 +53,7 @@ class PluginEpProfiler final : public profiling::EpProfiler { ~PluginEpProfiler() override; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PluginEpProfiler); - bool StartProfiling(TimePoint profiling_start_time) override; + Status StartProfiling(TimePoint profiling_start_time) override; void EndProfiling(TimePoint start_time, profiling::Events& events) override; void Start(uint64_t relative_ort_event_id) override; diff --git a/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc b/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc index 6f29361502a73..e28d9a4b4753b 100644 --- a/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc +++ b/onnxruntime/core/session/plugin_ep/ep_kernel_registration.cc @@ -207,6 +207,23 @@ class PluginEpOpKernel final : public controlflow::IControlFlowKernel { Status PluginEpOpKernel::Create(FuncManager& /*fn_manager*/, const OpKernelInfo& info, OrtKernelCreateFunc kernel_create_func, void* kernel_create_func_state, /*out*/ std::unique_ptr& op_kernel) { + const auto* ep = info.GetExecutionProvider(); + ORT_ENFORCE(ep != nullptr, "IExecutionProvider* retrieved from OpKernelInfo should never be nullptr"); + const auto* ort_ep = ep->GetOrtEp(); + ORT_ENFORCE(ort_ep != nullptr, "GetOrtEp() returned nullptr for EP '", ep->Type(), "'"); + + // Sanity-check the OrtEp to detect corruption early (e.g., garbage pointer from vtable mismatch). + ORT_ENFORCE(ort_ep->ort_version_supported > 0 && ort_ep->ort_version_supported <= ORT_API_VERSION, + "OrtEp for '", ep->Type(), "' has invalid ort_version_supported=", ort_ep->ort_version_supported, + " (expected 1..", ORT_API_VERSION, "). Possible pointer corruption or stale plugin DLL."); + ORT_ENFORCE(ort_ep->GetName != nullptr, + "OrtEp for '", ep->Type(), "' has null GetName function pointer. Possible pointer corruption."); + const char* ort_ep_name = ort_ep->GetName(ort_ep); + ORT_ENFORCE(ort_ep_name != nullptr && ep->Type() == ort_ep_name, + "OrtEp::GetName() returned '", (ort_ep_name ? ort_ep_name : ""), + "' but IExecutionProvider::Type() is '", ep->Type(), + "'. Possible pointer corruption or EP mismatch."); + // OpKernel's constructor *copies* the OpKernelInfo. // Therefore, must create the OpKernel instance immediately so that we can pass the actual OpKernelInfo // to the plugin EP's kernel creation function. @@ -218,8 +235,6 @@ Status PluginEpOpKernel::Create(FuncManager& /*fn_manager*/, const OpKernelInfo& const auto& op_type = info.node().OpType(); const auto& node_name = info.node().Name(); - const auto* ep = info.GetExecutionProvider(); - ORT_ENFORCE(ep != nullptr, "IExecutionProvider* retrieved from OpKernelInfo should never be nullptr"); const auto& ep_name = ep->Type(); // Do some basic checks for the OrtKernelImpl provided by the EP. Other checks for missing function implementations diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc new file mode 100644 index 0000000000000..bf57b2fae4591 --- /dev/null +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_profiling_test.cc @@ -0,0 +1,289 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Tests for the CUDA plugin EP profiling integration. +// Uses InferenceSessionWrapper to directly query the Profiler's EP start status, +// which propagates the OrtStatus* returned by the plugin's StartProfiling C API. +// This distinguishes "profiling not compiled in / CUPTI unavailable" (skip) +// from "profiling started but no kernel events appeared" (regression/fail). + +#if defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) && defined(ENABLE_CUDA_PROFILING) + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "nlohmann/json.hpp" + +#include "core/framework/data_types.h" +#include "core/framework/error_code_helper.h" +#include "core/framework/run_options.h" +#include "core/framework/tensor.h" +#include "core/session/abi_session_options_impl.h" +#include "core/session/inference_session.h" +#include "core/session/onnxruntime_cxx_api.h" +#include "core/session/ort_env.h" +#include "core/session/utils.h" +#include "test/util/include/asserts.h" +#include "test/util/include/file_util.h" +#include "test/util/include/inference_session_wrapper.h" + +extern std::unique_ptr ort_env; + +namespace onnxruntime { +namespace test { +namespace { + +constexpr const char* kCudaPluginEpName = "CudaPluginExecutionProvider"; +constexpr const char* kRegistrationName = "CudaPluginProfilingTest"; + +std::filesystem::path GetCudaPluginLibraryPath() { + return GetSharedLibraryFileName(ORT_TSTR("onnxruntime_providers_cuda_plugin")); +} + +// Get the internal OrtEnv from the C++ Ort::Env wrapper. +OrtEnv& GetOrtEnv() { + return *static_cast(*ort_env); +} + +// RAII handle that registers/unregisters the CUDA plugin EP library. +// Uses the C API directly to avoid exceptions (the plugin DLL may fail to load +// if CUPTI is not on PATH due to the hard import dependency). +// OrtStatus* returns are wrapped in Ort::Status for leak-safe RAII. +class ScopedCudaPluginRegistration { + public: + ScopedCudaPluginRegistration(Ort::Env& env, const char* registration_name) + : env_(env), name_(registration_name) { + auto lib_path = GetCudaPluginLibraryPath(); + if (!std::filesystem::exists(lib_path)) { + load_error_ = "Plugin library not found: " + lib_path.string(); + return; + } + Ort::Status status(Ort::GetApi().RegisterExecutionProviderLibrary( + env_, name_.c_str(), lib_path.c_str())); + if (!status.IsOK()) { + load_error_ = status.GetErrorMessage(); + return; + } + available_ = true; + } + + ~ScopedCudaPluginRegistration() { + if (available_) { + Ort::Status status(Ort::GetApi().UnregisterExecutionProviderLibrary( + env_, name_.c_str())); + ORT_UNUSED_PARAMETER(status); // intentionally ignore unregister errors during teardown + } + } + + bool IsAvailable() const { return available_; } + const std::string& LoadError() const { return load_error_; } + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ScopedCudaPluginRegistration); + + private: + Ort::Env& env_; + std::string name_; + std::string load_error_; + bool available_ = false; +}; + +// Find the CUDA plugin EP device after registration. +Ort::ConstEpDevice FindCudaPluginDevice(Ort::Env& env) { + auto ep_devices = env.GetEpDevices(); + for (const auto& device : ep_devices) { + if (strcmp(device.EpName(), kCudaPluginEpName) == 0) { + return device; + } + } + return Ort::ConstEpDevice{nullptr}; +} + +} // namespace + +class CudaPluginProfilingTest : public ::testing::Test { + protected: + void SetUp() override { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "No CUDA device available."; + } + + registration_ = std::make_unique( + *ort_env, kRegistrationName); + if (!registration_->IsAvailable()) { + GTEST_SKIP() << "CUDA plugin EP library not available: " + << registration_->LoadError(); + } + + cuda_device_ = FindCudaPluginDevice(*ort_env); + if (!cuda_device_) { + GTEST_SKIP() << "No CUDA plugin EP device found after registration."; + } + } + + void TearDown() override { + registration_.reset(); + cudaDeviceSynchronize(); + } + + std::unique_ptr registration_; + Ort::ConstEpDevice cuda_device_{nullptr}; +}; + +// Test that session-level profiling produces valid JSON with GPU Kernel events +// when CUPTI is functional. Uses InferenceSessionWrapper to directly query the +// profiler's EP start status for definitive pass/skip/fail decisions. +TEST_F(CudaPluginProfilingTest, SessionProfiling_ProducesValidProfile) { + const ORTCHAR_T* model_path = ORT_TSTR("testdata/matmul_1.onnx"); + + // Set up session options with the plugin EP and profiling enabled. + OrtSessionOptions ort_options; + + const OrtEpDevice* device_ptr = static_cast(cuda_device_); + auto ep_devices_span = gsl::make_span(&device_ptr, 1); + + std::unique_ptr factory; + ASSERT_STATUS_OK(CreateIExecutionProviderFactoryForEpDevices( + GetOrtEnv().GetEnvironment(), ep_devices_span, factory)); + ort_options.provider_factories.push_back(std::move(factory)); + + auto profile_prefix = std::filesystem::temp_directory_path() / ORT_TSTR("cuda_plugin_profiling_test"); + ort_options.value.enable_profiling = true; + ort_options.value.profile_file_prefix = profile_prefix.native(); + + // Create session via InferenceSessionWrapper for internal access. + InferenceSessionWrapper session(ort_options.value, GetOrtEnv().GetEnvironment()); + ASSERT_STATUS_OK(session.Load(model_path)); + + OrtStatus* init_status = InitializeSession(&ort_options, session); + ASSERT_STATUS_OK(ToStatusAndRelease(init_status)); + + // Check EP profiling status. Three scenarios: + // 1. No EP profiler registered (plugin not built with profiling) → skip + // 2. EP profiler registered but StartProfiling failed (CUPTI unavailable/blocked) → skip + // 3. EP profiler started successfully → kernel events MUST appear + const auto& profiler = session.GetProfiling(); + if (!profiler.HasEpProfilers()) { + GTEST_SKIP() << "Plugin EP did not register a profiler. " + << "It may have been built without ENABLE_CUDA_PROFILING."; + } + + const auto& ep_profiling_status = profiler.GetEpProfilingStatus(); + if (!ep_profiling_status.IsOK()) { + GTEST_SKIP() << "EP profiling did not start: " << ep_profiling_status.ErrorMessage(); + } + + // Profiling started successfully — run inference. + // Input X:[3,2] float, output Y:[3,1]. + std::vector x_data(6, 1.0f); + int64_t x_shape[] = {3, 2}; + + OrtValue input_tensor; + Tensor::InitOrtValue(DataTypeImpl::GetType(), + TensorShape(x_shape, 2), + x_data.data(), OrtMemoryInfo(), + input_tensor); + + std::vector feed_names = {"X"}; + std::vector feeds = {input_tensor}; + std::vector output_names = {"Y"}; + std::vector fetches; + + RunOptions run_options; + ASSERT_STATUS_OK(session.Run(run_options, feed_names, feeds, output_names, &fetches)); + + // End profiling and read the output file. + std::string profile_file_path = session.EndProfiling(); + + auto cleanup_profile = gsl::finally([&profile_file_path] { + std::error_code ec; + std::filesystem::remove(profile_file_path, ec); + }); + + ASSERT_TRUE(std::filesystem::exists(profile_file_path)) + << "Profile file not found: " << profile_file_path; + + std::ifstream profile_stream(profile_file_path); + ASSERT_TRUE(profile_stream.is_open()) << "Could not open: " << profile_file_path; + + std::string content(std::istreambuf_iterator{profile_stream}, + std::istreambuf_iterator{}); + profile_stream.close(); + + auto profile_json = nlohmann::json::parse(content); + ASSERT_TRUE(profile_json.is_array()) << "Profile JSON is not an array"; + ASSERT_GT(profile_json.size(), 0u) << "Profile JSON is empty"; + + // Validate standard fields on all entries. + for (const auto& entry : profile_json) { + if (!entry.is_object() || !entry.contains("name")) { + continue; + } + EXPECT_TRUE(entry.contains("pid")) << "Missing 'pid': " << entry; + EXPECT_TRUE(entry.contains("ts")) << "Missing 'ts': " << entry; + EXPECT_TRUE(entry.contains("dur")) << "Missing 'dur': " << entry; + EXPECT_TRUE(entry.contains("ph")) << "Missing 'ph': " << entry; + EXPECT_TRUE(entry.contains("args")) << "Missing 'args': " << entry; + } + + // Since EP profiling started OK, GPU Kernel events MUST be present. + // Their absence indicates a regression in the profiling wiring. + std::vector kernel_events; + for (const auto& entry : profile_json) { + if (entry.is_object() && entry.contains("cat") && entry["cat"] == "Kernel") { + kernel_events.push_back(entry); + } + } + + ASSERT_FALSE(kernel_events.empty()) + << "EP profiling started successfully (CUPTI tracing is active) but no GPU " + << "Kernel events were found in the profile output. This is a regression.\n" + << "Profile content (first 2000 chars): " << content.substr(0, 2000); + + // Validate kernel event metadata. + for (const auto& event : kernel_events) { + EXPECT_TRUE(event.contains("ts")) << event; + EXPECT_TRUE(event.contains("dur")) << event; + EXPECT_GE(event["dur"].get(), 0) << event; + ASSERT_TRUE(event.contains("args")) << "Kernel event missing 'args': " << event; + const auto& args = event["args"]; + EXPECT_TRUE(args.contains("stream")) << "Kernel missing 'stream': " << event; + EXPECT_TRUE(args.contains("block_x")) << "Kernel missing 'block_x': " << event; + } + + // Timeline plausibility: kernel timestamps should fall within the session + // profiling window (derived from CPU-side events). + int64_t session_end_us = 0; + for (const auto& entry : profile_json) { + if (!entry.is_object() || !entry.contains("cat")) continue; + std::string cat = entry["cat"].get(); + if ((cat == "Session" || cat == "Node" || cat == "Api") && + entry.contains("ts") && entry.contains("dur")) { + int64_t end = entry["ts"].get() + entry["dur"].get(); + session_end_us = std::max(session_end_us, end); + } + } + + if (session_end_us > 0) { + constexpr int64_t kMarginUs = 10'000; // 10ms margin for GPU clock skew + for (const auto& event : kernel_events) { + int64_t ts = event["ts"].get(); + EXPECT_GE(ts, -kMarginUs) + << "GPU kernel ts before profiling start (domain mismatch?): " << event; + EXPECT_LE(ts, session_end_us + kMarginUs) + << "GPU kernel ts beyond session end (domain mismatch?): " << event; + } + } +} + +} // namespace test +} // namespace onnxruntime + +#endif // defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) && defined(ENABLE_CUDA_PROFILING) diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py index ff9f2edd9d002..c03545fc31435 100644 --- a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py +++ b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import json import os import tempfile import unittest @@ -2395,6 +2396,85 @@ def test_iobinding_matmul(self): if os.path.exists(model_path): os.remove(model_path) + # ---- Profiling tests ---- + + def _run_profiling_test(self): + """Run a model with session-level profiling enabled and verify the JSON output. + + Validates that profiling produces a valid JSON file with standard event + fields. GPU Kernel event validation (CUPTI) is handled by the C++ test + (cuda_plugin_profiling_test.cc) which can directly probe CUPTI availability. + """ + target_device = get_cuda_plugin_device() + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as tmp: + model_path = tmp.name + profile_file = None + try: + create_matmul_model(model_path) + sess_options = _create_session_options() + sess_options.add_provider_for_devices([target_device], {}) + + profile_prefix = os.path.join(tempfile.gettempdir(), "cuda_plugin_ep_profiling_test") + sess_options.enable_profiling = True + sess_options.profile_file_prefix = profile_prefix + + sess = onnxrt.InferenceSession(model_path, sess_options=sess_options) + + assigned_nodes, assignment_info = _get_assigned_nodes(sess, CUDA_PLUGIN_EP_NAME) + self.assertTrue( + assigned_nodes, + f"{CUDA_PLUGIN_EP_NAME} was assigned no nodes. " + f"Assignments: {_format_assignment_summary(assignment_info)}", + ) + + a = np.random.rand(3, 4).astype(np.float32) + b = np.random.rand(4, 5).astype(np.float32) + sess.run(None, {"A": a, "B": b}) + + profile_file = sess.end_profiling() + self.assertTrue(profile_file, "No profile file returned") + self.assertTrue(os.path.exists(profile_file), f"Profile file not found: {profile_file}") + + with open(profile_file) as f: + profile_data = json.load(f) + + self.assertIsInstance(profile_data, list) + self.assertGreater(len(profile_data), 0, "Profile JSON is empty") + + # Every event entry must have standard tracing fields. + required_keys = {"pid", "dur", "ts", "ph", "name", "args"} + for entry in profile_data: + if not isinstance(entry, dict): + continue + if "name" not in entry: + continue + for key in required_keys: + self.assertIn(key, entry, f"Missing '{key}' in profile entry: {entry}") + + # If GPU kernel events are present, validate their metadata. + kernel_events = [e for e in profile_data if isinstance(e, dict) and e.get("cat") == "Kernel"] + if kernel_events: + for event in kernel_events: + self.assertIn("ts", event) + self.assertIn("dur", event) + self.assertGreaterEqual(event["dur"], 0) + args = event.get("args", {}) + self.assertIn("stream", args, f"GPU kernel event missing 'stream': {event}") + self.assertIn("block_x", args, f"GPU kernel event missing 'block_x': {event}") + else: + print("Note: No GPU Kernel events in profile (CUPTI may not be available).") + + finally: + if os.path.exists(model_path): + os.remove(model_path) + if profile_file and os.path.exists(profile_file): + os.remove(profile_file) + + def test_session_profiling(self): + """Verify session-level profiling produces valid output with the CUDA Plugin EP.""" + self._run_profiling_test() + if __name__ == "__main__": unittest.main()