CUDA Plugin EP: NHWC Cleanup & Hardening (#28612)

tianleiwu · web-flow · commit d165fba0abb8 · 2026-05-29T12:48:54.000-07:00
## Summary

Unifies the NHWC-eligible op allowlist between the bundled CUDA EP and
the CUDA plugin EP into a single shared header, adds kernel-miss
diagnostics, and expands NHWC test coverage from 4 ops to 11.

## Motivation

The bundled EP (`cuda_execution_provider.cc`) and the plugin EP
(`plugin/cuda_ep.cc`) independently maintained their own copies of the
NHWC allowlist. This created a maintenance hazard where ops could be
added to one but not the other, leading to silent divergence.
Additionally, there was no runtime diagnostic when the framework rewrote
a node to the NHWC domain but the plugin EP lacked a matching kernel —
failures were silent fallbacks to CPU.

## Key Changes

### Shared NHWC Allowlist (`cuda_nhwc_ops.h`)

| Item | Detail |
|------|--------|
| New file | `onnxruntime/core/providers/cuda/cuda_nhwc_ops.h` |
| Contents | `IsNhwcEligibleOnnxOp()`, `IsNhwcEligibleMsOp()`,
`IsNhwcEligible()` inline functions |
| Ops covered | AveragePool, BatchNormalization, Conv, ConvTranspose,
DepthToSpace, GlobalAveragePool, GlobalMaxPool, GridSample, LRN,
MaxPool, SpaceToDepth (+ MS-domain GridSample) |

### Bundled EP Refactor (`cuda_execution_provider.cc`)

- Removed the static `std::unordered_set&lt;std::string_view&gt;
cuda_nhwc_onnx_ops` and the inline domain check logic.
- Replaced with a single call to `cuda::IsNhwcEligible(node_domain,
node_op_type)`.

### Plugin EP Refactor &amp; Diagnostics (`plugin/cuda_ep.cc`)

- `ShouldConvertDataLayoutForOpImpl`: Replaced ~20 lines of static set +
domain checks with a single `cuda::IsNhwcEligible()` call.
- `GetCapabilityImpl`: Added a WARNING-level diagnostic in the `else`
branch (kernel not found). When a node in the `com.ms.internal.nhwc`
domain has no registered kernel, the log emits the op type, domain,
version, and node name — making future NHWC registration gaps
immediately visible at session creation.

### Expanded NHWC Test Coverage (`test_cuda_plugin_ep.py`)

- Added `_assert_nhwc_domain_assigned()` helper that verifies NHWC
layout transformation occurred by checking for framework-inserted
Transpose nodes in the EP's assignment info.
- Added `_run_nhwc_model_test()` helper combining domain assertion +
numerical validation.
- Updated 4 existing NHWC tests (Conv, BatchNormalization, MaxPool,
AveragePool) to include structural assertions.
- Added 7 new NHWC test methods:
  - `test_nhwc_conv_transpose`
  - `test_nhwc_global_max_pool`
  - `test_nhwc_global_average_pool`
  - `test_nhwc_depth_to_space`
  - `test_nhwc_space_to_depth`
  - `test_nhwc_lrn`
  - `test_nhwc_grid_sample`

## Testing Notes

Run the full CUDA plugin EP test suite with NHWC enabled:

```bash
bash .env/cuda13_plugin.sh --build --install --test_plugin
```

Or run only the NHWC tests directly:

```bash
cd onnxruntime/test/python/transformers
ORT_TEST_CUDA_PLUGIN_EP=1 python -m unittest \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_conv \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_batch_normalization \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_maxpool \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_avgpool \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_conv_transpose \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_global_max_pool \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_global_average_pool \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_depth_to_space \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_space_to_depth \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_lrn \
  test_cuda_plugin_ep.TestCudaPluginEP.test_nhwc_grid_sample
```

All 86 tests in the suite pass (11 NHWC + 75 existing), with no
regressions.
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/platform/env_var_utils.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/cuda_nhwc_ops.h"
 #include "core/providers/cuda/cuda_allocator.h"
 #include "core/providers/cuda/cuda_fwd.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
@@ -383,23 +384,7 @@ std::optional<bool> CUDAExecutionProvider::ShouldConvertDataLayoutForOp([[maybe_
     return std::nullopt;
   }
 
-  // TODO(mtavenrath) generate list from registered kernels using nhwc domain
-  static const std::unordered_set<std::string_view> cuda_nhwc_onnx_ops{
-      "BatchNormalization",
-      "Conv",
-      "ConvTranspose",
-      "GlobalMaxPool",
-      "MaxPool",
-      "GlobalAveragePool",
-      "AveragePool",
-      "GridSample",
-      "DepthToSpace",
-      "SpaceToDepth",
-      "LRN",
-  };
-
-  return (node_domain == kOnnxDomain && cuda_nhwc_onnx_ops.find(node_op_type) != cuda_nhwc_onnx_ops.end()) ||
-         (node_domain == kMSDomain && node_op_type == "GridSample");
+  return cuda::IsNhwcEligible(node_domain, node_op_type);
 
 #else  // defined(ENABLE_CUDA_NHWC_OPS)
   ORT_UNUSED_PARAMETER(node_domain);
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_ops.h b/onnxruntime/core/providers/cuda/cuda_nhwc_ops.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string_view>
+
+namespace onnxruntime {
+namespace cuda {
+
+// Unified allowlist of ops eligible for NHWC layout conversion in both the
+// bundled CUDA EP and the CUDA plugin EP.  Maintaining a single source of truth
+// prevents silent divergence between the two implementations.
+
+inline bool IsNhwcEligibleOnnxOp(std::string_view op_type) {
+  // Alphabetical order for easy maintenance.
+  return op_type == "AveragePool" ||
+         op_type == "BatchNormalization" ||
+         op_type == "Conv" ||
+         op_type == "ConvTranspose" ||
+         op_type == "DepthToSpace" ||
+         op_type == "GlobalAveragePool" ||
+         op_type == "GlobalMaxPool" ||
+         op_type == "GridSample" ||
+         op_type == "LRN" ||
+         op_type == "MaxPool" ||
+         op_type == "SpaceToDepth";
+}
+
+inline bool IsNhwcEligibleMsOp(std::string_view op_type) {
+  return op_type == "GridSample";
+}
+
+// Returns true if the given (domain, op_type) pair is eligible for NHWC
+// conversion.  |domain| should be kOnnxDomain ("") or kMSDomain
+// ("com.microsoft").
+inline bool IsNhwcEligible(std::string_view domain, std::string_view op_type) {
+  if (domain.empty()) {
+    return IsNhwcEligibleOnnxOp(op_type);
+  }
+  if (domain == "com.microsoft") {
+    return IsNhwcEligibleMsOp(op_type);
+  }
+  return false;
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep.cc
@@ -20,6 +20,7 @@
 #include <unordered_set>
 
 #include "core/graph/constants.h"
+#include "core/providers/cuda/cuda_nhwc_ops.h"
 
 namespace onnxruntime {
 namespace cuda_plugin {
@@ -214,7 +215,7 @@ OrtStatus* ORT_API_CALL CudaEp::GetCapabilityImpl(
   tentative_nodes.reserve(all_nodes.size());
 
   for (const auto& node : all_nodes) {
-    std::string ep_name = node.GetEpName();
+    const std::string& ep_name = node.GetEpName();
     if (!ep_name.empty()) {
       if (ep_name == ep->name_) {
         candidate_nodes.push_back(node);
@@ -229,6 +230,18 @@ OrtStatus* ORT_API_CALL CudaEp::GetCapabilityImpl(
     if (kernel_def != nullptr) {
       candidate_nodes.push_back(node);
       tentative_nodes.push_back(node);
+    } else {
+      // Emit a diagnostic when an NHWC-domain node has no matching kernel.
+      // This helps identify gaps between the layout conversion allowlist and
+      // the actually-registered NHWC kernels in the plugin build.
+      const std::string& node_domain = node.GetDomain();
+      if (node_domain == kMSInternalNHWCDomain) {
+        ORT_CXX_LOGF(Ort::Logger(&ep->logger_), ORT_LOGGING_LEVEL_WARNING,
+                     "NHWC kernel miss: op=%s domain=%s version=%d node=%s - "
+                     "no matching kernel registered in the CUDA plugin EP.",
+                     node.GetOperatorType().c_str(), node_domain.c_str(),
+                     node.GetSinceVersion(), node.GetName().c_str());
+      }
     }
   }
 
@@ -308,36 +321,11 @@ OrtStatus* ORT_API_CALL CudaEp::ShouldConvertDataLayoutForOpImpl(
     return nullptr;
   }
 
-  // ONNX domain ops that have NHWC kernel registrations.
-  static const std::unordered_set<std::string_view> cuda_nhwc_onnx_ops{
-      "BatchNormalization",
-      "Conv",
-      "ConvTranspose",
-      "GlobalMaxPool",
-      "MaxPool",
-      "GlobalAveragePool",
-      "AveragePool",
-      "GridSample",
-      "DepthToSpace",
-      "SpaceToDepth",
-      "LRN",
-  };
-
-  // Check ONNX domain (empty string) or MS domain (com.microsoft)
-  bool is_onnx_domain = (safe_domain[0] == '\0');
-  bool is_ms_domain = (std::strcmp(safe_domain, "com.microsoft") == 0);
-
-  if (is_onnx_domain && cuda_nhwc_onnx_ops.count(safe_op_type) > 0) {
+  if (cuda::IsNhwcEligible(safe_domain, safe_op_type)) {
     *should_convert = 1;  // Convert
-    return nullptr;
-  }
-
-  if (is_ms_domain && std::strcmp(safe_op_type, "GridSample") == 0) {
-    *should_convert = 1;  // Convert
-    return nullptr;
+  } else {
+    *should_convert = 0;  // Explicitly decline conversion for unsupported NHWC ops.
   }
-
-  *should_convert = 0;  // Explicitly decline conversion for unsupported NHWC ops.
   return nullptr;
 #endif
 }
diff --git a/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py b/onnxruntime/test/python/transformers/test_cuda_plugin_ep.py