Handle PyCUtensorMapObject in extractTmaDesc in the launcher

nputikhin · Google-ML-Automation · commit 0d402807af32 · 2025-10-30T04:55:52.000-07:00
Reenables failing tests

PiperOrigin-RevId: 825528658
diff --git a/third_party/triton/temporary/launcher_tma_desc_fix.patch b/third_party/triton/temporary/launcher_tma_desc_fix.patch
@@ -0,0 +1,144 @@
+diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
+--- a/third_party/nvidia/backend/cuda_utils.cc
++++ b/third_party/nvidia/backend/cuda_utils.cc
+@@ -270,51 +270,16 @@ bool extractPointer(PyObject* obj, void*
+   return true;
+ }
+ 
++CUtensorMap* getTmaDesc(PyObject* obj);
++
+ // Extract a CUtensorMap descriptor from a python object, and store it to the
+ // memory location pointed by ptr.
+ bool extractTmaDesc(PyObject* obj, void* ptr) {
+-  if (sizeof(CUtensorMap*) != 8) {
+-    PyErr_SetString(PyExc_SystemError,
+-                "extractTmaDesc() requires 64-bit compilation");
+-    return false;
+-  }
+-
+-  UniquePyObjectPtr method_ret(
+-      PyObject_CallMethod(obj, "tma_desc_cpu_ptr", nullptr));
+-  // Checking the error retains context if tma_desc_cpu_ptr raises an exception.
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!method_ret) {
+-    PyErr_SetString(PyExc_SystemError, "Call to tma_desc_cpu_ptr() failed");
++  CUtensorMap* tensor_map = getTmaDesc(obj);
++  if (tensor_map == nullptr) {
+     return false;
+   }
+-
+-  if (!PyLong_Check(method_ret.get())) {
+-    PyErr_SetString(PyExc_TypeError,
+-                    "tma_desc_cpu_ptr() must return 64-bit int");
+-    return false;
+-  }
+-
+-  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret.get());
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!ptr_as_uint) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "received NULL ptr from tma_desc_cpu_ptr()");
+-    return false;
+-  }
+-  if (ptr_as_uint % 64 != 0) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "tma_desc_cpu_ptr() must be 64-byte aligned");
+-    return false;
+-  }
+-
+-  *static_cast<CUtensorMap*>(ptr) =
+-      *reinterpret_cast<CUtensorMap*>(ptr_as_uint);
++  *static_cast<CUtensorMap*>(ptr) = *tensor_map;
+   return true;
+ }
+ 
+@@ -392,6 +357,7 @@ struct ExtractionInfo {
+   // Prefixes of types reprs supported by the extractor.
+   llvm::SmallVector<llvm::StringRef> supported_type_repr_prefixes;
+   std::size_t size;         // Size required by the extracted value.
++  std::size_t alignment;    // Alignment requirement for the extracted value.
+   ExtractorType extractor;  // Function to call to extract the value.
+ 
+   // Builds an ExtractionInfo for a given type T and a list of type reprs that
+@@ -400,7 +366,7 @@ struct ExtractionInfo {
+   static ExtractionInfo build(
+       std::initializer_list<llvm::StringRef> supported_type_reprs,
+       ExtractorType extractor = extractValue<T>) {
+-    return {supported_type_reprs, sizeof(T), extractor};
++    return {supported_type_reprs, sizeof(T), alignof(T), extractor};
+   }
+ 
+   // Checks if the extractor supports extracting a given type repr.
+@@ -428,7 +394,7 @@ const ExtractionInfo kExtractionInfos[]{
+     // Note: types are e.g. '*fp32', so no closing quote is intentional.
+     ExtractionInfo::build<void*>({"'*"}, extractPointer),
+     ExtractionInfo{
+-        {"None", "'none'"}, 0, nullptr},  // Represent constexprs as None
++        {"None", "'none'"}, 0, 0, nullptr},  // Represent constexprs as None
+     ExtractionInfo::build<CUtensorMap>({"'nvTmaDesc'"}, extractTmaDesc),
+ };
+ 
+@@ -628,7 +594,19 @@ PyObject* launch(PyObject* self, PyObjec
+     if (extraction_info.size == 0) {
+       continue;  // skip adding constexpr parameters
+     }
+-    config.params[params_idx] = alloca(extraction_info.size);
++    size_t alignment = std::max(1UL, extraction_info.alignment);
++
++    // Allocate enough space on the stack to guarantee an aligned block.
++    size_t size_with_alignment = extraction_info.size + alignment - 1;
++    void *param_storage_ptr = alloca(size_with_alignment);
++
++    void *aligned_ptr = std::align(alignment, extraction_info.size,
++                                   param_storage_ptr, size_with_alignment);
++    if (aligned_ptr == nullptr) {
++      PyErr_SetString(PyExc_MemoryError, "Failed to align parameter storage");
++      return nullptr;
++    }
++    config.params[params_idx] = aligned_ptr;
+     if (!extraction_info.extractor(arg, config.params[params_idx])) {
+       return nullptr;
+     }
+@@ -940,6 +918,36 @@ static PyTypeObject PyCUtensorMapType = 
+ };
+ // clang-format on
+ 
++namespace {
++
++// Extracts a pointer to `CUtensorMap` from a `PyCUtensorMapObject`.
++CUtensorMap* getTmaDesc(PyObject* obj) {
++  if (sizeof(CUtensorMap*) != 8) {
++    PyErr_SetString(PyExc_SystemError,
++                    "getTmaDesc() requires 64-bit compilation");
++    return nullptr;
++  }
++  if (Py_TYPE(obj) != static_cast<PyTypeObject*>(&PyCUtensorMapType)) {
++    PyErr_Format(PyExc_TypeError,
++                 "object must be of type PyCUtensorMap, got %s",
++                 Py_TYPE(obj)->tp_name);
++    return nullptr;
++  }
++  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
++  // PyCUtensorMapObject aligns tensorMap to 128.
++  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
++  if (align_128 != 0) {
++    PyErr_Format(
++        PyExc_ValueError,
++        "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld",
++        align_128);
++    return nullptr;
++  }
++  return map;
++}
++
++}  // namespace
++
+ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
+   unsigned long long global_address;
+   int swizzle;
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
@@ -15,5 +15,6 @@ those to this list.
 
 temporary_patch_list = [
     "//third_party/triton:temporary/utility-fix.patch",
+    "//third_party/triton:temporary/launcher_tma_desc_fix.patch",
     # Add new patches just above this line
 ]

Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,6 @@ those to this list.`
`15`	`15`
`16`	`16`	`temporary_patch_list = [`
`17`	`17`	`"//third_party/triton:temporary/utility-fix.patch",`
	`18`	`+ "//third_party/triton:temporary/launcher_tma_desc_fix.patch",`
`18`	`19`	`# Add new patches just above this line`
`19`	`20`	`]`