undo non-triton changes

matthiasdiener · matthiasdiener · commit 43cf8ab6ba03 · 2025-12-01T13:04:04.000-06:00
diff --git a/transformer_engine/common/include/transformer_engine/recipe.h b/transformer_engine/common/include/transformer_engine/recipe.h
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -75,12 +73,6 @@ void nvte_delayed_scaling_recipe_amax_and_scale_update_after_reduction(
     std::vector<NVTETensor> scales, const char* amax_compute_algo, NVTEDType fp8_dtype,
     float margin, cudaStream_t stream);
 
-#ifdef __HIP_PLATFORM_AMD__
-
-constexpr int amax_kernel_threads = 512;
-
-#endif
-
 /*! \brief Compute an FP8 tensor's amax.
  *
  *  The amax (maximum absolute value) of the input tensor is computed
@@ -92,22 +84,6 @@ constexpr int amax_kernel_threads = 512;
  */
 void nvte_compute_amax(const NVTETensor input, NVTETensor output, cudaStream_t stream);
 
-#ifdef __HIP_PLATFORM_AMD__
-
-/*! \brief Compute an FP8 tensor's amax.
- *
- *  The amax (maximum absolute value) of the input tensor is computed
- *  and written to the amax buffer of the output tensor.
- *
- *  \param[in]     input            Input tensor. Must be unquantized.
- *  \param[in,out] output           Output tensor. Must be an FP8 tensor with per-tensor scaling.
- *  \param[out]    workspace        Output tensor. Must be FP32.
- *  \param[in]     stream           CUDA stream used for the operation.
- */
-void nvte_compute_amax_with_workspace(const NVTETensor input, NVTETensor output, NVTETensor workspace, cudaStream_t stream);
-
-#endif
-
 /*! \brief Update an FP8 tensor's scale based on its amax.
  *
  *  This is only supported for FP8 tensors with per-tensor scaling.
diff --git a/transformer_engine/common/recipe/current_scaling.cu b/transformer_engine/common/recipe/current_scaling.cu
@@ -26,39 +26,12 @@ using bf16__ = __nv_bfloat16;
 using bf16__ = __hip_bfloat16;
 #endif //__HIP_PLATFORM_AMD__
 
-
-#ifdef __HIP_PLATFORM_AMD__
-
-template <int BLOCK_THREADS>
-__global__ void amax_final_reduce(const float* __restrict__ block_amax,
-                                  float* __restrict__ global_amax,
-                                  int num_blocks) {
-  float val = 0.f;
-
-  for (int i = threadIdx.x; i < num_blocks; i += BLOCK_THREADS) {
-    val = fmaxf(val, block_amax[i]);
-  }
-
-  const int warp_id = threadIdx.x / THREADS_PER_WARP;
-  const float block_max =
-      reduce_max<BLOCK_THREADS / THREADS_PER_WARP>(val, warp_id);
-
-  if (threadIdx.x == 0) {
-    *global_amax = block_max;
-  }
-}
-
-#endif
+constexpr int amax_kernel_threads = 512;
 
 template <int nvec, bool aligned, typename InputType>
 __launch_bounds__(amax_kernel_threads) __global__
-#ifdef __HIP_PLATFORM_AMD__
-    void amax_kernel(const InputType *input, float *amax, float* __restrict__ block_amax, const size_t N,
-                     const size_t num_aligned_elements) {
-#else
     void amax_kernel(const InputType *input, float *amax, const size_t N,
                      const size_t num_aligned_elements) {
-#endif
   VectorizedLoader<InputType, nvec, aligned> loader(input, N);
   InputType max{0.f};
   const int warp_id = threadIdx.x / THREADS_PER_WARP;
@@ -92,23 +65,12 @@ __launch_bounds__(amax_kernel_threads) __global__
   // Reduce amax over block
   max = reduce_max<amax_kernel_threads / THREADS_PER_WARP>(max, warp_id);
   if (threadIdx.x == 0) {
-#ifdef __HIP_PLATFORM_AMD__
-    if (block_amax != nullptr) {
-      // 2-stage: write per-block result
-      block_amax[blockIdx.x] = max;
-    } else {
-      // Atomic path: directly update global amax
-      atomicMaxFloat(amax, max);
-    }
-#else
     atomicMaxFloat(amax, max);
-#endif
   }
 }
 
 template <int nvec, typename InputType>
-void launch_amax_kernel(const InputType *input, float *amax, const size_t N, float *block_amax,
-                        size_t block_capacity, cudaStream_t stream) {
+void launch_amax_kernel(const InputType *input, float *amax, const size_t N, cudaStream_t stream) {
   // Zero out amax so we can update with atomic max
   (void)cudaMemsetAsync(amax, 0, sizeof(float), stream);
 
@@ -127,54 +89,24 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, flo
   constexpr size_t max_blocks = 65535;
   num_blocks = std::min(num_blocks, max_blocks);
 
-#ifdef __HIP_PLATFORM_AMD__
-  if (block_capacity < num_blocks)
-    block_amax = nullptr;
-#endif
-
   // Launch kernel
   switch (align) {
     case Alignment::SAME_ALIGNED:
-#ifdef __HIP_PLATFORM_AMD__
-      amax_kernel<nvec, true, InputType>
-          <<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, num_aligned_elements);
-#else
       amax_kernel<nvec, true, InputType>
           <<<num_blocks, threads, 0, stream>>>(input, amax, N, num_aligned_elements);
-#endif
       break;
     case Alignment::SAME_UNALIGNED:
-#ifdef __HIP_PLATFORM_AMD__
-      amax_kernel<nvec, false, InputType>
-          <<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, num_aligned_elements);
-#else
       amax_kernel<nvec, false, InputType>
           <<<num_blocks, threads, 0, stream>>>(input, amax, N, num_aligned_elements);
-#endif
       break;
     case Alignment::DIFFERENT: {
       // This case is a logic error, since there is only one pointer (input)
       // in the alignment check. Still safe to process without vectorization.
-#ifdef __HIP_PLATFORM_AMD__
-      amax_kernel<1, true, InputType><<<num_blocks, threads, 0, stream>>>(input, amax, block_amax, N, N);
-#else
       amax_kernel<1, true, InputType><<<num_blocks, threads, 0, stream>>>(input, amax, N, N);
-#endif
       break;
     }
   }
 
-#ifdef __HIP_PLATFORM_AMD__
-  if (block_amax != nullptr) {
-    constexpr int FINAL_REDUCE_THREADS = 256;
-    dim3 fr_block(FINAL_REDUCE_THREADS);
-    dim3 fr_grid(1);
-
-    amax_final_reduce<FINAL_REDUCE_THREADS>
-        <<<fr_grid, fr_block, 0, stream>>>(block_amax, amax, static_cast<int>(num_blocks));
-  }
-#endif
-
   // Check results
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
@@ -183,12 +115,6 @@ void launch_amax_kernel(const InputType *input, float *amax, const size_t N, flo
 }  // namespace transformer_engine
 
 void nvte_compute_amax(const NVTETensor input_, const NVTETensor output_, cudaStream_t stream) {
-#ifdef __HIP_PLATFORM_AMD__
-  nvte_compute_amax_with_workspace(input_, output_, /*workspace=*/nullptr, stream);
-}
-
-void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor output_, const NVTETensor workspace_, cudaStream_t stream) {
-#endif
   NVTE_API_CALL(nvte_compute_amax);
   using namespace transformer_engine;
 
@@ -224,31 +150,11 @@ void nvte_compute_amax_with_workspace(const NVTETensor input_, const NVTETensor
              to_string(output.amax.dtype), ")");
   CheckOutputTensor(output, "output_compute_amax", true);
 
-#ifdef __HIP_PLATFORM_AMD__
-  // Optional workspace
-  float* block_amax = nullptr;
-  size_t block_capacity = 0;
-
-  if (workspace_ != nullptr) {
-    auto &workspace = *reinterpret_cast<Tensor *>(workspace_);
-    if (workspace.data.dptr != nullptr) {
-      NVTE_CHECK(workspace.data.dtype == DType::kFloat32,
-                "Workspace tensor for amax computation must be FP32, got dtype=",
-                to_string(workspace.data.dtype));
-      block_amax     = reinterpret_cast<float*>(workspace.data.dptr);
-      block_capacity = workspace.data.numel();
-    }
-  }
-#endif
-
   // Compute amax
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.data.dtype, IType, constexpr int nvec = 32 / sizeof(IType);
       launch_amax_kernel<nvec>(reinterpret_cast<const IType *>(input.data.dptr),
                                reinterpret_cast<float *>(output.amax.dptr), input.data.numel(),
-#ifdef __HIP_PLATFORM_AMD__
-                               block_amax, block_capacity,
-#endif
                                stream););  // NOLINT(*)
 }
 
diff --git a/transformer_engine/pytorch/csrc/common.cpp b/transformer_engine/pytorch/csrc/common.cpp
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -12,10 +10,6 @@
 #include "pybind.h"
 #include "transformer_engine/transformer_engine.h"
 
-#ifdef __HIP_PLATFORM_AMD__
-#include "common/common.h"
-#endif
-
 namespace transformer_engine::pytorch {
 
 std::vector<size_t> getTensorShape(at::Tensor t) {
@@ -283,32 +277,4 @@ int roundup(const int value, const int multiple) {
   return ((value + multiple - 1) / multiple) * multiple;
 }
 
-#ifdef __HIP_PLATFORM_AMD__
-
-inline bool nvte_use_atomic_amax() {
-  const char *env_p = std::getenv("NVTE_USE_ATOMIC_AMAX");
-  if (env_p && std::string(env_p) == "1")
-    return true;
-  return false;
-}
-
-TensorWrapper allocate_amax_workspace(const TensorWrapper& input_tensor) {
-  if (nvte_use_atomic_amax() || input_tensor.numel() == 0) {
-    // User chose atomic path, or empty tensor -> no need for workspace
-    return TensorWrapper{};
-  }
-
-  const auto N = input_tensor.numel();
-  constexpr size_t max_blocks_hw = 65535;
-
-  size_t max_blocks = DIVUP(N, static_cast<size_t>(amax_kernel_threads));
-  size_t workspace_blocks = std::min(max_blocks, max_blocks_hw);
-
-  at::Tensor ws = at::empty(workspace_blocks, at::CUDA(at::kFloat));
-
-  return makeTransformerEngineTensor(ws);
-}
-
-#endif
-
 }  // namespace transformer_engine::pytorch
diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h
@@ -374,9 +374,6 @@ std::vector<size_t> convertShape(const NVTEShape& shape);
 
 int roundup(const int value, const int multiple);
 
-#ifdef __HIP_PLATFORM_AMD__
-TensorWrapper allocate_amax_workspace(const TensorWrapper& input_tensor);
-#endif
 }  // namespace transformer_engine::pytorch
 
 namespace std {
diff --git a/transformer_engine/pytorch/csrc/extensions/activation.cpp b/transformer_engine/pytorch/csrc/extensions/activation.cpp
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -38,18 +36,10 @@ py::object activation_helper(const at::Tensor& input, py::handle quantizer, int
     auto [te_output_act, out_act] =
         my_quantizer_none->create_tensor(input_shape, GetTransformerEngineDType(fake_tensor_type));
 
-#ifdef __HIP_PLATFORM_AMD__
-    auto workspace = allocate_amax_workspace(te_input);
-#endif
     NVTE_SCOPED_GIL_RELEASE({
       act_func(te_input.data(), te_output_act.data(), at::cuda::getCurrentCUDAStream());
       // use te_output_act as input to the compute amax and find the amax of activated tensor
-#ifdef __HIP_PLATFORM_AMD__
-      nvte_compute_amax_with_workspace(te_output_act.data(), te_output.data(),
-                        workspace.data(), at::cuda::getCurrentCUDAStream());
-#else
       nvte_compute_amax(te_output_act.data(), te_output.data(), at::cuda::getCurrentCUDAStream());
-#endif
     });
 
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
diff --git a/transformer_engine/pytorch/csrc/extensions/bias.cpp b/transformer_engine/pytorch/csrc/extensions/bias.cpp
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -51,13 +49,7 @@ std::vector<py::object> bgrad_quantize(const at::Tensor& input, py::handle py_qu
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(quantizer.get());
     NVTE_SCOPED_GIL_RELEASE({
-#ifdef __HIP_PLATFORM_AMD__
-      nvte_compute_amax_with_workspace(input_tensor.data(), out_tensor.data(),
-                                       allocate_amax_workspace(input_tensor).data(),
-                                       at::cuda::getCurrentCUDAStream());
-#else
       nvte_compute_amax(input_tensor.data(), out_tensor.data(), at::cuda::getCurrentCUDAStream());
-#endif
     });
     // check if we need to do amax reudction (depending on model parallel configs)
     if (my_quantizer_cs->with_amax_reduction) {
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -55,13 +53,7 @@ py::object quantize(const at::Tensor& tensor, py::handle quantizer, const py::ob
     // my_quantizer here has to be a Float8CurrentScalingQuantizer
     auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer*>(my_quantizer.get());
     NVTE_SCOPED_GIL_RELEASE({
-#ifdef __HIP_PLATFORM_AMD__
-      nvte_compute_amax_with_workspace(te_input.data(), te_output.data(),
-                                       allocate_amax_workspace(te_input).data(),
-                                       at::cuda::getCurrentCUDAStream());
-#else
       nvte_compute_amax(te_input.data(), te_output.data(), at::cuda::getCurrentCUDAStream());
-#endif
     });
     // check if we need to do amax reudction (depending on model parallel configs)
     if (my_quantizer_cs->with_amax_reduction) {
diff --git a/transformer_engine/pytorch/csrc/extensions/normalization.cpp b/transformer_engine/pytorch/csrc/extensions/normalization.cpp
@@ -144,14 +144,8 @@ std::vector<py::object> layernorm_fwd(py::handle input, py::handle weight, Maybe
       // my_quantizer here has to be a Float8CurrentScalingQuantizer
       auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer *>(my_quantizer.get());
       NVTE_SCOPED_GIL_RELEASE({
-#ifdef __HIP_PLATFORM_AMD__
-        nvte_compute_amax_with_workspace(unquantized_out_cu.data(), out_cu.data(),
-                                         allocate_amax_workspace(unquantized_out_cu).data(),
-                          at::cuda::getCurrentCUDAStream());
-#else
         nvte_compute_amax(unquantized_out_cu.data(), out_cu.data(),
                           at::cuda::getCurrentCUDAStream());
-#endif
       });
       // check if we need to do amax reudction (depending on model parallel configs)
       if (my_quantizer_cs->with_amax_reduction) {
@@ -308,14 +302,8 @@ std::vector<py::object> rmsnorm_fwd(const py::handle &input, const py::handle &w
       // my_quantizer here has to be a Float8CurrentScalingQuantizer
       auto my_quantizer_cs = static_cast<Float8CurrentScalingQuantizer *>(my_quantizer.get());
       NVTE_SCOPED_GIL_RELEASE({
-#ifdef __HIP_PLATFORM_AMD__
-        nvte_compute_amax_with_workspace(unquantized_out_cu.data(), out_cu.data(),
-                                         allocate_amax_workspace(unquantized_out_cu).data(),
-                          at::cuda::getCurrentCUDAStream());
-#else
         nvte_compute_amax(unquantized_out_cu.data(), out_cu.data(),
                           at::cuda::getCurrentCUDAStream());
-#endif
       });
       // check if we need to do amax reudction (depending on model parallel configs)
       if (my_quantizer_cs->with_amax_reduction) {
diff --git a/transformer_engine/pytorch/csrc/extensions/recipe.cpp b/transformer_engine/pytorch/csrc/extensions/recipe.cpp
@@ -1,6 +1,4 @@
 /*************************************************************************
- * This file was modified for portability to AMDGPU
- * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See LICENSE for license information.
@@ -27,13 +25,7 @@ void compute_amax(const at::Tensor& tensor, at::Tensor& amax) {
       DType::kFloat8E4M3,  // It doesn't matter because we only compute amax.
       amax.data_ptr<float>());
 
-#ifdef __HIP_PLATFORM_AMD__
-  nvte_compute_amax_with_workspace(te_input.data(), fake_te_output.data(),
-                                   allocate_amax_workspace(te_input).data(),
-                                   at::cuda::getCurrentCUDAStream());
-#else
   nvte_compute_amax(te_input.data(), fake_te_output.data(), at::cuda::getCurrentCUDAStream());
-#endif
 }
 
 void fused_amax_and_scale_update_after_reduction(const at::Tensor& amax_reduction_buffer,
diff --git a/transformer_engine/pytorch/triton_kernels/cast_transpose.py b/transformer_engine/pytorch/triton_kernels/cast_transpose.py
@@ -241,7 +241,7 @@ def _amax_reduce_triton_stage1(
 @triton.jit
 def _amax_reduce_and_compute_scale_triton(
     block_amax,      # float32[num_blocks]
-    num_blocks,  # int32[1]
+    num_blocks,      # int32[1]
     amax_ptr,        # float32[1]
     scale_ptr,       # float32[1]
     inv_ptr,         # float32[1]