ROCm
diff --git a/‎LICENSE
Lines changed: 4 additions & 0 deletions b/‎LICENSE
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/CMakeLists.txt
Lines changed: 25 additions & 2 deletions b/‎aten/src/ATen/CMakeLists.txt
Lines changed: 25 additions & 2 deletions
diff --git a/‎aten/src/ATen/Context.cpp
Lines changed: 34 additions & 0 deletions b/‎aten/src/ATen/Context.cpp
Lines changed: 34 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.h
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/Context.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/ROCmFABackend.h
Lines changed: 31 additions & 0 deletions b/‎aten/src/ATen/ROCmFABackend.h
Lines changed: 31 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
Lines changed: 17 additions & 11 deletions b/‎aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
Lines changed: 17 additions & 11 deletions
diff --git a/‎aten/src/ATen/native/transformers/hip/aotriton_adapter.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/transformers/hip/aotriton_adapter.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip renamed to ‎aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
Lines changed: 16 additions & 23 deletions b/‎aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip renamed to ‎aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
Lines changed: 16 additions & 23 deletions
diff --git a/‎aten/src/ATen/native/transformers/hip/flash_attn/ck/bias.hpp
Lines changed: 100 additions & 0 deletions b/‎aten/src/ATen/native/transformers/hip/flash_attn/ck/bias.hpp
Lines changed: 100 additions & 0 deletions
@@ -32,6 +32,10 @@ All contributions by Cruise LLC:
 Copyright (c) 2022 Cruise LLC.
 All rights reserved.
 
+All contributions by Tri Dao:
+Copyright (c) 2024 Tri Dao.
+All rights reserved.
+
 All contributions by Arm:
 Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates
 
 
@@ -168,9 +168,28 @@ file(GLOB flash_attention_cuda_cu "native/transformers/cuda/flash_attn/*.cu")
 file(GLOB flash_attention_cuda_kernels_cu "native/transformers/cuda/flash_attn/kernels/*.cu")
 file(GLOB flash_attention_cuda_cpp "native/transformers/cuda/flash_attn/*.cpp")
 
-# flash_attention sources
+# flash_attention hip sources
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
-file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
+# if USE_FLASH_ATTENTION is set, ensure CK instances get generated
+if(USE_FLASH_ATTENTION)
+  if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
+    set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
+      if(USE_CK_FLASH_ATTENTION STREQUAL "1")
+        if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+          list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
+          if(NUM_ARCHS GREATER 1)
+            message(WARNING "Building CK for multiple archs can increase build time considerably!
+            Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
+          endif()
+        endif()
+        message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
+        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
+        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+      endif()
+  endif()
+  file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
+  file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
+endif()
 
 #Mem_eff attention sources
 file(GLOB mem_eff_attention_cuda_cu "native/transformers/cuda/mem_eff_attention/*.cu")
@@ -185,6 +204,7 @@ if(USE_FLASH_ATTENTION)
   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${flash_attention_cuda_kernels_cu})
 
   list(APPEND native_transformers_hip_hip ${flash_attention_hip_hip})
+  list(APPEND native_transformers_hip_hip ${flash_attention_hip_aot_hip})
   list(APPEND native_transformers_src_hip_hip ${flash_attention_src_hip_hip})
 endif()
 
@@ -325,6 +345,9 @@ if(USE_ROCM)
   # Next two lines are needed because TunableOp uses third-party/fmt
   list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
   list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
+if(USE_FLASH_ATTENTION)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
+endif()
   list(APPEND ATen_HIP_SRCS
     ${ATen_HIP_SRCS}
     ${hip_hip}
 
@@ -343,6 +343,40 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #endif
 }
 
+at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
+  return rocm_fa_preferred_backend;
+}
+
+void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
+
+  // TODO: add plumbing for hasCK for validity checking
+  TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
+      "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
+#ifdef USE_ROCM
+  if(b == at::ROCmFABackend::Ck) {
+    static const bool ck_unsupported = []() {
+      static const std::vector<std::string> archs = {
+          "gfx90a",  "gfx942"
+      };
+      for (auto index: c10::irange(getNumGPUs())) {
+        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+          TORCH_WARN_ONCE(
+            "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
+          return true;
+        }
+      }
+      return false;
+    }();
+    if(!ck_unsupported) rocm_fa_preferred_backend = b;
+  }
+  else {
+     rocm_fa_preferred_backend = b;
+  }
+#endif
+  rocm_fa_preferred_backend = b;
+}
+
+
 bool Context::allowFP16ReductionCuBLAS() const {
   return allow_fp16_reduction_cublas;
 }
 
@@ -4,6 +4,7 @@
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/DeviceAccelerator.h>
 #include <ATen/LinalgBackend.h>
+#include <ATen/ROCmFABackend.h>
 #include <ATen/SDPBackend.h>
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
@@ -239,6 +240,9 @@ class TORCH_API Context {
   at::BlasBackend blasPreferredBackend();
   void setBlasPreferredBackend(at::BlasBackend);
 
+  at::ROCmFABackend getROCmFAPreferredBackend() const;
+  void setROCmFAPreferredBackend(at::ROCmFABackend);
+
   // Note [Enabling Deterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Operations in PyTorch that normally act nondeterministically, but have an
@@ -428,6 +432,10 @@ class TORCH_API Context {
 #endif
       ? at::BlasBackend::Cublaslt
       : at::BlasBackend::Cublas;
+  at::ROCmFABackend rocm_fa_preferred_backend =
+      c10::utils::check_env("TORCH_ROCM_FA_PREFER_CK") == true
+      ? at::ROCmFABackend::Ck
+      : at::ROCmFABackend::Default;
 #ifdef C10_MOBILE
   bool release_original_weights = true;
 #else
 
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <ostream>
+#include <string>
+
+namespace at {
+
+enum class ROCmFABackend : int8_t { Default, AOTriton, Ck };
+
+inline std::string ROCmFABackendToString(at::ROCmFABackend backend) {
+  switch (backend) {
+    case ROCmFABackend::Default:
+      return "at::ROCmFABackend::Default";
+    case ROCmFABackend::AOTriton:
+      return "at::ROCmFABackend::AOTriton";
+    case ROCmFABackend::Ck:
+      return "at::ROCmFABackend::Ck";
+    default:
+      TORCH_CHECK(false, "Unknown ROCm flash attention backend")
+  }
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::ROCmFABackend backend) {
+  return stream << ROCmFABackendToString(backend);
+}
+
+} // namespace at
@@ -28,7 +28,7 @@
 #if USE_ROCM
 #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION)
 #include <aotriton/flash.h>
-#define USE_AOTRITON 1
+#define USE_ROCM_ATTENTION 1
 #endif
 #endif
 
@@ -219,15 +219,21 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
   using sm80 = SMVersion<8, 0>;
   using sm90 = SMVersion<9, 0>;
 #if USE_ROCM
-#if USE_AOTRITON
-  auto stream = at::cuda::getCurrentCUDAStream().stream();
-  if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
-      auto dprops = at::cuda::getCurrentDeviceProperties();
-      if (debug) {
-          TORCH_WARN(
-                  "Flash attention was not compiled for current AMD GPU architecture. Attempting to run on architecture ", dprops->gcnArchName);
-      }
-      return false;
+#if USE_ROCM_ATTENTION
+  if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) {
+    // User explicitly set CK as the flash attention backend. Return true for now
+    // TODO: Flesh out sanity checks
+    return true;
+  } else {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        if (debug) {
+            TORCH_WARN(
+                    "Flash attention was not compiled for current AMD GPU architecture. Attempting to run on architecture ", dprops->gcnArchName);
+        }
+        return false;
+    }
   }
 #else
   return false;
@@ -254,7 +260,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
   using sm50 = SMVersion<5, 0>;
   using sm90 = SMVersion<9, 0>;
 #if USE_ROCM
-#if USE_AOTRITON
+#if USE_ROCM_ATTENTION
   auto stream = at::cuda::getCurrentCUDAStream().stream();
   if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
       auto dprops = at::cuda::getCurrentDeviceProperties();
 
@@ -124,7 +124,7 @@ inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q)
 inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr)
 {
   return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(ptr),
-                                 aotriton::DType::kUInt64);  // AOTriton excepts unsigned int64
+                                 aotriton::DType::kUInt64);  // AOTriton accepts unsigned int64
 }
 
 } // namespace aotriton_adapter
 
@@ -115,24 +115,18 @@ prepare_philox_arguments(float p_dropout, int64_t counter_offset) {
 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
-        const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
-        const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
-        std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
-        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
-        const float p_dropout,
-        const float softmax_scale,
-        bool is_causal,
-        int window_size_left,
-        int window_size_right,
-        const bool return_softmax,
-        std::optional<at::Generator> gen_) {
-  // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  // [ROCM specific]: must be at the beginning of the function
-  // Otherwise check_gpu_arch() checks cuda:0 device.
-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
-
+mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
+            const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
+            const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
+            std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+            std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+            const float p_dropout,
+            const float softmax_scale,
+            bool is_causal,
+            int window_size_left,
+            int window_size_right,
+            const bool return_softmax,
+            std::optional<at::Generator> gen_) {
   auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
   check_gpu_arch(stream);
 
@@ -242,7 +236,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
                const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -408,7 +402,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_size_og
+mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_size_og
         const at::Tensor &q,   // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &k,   // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
@@ -559,7 +553,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
+mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
                const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
                const at::Tensor &k,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -747,7 +741,6 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
   return { dq, dk, dv, softmax_d };
 }
-
-} // namespace pytorch_fmha
+} // namespace pytorch_flash
 
 #endif
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <ck_tile/core.hpp>
+#include <ck_tile/ops/fmha.hpp>
+
+// keep sync with BlockAttentionBiasEnum
+enum class bias_enum
+{
+    no_bias          = 0,
+    elementwise_bias = 1,
+    alibi            = 2,
+};
+
+struct bias_info
+{
+    bias_enum type;
+    /*
+     * simple dispatch logic
+     *
+     * if type == elementwise_bias:
+     *      if rank_info == 0:
+     *           bias is 1*1*s*s
+     *      elif rank_info == 1:
+     *           bias is 1*h*s*s
+     *      elif rank_info == 2:
+     *           bias is b*h*s*s
+     *
+     * elif type == alibi:
+     *       if rank_info == 0:
+     *           alibi in 1*h
+     *       elif rank_info == 1:
+     *           alibi in b*h
+     */
+    int rank_info;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == bias_enum::no_bias)
+            os << "n";
+        else if(type == bias_enum::elementwise_bias)
+        {
+            os << "e";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+        else if(type == bias_enum::alibi)
+        {
+            os << "alibi";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+    }
+
+    static bias_info decode(std::string str)
+    {
+        bias_info info{bias_enum::no_bias, 0};
+        if(str == "0" || str == "n")
+        {
+            info.type = bias_enum::no_bias;
+        }
+        else if(str.compare(0, 1, "1") == 0 || str.compare(0, 1, "e") == 0 ||
+                str.compare(0, 11, "elementwise") == 0)
+        {
+            info.type    = bias_enum::elementwise_bias;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        else if(str.compare(0, 1, "2") == 0 || str.compare(0, 1, "a") == 0 ||
+                str.compare(0, 5, "alibi") == 0)
+        {
+            info.type    = bias_enum::alibi;
+            auto found_0 = str.find(':');
+            if(found_0 != std::string::npos)
+            {
+                std::string e  = str.substr(found_0 + 1);
+                info.rank_info = atoi(e.c_str());
+            }
+        }
+        return info;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const bias_info& bi)
+    {
+        bi.serialize(os);
+        return os;
+    }
+};
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q)`
`124`	`124`	`inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr)`
`125`	`125`	`{`
`126`	`126`	`return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(ptr),`
`127`		`- aotriton::DType::kUInt64); // AOTriton excepts unsigned int64`
	`127`	`+ aotriton::DType::kUInt64); // AOTriton accepts unsigned int64`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`} // namespace aotriton_adapter`