microsoft · kunal-vaishnavi · Apr 26, 2024 · Apr 26, 2024 · May 3, 2024 · Jun 12, 2024
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -335,7 +335,7 @@ Status Attention<T>::Compute(OpKernelContext* context) const {
 
   // Compute the attention score and apply the score to V
   return ApplyAttention(Q, K, V, mask_index, past, nullptr /* past_key */, nullptr /* past_value */,
-                        output, nullptr /* present_key */, nullptr /* present_value */,
+                        output, nullptr /* present_key */, nullptr /* present_value */, nullptr /* output_qk */,
                         batch_size, sequence_length, sequence_length,
                         parameters.head_size, parameters.v_head_size, parameters.v_hidden_size,
                         attention_bias, context);

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
@@ -3,6 +3,7 @@
 
 #include "contrib_ops/cpu/bert/attention_base.h"
 #include "contrib_ops/cpu/bert/multihead_attention_helper.h"
+#include "contrib_ops/cpu/utils/dump_tensor.h"
 #include "core/providers/common.h"
 
 namespace onnxruntime {

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
@@ -7,6 +7,7 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "contrib_ops/cpu/bert/attention_common.h"
+#include "contrib_ops/cpu/bert/attention_parameters.h"
 
 namespace onnxruntime {
 namespace contrib {

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -49,148 +49,10 @@ enum AttentionKernelType {
   AttentionKernel_FlashAttention,
   AttentionKernel_CudnnFlashAttention,
   AttentionKernel_LeanAttention,
+  AttentionKernel_FtCausalAttention,
   AttentionKernel_Default
 };
 
-// Parameters deduced from node attributes and inputs/outputs.
-struct AttentionParameters {
-  int batch_size;
-  int sequence_length;
-  int kv_sequence_length;     // input sequence length of K or V
-  int past_sequence_length;   // sequence length in past state of K or V
-  int total_sequence_length;  // total sequence length of K or V
-  int max_sequence_length;    // max sequence length from 4D mask
-  int input_hidden_size;      // first dimension of weights for input projection
-  int hidden_size;            // hidden size of Q or K
-  int head_size;              // hidden size per head of Q or K
-  int v_hidden_size;          // hidden size of V
-  int v_head_size;            // hidden size per head of V
-  int num_heads;
-  int rotary_embedding;
-  bool is_unidirectional;
-  bool past_present_share_buffer;
-  bool do_rotary;
-  bool broadcast_attn_bias_dim_0;
-  bool broadcast_attn_bias_dim_1;
-  float mask_filter_value;
-  float scale;
-  bool use_tf32;
-  AttentionMaskType mask_type;
-  AttentionQkvFormat qkv_format;
-};
-
-struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters {
-  int beam_width = 1;
-
-  // Only NeoX style rotary embedding is supported
-  int rotary_embedding_dim = 0;
-  int t_step = 0;
-
-  // Whether to use multihead attention(excludes matmul and bias)
-  bool is_mha = false;
-  bool is_cross_attention = false;
-  bool is_packed_qkv = false;
-
-  // Useful to better use global memory bandwidth on certain CUDA architectures.
-  // Turned off by default for now until we fully understand performance implications
-  // for all types of workloads.
-  // Can be turned on by appropriate environment variable (see attention_common.h).
-  bool kv_data_in_flight = false;
-
-  void* q = nullptr;
-  void* q_bias = nullptr;
-
-  void* k = nullptr;
-  void* k_bias = nullptr;
-
-  void* v = nullptr;
-  void* v_bias = nullptr;
-
-  void* attention_bias = nullptr;
-
-  void* k_cache = nullptr;
-  void* v_cache = nullptr;
-
-  void* out = nullptr;
-  void* out_qk = nullptr;
-
-  const int32_t* cache_indir = nullptr;
-  const int32_t* mask = nullptr;  // [B, total_sequence_length]
-};
-
-// Parameters deduced from node attributes and inputs/outputs.
-struct PackedAttentionParameters {
-  int batch_size;
-  int sequence_length;
-  int input_hidden_size;  // hidden size of input
-  int hidden_size;        // hidden size of Q or K
-  int head_size;          // hidden size per head of Q or K
-  int v_hidden_size;      // hidden size of V
-  int v_head_size;        // hidden size per head of V
-  int num_heads;
-  float scale;
-  int token_count;
-  bool broadcast_attn_bias_dim_0;
-  bool broadcast_attn_bias_dim_1;
-  bool use_tf32;
-};
-
-// Parameters deduced from node attributes and inputs/outputs.
-struct GroupQueryAttentionParameters {
-  int batch_size;
-  int sequence_length;          // sequence length of input query, key, value
-  int seqlen_past_kv_cache;     // sequence length of past kv tensor
-  int seqlen_present_kv_cache;  // sequence length of present kv tensor
-  int total_sequence_length;    // maximum total sequence length (past_sequence_length + sequence_length) among keys
-  int hidden_size;
-  int num_heads;
-  int head_size;
-  int kv_hidden_size;
-  int kv_num_heads;
-  int num_splits;          // number of splits for splitkv
-  int rotary_dim;          // rotary embedding dimension
-  bool is_unidirectional;  // causal
-  int local_window_size;
-  bool kv_share_buffer;
-  bool is_packed_qkv;
-  bool is_subsequent_prompt;  // indicates whether we have past context and seqlen > 1
-  bool is_first_prompt;       // indicates whether this is first decoding step
-  bool do_rotary;
-  bool rotary_interleaved;
-  bool use_smooth_softmax;
-  float scale;
-  float softcap;
-  AttentionQkvFormat qkv_format;
-  AttentionQkvFormat past_kv_format;
-  int zeros_count;
-  int* zero_ptr;
-};
-
-// Parameters for sparse attention.
-struct SparseAttentionParameters {
-  int batch_size;                  // batch size
-  int sequence_length;             // sequence length of input query, key, value
-  int hidden_size;                 // hidden size of query
-  int num_heads;                   // number of heads of query
-  int head_size;                   // hidden size per head of query, key or value
-  int kv_hidden_size;              // hidden size of key or value
-  int kv_num_heads;                // number of heads of key or value
-  bool do_rotary;                  // whether to use rotary embedding
-  bool rotary_interleaved;         // whether to use interleaved rotary embedding
-  int rotary_dim;                  // rotary embedding dimension
-  int sparse_block_size;           // block size for sparse attention
-  int num_sparse_layout;           // number of sparse layout
-  int stride_col_indices;          // shape of block_col_indices is [num_sparse_layout, stride_col_indices]
-  int stride_row_indices;          // shape of block_row_indices is [num_sparse_layout, stride_row_indices]
-  float scale;                     // scaling factor applied prior to softmax
-  bool is_packed_qkv;              // whether qkv is packed
-  int total_sequence_length;       // maximum total sequence length (past_sequence_length + sequence_length) among keys
-  int max_sequence_length;         // max sequence length for sparse layout
-  int max_rotary_sequence_length;  // max sequence length for rotary cos/sin cache
-  int max_cache_sequence_length;   // max sequence length for kv cache buffer
-  bool past_present_share_buffer;  // whether past_key and present_key share buffer, so is past_value and present_value
-};
-
 constexpr bool LAYOUT_BSNH = false;
 constexpr bool LAYOUT_BNSH = true;
 
@@ -215,6 +77,7 @@ enum class AttentionBackend : int {
 
   // Experimental kernels
   LEAN_ATTENTION = 256,
+  FT_CAUSAL_ATTENTION = 512,  // FasterTransformer's decoder masked multihead attention
 };
 
 // Environment variable to enable debug information of attention kernel to be printed. Default is 0 (disabled).
@@ -245,6 +108,9 @@ constexpr const char* kDisableFlashAttention = "ORT_DISABLE_FLASH_ATTENTION";
 // Environment variable to enable or disable lean attention. Default is 0 (disabled).
 constexpr const char* kEnableLeanAttention = "ORT_ENABLE_LEAN_ATTENTION";
 
+// Environment variable to enable or disable FasterTransformer's decoder masked multi-head attention. Default is 0 (enabled).
+constexpr const char* kDisableFtCausalAttention = "ORT_DISABLE_FT_CAUSAL_ATTENTION";
+
 // Minimum sequence length to perfer memory efficient attention when data type is float32
 constexpr const char* kMinSeqLenForEfficientAttentionFp32 = "ORT_MIN_SEQ_LEN_EFFICIENT_ATTENTION_FP32";