microsoft · apsonawane · Apr 25, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2671,14 +2671,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Scale tensor for past_value.</dd>
 </dl>
 
-#### Outputs (3 - 4)
+#### Outputs (1 - 4)
 
 <dl>
 <dt><tt>output</tt> : T</dt>
 <dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
-<dt><tt>present_key</tt> : T_CACHE</dt>
+<dt><tt>present_key</tt> (optional) : T_CACHE</dt>
 <dd>present state key with support for format BNSH. When past_key uses same tensor as present_key(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length.</dd>
-<dt><tt>present_value</tt> : T_CACHE</dt>
+<dt><tt>present_value</tt> (optional) : T_CACHE</dt>
 <dd>present state value with support for format BNSH. When past_value uses same tensor as present_value(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length.</dd>
 <dt><tt>output_qk</tt> (optional) : T</dt>
 <dd>Values of QK matrix multiplication, either before or after softmax normalization</dd>

diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -85,7 +85,9 @@ class GQAAttentionBase {
     if (past_key != nullptr && past_value != nullptr) {
       seqlen_past_kv_cache = static_cast<int>(past_key->Shape().GetDims()[2]);
     }
-    int seqlen_present_kv_cache = static_cast<int>(present_key->Shape().GetDims()[2]);
+    int seqlen_present_kv_cache = present_key != nullptr
+                                      ? static_cast<int>(present_key->Shape().GetDims()[2])
+                                      : parameters.seqlen_present_kv_cache;
 
     // Compute the attention score.
     bool gqa_mlas_supported = MlasGQASupported<T>(CblasNoTrans, CblasTrans) &&
@@ -175,7 +177,7 @@ class GQAAttentionBase {
     const size_t past_buff_chunk_length = past_buffer_sequence_length * head_size;        // L x H
     const size_t present_buff_chunk_length = present_buffer_sequence_length * head_size;  // T x H
 
-    if (!past_present_share_buffer) {
+    if (present_key && !past_present_share_buffer) {
       memset((void*)present_key,
              0,
              batch_size * kv_num_heads_ * present_buffer_sequence_length * head_size * sizeof(T));
@@ -402,7 +404,7 @@ class GQAAttentionBase {
     const size_t past_buff_chunk_length = past_buffer_sequence_length * head_size;        // L x H
     const size_t present_buff_chunk_length = present_buffer_sequence_length * head_size;  // T x H
 
-    if (!past_present_share_buffer) {
+    if (present_value && !past_present_share_buffer) {
       memset((void*)present_value,
              0,
              batch_size * kv_num_heads_ * present_buffer_sequence_length * head_size * sizeof(T));

diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
@@ -233,7 +233,9 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
   const T* head_sink_data = (head_sink != nullptr) ? head_sink->Data<T>() : nullptr;
 
   // Compute the attention score and apply the score to V
-  return ApplyAttention(q_rotary, packed_qkv ? nullptr : k_rotary, packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(),
+  const T* k_data = packed_qkv ? nullptr : k_rotary;
+  const T* v_data = packed_qkv ? nullptr : V.Get<Tensor>().Data<T>();
+  return ApplyAttention(q_rotary, k_data, v_data,
                         head_sink_data, attention_bias, past_key, past_value, output, present_k, present_v,
                         output_qk, seqlens_k, parameters, allocator, context);
 }

diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -242,7 +242,7 @@ Status CheckInputs(const T* query,
   int q_hidden_size = 0;
   int kv_hidden_size = 0;
   int head_size = 0;
-  const bool is_packed_qkv = key == nullptr;
+  const bool is_packed_qkv = (key == nullptr);
   if (!is_packed_qkv) {
     ORT_RETURN_IF_ERROR(Check_Q_K_V(query, key, value, num_heads, kv_num_heads, batch_size, sequence_length,
                                     q_hidden_size, kv_hidden_size, head_size));

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -227,6 +227,7 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
                                                                                attention_bias,
                                                                                head_sink,
                                                                                parameters));
+
   parameters.local_window_size = local_window_size_;
   parameters.is_unidirectional = is_unidirectional_;
   parameters.use_smooth_softmax = use_smooth_softmax_ || head_sink != nullptr;
@@ -291,13 +292,10 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
 
   data.past_key = (past_key == nullptr) ? nullptr : reinterpret_cast<const CudaU*>(past_key->Data<U>());
   data.past_value = (past_value == nullptr) ? nullptr : reinterpret_cast<const CudaU*>(past_value->Data<U>());
-
-  data.present_key = reinterpret_cast<CudaU*>(present_key_output->MutableData<U>());
-  data.present_value = reinterpret_cast<CudaU*>(present_value_output->MutableData<U>());
-
+  data.present_key = (present_key_output != nullptr) ? reinterpret_cast<CudaU*>(present_key_output->MutableData<U>()) : nullptr;
+  data.present_value = (present_value_output != nullptr) ? reinterpret_cast<CudaU*>(present_value_output->MutableData<U>()) : nullptr;
   // Compute past_present_share_buffer early since it's needed for flash attention path selection.
-  // This compares the final pointer values after quantization handling.
-  parameters.past_present_share_buffer = (data.past_key == data.present_key);
+  parameters.past_present_share_buffer = (data.past_key != nullptr && data.past_key == data.present_key);
 
   bool is_inputs_quantized = (k_quant_type_ != KVQuantizationType::NONE) || (v_quant_type_ != KVQuantizationType::NONE);
   constexpr bool is_int8 = std::is_same<U, int8_t>::value;
@@ -562,10 +560,12 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
   }
 
   // Validate past_value pointer consistency (past_present_share_buffer was computed early after pointer setup)
-  if (parameters.past_present_share_buffer) {
-    ORT_ENFORCE(data.past_value == data.present_value, "past_value and present_value must be the same tensor when past_present_share_buffer is true");
-  } else {
-    ORT_ENFORCE(data.past_value != data.present_value, "past_value and present_value must be different tensors when past_present_share_buffer is false");
+  if (data.present_value != nullptr) {
+    if (parameters.past_present_share_buffer) {
+      ORT_ENFORCE(data.past_value == data.present_value, "past_value and present_value must be the same tensor when past_present_share_buffer is true");
+    } else {
+      ORT_ENFORCE(data.past_value != data.present_value, "past_value and present_value must be different tensors when past_present_share_buffer is false");
+    }
   }
 
   data.output = reinterpret_cast<CudaT*>(output->MutableData<T>());

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -98,6 +98,14 @@ Status PrepareQKV(
     q_out = nullptr;
   }
 
+  // present_key/present_value are required for the CUDA path since flash attention
+  // and memory-efficient attention read directly from the present KV buffers.
+  // The CPU path supports optional present outputs for KV-shared layers.
+  if (data.present_key == nullptr || data.present_value == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "present_key and present_value outputs are required for the CUDA GroupQueryAttention kernel.");
-  // present_key/present_value are required for the CUDA path since flash attention
-  // and memory-efficient attention read directly from the present KV buffers.
-  // The CPU path supports optional present outputs for KV-shared layers.
-  if (data.present_key == nullptr || data.present_value == nullptr) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "present_key and present_value outputs are required for the CUDA GroupQueryAttention kernel.");
+  // present_key/present_value are currently required for the CUDA path since flash attention
+  // and memory-efficient attention read directly from the present KV buffers.
+  // Note: although the operator schema/docs may declare these outputs optional,
+  // the CUDA EP does not currently support omitting them.
+  if (data.present_key == nullptr || data.present_value == nullptr) {
+    return ORT_MAKE_STATUS(
+        ONNXRUNTIME, INVALID_ARGUMENT,
+        "CUDA GroupQueryAttention currently requires both present_key and present_value outputs to be connected "
+        "because the CUDA kernels read directly from the present KV buffers. "
+        "If your model relies on omitted optional present outputs, either connect both outputs when using the CUDA "
+        "execution provider or run this node on CPU.");
-  // present_key/present_value are required for the CUDA path since flash attention
-  // and memory-efficient attention read directly from the present KV buffers.
-  // The CPU path supports optional present outputs for KV-shared layers.
-  if (data.present_key == nullptr || data.present_value == nullptr) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "present_key and present_value outputs are required for the CUDA GroupQueryAttention kernel.");
+  // present_key/present_value are currently required for the CUDA path since flash attention
+  // and memory-efficient attention read directly from the present KV buffers.
+  // Note: although the operator schema/docs may declare these outputs optional,
+  // the CUDA EP does not currently support omitting them.
+  if (data.present_key == nullptr || data.present_value == nullptr) {
+    return ORT_MAKE_STATUS(
+        ONNXRUNTIME, INVALID_ARGUMENT,
+        "CUDA GroupQueryAttention currently requires both present_key and present_value outputs to be connected "
+        "because the CUDA kernels read directly from the present KV buffers. "
+        "If your model relies on omitted optional present outputs, either connect both outputs when using the CUDA "
+        "execution provider or run this node on CPU.");
+  }
+
   U* k = reinterpret_cast<U*>(data.present_key);
   U* v = reinterpret_cast<U*>(data.present_value);
   int max_cache_length = parameters.seqlen_present_kv_cache;

diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -212,7 +212,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
                                                                 scale_,
                                                                 softcap_,
                                                                 0,
-                                                                context.DeviceLimits().maxComputeInvocationsPerWorkgroup));
+                                                                static_cast<int>(context.DeviceLimits().maxComputeInvocationsPerWorkgroup)));
   params.use_smooth_softmax = use_smooth_softmax_;
   params.rotary_interleaved = rotary_interleaved_;
 

diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1323,13 +1323,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 "present state key with support for format BNSH. When past_key uses same tensor as present_key"
                 "(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +"
                 "kv_sequence_length.",
-                "T_CACHE")
+                "T_CACHE",
+                OpSchema::Optional)
         .Output(2,
                 "present_value",
                 "present state value with support for format BNSH. When past_value uses same tensor as present_value"
                 "(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +"
                 "kv_sequence_length.",
-                "T_CACHE")
+                "T_CACHE",
+                OpSchema::Optional)
         .Output(3,
                 "output_qk",
                 "Values of QK matrix multiplication, either before or after softmax normalization",

diff --git a/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc b/onnxruntime/test/contrib_ops/group_query_attention_op_test.cc
@@ -307,5 +307,108 @@ TEST(GroupQueryAttentionTest, SeqlensKWrongLength) {
              {}, nullptr, &execution_providers);
 }
 
+// ============================================================================
+// Optional present_key/present_value output tests
+// ============================================================================
+
+// Helper for tests with optional present outputs.
+// When omit_present=true, present_key and present_value outputs are not connected.
+static void RunGQAOptionalPresentTest(
+    int batch_size,
+    int sequence_length,
+    int total_seq_len,
+    bool omit_present,
+    OpTester::ExpectResult expect,
+    const std::string& expected_message) {
+  constexpr int num_heads = 2;
+  constexpr int kv_num_heads = 1;
+  constexpr int head_size = 8;
+  constexpr int hidden_size = num_heads * head_size;
+  constexpr int kv_hidden_size = kv_num_heads * head_size;
+
+  OpTester tester("GroupQueryAttention", 1, onnxruntime::kMSDomain);
+  tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+  tester.AddAttribute<int64_t>("kv_num_heads", static_cast<int64_t>(kv_num_heads));
+
+  std::vector<float> query_data(batch_size * sequence_length * hidden_size, 1.0f);
+  tester.AddInput<float>("query", {batch_size, sequence_length, hidden_size}, query_data);
+
+  std::vector<float> key_data(batch_size * sequence_length * kv_hidden_size, 0.5f);
+  tester.AddInput<float>("key", {batch_size, sequence_length, kv_hidden_size}, key_data);
+
+  std::vector<float> value_data(batch_size * sequence_length * kv_hidden_size, 0.5f);
+  tester.AddInput<float>("value", {batch_size, sequence_length, kv_hidden_size}, value_data);
+
+  tester.AddOptionalInputEdge<float>();  // past_key
+  tester.AddOptionalInputEdge<float>();  // past_value
+
+  std::vector<int32_t> seqlens_k_data(batch_size, static_cast<int32_t>(total_seq_len - 1));
+  tester.AddInput<int32_t>("seqlens_k", {batch_size}, seqlens_k_data);
+  tester.AddInput<int32_t>("total_sequence_length", {1}, {static_cast<int32_t>(total_seq_len)});
+
+  tester.AddOptionalInputEdge<float>();    // cos_cache
+  tester.AddOptionalInputEdge<float>();    // sin_cache
+  tester.AddOptionalInputEdge<int64_t>();  // position_ids
+  tester.AddOptionalInputEdge<float>();    // attention_bias
+  tester.AddOptionalInputEdge<float>();    // head_sink
+
+  // Output 0: output (always required)
+  tester.AddOutput<float>("output", {batch_size, sequence_length, hidden_size},
+                          std::vector<float>(batch_size * sequence_length * hidden_size, 0.0f));
+
+  if (omit_present) {
+    // Omit present_key and present_value — they are optional
+    tester.AddOptionalOutputEdge<float>();  // present_key
+    tester.AddOptionalOutputEdge<float>();  // present_value
+  } else {
+    int present_seq_len = total_seq_len;
+    tester.AddOutput<float>("present_key", {batch_size, kv_num_heads, present_seq_len, head_size},
+                            std::vector<float>(batch_size * kv_num_heads * present_seq_len * head_size, 0.0f));
+    tester.AddOutput<float>("present_value", {batch_size, kv_num_heads, present_seq_len, head_size},
+                            std::vector<float>(batch_size * kv_num_heads * present_seq_len * head_size, 0.0f));
+  }
+
+  if (expect == OpTester::ExpectResult::kExpectSuccess) {
+    tester.SetOutputTolerance(1e6f);
+  }
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  tester.Run(expect, expected_message, {}, nullptr, &execution_providers);
+}
+
+// Baseline: GQA with present outputs connected works as before
+TEST(GroupQueryAttentionTest, OptionalPresent_WithPresent) {
+  RunGQAOptionalPresentTest(
+      /*batch_size=*/1,
+      /*sequence_length=*/4,
+      /*total_seq_len=*/4,
+      /*omit_present=*/false,
+      OpTester::ExpectResult::kExpectSuccess,
+      "");
+}
+
+// KV-shared layer scenario: present outputs omitted, attention uses K,V directly
+TEST(GroupQueryAttentionTest, OptionalPresent_WithoutPresent) {
+  RunGQAOptionalPresentTest(
+      /*batch_size=*/1,
+      /*sequence_length=*/4,
+      /*total_seq_len=*/4,
+      /*omit_present=*/true,
+      OpTester::ExpectResult::kExpectSuccess,
+      "");
+}
+
+// Batched: present outputs omitted with batch_size > 1
+TEST(GroupQueryAttentionTest, OptionalPresent_Batched) {
+  RunGQAOptionalPresentTest(
+      /*batch_size=*/2,
+      /*sequence_length=*/3,
+      /*total_seq_len=*/3,
+      /*omit_present=*/true,
+      OpTester::ExpectResult::kExpectSuccess,
+      "");
+}
+
 }  // namespace test
 }  // namespace onnxruntime