Fix FlashAttentionDecodeSplitVx indirect dispatch input ordering

jchen10 · jchen10 · commit 03dc3519d660 · 2026-04-01T16:08:38.000+08:00
Move SetIndirectDispatchTensor after all AddInput calls to ensure the
indirect buffer is the last program input. When head_sink was added
after SetIndirectDispatchTensor, the shader variable types were
swapped, causing a u32*f16 WGSL compilation error.
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -344,14 +344,18 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
   program.AddOutputs({{out_split_vx, ProgramTensorMetadataDependency::TypeAndRank, components}});  // [B, N, split_k, head_size]
   const uint32_t batch_heads = static_cast<uint32_t>(parameters.batch_size_ * parameters.num_heads_);
   if (use_indirect_dispatch) {
-    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None})
-        .SetIndirectDispatchTensor(indirect_buffer);
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::None});
   } else {
     program.SetDispatchGroupSize(batch_heads * num_total_seq_length_tile);
   }
   if (has_head_sink) {
     program.AddInput({head_sink, ProgramTensorMetadataDependency::Type});
   }
+  // SetIndirectDispatchTensor must be called after all AddInput calls because it
+  // appends the indirect buffer as the last program input.
+  if (use_indirect_dispatch) {
+    program.SetIndirectDispatchTensor(indirect_buffer);
+  }
   program.CacheHint(tile_size, head_size_vec, use_indirect_dispatch, has_head_sink)
       .SetWorkgroupSize(64)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.total_sequence_length_)},