fix

Iamleos · Iamleos · commit 57d7f3fc9a41 · 2025-09-30T19:03:20.000+08:00
diff --git a/python/sgl_jax/srt/layers/attention/flash_attn_kernel/flash_attention.py b/python/sgl_jax/srt/layers/attention/flash_attn_kernel/flash_attention.py
@@ -258,7 +258,7 @@ def _ragged_paged_attention_kernel(
     q_hbm_ref,  # [actual_num_kv_heads, padded_num_tokens, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
     kv_hbm_ref,  # [padded_num_tokens, num_kv_heads_x2 // kv_packing, kv_packing, head_dim] - Fused KV with interleaved [K1,V1,K2,V2,...]
     kv_cache_fused_hbm_ref,  # [total_num_pages, page_size, num_kv_heads_interleaved // kv_packing, kv_packing, head_dim]
-    custom_mask_ref,  # (flatten_total_kv_len,)
+    custom_mask_ref,  # (flatten_total_kv_len,), int8, dma not support bool type
     # Output
     o_hbm_ref,  # [actual_num_kv_heads, max_num_tokens, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
     updated_kv_cache_fused_hbm_ref,  # [total_num_pages, page_size, num_kv_heads_interleaved // kv_packing, kv_packing, head_dim]
@@ -799,9 +799,10 @@ def flash_attention(q_batch, k_batch, v_batch):
                     k_span = bkv_idx * bkv_sz + lax.broadcasted_iota(
                         jnp.int32, s.shape, 2
                     )
+                    # convert custom_mask from int8 to bool
                     mask = lax.select(
                         causal == 0,
-                        custom_mask,
+                        custom_mask.astype(jnp.bool),
                         q_span < k_span,
                     )
                     if sliding_window is not None:
@@ -1340,7 +1341,11 @@ def ragged_paged_attention(
     )
     if custom_mask is None:
         # fix bug: XLA layout ({0}) does not match Mosaic layout ({0:T(128)}) for an operand of shape s32[0]
-        custom_mask = jnp.empty((1, 128))
+        custom_mask = jnp.empty((1, 128), dtype=jnp.int8)
+    else:
+        assert (
+            custom_mask.dtype != jnp.bool
+        ), f"custom_mask bool dtype is not supported, use int32 instead. 0: False, 1: True"
 
     grid = (distribution[2],)
 
diff --git a/python/sgl_jax/srt/layers/attention/flashattention_backend.py b/python/sgl_jax/srt/layers/attention/flashattention_backend.py
@@ -101,7 +101,11 @@ def get_forward_metadata(self, batch: ModelWorkerBatch):
         page_indices = (selected_cache_locs // self.page_size).astype(np.int32)
 
         if batch.forward_mode == ForwardMode.TARGET_VERIFY:
-            metadata.custom_mask = batch.spec_info.custom_mask
+            # convert custom_mask from bool to int8, because dma not support bool type
+            if batch.spec_info.custom_mask.dtype == jnp.bool:
+                metadata.custom_mask = batch.spec_info.custom_mask.astype(jnp.int8)
+            else:
+                metadata.custom_mask = batch.spec_info.custom_mask
         else:
             metadata.custom_mask = None
 
diff --git a/python/sgl_jax/test/test_flashattention.py b/python/sgl_jax/test/test_flashattention.py
@@ -247,7 +247,7 @@ def align_to_size(l, size, value=0):
     attention_backend = FlashAttentionBackend(
         num_heads, num_kv_heads, head_dim, page_size=page_size, mesh=mesh
     )
-    print(f"!!!!!!!! {causal=}")
+
     if not causal:
         forward_mode = ForwardMode.TARGET_VERIFY
         custom_mask = create_custom_mask(lens)

Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ def align_to_size(l, size, value=0):`
`247`	`247`	`attention_backend = FlashAttentionBackend(`
`248`	`248`	`num_heads, num_kv_heads, head_dim, page_size=page_size, mesh=mesh`
`249`	`249`	`)`
`250`		`- print(f"!!!!!!!! {causal=}")`
	`250`	`+`
`251`	`251`	`if not causal:`
`252`	`252`	`forward_mode = ForwardMode.TARGET_VERIFY`
`253`	`253`	`custom_mask = create_custom_mask(lens)`