update sharding rules for attn.

entrpn · entrpn · commit 8c4f7749ad5c · 2025-09-19T18:01:37.000Z
diff --git a/src/maxdiffusion/common_types.py b/src/maxdiffusion/common_types.py
@@ -44,4 +44,13 @@
 KEEP_2 = "activation_keep_2"
 CONV_OUT = "activation_conv_out_channels"
 
+# For setting self/cross attention independently in splash kernel
+SELF_ATTN_HEAD = "activation_self_attn_heads"
+SELF_ATTN_Q_LENGTH = "activation_self_attn_q_length"
+SELF_ATTN_KV_LENGTH = "activation_self_attn_kv_length"
+CROSS_ATTN_HEAD = "activation_cross_attn_heads"
+CROSS_ATTN_Q_LENGTH = "activation_cross_attn_q_length"
+CROSS_ATTN_KV_LENGTH = "activation_cross_attn_kv_length"
+
+
 WAN_MODEL = "Wan2.1"
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -68,10 +68,21 @@ flash_block_sizes: {}
 #   "block_kv" : 2048,
 #   "block_q_dkv" : 3024,
 #   "block_kv_dkv" : 2048,
-#   "block_kv_dkv_compute" : 2048,
+#   "block_kv_dkv_compute" : 1024,
 #   "block_q_dq" : 3024,
 #   "block_kv_dq" : 2048
 # }
+# Use on v5p
+flash_block_sizes: {
+  "block_q" : 1024,
+  "block_kv_compute" : 256,
+  "block_kv" : 3072,
+  "block_q_dkv" : 1024,
+  "block_kv_dkv" : 3072,
+  "block_kv_dkv_compute" : 256,
+  "block_q_dq" : 1024,
+  "block_kv_dq" : 3072
+}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -132,8 +143,9 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 logical_axis_rules: [
                       ['batch', 'data'],
                       ['activation_batch', 'data'],
+                      ['activation_self_attn_heads', ['fsdp', 'tensor']],
+                      ['activation_cross_attn_q_length', ['fsdp', 'tensor']],
                       ['activation_length', 'fsdp'],
-                      
                       ['activation_heads', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -45,6 +45,14 @@
 EMBED = common_types.EMBED
 Quant = quantizations.AqtQuantization
 
+SELF_ATTN_HEAD = common_types.SELF_ATTN_HEAD
+SELF_ATTN_Q_LENGTH = common_types.SELF_ATTN_Q_LENGTH
+SELF_ATTN_KV_LENGTH = common_types.SELF_ATTN_KV_LENGTH
+CROSS_ATTN_HEAD = common_types.CROSS_ATTN_HEAD
+CROSS_ATTN_Q_LENGTH = common_types.CROSS_ATTN_Q_LENGTH
+CROSS_ATTN_KV_LENGTH = common_types.CROSS_ATTN_KV_LENGTH
+
+
 
 def _maybe_aqt_einsum(quant: Quant):
   return jnp.einsum if quant is None else quant.einsum()
@@ -174,7 +182,6 @@ def _tpu_flash_attention(
     flash_block_sizes: BlockSizes,
     dtype: jnp.dtype = jnp.float32,
     attention_kernel: str = "flash",
-    is_self_attention: Optional[bool] = None,
 ) -> jax.Array:
   """TPU Flash Attention"""
 
@@ -203,22 +210,8 @@ def _tpu_flash_attention(
   query = _reshape_data_for_flash(query, heads)
   key = _reshape_data_for_flash(key, heads)
   value = _reshape_data_for_flash(value, heads)
-  
-  # Use different sharding strategy for self-attn vs cross-attn
-  if is_self_attention is not None:
-    if is_self_attention:
-        # Self-attention: Context Parallelism (sharding along num_heads)
-        q_axis_names = PartitionSpec("data", ("fsdp", "tensor"), None, None)
-        kv_axis_names = PartitionSpec("data", ("fsdp", "tensor"), None, None)
-    else:
-        # Cross-attention: Sequence Parallelism for Q
-        # Q's sequence is sharded; K/V are replicated
-        q_axis_names = PartitionSpec("data", None, ("fsdp", "tensor"), None)
-        kv_axis_names = PartitionSpec("data", None, None, None)
-  else:
-    # Fallback to original maxdiffusion behavior if the flag isn't provided
-    q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
-    kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
+  q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
+  kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
 
   @functools.partial(
       shard_map.shard_map,
@@ -435,7 +428,6 @@ def _apply_attention(
     axis_names_kv: AxisNames,
     flash_block_sizes: BlockSizes,
     dpa_layer: Callable,
-    is_self_attention: bool = True,
 ):
   """Routes to different attention kernels."""
   _check_attention_inputs(query, key, value)
@@ -456,7 +448,7 @@ def _apply_attention(
     )
   elif attention_kernel == "flash":
     return _tpu_flash_attention(
-        query, key * scale, value, heads, mesh, axis_names_q, axis_names_kv, flash_block_sizes, dtype, attention_kernel, is_self_attention,
+        query, key * scale, value, heads, mesh, axis_names_q, axis_names_kv, flash_block_sizes, dtype, attention_kernel,
     )
   elif attention_kernel == "ring":
     return _tpu_flash_attention(
@@ -591,7 +583,6 @@ def __init__(
       flash_block_sizes: BlockSizes = None,
       dtype: DType = jnp.float32,
       quant: Quant = None,
-      is_self_attention: bool = True,
   ):
     self.dpa_layer = None
     if attention_kernel == "cudnn_flash_te":
@@ -611,7 +602,6 @@ def __init__(
     self.flash_block_sizes = flash_block_sizes
     self.dtype = dtype
     self.quant = quant
-    self.is_self_attention = is_self_attention
 
   def apply_attention(self, query: Array, key: Array, value: Array):
     return _apply_attention(
@@ -632,7 +622,6 @@ def apply_attention(self, query: Array, key: Array, value: Array):
         axis_names_kv=self.axis_names_kv,
         flash_block_sizes=self.flash_block_sizes,
         dpa_layer=self.dpa_layer,
-        is_self_attention=self.is_self_attention,
     )
 
 
@@ -738,6 +727,13 @@ def __init__(
     self.value_axis_names = value_axis_names
     self.out_axis_names = out_axis_names
 
+    if is_self_attention:
+      axis_names_q = (BATCH, SELF_ATTN_HEAD, SELF_ATTN_Q_LENGTH, D_KV)
+      axis_names_kv = (BATCH, SELF_ATTN_HEAD, SELF_ATTN_KV_LENGTH, D_KV)
+    else:
+      axis_names_q = (BATCH, CROSS_ATTN_HEAD, CROSS_ATTN_Q_LENGTH, D_KV)
+      axis_names_kv = (BATCH, CROSS_ATTN_HEAD, CROSS_ATTN_KV_LENGTH, D_KV)
+      
     self.attention_op = NNXAttentionOp(
         mesh=mesh,
         attention_kernel=attention_kernel,
@@ -747,11 +743,12 @@ def __init__(
         use_memory_efficient_attention=use_memory_efficient_attention,
         split_head_dim=split_head_dim,
         float32_qk_product=False,
+        axis_names_q=axis_names_q,
+        axis_names_kv=axis_names_kv,
         flash_min_seq_length=flash_min_seq_length,
         flash_block_sizes=flash_block_sizes,
         dtype=dtype,
         quant=quant,
-        is_self_attention=is_self_attention,
     )
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.