Shard LongCat Flash (ml-explore#771)

kernelpool · web-flow · commit 8daabcc7c1df · 2026-01-18T07:21:36.000-08:00
diff --git a/mlx_lm/models/longcat_flash.py b/mlx_lm/models/longcat_flash.py
@@ -4,6 +4,7 @@
 
 import mlx.core as mx
 import mlx.nn as nn
+from mlx.nn.layers.distributed import shard_inplace, shard_linear, sum_gradients
 
 from .activations import swiglu
 from .base import BaseModelArgs, create_attention_mask, scaled_dot_product_attention
@@ -238,8 +239,11 @@ def __init__(self, args: ModelArgs):
         )
 
         self.router = LongcatFlashTopkRouter(args)
+        self.sharding_group = None
 
     def __call__(self, hidden_states):
+        if self.sharding_group is not None:
+            hidden_states = sum_gradients(self.sharding_group)(hidden_states)
 
         topk_indices, topk_weights = self.router(hidden_states)
 
@@ -251,14 +255,20 @@ def __call__(self, hidden_states):
         regular_outputs = self.switch_mlp(hidden_states, topk_indices)
 
         weighted_outputs = regular_outputs * regular_weights[..., None]
+        final_output = mx.sum(weighted_outputs, axis=-2)
+
+        if self.sharding_group is not None:
+            final_output = mx.distributed.all_sum(
+                final_output, group=self.sharding_group
+            )
 
-        # Add identity expert contribution if needed
+        # Add identity expert contribution after all_sum to avoid summing it N times
         assert self.zero_expert_type == "identity"
-        identity_weights = mx.where(mask, topk_weights, 0.0)
-        identity_outputs = hidden_states[..., None, :] * identity_weights[..., None]
-        weighted_outputs = weighted_outputs + identity_outputs
+        identity_weights_sum = mx.sum(
+            mx.where(mask, topk_weights, 0.0), axis=-1, keepdims=True
+        )
+        final_output = final_output + hidden_states * identity_weights_sum
 
-        final_output = mx.sum(weighted_outputs, axis=-2)
         return final_output
 
 
@@ -394,3 +404,37 @@ def sanitize(self, weights):
 
     def make_cache(self):
         return [CacheList(KVCache(), KVCache()) for _ in self.model.layers]
+
+    def shard(self, group: Optional[mx.distributed.Group] = None):
+        group = group or mx.distributed.init()
+        N = group.size()
+
+        for layer in self.model.layers:
+            for attn in layer.self_attn:
+                if attn.q_lora_rank is None:
+                    attn.q_proj = shard_linear(
+                        attn.q_proj, "all-to-sharded", group=group
+                    )
+                else:
+                    attn.q_b_proj = shard_linear(
+                        attn.q_b_proj, "all-to-sharded", group=group
+                    )
+                attn.kv_b_proj = shard_linear(
+                    attn.kv_b_proj, "all-to-sharded", group=group
+                )
+                attn.o_proj = shard_linear(attn.o_proj, "sharded-to-all", group=group)
+                attn.num_attention_heads //= N
+
+            for mlp in layer.mlps:
+                mlp.gate_proj = shard_linear(
+                    mlp.gate_proj, "all-to-sharded", group=group
+                )
+                mlp.up_proj = shard_linear(mlp.up_proj, "all-to-sharded", group=group)
+                mlp.down_proj = shard_linear(
+                    mlp.down_proj, "sharded-to-all", group=group
+                )
+
+            layer.mlp.sharding_group = group
+            shard_inplace(layer.mlp.switch_mlp.gate_proj, "all-to-sharded", group=group)
+            shard_inplace(layer.mlp.switch_mlp.up_proj, "all-to-sharded", group=group)
+            shard_inplace(layer.mlp.switch_mlp.down_proj, "sharded-to-all", group=group)