Add manual constraints in Llama3 example (#130)

fmassa · web-flow · commit 273f54cb10aa · 2025-09-07T07:42:49.000+02:00
* [WIP] Remove Embedding constraint for 1d and add mm constraints for 2d

* Cleanup

* Cleanup

* Bugfix and disable by default

* Make it work with mm and einsum
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
@@ -556,7 +556,7 @@ def forward(self, tokens: torch.Tensor, input_batch: torch.Tensor | None = None)
 # AutoParallel code starts here
 # ==============================================================
 
-world_size = 256
+world_size = 64
 
 fake_store = FakeStore()
 torch.distributed.init_process_group(
@@ -579,7 +579,7 @@ def forward(self, tokens: torch.Tensor, input_batch: torch.Tensor | None = None)
         ),
     )
 
-batch_size = 4 * mesh.shape[0]
+batch_size = 2 * mesh.shape[0]
 seqlen = 2048 * 4
 vocab_size = 128256
 use_vocab_parallel = not use_1d_mesh
@@ -588,6 +588,8 @@ def forward(self, tokens: torch.Tensor, input_batch: torch.Tensor | None = None)
 
 def model_fn():
     model_args = TransformerModelArgs(
+        dim=4096,
+        n_heads=32,
         n_layers=32,
         vocab_size=vocab_size,
         max_seq_len=seqlen,
@@ -610,6 +612,90 @@ def input_fn():
 
 mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
 
+
+def group_mm_nodes_with_its_gradients(nodes):
+    fwd_nodes = [n for n in nodes if "nn_module_stack" in n.meta]
+    bwd_nodes = [n for n in nodes if "fwd_nn_module_stack" in n.meta]
+    assert len(fwd_nodes) * 2 == len(bwd_nodes)
+    res = {}
+    for fwd_node in fwd_nodes:
+        o = []
+        for bwd_node in bwd_nodes:
+            if fwd_node.meta["nn_module_stack"] == bwd_node.meta["fwd_nn_module_stack"]:
+                o.append(bwd_node)
+        assert len(o) == 2
+        res[fwd_node] = o
+    return res
+
+
+def force_tp_constraints(autop, mm_nodes, feat_dim=1, bwd_constraint=False):
+    # out = x @ w   - S(0)R, RS(1) -> S(0)S(1)
+    # g_w = g.T @ x - S(1)S(0), S(0)R -> PS(0)
+    # g_x = g @ w.T - S(0)S(1), RS(0) -> S(0)P
+
+    add_node_constraint = autop.sharding_optimizer.add_node_constraint
+    fwd_bwd_groups = group_mm_nodes_with_its_gradients(mm_nodes)
+    fwd_nodes = list(fwd_bwd_groups.keys())
+    dim1 = 0 if feat_dim == 1 else 1
+    dim2 = 1 if feat_dim == 1 else 0
+    # assume there are 7 mm nodes per transformer block
+    # skip last mm as it's the final projection layer
+    assert (
+        len(fwd_nodes) - 1
+    ) % 7 == 0, f"expected 7 mm nodes per transformer block, {len(fwd_nodes) - 1}"
+    for block in range(0, len(fwd_nodes) - 1, 7):
+        fwd_nodes_block = fwd_nodes[block : block + 7]
+        # force the first 3 mm nodes to be S(0)S(1)
+        the_nodes = fwd_nodes_block[:3] + fwd_nodes_block[4:6]
+        for n in the_nodes:
+            add_node_constraint(n, (Shard(0), Shard(feat_dim)))
+            add_node_constraint(n.all_input_nodes[0], (Shard(0), Replicate()))
+            add_node_constraint(n.all_input_nodes[1], (Replicate(), Shard(1)))
+
+            if bwd_constraint:
+                bwd_nodes = fwd_bwd_groups[n]
+                # first is g_w, second is g_x
+                add_node_constraint(bwd_nodes[0], (Partial(), Shard(dim1)))
+                add_node_constraint(bwd_nodes[1], (Shard(0), Partial()))
+
+        # add reduction to finish TP, yielding S(0)P
+        the_nodes = fwd_nodes_block[3:4] + fwd_nodes_block[6:7]
+        for n in the_nodes:
+            add_node_constraint(n, (Shard(0), Partial()))
+            add_node_constraint(n.all_input_nodes[0], (Shard(0), Shard(feat_dim)))
+            add_node_constraint(n.all_input_nodes[1], (Replicate(), Shard(0)))
+
+            if bwd_constraint:
+                bwd_nodes = fwd_bwd_groups[n]
+                # first is g_w, second is g_x
+                add_node_constraint(bwd_nodes[0], (Partial(), Shard(dim2)))
+                add_node_constraint(bwd_nodes[1], (Shard(0), Shard(feat_dim)))
+
+
+def add_tp_constraints(autop):
+    mm_nodes = autop.gm.graph.find_nodes(
+        op="call_function", target=torch.ops.aten.mm.default
+    )
+    einsum_nodes = autop.gm.graph.find_nodes(
+        op="call_function", target=torch.ops.aten.einsum.default
+    )
+    assert (len(mm_nodes) > 0) ^ (
+        len(einsum_nodes) > 0
+    ), f"only one should be non-empty, got {len(mm_nodes)} and {len(einsum_nodes)}"
+    feat_dim = 1 if len(mm_nodes) > 0 else 2
+    tgt_nodes = mm_nodes + einsum_nodes
+    force_tp_constraints(autop, tgt_nodes, feat_dim=feat_dim, bwd_constraint=True)
+
+    if einsum_nodes:
+        # add sequence parallelism if we have einsum nodes
+        autop.sharding_optimizer.add_node_constraint(
+            list(tgt_nodes[3].users)[0], (Shard(0), Shard(1))
+        )
+        autop.sharding_optimizer.add_node_constraint(
+            list(list(tgt_nodes[3].users)[0].users)[0], (Shard(0), Shard(1))
+        )
+
+
 # parallelize the model
 with AutoParallel(
     model, input_fn, mesh, mp_policy, compile=True, repeated_subgraphs=True
@@ -626,17 +712,9 @@ def input_fn():
     autop.add_input_constraints([x_sharding])
     autop.add_output_constraints([out_sharding])
 
-    # example of how to add manual constraints
-    if use_1d_mesh:
-        # add constraint on the output sharding of embedding bag
-        # otherwise it might decide that it's ok to replicate both inputs. This is indeed fine
-        # for 1d but the current cost model doesn't take output memory into account, so it thinks
-        # it is not expensive. I should add an activation memory constraint as well to avoid
-        # those cases
-        embedding_nodes = autop.gm.graph.find_nodes(
-            op="call_function", target=torch.ops.aten.embedding.default
-        )
-        autop.sharding_optimizer.add_node_constraint(embedding_nodes[0], x_sharding)
+    enable_manual_constraint = False
+    if enable_manual_constraint and not use_1d_mesh:
+        add_tp_constraints(autop)
 
     t = time.time()
     sharding_placement = autop.optimize_placement()