Use custom redistribute_cost for optimal PP->S(0)S(0) cost (#109)

fmassa · web-flow · commit 19f1fab97510 · 2025-08-27T10:03:10.000-07:00
* Use custom redistribute_cost for optimal PP-&gt;S(0)S(0) cost

Previously, given the default iteration order, it was always less expensive to do PP-&gt;S(0)P-&gt;S(0)S(0) instead of directly PP-&gt;S(0)S(0). We now favor doing it in a single pass by uting the optimal redistribution cost for a given operation. For now, we only consider the PP-&gt;S(0)S(0) case, but we should generalize this to all cases in the future

* Add all-to-all cost

Need to verify if it's correct

* Using a2a == ag*4 gives better results for llama3 8b
diff --git a/autoparallel/collective_runtime_estimation.py b/autoparallel/collective_runtime_estimation.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.distributed.tensor._dtensor_spec as dtensor_spec
+from torch.distributed.tensor._collective_utils import (
+    MeshTopoInfo,
+    allgather_cost,
+    allreduce_cost,
+    reduce_scatter_cost,
+    spec_to_bytes,
+)
+from torch.distributed.tensor.placement_types import Partial, Shard
+
+
+def all_to_all_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    num_hops = num_devices_on_mesh_dim**2
+    # base latency + comm latency
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]  # us
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth  # s
+    return latency + bw * 1e6  # rescale to us
+
+
+# this is a copy-paste from https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/_collective_utils.py
+# with iteration order introduced
+# TODO: this should be improved, as we just really use the non-canonical order for
+# PP->S(0)S(0) for now
+def redistribute_cost(
+    current_spec: "dtensor_spec.DTensorSpec",
+    target_spec: "dtensor_spec.DTensorSpec",
+    order: list[int],
+) -> float:
+    """
+    This function returns the cost of redistribute from current to target DTensorSpec.
+
+    NOTE:
+    1. Only consider communication cost here, since computation costs for redistribute
+       are quite trivial (i.e. we only need to narrow or simple division)
+    2. Only consider redistribute cost on same mesh, cross mesh communication cost is
+       not quite needed for operator strategy estimation/selection.
+    """
+    if current_spec.mesh != target_spec.mesh:
+        # make infinite cost if meshes are not same
+        # TODO: see if we want to support this once there's cross mesh communication
+        return float("inf")
+
+    if current_spec.is_replicated():
+        # short-cut:
+        # comm cost is 0 if current spec is already full replication
+        return 0.0
+
+    mesh_topo = MeshTopoInfo.build_from_mesh(current_spec.mesh)
+    cost = 0.0
+    comm_bytes_gb = (
+        spec_to_bytes(current_spec) / current_spec.num_shards / 1024 / 1024 / 1024
+    )
+    # Transformation that considered for redistribute cost:
+    # 1. allgather 2. alltoall
+    # 3. allreduce 4. reduce_scatter
+    curr_placements = [current_spec.placements[i] for i in order]
+    tgt_placements = [target_spec.placements[i] for i in order]
+    for i, current, target in zip(order, curr_placements, tgt_placements):
+        if current == target:
+            continue
+        num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[i]
+        if current.is_shard() and target.is_replicate():
+            # allgather gives larger comm bytes
+            comm_bytes_gb *= num_devices_on_mesh_dim
+            # add up allgather comm cost
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i)
+        elif current.is_shard() and target.is_shard():
+            # should be alltoall comm, since we haven't implement it yet, add penalty
+            # to favor allgather instead
+            # cost += all_to_all_cost(comm_bytes_gb, mesh_topo, i)
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i) * 4.0
+        elif current.is_partial() and target.is_replicate():
+            # add up allreduce comm cost
+            cost += allreduce_cost(comm_bytes_gb, mesh_topo, i)
+        elif current.is_partial() and target.is_shard():
+            # add up reduce_scatter comm cost
+            cost += reduce_scatter_cost(comm_bytes_gb, mesh_topo, i)
+            # after reduce_scatter the comm bytes for further collectives halved.
+            comm_bytes_gb /= num_devices_on_mesh_dim
+        elif current.is_shard() and target.is_partial():
+            # ban shard -> partial as it does not make sense to perform
+            # this redistribute
+            return float("inf")
+
+    return cost
+
+
+def estimate_strategy_comms_cost(src_spec, tgt_spec):
+    order = list(range(src_spec.mesh.ndim))
+    if src_spec.placements == (Partial(), Partial()) and tgt_spec.placements == (
+        Shard(0),
+        Shard(0),
+    ):
+        order = [1, 0]
+    comms_cost = redistribute_cost(src_spec, tgt_spec, order)
+    return comms_cost
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
@@ -96,6 +96,7 @@
 from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
 from torch.utils._pytree import tree_flatten, tree_map_only
 
+from .collective_runtime_estimation import estimate_strategy_comms_cost
 from .compute_estimation import (
     _get_sharded_shape_stride,
     estimate_strategy_runtime_cost,
@@ -301,6 +302,19 @@ def build_ds(self):
                     if node.op != "placeholder":
                         argi_strat = self.strats[self._all_input_nodes(node)[argi]]
                     for ii, comm_cost in enumerate(xxi):
+                        if node.op != "placeholder":
+                            src_spec = argi_strat.strategies[ii].output_specs
+                            # TODO: operator.getitem being special is something
+                            # we might want to change in the future
+                            if node.target == operator.getitem:
+                                src_spec = src_spec[node.args[1]]
+                            tgt_spec = ssi.input_specs[argi]
+                            assert isinstance(src_spec, DTensorSpec)
+                            assert isinstance(tgt_spec, DTensorSpec)
+                            # we use our custom comm_cost function to estimate the cost
+                            # of the collective operation
+                            comm_cost = estimate_strategy_comms_cost(src_spec, tgt_spec)
+
                         if node in grad_param_nodes:
                             comm_cost = comm_cost / self.rescale_grad_comm_cost_for_mp
                         # Imagine we start node_i from S(0)S(0) and we want to reach node_{i+2} at
diff --git a/autoparallel/ordered_sharding.py b/autoparallel/ordered_sharding.py
@@ -37,7 +37,8 @@ def _optimize_same_nd_sharding_as_1d(
         return redistribute_local_tensor(arg, curr_spec, tgt_spec)
 
     # TODO: make this more general, I'm playing safe for now
-    if not (curr_spec_first == Shard(0) and tgt_spec_first == Replicate()):
+    allowed_placements = [(Shard(0), Replicate()), (Partial(), Shard(0))]
+    if (curr_spec_first, tgt_spec_first) not in allowed_placements:
         print(f"NOT doing optimization for {str(curr_spec)} -> {str(tgt_spec)}")
         return redistribute_local_tensor(arg, curr_spec, tgt_spec)