meta-pytorch · fmassa · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -184,6 +184,7 @@ def _get_decomp_table():
     decomp_table.pop(torch.ops.aten.native_layer_norm.default)
     decomp_table.pop(torch.ops.aten.embedding_dense_backward.default)
     decomp_table.pop(torch.ops.aten.native_layer_norm_backward.default)
+    decomp_table.pop(torch.ops.aten._softmax_backward_data.default)
 
     # decompose addmm to allow for TP on mm
     decomp_table.pop(torch.ops.aten.addmm.default)
@@ -246,7 +247,7 @@ def __init__(self, model, input_fn, mesh: DeviceMesh):
         self.mesh = mesh
         self.build_model_graph()
 
-        sharding_optimizer = ShardingOptimizer(self.gm, self.mesh)
+        sharding_optimizer = ShardingOptimizer(self.gm, self.mesh, self.fake_mode)
         # makes sharding of params and gradients the same
         sharding_optimizer.add_grad_param_constraints()
         self.sharding_optimizer = sharding_optimizer

diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -8,7 +8,38 @@
 
 import torch
 from torch.utils._pytree import tree_map_only
-from torch.utils.flop_counter import FlopCounterMode
+from torch.utils.flop_counter import FlopCounterMode, register_flop_formula
+
+
+@register_flop_formula(torch.ops.aten._grouped_mm)
+def gmm_flop(
+    a_shape, b_shape, offs_shape=None, bias_shape=None, out_shape=None, **kwargs
+) -> int:
+    """Count flops for the gmm operation."""
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor
+    if len(a_shape) == 2:
+        assert offs_shape is not None
+        (b,) = offs_shape
+        m0, k = a_shape
+        # assumption: assume roughtly balanced, so falls-back to bmm
+        m = m0 // b
+    else:
+        assert offs_shape is None
+        b, m, k = a_shape
+    if len(b_shape) == 2:
+        assert offs_shape is not None
+        (b2,) = offs_shape
+        k2, n0 = b_shape
+        # assumption: assume roughtly balanced, so falls-back to bmm
+        n = n0 // b2
+    else:
+        b2, k2, n = b_shape
+    assert b == b2
+    assert k == k2
+    # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
+    flop = b * m * n * 2 * k
+    return flop
 
 
 @dataclass
@@ -147,12 +178,13 @@ def _get_device_tflops(dtype):
             f"Unsupported device: {device_name}. Supported devices: {[limit.name for limit in DEVICE_LIMITS]}"
         )
 
-    if dtype not in device_limit.gemm_tflops:
-        raise ValueError(
-            f"Dtype {dtype} not supported on {device_limit.name}. Supported dtypes: {list(device_limit.gemm_tflops.keys())}"
-        )
+    # TODO: add proper support for int64 etc
+    # if dtype not in device_limit.gemm_tflops:
+    #     raise ValueError(
+    #         f"Dtype {dtype} not supported on {device_limit.name}. Supported dtypes: {list(device_limit.gemm_tflops.keys())}"
+    #     )
 
-    return device_limit.gemm_tflops[dtype]
+    return device_limit.gemm_tflops.get(dtype, 1)
 
 
 def _get_sharded_shape_stride(spec):
@@ -213,7 +245,7 @@ def estimate_strategy_runtime_cost(node, strategy):
 
     # TODO: maybe cache the flop_counter to avoid recreating it
     # all the time
-    with FlopCounterMode(display=False) as flop_counter:
+    with FlopCounterMode(display=False) as flop_counter, fake_mode:
         node.target(*args, **kwargs)
 
     flops = flop_counter.get_total_flops()

diff --git a/autoparallel/export_module.py b/autoparallel/export_module.py
@@ -204,32 +204,41 @@ def rename_nodes(fx_g, nodes, new_name, idxs=None):
     # TODO: align number of grad names with inputs everywhere?
     all_output_nodes = fx_g.graph.find_nodes(op="output")[0].all_input_nodes
     output_nodes = all_output_nodes[: metadata.num_outputs]
+    print("output")
     rename_nodes(fx_g, output_nodes, "output")
     param_grad = all_output_nodes[
         metadata.num_outputs : metadata.num_outputs + params_len
     ]
+    print("grad_param")
     rename_nodes(fx_g, param_grad, "grad_param")
     grad_inputs = all_output_nodes[metadata.num_outputs + params_len :]
     inputs_that_require_grad = [
         i for i, n in enumerate(metadata.input_info[params_len:]) if n.requires_grad
     ]
-    rename_nodes(fx_g, grad_inputs, "grad_input", inputs_that_require_grad)
+    print("grad_input")
+
+    # TODO: figure out and fix why this is not working
+    # rename_nodes(fx_g, grad_inputs, "grad_input", inputs_that_require_grad)
 
     tangent_nodes = fx_g.graph.find_nodes(op="placeholder")[
         -len(metadata.traced_tangents) :
     ]
     outputs_that_require_grad = [
         i for i, n in enumerate(metadata.output_info) if n.requires_grad
     ]
+    print("tangents")
     rename_nodes(fx_g, tangent_nodes, "tangents", outputs_that_require_grad)
     input_nodes = fx_g.graph.find_nodes(op="placeholder")[
         params_len + buffer_len : -len(metadata.traced_tangents)
     ]
+    print("input")
     rename_nodes(fx_g, input_nodes, "input")
     param_nodes = fx_g.graph.find_nodes(op="placeholder")[:params_len]
+    print("param")
     rename_nodes(fx_g, param_nodes, "param")
 
     buffer_nodes = fx_g.graph.find_nodes(op="placeholder")[
         params_len : params_len + buffer_len
     ]
+    print("buffer")
     rename_nodes(fx_g, buffer_nodes, "buffer")
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
@@ -114,10 +114,11 @@ def _get_next_name(name):
 
 
 class ShardingOptimizer:
-    def __init__(self, gm, mesh):
+    def __init__(self, gm, mesh, fake_mode):
         self.gm = gm
         self.graph = gm.graph
         self.mesh = mesh
+        self.fake_mode = fake_mode
         self.node_map = {node: i for i, node in enumerate(self.graph.nodes)}
         self.strats = self.build_sharding_metadata()
         # ds: Decision variables dictionary mapping (s_i, argi, ss, ii) -> ILP variable data
@@ -168,7 +169,12 @@ def build_sharding_metadata(self):
                     strats[node] = strat
                 else:
                     strat = get_placement_options(
-                        self.mesh, node.target, user_strats, user_args, user_kwargs
+                        self.mesh,
+                        node.target,
+                        user_strats,
+                        user_args,
+                        user_kwargs,
+                        self.fake_mode,
                     )
                     strats[node] = strat
             elif node.op == "output":
@@ -215,7 +221,8 @@ def build_ds(self):
                     "num_output_strat": len(s.strategies),
                 }
             for ss, ssi in enumerate(s.strategies):
-                compute_cost = estimate_strategy_runtime_cost(node, ssi)
+                with self.fake_mode:
+                    compute_cost = estimate_strategy_runtime_cost(node, ssi)
                 for argi, xxi in enumerate(ssi.redistribute_cost):
                     for ii, comm_cost in enumerate(xxi):
                         va = pulp.LpVariable(
@@ -483,7 +490,12 @@ def print_costs_for_node(self, node, arg=0, **kwargs):
         from torch.distributed.tensor._op_schema import _pretty_print_spec
 
         tgt_strat = self.strats[node]
-        src_strat = self.strats[node.args[arg]]
+        # Use this instead of node.all_input_nodes because there could be
+        # duplicate nodes that get removed
+        all_input_nodes = [
+            x for x in tree_flatten(node.args)[0] if isinstance(x, torch.fx.Node)
+        ]
+        src_strat = self.strats[all_input_nodes[arg]]
         src_placements = [""] + [
             _pretty_print_spec(x.output_specs) for x in src_strat.strategies
         ]

diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -50,11 +50,15 @@
 _op_rules = {}
 
 
-def register_rule(op):
+def register_rule(ops):
     global _op_rules
 
     def wrapper(impl):
-        _op_rules[op] = impl
+        if isinstance(ops, list):
+            for op in ops:
+                _op_rules[op] = impl
+        else:
+            _op_rules[ops] = impl
         return impl
 
     return wrapper
@@ -382,7 +386,7 @@ def factory_rule(mesh, op_schema: OpSchema) -> OpStrategy:
     This util applies to any factory function that takes 'size' as the first argument,
     and supports Replication and Shard placements all at zero cost.
     """
-    assert isinstance(op_schema.args_schema[0], torch.Size)
+    assert isinstance(op_schema.args_schema[0], (torch.Size, list))
     shape = op_schema.args_schema[0]
     x = torch.empty(shape, device="meta")
     stride = x.stride()
@@ -424,8 +428,11 @@ def factory_rule(mesh, op_schema: OpSchema) -> OpStrategy:
             * len(strategy_combs)
         ]
 
+        # TODO: should we add an input_spec here, so that we can ensure we always
+        # have input and output specs? For now I hacked it in utils.py
         strategy = OpSpec(
             output_specs=output_specs,
+            input_specs=[output_specs],
             redistribute_cost=redistribute_cost,
         )
         all_strategies.append(strategy)
@@ -617,23 +624,26 @@ def _unsafe_index_rule(mesh, op_schema):
     raise NotImplementedError()
 
 
-@register_opschema_rule(torch.ops.aten.index.Tensor)
+# Disable this rule as it's implementation is inferior than the baseline
+# @register_opschema_rule(torch.ops.aten.index.Tensor)
 def index_rule(mesh, op_schema):
-    raise NotImplementedError("Needs hardening, only tested on a few cases")
+    print(f"Ops that need to be implemented {torch.ops.aten.index.Tensor}")
+    # raise NotImplementedError("Needs hardening, only tested on a few cases")
     strat = op_schema.args_schema
     specs = strat  # TODO: clean this up
     res = []
     idxs_placements = [(Replicate(), Replicate()), (Shard(0), Replicate())]
-    if strat[1].childs[0] is None:
-        idxs_placements = idxs_placements[:1]
-    else:
-        idxs_placements = idxs_placements[1:]
+    idxs_placements = [(Replicate(),) * mesh.ndim]
+    # if strat[1].childs[0] is None:
+    #    idxs_placements = idxs_placements[:1]
+    # else:
+    #    idxs_placements = idxs_placements[1:]
     # TODO: this is a nasty hack and won't work for most of the cases
-    for i, ss in enumerate(strat[0].strategies):
+    for i, ss in enumerate(strat[0].strategies[:1]):
         for plt in idxs_placements:
             ispec = ss.input_specs[0]
             ospec = DTensorSpec(mesh=mesh, placements=ispec.placements)
-            assert ss.output_spec == ispec
+            # assert ss.output_spec == ispec, f"{ss.output_spec}, {ispec}"
             idxs_strats = [
                 DTensorSpec(mesh, placements=plt)
                 for x in strat[1].childs
@@ -658,6 +668,127 @@ def index_rule(mesh, op_schema):
     return out_strat
 
 
+@register_opschema_rule(torch.ops.aten.sort.stable)
+def sort_rule(mesh, op_schema):
+    op = torch.ops.aten.topk.default
+    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
+        op
+    ](
+        op_schema
+    )
+    return out_strat
+
+
+@register_opschema_rule(torch.ops.aten.gather.default)
+def gather_strategy(mesh, op_schema):
+    from torch.distributed.tensor._op_schema import PlacementList
+    from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+    from torch.distributed.tensor._ops.utils import expand_to_full_mesh_op_strategy
+
+    input_strategy = op_schema.args_schema[0]
+    dim = op_schema.args_schema[1]
+    index_strategy = op_schema.args_schema[2]
+
+    input_shape = input_strategy.shape
+    index_shape = index_strategy.shape
+
+    single_mesh_dim_strategies = []
+
+    # placement list stores placements of [output, input, index]
+    # first we always have replicate all for inputs and output
+    all_replicate: PlacementList = [Replicate()] * 3
+    single_mesh_dim_strategies.append(all_replicate)
+
+    # input sharding, input sharded, index accepts mask partial, output follows index
+    # this only works when the input is sharded on the gather dimension, and
+    # index has size 1 on the gather dimension
+    if index_shape[dim] == 1:
+        index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
+        input_sharding: PlacementList = [
+            index_partial_placement,
+            Shard(dim),
+            index_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(input_sharding)
+
+    # index sharding, input replicated, index sharded, output follows index
+    # this only works when the sharding dimension is the gather dimension
+    index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim)]
+    single_mesh_dim_strategies.append(index_sharding)
+
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim:
+                sharding = [Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+
+
+@register_opschema_rule(torch.ops.aten.scatter_add.default)
+def scatter_add_strategy(mesh, op_schema):
+    from torch.distributed.tensor._op_schema import PlacementList
+
+    # from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+    from torch.distributed.tensor._ops.utils import expand_to_full_mesh_op_strategy
+
+    input_strategy = op_schema.args_schema[0]
+    dim = op_schema.args_schema[1]
+    index_strategy = op_schema.args_schema[2]
+    # src_strategy = op_schema.args_schema[3]
+
+    input_shape = input_strategy.shape
+    index_shape = index_strategy.shape
+
+    single_mesh_dim_strategies = []
+
+    # placement list stores placements of [output, input, index]
+    # first we always have replicate all for inputs and output
+    all_replicate: PlacementList = [Replicate()] * 4
+    single_mesh_dim_strategies.append(all_replicate)
+
+    """
+    # input sharding, input sharded, index accepts mask partial, output follows index
+    # this only works when the input is sharded on the gather dimension, and
+    # index has size 1 on the gather dimension
+    if index_shape[dim] == 1:
+        index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
+        input_sharding: PlacementList = [
+            index_partial_placement,
+            Shard(dim),
+            index_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(input_sharding)
+    """
+    # index sharding, input replicated, index sharded, output follows index
+    # this only works when the sharding dimension is the gather dimension
+    index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim), Shard(dim)]
+    single_mesh_dim_strategies.append(index_sharding)
+
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim:
+                sharding = [Shard(d), Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+
+
+@register_opschema_rule(torch.ops.aten.slice_scatter.default)
+def slice_scatter_rule(mesh, op_schema):
+    op = torch.ops.aten.slice_scatter.default
+    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
+        op
+    ](
+        op_schema
+    )
+    return out_strat
+
+
 def sdpa_rule(op, mesh, op_schema):
     out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
         op