meta-pytorch · fmassa · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
@@ -490,7 +490,12 @@ def print_costs_for_node(self, node, arg=0, **kwargs):
         from torch.distributed.tensor._op_schema import _pretty_print_spec
 
         tgt_strat = self.strats[node]
-        src_strat = self.strats[node.args[arg]]
+        # Use this instead of node.all_input_nodes because there could be
+        # duplicate nodes that get removed
+        all_input_nodes = [
+            x for x in tree_flatten(node.args)[0] if isinstance(x, torch.fx.Node)
+        ]
+        src_strat = self.strats[all_input_nodes[arg]]
         src_placements = [""] + [
             _pretty_print_spec(x.output_specs) for x in src_strat.strategies
         ]

diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -668,6 +668,127 @@ def index_rule(mesh, op_schema):
     return out_strat
 
 
+@register_opschema_rule(torch.ops.aten.sort.stable)
+def sort_rule(mesh, op_schema):
+    op = torch.ops.aten.topk.default
+    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
+        op
+    ](
+        op_schema
+    )
+    return out_strat
+
+
+@register_opschema_rule(torch.ops.aten.gather.default)
+def gather_strategy(mesh, op_schema):
+    from torch.distributed.tensor._op_schema import PlacementList
+    from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+    from torch.distributed.tensor._ops.utils import expand_to_full_mesh_op_strategy
+
+    input_strategy = op_schema.args_schema[0]
+    dim = op_schema.args_schema[1]
+    index_strategy = op_schema.args_schema[2]
+
+    input_shape = input_strategy.shape
+    index_shape = index_strategy.shape
+
+    single_mesh_dim_strategies = []
+
+    # placement list stores placements of [output, input, index]
+    # first we always have replicate all for inputs and output
+    all_replicate: PlacementList = [Replicate()] * 3
+    single_mesh_dim_strategies.append(all_replicate)
+
+    # input sharding, input sharded, index accepts mask partial, output follows index
+    # this only works when the input is sharded on the gather dimension, and
+    # index has size 1 on the gather dimension
+    if index_shape[dim] == 1:
+        index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
+        input_sharding: PlacementList = [
+            index_partial_placement,
+            Shard(dim),
+            index_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(input_sharding)
+
+    # index sharding, input replicated, index sharded, output follows index
+    # this only works when the sharding dimension is the gather dimension
+    index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim)]
+    single_mesh_dim_strategies.append(index_sharding)
+
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim:
+                sharding = [Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+
+
+@register_opschema_rule(torch.ops.aten.scatter_add.default)
+def scatter_add_strategy(mesh, op_schema):
+    from torch.distributed.tensor._op_schema import PlacementList
+
+    # from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+    from torch.distributed.tensor._ops.utils import expand_to_full_mesh_op_strategy
+
+    input_strategy = op_schema.args_schema[0]
+    dim = op_schema.args_schema[1]
+    index_strategy = op_schema.args_schema[2]
+    # src_strategy = op_schema.args_schema[3]
+
+    input_shape = input_strategy.shape
+    index_shape = index_strategy.shape
+
+    single_mesh_dim_strategies = []
+
+    # placement list stores placements of [output, input, index]
+    # first we always have replicate all for inputs and output
+    all_replicate: PlacementList = [Replicate()] * 4
+    single_mesh_dim_strategies.append(all_replicate)
+
+    """
+    # input sharding, input sharded, index accepts mask partial, output follows index
+    # this only works when the input is sharded on the gather dimension, and
+    # index has size 1 on the gather dimension
+    if index_shape[dim] == 1:
+        index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
+        input_sharding: PlacementList = [
+            index_partial_placement,
+            Shard(dim),
+            index_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(input_sharding)
+    """
+    # index sharding, input replicated, index sharded, output follows index
+    # this only works when the sharding dimension is the gather dimension
+    index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim), Shard(dim)]
+    single_mesh_dim_strategies.append(index_sharding)
+
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim:
+                sharding = [Shard(d), Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+
+
+@register_opschema_rule(torch.ops.aten.slice_scatter.default)
+def slice_scatter_rule(mesh, op_schema):
+    op = torch.ops.aten.slice_scatter.default
+    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
+        op
+    ](
+        op_schema
+    )
+    return out_strat
+
+
 def sdpa_rule(op, mesh, op_schema):
     out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
         op

diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -176,12 +176,32 @@ def _generate_dummy_strategy(
     return out_strat
 
 
+def keep_unique_configs(op_strat):
+    added = set()
+    filtered_strats = []
+    for strat in op_strat.strategies:
+        input_specs = strat.input_specs
+        output_specs = strat.output_specs
+        if isinstance(input_specs, list):
+            input_specs = tuple(input_specs)
+        if isinstance(output_specs, list):
+            output_specs = tuple(output_specs)
+        key = (input_specs, output_specs)
+        if key in added:
+            continue
+
+        added.add(key)
+        filtered_strats.append(strat)
+    return OpStrategy(filtered_strats)
+
+
 def get_placement_options(mesh, op, specs, user_args, user_kwargs, fake_mode):
     # print(op)
 
     if op in _op_rules:
         out_strat = _op_rules[op](mesh, specs)
         out_strat = remove_invalid_configs(out_strat, mesh)
+        out_strat = keep_unique_configs(out_strat)
         return out_strat
 
     strat = []
@@ -224,6 +244,7 @@ def get_placement_options(mesh, op, specs, user_args, user_kwargs, fake_mode):
     propagate_tensor_meta(op, user_args, user_kwargs, out_strat, fake_mode)
     fill_missing_redistribute_cost(op, specs, out_strat)
     out_strat = remove_invalid_configs(out_strat, mesh)
+    out_strat = keep_unique_configs(out_strat)
 
     return out_strat