From bbd4d7f12791df71c1a1a601fddfcb5a5bb4f407 Mon Sep 17 00:00:00 2001 From: hsmallbone Date: Wed, 8 Jan 2025 13:21:33 +0800 Subject: [PATCH] feat: Add no_ssh multinode launcher option for deepspeed --- src/accelerate/commands/config/cluster.py | 2 +- src/accelerate/utils/constants.py | 2 +- src/accelerate/utils/launch.py | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 43c369f8c7b..8e19d25f736 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -350,7 +350,7 @@ def get_cluster_input(): "Please specify the location of hostfile: ", str, ) - + is_exclusion_filter = _ask_field( "Do you want to specify exclusion filter string? [yes/NO]: ", _convert_yes_no_to_bool, diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py index a6d7d262678..af5a95da123 100644 --- a/src/accelerate/utils/constants.py +++ b/src/accelerate/utils/constants.py @@ -41,7 +41,7 @@ "2.1.0.a0+32f93b1" # Technically should be 2.1.0, but MS-AMP uses this specific prerelease in their Docker image. ) FSDP_MODEL_NAME = "pytorch_model_fsdp" -DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"] +DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich", "nossh"] TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"] ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0" XPU_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.4.0" diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index c6f3d60031d..7e413ee5fb4 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -321,8 +321,12 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict args.deepspeed_multinode_launcher = DEEPSPEED_MULTINODE_LAUNCHERS[0] if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]: - cmd = ["deepspeed", "--no_local_rank"] - cmd.extend(["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)]) + cmd = ["deepspeed"] + cmd.extend(["--hostfile", str(args.deepspeed_hostfile)]) + if args.deepspeed_multinode_launcher == "nossh": + cmd.extend(["--node_rank", str(args.machine_rank), "--no_ssh"]) + else: + cmd.extend(["--no_local_rank", "--launcher", str(args.deepspeed_multinode_launcher)]) if args.deepspeed_exclusion_filter is not None: cmd.extend( [