From fcc5af5b948318a53042bc82dce4c6c9e3794e3d Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Tue, 31 Oct 2023 23:46:00 -0400 Subject: [PATCH] Edge-casing for multi-GPU HF-to-NeoX conversion (#1065) * edge-casing for multiGPU hf to sequential case * cleanup whitespace * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- tools/ckpts/convert_hf_to_sequential.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 2d7794105..2b23f2207 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 2ff807d + Default = 7c50e77 current git hash of repository diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py index 8a3902bce..be445ec72 100644 --- a/tools/ckpts/convert_hf_to_sequential.py +++ b/tools/ckpts/convert_hf_to_sequential.py @@ -519,7 +519,7 @@ def get_non_existing_dir(tmp_dir): model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, - args=neox_args, + # args=neox_args, lr_scheduler=lr_scheduler, dist_init_required=False, model_parameters=None, @@ -527,7 +527,7 @@ def get_non_existing_dir(tmp_dir): mpu=mpu if not neox_args.is_pipe_parallel else None, ) - if os.environ["OMPI_COMM_WORLD_RANK"] == "0": + if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0": os.makedirs(f"{tmp_cache_dir}", exist_ok=True) torch.distributed.barrier() @@ -566,7 +566,7 @@ def get_non_existing_dir(tmp_dir): print("==========================================") convert(hf_model, ckpt_dir=ckpt_dir, output_dir=args.output_dir) - if os.environ["OMPI_COMM_WORLD_RANK"] == "0": + if os.environ.get("OMPI_COMM_WORLD_RANK", "1") == "0": # cleanup temp dir os.system(f"rm -r {tmp_cache_dir}")