diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
index ebd7b0b04..632259a37 100644
--- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
+++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
@@ -11,8 +11,8 @@
 set -euxo pipefail
 
 # default variables for Enroot, if these variables are defined then use them
-: "${APPS_PATH:=/fsx/apps}"
-: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
 
 ENROOT_IMAGE=deepspeed
 docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .
diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md
index fd2ef7524..50ce348d5 100644
--- a/3.test_cases/pytorch/deepspeed/README.md
+++ b/3.test_cases/pytorch/deepspeed/README.md
@@ -10,12 +10,12 @@ This guide assumes that you have the following:
 * Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed.
 * An FSx for Lustre filesystem mounted on `/fsx`.
 
-We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases:
+We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../../1.architectures). You need to set the following environment variables to run these test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
+export FSX_PATH=/fsxl
 export MODEL_PATH=$FSX_PATH/deepspeed
 export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path
 cd $TEST_CASE_PATH                          # Note that we assume that you are here during the following command executions
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md
index c0405d042..47702ea35 100644
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md
@@ -6,8 +6,8 @@
 You need to follow steps in `../README.md` to prepare AWS-optimized DeepSpeed container. Also, set the following environment variables to run the test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
 export FSX_PATH=/fsx
 export MODEL_PATH=$FSX_PATH/deepspeed
 export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch
index 5eefce6d2..1144a11f3 100644
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch
@@ -4,9 +4,9 @@
 #SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr
 #SBATCH --nodes 1
 
-: "${APPS_PATH:=/fsx/apps}"
-: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
-: "${FSX_PATH:=/fsx}"
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsxl}"
 : "${DATASET:=c4_subset}"
 : "${DATA_PATH:=$FSX_PATH/$DATASET}"
 : "${MODEL_PATH:=$FSX_PATH/deepspeed}"
@@ -15,7 +15,8 @@
 
 declare -a ARGS=(
     --container-image ${IMAGE}
-    --container-mounts /fsx
+    --container-mount-home
+    --container-mounts ${FSX_PATH}
 )
 
 srun -l "${ARGS[@]}" python3 ${PWD}/src/convert_llama_weights_to_hf.py \
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh
index b2b4fd224..c21ef8245 100644
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-sbatch --nodes=1 --job-name=cvtw-mgtds scripts/finetune_llama.sbatch convert
\ No newline at end of file
+sbatch --nodes=1 --job-name=cvtw-mgtds finetune_llama.sbatch convert_hf2mds
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh
index 546967e86..a7264e7fc 100644
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh
@@ -1,4 +1,4 @@
 
 #!/bin/bash
 
-sbatch --nodes=1 --job-name=finetune-llama scripts/finetune_llama.sbatch finetune
\ No newline at end of file
+sbatch --nodes=1 --job-name=finetune-llama finetune_llama.sbatch finetune
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md
index fa38fb968..965b3b5ac 100644
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md
@@ -6,9 +6,9 @@ This test case showcase how to finetune Llama2 model from HuuggingFace Weights u
 Set the following environment variables to run the test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
+export FSX_PATH=/fsxl
 export MODEL_PATH=$FSX_PATH/deepspeed
 export DATA_PATH=$FSX_PATH/alpaca
 ```
@@ -21,8 +21,24 @@ mkdir -p ${DATA_PATH}
 wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O ${DATA_PATH}/alpaca_data.json
 ```
 
-Llama2 model, which governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample.
-You can submit access request from [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/), we need "Llama 2 & Llama Chat" to be checked. Use the [download.sh](https://github.com/facebookresearch/llama/blob/main/download.sh) in the official repository. You will be asked to input an URL from the email you recieve from meta.  
+Llama2 model is governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample.
+
+### Option 1: Download from Hugging Face (Recommended)
+1. Visit [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on Hugging Face
+2. Accept the license terms and submit an access request (processed hourly)
+3. Install Hugging Face CLI: `pip install -U "huggingface_hub[cli]"`
+4. Login to Hugging Face: `huggingface-cli login`
+5. Download the model to your desired location:
+   ```bash
+   hf download meta-llama/Llama-2-7b --local-dir ${MODEL_PATH}/Llama2-meta/7B
+   hf download meta-llama/Llama-2-7b tokenizer.model --local-dir ${MODEL_PATH}/Llama2-meta
+   ```
+
+### Option 2: Download from Meta
+1. Submit an access request from [Meta's Llama downloads page](https://www.llama.com/llama-downloads/)
+2. You will receive an email with a signed download URL (valid for 24 hours)
+3. Use the [download.sh](https://github.com/meta-llama/llama/blob/main/download.sh) script from the official repository
+4. Run `./download.sh` and paste the URL from your email when prompted  
 
 We will assume that you had placed the model and tokenizer as follows on cluster:
 
@@ -60,7 +76,7 @@ ${MODEL_PATH}/Llama2-7b-hf
 
 Finally, transforms the checkpoint into Megatron DeepSpeed format:
 
-``bash
+```bash
 bash 2.convert-weights-to-mega-ds.sh
 ```
 
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch
new file mode 100644
index 000000000..32ea9e546
--- /dev/null
+++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch
@@ -0,0 +1,154 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --job-name=convert-llama-weights-to-megatron-deepspeed
+#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr
+#SBATCH --nodes 1
+
+set -euxo pipefail
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsxl}"
+: "${DATA_PATH:=$FSX_PATH/alpaca/alpaca_data.json}"
+: "${MODEL_PATH:=$FSX_PATH/deepspeed}"
+: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}"
+: "${HF_LLAMA_PATH:=$FSX_PATH/deepspeed/Llama2-7b-hf}"
+
+export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+export NODES_ARRAY=($NODES)
+export HEAD_NODE=${NODES_ARRAY[0]}
+export MASTER_ADDR=$(hostname --ip-address)
+export MASTER_PORT=$((RANDOM + 10000))
+export NNODES=$SLURM_JOB_NUM_NODES
+export NUM_GPUS_PER_NODE=8
+## EFA settings
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons
+export FI_EFA_USE_HUGE_PAGE=0
+# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
+# https://github.com/pytorch/pytorch/issues/68893
+export NCCL_SOCKET_IFNAME=en
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMPI_MCA_plm=^slurm 
+export MICRO_BATCH_SIZE=16
+export GLOBAL_BATCH_SIZE=256
+export TP=4
+export PP=2
+# require to align with weight dimensions
+export HIDDEN_SIZE=4096
+export FFN_HIDDEN_SIZE=11008
+export NUM_LAYERS=32
+export NUM_HEADS=32
+export SEQ_LENGTH=512
+export MEGA_DS_LLAMA_PATH=${MODEL_PATH}/Llama2-7b-mega-ds-T${TP}P${PP}
+cat <<EOF > configs/ds_config.json
+{
+    "train_batch_size": ${GLOBAL_BATCH_SIZE},
+    "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+    "steps_per_print": 100,
+    "zero_optimization": {
+        "stage": 0
+    },
+    "bf16": {
+        "enabled": true
+    }
+}
+EOF
+
+declare -a ARGS=(
+    --container-image ${IMAGE}
+    --container-mounts ${FSX_PATH}
+    --container-mount-home
+)
+
+declare -a DIST_ARGS=(
+    --nnodes ${NNODES}
+    --nproc-per-node ${NUM_GPUS_PER_NODE}
+    --master_addr ${MASTER_ADDR}
+    --master_port ${MASTER_PORT}
+    --rdzv_id $RANDOM
+    --rdzv_backend c10d
+    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT}
+)
+
+declare -a CONVERT_HF2MDS_ARGS=(
+    --hf-ckpt-num-shards 2
+    --hf-ckpt-dir $HF_LLAMA_PATH
+    --load-mode auto
+    --save $MEGA_DS_LLAMA_PATH
+)
+
+declare -a CONVERT_MDS2HF_ARGS=(
+    --hf-ckpt-num-shards 2
+    --hf-ckpt-dir $HF_LLAMA_PATH
+    --load-mode auto
+    --to-hf-ckpt
+    --load $MEGA_DS_LLAMA_PATH
+    --save ${HF_LLAMA_PATH}-hf-out
+)
+
+declare -a FINETUNE_ARGS=(
+    --load $MEGA_DS_LLAMA_PATH
+)
+
+declare -a COMM_ARGS=(
+    --tensor-model-parallel-size $TP
+    --pipeline-model-parallel-size $PP
+    --lr-warmup-iters 2000
+    --weight-decay 0.1
+    --clip-grad 1
+    --num-layers $NUM_LAYERS
+    --hidden-size $HIDDEN_SIZE
+    --num-attention-heads $NUM_HEADS
+    --ffn-hidden-size $FFN_HIDDEN_SIZE
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-query-key-layer-scaling
+    --disable-bias-linear
+    --normalization rmsnorm
+    --use-rotary-position-embeddings
+    --untie-embeddings-and-output-weights
+    --swiglu
+    --seq-length $SEQ_LENGTH
+    --max-position-embeddings $SEQ_LENGTH
+    --micro-batch-size $MICRO_BATCH_SIZE
+    --global-batch-size $GLOBAL_BATCH_SIZE
+    --train-iters 3500
+    --lr 2e-5
+    --tensorboard-dir tensorboard_output
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --log-interval 1
+    --eval-iters 100
+    --eval-interval 100
+    --data-path $DATA_PATH
+    --save-interval 1500
+    --split 100,0,0
+    --bf16
+    --zero-stage 0
+    --tokenizer-type HFTokenizer
+    --tokenizer-model $HF_LLAMA_PATH
+    --deepspeed_config ${PWD}/configs/ds_config.json
+    --deepspeed
+    --distributed-backend nccl
+    --num-workers 0
+    --no-masked-softmax-fusion
+    --no-bias-gelu-fusion
+    --no-bias-dropout-fusion
+    --no-gradient-accumulation-fusion
+    --repeated-dataloader
+)
+
+
+if [ "$1" = "convert_hf2mds" ]; then
+    srun -l "${ARGS[@]}" python3 \
+    ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \
+    "${CONVERT_HF2MDS_ARGS[@]}" "${COMM_ARGS[@]}"
+elif [ "$1" = "convert_mds2hf" ]; then
+    srun -l "${ARGS[@]}" python3 \
+    ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \
+    "${CONVERT_MDS2HF_ARGS[@]}" "${COMM_ARGS[@]}"
+else
+    srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \
+    ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \
+    "${FINETUNE_ARGS[@]}" "${COMM_ARGS[@]}"
+fi
\ No newline at end of file
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh
deleted file mode 100644
index e69de29bb..000000000
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch
deleted file mode 100644
index 3d0a4a54b..000000000
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/bash
-#SBATCH --exclusive
-#SBATCH --job-name=convert-llama-weights-to-megatron-deepspeed
-#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr
-#SBATCH --nodes 1
-
-set -euxo pipefail
-: "${APPS_PATH:=/fsx/apps}"
-: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
-: "${FSX_PATH:=/fsx}"
-: "${DATA_PATH:=/fsx/alpaca/alpaca_data.json}"
-: "${MODEL_PATH:=$FSX_PATH/deepspeed}"
-: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}"
-: "${HF_LLAMA_PATH:=/fsx/deepspeed/Llama2-7b-hf}"
-
-export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
-export NODES_ARRAY=($NODES)
-export HEAD_NODE=${NODES_ARRAY[0]}
-export MASTER_ADDR=$(hostname --ip-address)
-export MASTER_PORT=$((RANDOM + 10000))
-export NNODES=$SLURM_JOB_NUM_NODES
-export NUM_GPUS_PER_NODE=8
-## EFA settings
-export FI_LOG_LEVEL=1
-export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons
-export FI_EFA_USE_HUGE_PAGE=0
-# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
-# https://github.com/pytorch/pytorch/issues/68893
-export NCCL_SOCKET_IFNAME=en
-export NCCL_ASYNC_ERROR_HANDLING=1
-export OMPI_MCA_plm=^slurm 
-export MICRO_BATCH_SIZE=16
-export GLOBAL_BATCH_SIZE=256
-export TP=4
-export PP=2
-# require to align with weight dimensions
-export HIDDEN_SIZE=4096
-export FFN_HIDDEN_SIZE=11008
-export NUM_LAYERS=32
-export NUM_HEADS=32
-export SEQ_LENGTH=512
-export MEGA_DS_LLAMA_PATH=${MODEL_PATH}/Llama2-7b-mega-ds-T${TP}P${PP}
-cat <<EOF > configs/ds_config.json
-{
-    "train_batch_size": ${GLOBAL_BATCH_SIZE},
-    "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
-    "steps_per_print": 100,
-    "zero_optimization": {
-        "stage": 0
-    },
-    "bf16": {
-        "enabled": true
-    }
-}
-EOF
-
-export HOSTFILE=/fsx/hostfile
-# create hostfile on the fly
-# https://github.com/microsoft/DeepSpeed/issues/3489
-function makehostfile() {
-perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
-$slots=8 if $slots==0; # workaround 8 gpu machines
-@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
-print map { "$b$_ slots=$slots\n" } @nodes'
-}
-makehostfile > ${HOSTFILE}
-
-
-declare -a ARGS=(
-    --container-image ${IMAGE}
-    --container-mounts /fsx,/opt/slurm/bin
-)
-declare -a DIST_ARGS=(
-    --nnodes ${NNODES}
-    --nproc-per-node ${NUM_GPUS_PER_NODE}
-    --master_addr ${MASTER_ADDR}
-    --master_port ${MASTER_PORT}
-    --rdzv_id $RANDOM
-    --rdzv_backend c10d
-    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT}
-)
-declare -a CONVERT_ARGS=(
-    --hf-ckpt-num-shards 3 
-    --origin-hf-ckpt-dir ${MODEL_PATH}/Llama2-7b-hf
-    --save ${MEGA_DS_LLAMA_PATH}
-)
-declare -a COMM_ARGS=(
-    --tensor-model-parallel-size $TP 
-    --pipeline-model-parallel-size $PP 
-    --lr-warmup-iters 2000 
-    --weight-decay 0.1 
-    --clip-grad 1 
-    --num-layers $NUM_LAYERS 
-    --hidden-size $HIDDEN_SIZE 
-    --num-attention-heads $NUM_HEADS 
-    --ffn-hidden-size $FFN_HIDDEN_SIZE 
-    --attention-dropout 0 
-    --hidden-dropout 0 
-    --no-query-key-layer-scaling 
-    --disable-bias-linear 
-    --normalization rmsnorm 
-    --use-rotary-position-embeddings 
-    --untie-embeddings-and-output-weights 
-    --swiglu 
-    --seq-length $SEQ_LENGTH 
-    --max-position-embeddings $SEQ_LENGTH 
-    --micro-batch-size $MICRO_BATCH_SIZE 
-    --global-batch-size $GLOBAL_BATCH_SIZE 
-    --train-iters 3500 
-    --lr 2e-5 
-    --tensorboard-dir tensorboard_output 
-    --lr-decay-iters 320000 
-    --lr-decay-style cosine 
-    --log-interval 1 
-    --eval-iters 100 
-    --eval-interval 100 
-    --data-path $DATA_PATH 
-    --save-interval 1500 
-    --split 100,0,0 
-    --bf16 
-    --zero-stage 0 
-    --tokenizer-type HFTokenizer 
-    --tokenizer-model $HF_LLAMA_PATH 
-    --deepspeed_config ${PWD}/configs/ds_config.json 
-    --deepspeed 
-    --distributed-backend nccl 
-    --num-workers 0 
-    --no-masked-softmax-fusion 
-    --no-bias-gelu-fusion 
-    --no-bias-dropout-fusion 
-    --no-gradient-accumulation-fusion 
-    --repeated-dataloader
-)
-
-if [ "$1" = "convert" ]; then
-    srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \
-    ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \
-    "${CONVERT_ARGS[@]}" "${COMM_ARGS[@]}"
-else
-    srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \
-    ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \
-    --load ${MEGA_DS_LLAMA_PATH} "${COMM_ARGS[@]}"
-    # function run_deepspeed() {
-    #     srun --nodelist=${NODE} --ntasks=1 -l "${ARGS[@]}" deepspeed "${DIST_ARGS[@]}" \
-    #     ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \
-    #     --load ${MEGA_DS_LLAMA_PATH} "${COMM_ARGS[@]}"
-    # }
-    # # run deepspeed
-    # NODE_RANK=1
-    # for (( NODE_RANK=1; NODE_RANK<${NNODES}; NODE_RANK++ ))
-    # do
-    #     NODE=${NODES[$NODE_RANK]}
-    #     echo "Run compute node ${NODE} for rank: ${NODE_RANK}"
-    #     run_deepspeed &
-    # done
-    # NODE_RANK=0
-    # NODE=${HEAD_NODE}
-    # echo "Run main node ${NODE} for rank: ${NODE_RANK}"
-    # run_deepspeed
-    # wait
-fi
diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh
deleted file mode 100644
index d0acbcf34..000000000
--- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-# The code is adopted from https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
-DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
-DATASET_PATH=./alpaca_data.json
-# dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
-
-# weights link: https://huggingface.co/huggyllama/llama-7b
-
-MICRO_BATCH_SIZE=16
-GLOBAL_BATCH_SIZE=256
-TP=2
-PP=2
-# require to align with weight dimensions
-HIDDEN_SIZE=4096
-FFN_HIDDEN_SIZE=11008
-NUM_LAYERS=32
-NUM_HEADS=32
-SEQ_LENGTH=512
-######################################
-
-MEGA_DS_LLAMA_PATH=./"llama-7b-mega-ds-T${TP}P${PP}"
-
-# Below configuration required for llama model as per llama paper
-# --no-query-key-layer-scaling \
-# --attention-dropout 0 \
-# --hidden-dropout 0 \
-# --use-rotary-position-embeddings \
-# --untie-embeddings-and-output-weights \
-# --swiglu \
-# --normalization rmsnorm \
-# --disable-bias-linear \
-######################################
-covert_args="deepspeed tools/hf2megads_weight_converter.py \
---hf-ckpt-num-shards 2 \
---origin-hf-ckpt-dir $HF_LLAMA_PATH \
---save $MEGA_DS_LLAMA_PATH"
-
-finetune_args="deepspeed finetune_llama.py \
---load $MEGA_DS_LLAMA_PATH"
-
-comm_args="--tensor-model-parallel-size $TP \
---pipeline-model-parallel-size $PP \
---lr-warmup-iters 2000 \
---weight-decay 0.1 \
---clip-grad 1 \
---num-layers $NUM_LAYERS \
---hidden-size $HIDDEN_SIZE \
---num-attention-heads $NUM_HEADS \
---ffn-hidden-size $FFN_HIDDEN_SIZE \
---attention-dropout 0 \
---hidden-dropout 0 \
---no-query-key-layer-scaling \
---disable-bias-linear \
---normalization rmsnorm \
---use-rotary-position-embeddings \
---untie-embeddings-and-output-weights \
---swiglu \
---seq-length $SEQ_LENGTH \
---max-position-embeddings $SEQ_LENGTH \
---micro-batch-size $MICRO_BATCH_SIZE \
---global-batch-size $GLOBAL_BATCH_SIZE \
---train-iters 3500 \
---lr 2e-5 \
---tensorboard-dir tensorboard_output \
---lr-decay-iters 320000 \
---lr-decay-style cosine \
---log-interval 1 \
---eval-iters 100 \
---eval-interval 100 \
---data-path $DATASET_PATH \
---save-interval 1500 \
---split 100,0,0 \
---bf16 \
---zero-stage 0 \
---tokenizer-type HFTokenizer \
---tokenizer-model $HF_LLAMA_PATH \
---deepspeed_config ./examples_deepspeed/finetune_hf_llama/ds_config.json \
---deepspeed \
---distributed-backend nccl \
---num-workers 0 \
---no-masked-softmax-fusion \
---no-bias-gelu-fusion \
---no-bias-dropout-fusion \
---no-gradient-accumulation-fusion \
---repeated-dataloader"
-
-if [ "$1" = "convert" ]; then
-    task_args="$covert_args"
-else
-    task_args="$finetune_args"
-fi
-
-full_cmd="$task_args $comm_args"
-
-eval "$full_cmd"