aws-samples · KeitaW · Sep 28, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch
@@ -11,8 +11,8 @@
 set -euxo pipefail
 
 # default variables for Enroot, if these variables are defined then use them
-: "${APPS_PATH:=/fsx/apps}"
-: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
 
 ENROOT_IMAGE=deepspeed
 docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile .

diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md
@@ -10,12 +10,12 @@ This guide assumes that you have the following:
 * Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed.
 * An FSx for Lustre filesystem mounted on `/fsx`.
 
-We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases:
+We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../../1.architectures). You need to set the following environment variables to run these test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
+export FSX_PATH=/fsxl
 export MODEL_PATH=$FSX_PATH/deepspeed
 export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path
 cd $TEST_CASE_PATH                          # Note that we assume that you are here during the following command executions

diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md
@@ -6,8 +6,8 @@
 You need to follow steps in `../README.md` to prepare AWS-optimized DeepSpeed container. Also, set the following environment variables to run the test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
 export FSX_PATH=/fsx
 export MODEL_PATH=$FSX_PATH/deepspeed
 export TEST_CASE_PATH=${HOME}/18.deepspeed  # where you copy the test case or set to your test case path

diff --git a/...ch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch b/...ch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch
@@ -4,9 +4,9 @@
 #SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr
 #SBATCH --nodes 1
 
-: "${APPS_PATH:=/fsx/apps}"
-: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}"
-: "${FSX_PATH:=/fsx}"
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsxl}"
 : "${DATASET:=c4_subset}"
 : "${DATA_PATH:=$FSX_PATH/$DATASET}"
 : "${MODEL_PATH:=$FSX_PATH/deepspeed}"
@@ -15,7 +15,8 @@
 
 declare -a ARGS=(
     --container-image ${IMAGE}
-    --container-mounts /fsx
+    --container-mount-home
+    --container-mounts ${FSX_PATH}
 )
 
 srun -l "${ARGS[@]}" python3 ${PWD}/src/convert_llama_weights_to_hf.py \

diff --git a/...h/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh b/...h/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-sbatch --nodes=1 --job-name=cvtw-mgtds scripts/finetune_llama.sbatch convert
+sbatch --nodes=1 --job-name=cvtw-mgtds finetune_llama.sbatch convert_hf2mds
diff --git a/...cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh b/...cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh
@@ -1,4 +1,4 @@
 
 #!/bin/bash
 
-sbatch --nodes=1 --job-name=finetune-llama scripts/finetune_llama.sbatch finetune
+sbatch --nodes=1 --job-name=finetune-llama finetune_llama.sbatch finetune
diff --git a/...cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md b/...cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md
@@ -6,9 +6,9 @@ This test case showcase how to finetune Llama2 model from HuuggingFace Weights u
 Set the following environment variables to run the test cases:
 
 ```bash
-export APPS_PATH=/fsx/apps
-export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh
-export FSX_PATH=/fsx
+export CONTAINER_PATH=/fsxl/containers
+export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh
+export FSX_PATH=/fsxl
 export MODEL_PATH=$FSX_PATH/deepspeed
 export DATA_PATH=$FSX_PATH/alpaca
 ```
@@ -21,8 +21,24 @@ mkdir -p ${DATA_PATH}
 wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O ${DATA_PATH}/alpaca_data.json
 ```
 
-Llama2 model, which governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample.
-You can submit access request from [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/), we need "Llama 2 & Llama Chat" to be checked. Use the [download.sh](https://github.com/facebookresearch/llama/blob/main/download.sh) in the official repository. You will be asked to input an URL from the email you recieve from meta.  
+Llama2 model is governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample.
+
+### Option 1: Download from Hugging Face (Recommended)
+1. Visit [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on Hugging Face
+2. Accept the license terms and submit an access request (processed hourly)
+3. Install Hugging Face CLI: `pip install -U "huggingface_hub[cli]"`
+4. Login to Hugging Face: `huggingface-cli login`
+5. Download the model to your desired location:
+   ```bash
+   hf download meta-llama/Llama-2-7b --local-dir ${MODEL_PATH}/Llama2-meta/7B
+   hf download meta-llama/Llama-2-7b tokenizer.model --local-dir ${MODEL_PATH}/Llama2-meta
+   ```
+
+### Option 2: Download from Meta
+1. Submit an access request from [Meta's Llama downloads page](https://www.llama.com/llama-downloads/)
+2. You will receive an email with a signed download URL (valid for 24 hours)
+3. Use the [download.sh](https://github.com/meta-llama/llama/blob/main/download.sh) script from the official repository
+4. Run `./download.sh` and paste the URL from your email when prompted  
 
 We will assume that you had placed the model and tokenizer as follows on cluster:
 
@@ -60,7 +76,7 @@ ${MODEL_PATH}/Llama2-7b-hf
 
 Finally, transforms the checkpoint into Megatron DeepSpeed format:
 
-``bash
+```bash
 bash 2.convert-weights-to-mega-ds.sh
 ```
 

diff --git a/...ses/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch b/...ses/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch
@@ -0,0 +1,154 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --job-name=convert-llama-weights-to-megatron-deepspeed
+#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr
+#SBATCH --nodes 1
+
+set -euxo pipefail
+: "${CONTAINER_PATH:=/fsxl/containers}"
+: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}"
+: "${FSX_PATH:=/fsxl}"
+: "${DATA_PATH:=$FSX_PATH/alpaca/alpaca_data.json}"
+: "${MODEL_PATH:=$FSX_PATH/deepspeed}"
+: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}"
+: "${HF_LLAMA_PATH:=$FSX_PATH/deepspeed/Llama2-7b-hf}"
+
+export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+export NODES_ARRAY=($NODES)
+export HEAD_NODE=${NODES_ARRAY[0]}
+export MASTER_ADDR=$(hostname --ip-address)
+export MASTER_PORT=$((RANDOM + 10000))
+export NNODES=$SLURM_JOB_NUM_NODES
+export NUM_GPUS_PER_NODE=8
+## EFA settings
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons
+export FI_EFA_USE_HUGE_PAGE=0
+# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
+# https://github.com/pytorch/pytorch/issues/68893
+export NCCL_SOCKET_IFNAME=en
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMPI_MCA_plm=^slurm 
+export MICRO_BATCH_SIZE=16
+export GLOBAL_BATCH_SIZE=256
+export TP=4
+export PP=2
+# require to align with weight dimensions
+export HIDDEN_SIZE=4096
+export FFN_HIDDEN_SIZE=11008
+export NUM_LAYERS=32
+export NUM_HEADS=32
+export SEQ_LENGTH=512
+export MEGA_DS_LLAMA_PATH=${MODEL_PATH}/Llama2-7b-mega-ds-T${TP}P${PP}
+cat <<EOF > configs/ds_config.json
+{
+    "train_batch_size": ${GLOBAL_BATCH_SIZE},
+    "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
+    "steps_per_print": 100,
+    "zero_optimization": {
+        "stage": 0
+    },
+    "bf16": {
+        "enabled": true
+    }
+}
+EOF
+
+declare -a ARGS=(
+    --container-image ${IMAGE}
+    --container-mounts ${FSX_PATH}
+    --container-mount-home
+)
+
+declare -a DIST_ARGS=(
+    --nnodes ${NNODES}
+    --nproc-per-node ${NUM_GPUS_PER_NODE}
+    --master_addr ${MASTER_ADDR}
+    --master_port ${MASTER_PORT}
+    --rdzv_id $RANDOM
+    --rdzv_backend c10d
+    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT}
+)
+
+declare -a CONVERT_HF2MDS_ARGS=(
+    --hf-ckpt-num-shards 2
+    --hf-ckpt-dir $HF_LLAMA_PATH
+    --load-mode auto
+    --save $MEGA_DS_LLAMA_PATH
+)
+
+declare -a CONVERT_MDS2HF_ARGS=(
+    --hf-ckpt-num-shards 2
+    --hf-ckpt-dir $HF_LLAMA_PATH
+    --load-mode auto
+    --to-hf-ckpt
+    --load $MEGA_DS_LLAMA_PATH
+    --save ${HF_LLAMA_PATH}-hf-out
+)
+
+declare -a FINETUNE_ARGS=(
+    --load $MEGA_DS_LLAMA_PATH
+)
+
+declare -a COMM_ARGS=(
+    --tensor-model-parallel-size $TP
+    --pipeline-model-parallel-size $PP
+    --lr-warmup-iters 2000
+    --weight-decay 0.1
+    --clip-grad 1
+    --num-layers $NUM_LAYERS
+    --hidden-size $HIDDEN_SIZE
+    --num-attention-heads $NUM_HEADS
+    --ffn-hidden-size $FFN_HIDDEN_SIZE
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-query-key-layer-scaling
+    --disable-bias-linear
+    --normalization rmsnorm
+    --use-rotary-position-embeddings
+    --untie-embeddings-and-output-weights
+    --swiglu
+    --seq-length $SEQ_LENGTH
+    --max-position-embeddings $SEQ_LENGTH
+    --micro-batch-size $MICRO_BATCH_SIZE
+    --global-batch-size $GLOBAL_BATCH_SIZE
+    --train-iters 3500
+    --lr 2e-5
+    --tensorboard-dir tensorboard_output
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --log-interval 1
+    --eval-iters 100
+    --eval-interval 100
+    --data-path $DATA_PATH
+    --save-interval 1500
+    --split 100,0,0
+    --bf16
+    --zero-stage 0
+    --tokenizer-type HFTokenizer
+    --tokenizer-model $HF_LLAMA_PATH
+    --deepspeed_config ${PWD}/configs/ds_config.json
+    --deepspeed
+    --distributed-backend nccl
+    --num-workers 0
+    --no-masked-softmax-fusion
+    --no-bias-gelu-fusion
+    --no-bias-dropout-fusion
+    --no-gradient-accumulation-fusion
+    --repeated-dataloader
+)
+
+
+if [ "$1" = "convert_hf2mds" ]; then
+    srun -l "${ARGS[@]}" python3 \
+    ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \
+    "${CONVERT_HF2MDS_ARGS[@]}" "${COMM_ARGS[@]}"
+elif [ "$1" = "convert_mds2hf" ]; then
+    srun -l "${ARGS[@]}" python3 \
+    ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \
+    "${CONVERT_MDS2HF_ARGS[@]}" "${COMM_ARGS[@]}"
+else
+    srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \
+    ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \
+    "${FINETUNE_ARGS[@]}" "${COMM_ARGS[@]}"
+fi
diff --git a/..._megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh b/..._megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh