diff --git a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch index ebd7b0b04..632259a37 100644 --- a/3.test_cases/pytorch/deepspeed/1.build-image.sbatch +++ b/3.test_cases/pytorch/deepspeed/1.build-image.sbatch @@ -11,8 +11,8 @@ set -euxo pipefail # default variables for Enroot, if these variables are defined then use them -: "${APPS_PATH:=/fsx/apps}" -: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" +: "${CONTAINER_PATH:=/fsxl/containers}" +: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}" ENROOT_IMAGE=deepspeed docker build -t ${ENROOT_IMAGE} -f 0.deepspeed.dockerfile . diff --git a/3.test_cases/pytorch/deepspeed/README.md b/3.test_cases/pytorch/deepspeed/README.md index fd2ef7524..50ce348d5 100644 --- a/3.test_cases/pytorch/deepspeed/README.md +++ b/3.test_cases/pytorch/deepspeed/README.md @@ -10,12 +10,12 @@ This guide assumes that you have the following: * Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed. * An FSx for Lustre filesystem mounted on `/fsx`. -We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../1.architectures). You need to set the following environment variables to run these test cases: +We recommend that you set up a Slurm cluster using the templates in the architectures [directory](../../../1.architectures). You need to set the following environment variables to run these test cases: ```bash -export APPS_PATH=/fsx/apps -export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh -export FSX_PATH=/fsx +export CONTAINER_PATH=/fsxl/containers +export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh +export FSX_PATH=/fsxl export MODEL_PATH=$FSX_PATH/deepspeed export TEST_CASE_PATH=${HOME}/18.deepspeed # where you copy the test case or set to your test case path cd $TEST_CASE_PATH # Note that we assume that you are here during the following command executions diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md index c0405d042..47702ea35 100644 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/README.md @@ -6,8 +6,8 @@ You need to follow steps in `../README.md` to prepare AWS-optimized DeepSpeed container. Also, set the following environment variables to run the test cases: ```bash -export APPS_PATH=/fsx/apps -export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh +export CONTAINER_PATH=/fsxl/containers +export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh export FSX_PATH=/fsx export MODEL_PATH=$FSX_PATH/deepspeed export TEST_CASE_PATH=${HOME}/18.deepspeed # where you copy the test case or set to your test case path diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch index 5eefce6d2..1144a11f3 100644 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/1.convert-weights-to-hf.sbatch @@ -4,9 +4,9 @@ #SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr #SBATCH --nodes 1 -: "${APPS_PATH:=/fsx/apps}" -: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" -: "${FSX_PATH:=/fsx}" +: "${CONTAINER_PATH:=/fsxl/containers}" +: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}" +: "${FSX_PATH:=/fsxl}" : "${DATASET:=c4_subset}" : "${DATA_PATH:=$FSX_PATH/$DATASET}" : "${MODEL_PATH:=$FSX_PATH/deepspeed}" @@ -15,7 +15,8 @@ declare -a ARGS=( --container-image ${IMAGE} - --container-mounts /fsx + --container-mount-home + --container-mounts ${FSX_PATH} ) srun -l "${ARGS[@]}" python3 ${PWD}/src/convert_llama_weights_to_hf.py \ diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh index b2b4fd224..c21ef8245 100644 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/2.convert-weights-to-mega-ds.sh @@ -1,3 +1,3 @@ #!/bin/bash -sbatch --nodes=1 --job-name=cvtw-mgtds scripts/finetune_llama.sbatch convert \ No newline at end of file +sbatch --nodes=1 --job-name=cvtw-mgtds finetune_llama.sbatch convert_hf2mds \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh index 546967e86..a7264e7fc 100644 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/3.finetune-llama.sh @@ -1,4 +1,4 @@ #!/bin/bash -sbatch --nodes=1 --job-name=finetune-llama scripts/finetune_llama.sbatch finetune \ No newline at end of file +sbatch --nodes=1 --job-name=finetune-llama finetune_llama.sbatch finetune \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md index fa38fb968..965b3b5ac 100644 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/README.md @@ -6,9 +6,9 @@ This test case showcase how to finetune Llama2 model from HuuggingFace Weights u Set the following environment variables to run the test cases: ```bash -export APPS_PATH=/fsx/apps -export ENROOT_IMAGE=$APPS_PATH/deepspeed.sqsh -export FSX_PATH=/fsx +export CONTAINER_PATH=/fsxl/containers +export ENROOT_IMAGE=$CONTAINER_PATH/deepspeed.sqsh +export FSX_PATH=/fsxl export MODEL_PATH=$FSX_PATH/deepspeed export DATA_PATH=$FSX_PATH/alpaca ``` @@ -21,8 +21,24 @@ mkdir -p ${DATA_PATH} wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O ${DATA_PATH}/alpaca_data.json ``` -Llama2 model, which governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample. -You can submit access request from [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/), we need "Llama 2 & Llama Chat" to be checked. Use the [download.sh](https://github.com/facebookresearch/llama/blob/main/download.sh) in the official repository. You will be asked to input an URL from the email you recieve from meta. +Llama2 model is governed by the Meta license and must be downloaded and converted to the standard [Hugging Face](https://huggingface.co/) format prior to running this sample. + +### Option 1: Download from Hugging Face (Recommended) +1. Visit [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on Hugging Face +2. Accept the license terms and submit an access request (processed hourly) +3. Install Hugging Face CLI: `pip install -U "huggingface_hub[cli]"` +4. Login to Hugging Face: `huggingface-cli login` +5. Download the model to your desired location: + ```bash + hf download meta-llama/Llama-2-7b --local-dir ${MODEL_PATH}/Llama2-meta/7B + hf download meta-llama/Llama-2-7b tokenizer.model --local-dir ${MODEL_PATH}/Llama2-meta + ``` + +### Option 2: Download from Meta +1. Submit an access request from [Meta's Llama downloads page](https://www.llama.com/llama-downloads/) +2. You will receive an email with a signed download URL (valid for 24 hours) +3. Use the [download.sh](https://github.com/meta-llama/llama/blob/main/download.sh) script from the official repository +4. Run `./download.sh` and paste the URL from your email when prompted We will assume that you had placed the model and tokenizer as follows on cluster: @@ -60,7 +76,7 @@ ${MODEL_PATH}/Llama2-7b-hf Finally, transforms the checkpoint into Megatron DeepSpeed format: -``bash +```bash bash 2.convert-weights-to-mega-ds.sh ``` diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch new file mode 100644 index 000000000..32ea9e546 --- /dev/null +++ b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/finetune_llama.sbatch @@ -0,0 +1,154 @@ +#!/bin/bash +#SBATCH --exclusive +#SBATCH --job-name=convert-llama-weights-to-megatron-deepspeed +#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr +#SBATCH --nodes 1 + +set -euxo pipefail +: "${CONTAINER_PATH:=/fsxl/containers}" +: "${IMAGE:=$CONTAINER_PATH/deepspeed.sqsh}" +: "${FSX_PATH:=/fsxl}" +: "${DATA_PATH:=$FSX_PATH/alpaca/alpaca_data.json}" +: "${MODEL_PATH:=$FSX_PATH/deepspeed}" +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +: "${HF_LLAMA_PATH:=$FSX_PATH/deepspeed/Llama2-7b-hf}" + +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$((RANDOM + 10000)) +export NNODES=$SLURM_JOB_NUM_NODES +export NUM_GPUS_PER_NODE=8 +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export NCCL_ASYNC_ERROR_HANDLING=1 +export OMPI_MCA_plm=^slurm +export MICRO_BATCH_SIZE=16 +export GLOBAL_BATCH_SIZE=256 +export TP=4 +export PP=2 +# require to align with weight dimensions +export HIDDEN_SIZE=4096 +export FFN_HIDDEN_SIZE=11008 +export NUM_LAYERS=32 +export NUM_HEADS=32 +export SEQ_LENGTH=512 +export MEGA_DS_LLAMA_PATH=${MODEL_PATH}/Llama2-7b-mega-ds-T${TP}P${PP} +cat < configs/ds_config.json +{ + "train_batch_size": ${GLOBAL_BATCH_SIZE}, + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "zero_optimization": { + "stage": 0 + }, + "bf16": { + "enabled": true + } +} +EOF + +declare -a ARGS=( + --container-image ${IMAGE} + --container-mounts ${FSX_PATH} + --container-mount-home +) + +declare -a DIST_ARGS=( + --nnodes ${NNODES} + --nproc-per-node ${NUM_GPUS_PER_NODE} + --master_addr ${MASTER_ADDR} + --master_port ${MASTER_PORT} + --rdzv_id $RANDOM + --rdzv_backend c10d + --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} +) + +declare -a CONVERT_HF2MDS_ARGS=( + --hf-ckpt-num-shards 2 + --hf-ckpt-dir $HF_LLAMA_PATH + --load-mode auto + --save $MEGA_DS_LLAMA_PATH +) + +declare -a CONVERT_MDS2HF_ARGS=( + --hf-ckpt-num-shards 2 + --hf-ckpt-dir $HF_LLAMA_PATH + --load-mode auto + --to-hf-ckpt + --load $MEGA_DS_LLAMA_PATH + --save ${HF_LLAMA_PATH}-hf-out +) + +declare -a FINETUNE_ARGS=( + --load $MEGA_DS_LLAMA_PATH +) + +declare -a COMM_ARGS=( + --tensor-model-parallel-size $TP + --pipeline-model-parallel-size $PP + --lr-warmup-iters 2000 + --weight-decay 0.1 + --clip-grad 1 + --num-layers $NUM_LAYERS + --hidden-size $HIDDEN_SIZE + --num-attention-heads $NUM_HEADS + --ffn-hidden-size $FFN_HIDDEN_SIZE + --attention-dropout 0 + --hidden-dropout 0 + --no-query-key-layer-scaling + --disable-bias-linear + --normalization rmsnorm + --use-rotary-position-embeddings + --untie-embeddings-and-output-weights + --swiglu + --seq-length $SEQ_LENGTH + --max-position-embeddings $SEQ_LENGTH + --micro-batch-size $MICRO_BATCH_SIZE + --global-batch-size $GLOBAL_BATCH_SIZE + --train-iters 3500 + --lr 2e-5 + --tensorboard-dir tensorboard_output + --lr-decay-iters 320000 + --lr-decay-style cosine + --log-interval 1 + --eval-iters 100 + --eval-interval 100 + --data-path $DATA_PATH + --save-interval 1500 + --split 100,0,0 + --bf16 + --zero-stage 0 + --tokenizer-type HFTokenizer + --tokenizer-model $HF_LLAMA_PATH + --deepspeed_config ${PWD}/configs/ds_config.json + --deepspeed + --distributed-backend nccl + --num-workers 0 + --no-masked-softmax-fusion + --no-bias-gelu-fusion + --no-bias-dropout-fusion + --no-gradient-accumulation-fusion + --repeated-dataloader +) + + +if [ "$1" = "convert_hf2mds" ]; then + srun -l "${ARGS[@]}" python3 \ + ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \ + "${CONVERT_HF2MDS_ARGS[@]}" "${COMM_ARGS[@]}" +elif [ "$1" = "convert_mds2hf" ]; then + srun -l "${ARGS[@]}" python3 \ + ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \ + "${CONVERT_MDS2HF_ARGS[@]}" "${COMM_ARGS[@]}" +else + srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \ + ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \ + "${FINETUNE_ARGS[@]}" "${COMM_ARGS[@]}" +fi \ No newline at end of file diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/convert-weights-hf-to-megatron-deepspeed.sh deleted file mode 100644 index e69de29bb..000000000 diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch deleted file mode 100644 index 3d0a4a54b..000000000 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sbatch +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/bash -#SBATCH --exclusive -#SBATCH --job-name=convert-llama-weights-to-megatron-deepspeed -#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr -#SBATCH --nodes 1 - -set -euxo pipefail -: "${APPS_PATH:=/fsx/apps}" -: "${IMAGE:=$APPS_PATH/deepspeed.sqsh}" -: "${FSX_PATH:=/fsx}" -: "${DATA_PATH:=/fsx/alpaca/alpaca_data.json}" -: "${MODEL_PATH:=$FSX_PATH/deepspeed}" -: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" -: "${HF_LLAMA_PATH:=/fsx/deepspeed/Llama2-7b-hf}" - -export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) -export NODES_ARRAY=($NODES) -export HEAD_NODE=${NODES_ARRAY[0]} -export MASTER_ADDR=$(hostname --ip-address) -export MASTER_PORT=$((RANDOM + 10000)) -export NNODES=$SLURM_JOB_NUM_NODES -export NUM_GPUS_PER_NODE=8 -## EFA settings -export FI_LOG_LEVEL=1 -export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons -export FI_EFA_USE_HUGE_PAGE=0 -# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 -# https://github.com/pytorch/pytorch/issues/68893 -export NCCL_SOCKET_IFNAME=en -export NCCL_ASYNC_ERROR_HANDLING=1 -export OMPI_MCA_plm=^slurm -export MICRO_BATCH_SIZE=16 -export GLOBAL_BATCH_SIZE=256 -export TP=4 -export PP=2 -# require to align with weight dimensions -export HIDDEN_SIZE=4096 -export FFN_HIDDEN_SIZE=11008 -export NUM_LAYERS=32 -export NUM_HEADS=32 -export SEQ_LENGTH=512 -export MEGA_DS_LLAMA_PATH=${MODEL_PATH}/Llama2-7b-mega-ds-T${TP}P${PP} -cat < configs/ds_config.json -{ - "train_batch_size": ${GLOBAL_BATCH_SIZE}, - "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, - "steps_per_print": 100, - "zero_optimization": { - "stage": 0 - }, - "bf16": { - "enabled": true - } -} -EOF - -export HOSTFILE=/fsx/hostfile -# create hostfile on the fly -# https://github.com/microsoft/DeepSpeed/issues/3489 -function makehostfile() { -perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"}; -$slots=8 if $slots==0; # workaround 8 gpu machines -@nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}]; -print map { "$b$_ slots=$slots\n" } @nodes' -} -makehostfile > ${HOSTFILE} - - -declare -a ARGS=( - --container-image ${IMAGE} - --container-mounts /fsx,/opt/slurm/bin -) -declare -a DIST_ARGS=( - --nnodes ${NNODES} - --nproc-per-node ${NUM_GPUS_PER_NODE} - --master_addr ${MASTER_ADDR} - --master_port ${MASTER_PORT} - --rdzv_id $RANDOM - --rdzv_backend c10d - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} -) -declare -a CONVERT_ARGS=( - --hf-ckpt-num-shards 3 - --origin-hf-ckpt-dir ${MODEL_PATH}/Llama2-7b-hf - --save ${MEGA_DS_LLAMA_PATH} -) -declare -a COMM_ARGS=( - --tensor-model-parallel-size $TP - --pipeline-model-parallel-size $PP - --lr-warmup-iters 2000 - --weight-decay 0.1 - --clip-grad 1 - --num-layers $NUM_LAYERS - --hidden-size $HIDDEN_SIZE - --num-attention-heads $NUM_HEADS - --ffn-hidden-size $FFN_HIDDEN_SIZE - --attention-dropout 0 - --hidden-dropout 0 - --no-query-key-layer-scaling - --disable-bias-linear - --normalization rmsnorm - --use-rotary-position-embeddings - --untie-embeddings-and-output-weights - --swiglu - --seq-length $SEQ_LENGTH - --max-position-embeddings $SEQ_LENGTH - --micro-batch-size $MICRO_BATCH_SIZE - --global-batch-size $GLOBAL_BATCH_SIZE - --train-iters 3500 - --lr 2e-5 - --tensorboard-dir tensorboard_output - --lr-decay-iters 320000 - --lr-decay-style cosine - --log-interval 1 - --eval-iters 100 - --eval-interval 100 - --data-path $DATA_PATH - --save-interval 1500 - --split 100,0,0 - --bf16 - --zero-stage 0 - --tokenizer-type HFTokenizer - --tokenizer-model $HF_LLAMA_PATH - --deepspeed_config ${PWD}/configs/ds_config.json - --deepspeed - --distributed-backend nccl - --num-workers 0 - --no-masked-softmax-fusion - --no-bias-gelu-fusion - --no-bias-dropout-fusion - --no-gradient-accumulation-fusion - --repeated-dataloader -) - -if [ "$1" = "convert" ]; then - srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \ - ${PWD}/../Megatron-DeepSpeed/tools/hf2megads_weight_converter.py \ - "${CONVERT_ARGS[@]}" "${COMM_ARGS[@]}" -else - srun -l "${ARGS[@]}" torchrun "${DIST_ARGS[@]}" \ - ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \ - --load ${MEGA_DS_LLAMA_PATH} "${COMM_ARGS[@]}" - # function run_deepspeed() { - # srun --nodelist=${NODE} --ntasks=1 -l "${ARGS[@]}" deepspeed "${DIST_ARGS[@]}" \ - # ${PWD}/../Megatron-DeepSpeed/finetune_llama.py \ - # --load ${MEGA_DS_LLAMA_PATH} "${COMM_ARGS[@]}" - # } - # # run deepspeed - # NODE_RANK=1 - # for (( NODE_RANK=1; NODE_RANK<${NNODES}; NODE_RANK++ )) - # do - # NODE=${NODES[$NODE_RANK]} - # echo "Run compute node ${NODE} for rank: ${NODE_RANK}" - # run_deepspeed & - # done - # NODE_RANK=0 - # NODE=${HEAD_NODE} - # echo "Run main node ${NODE} for rank: ${NODE_RANK}" - # run_deepspeed - # wait -fi diff --git a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh b/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh deleted file mode 100644 index d0acbcf34..000000000 --- a/3.test_cases/pytorch/deepspeed/examples_megatron_deepspeed/finetune_hf_llama/scripts/finetune_llama.sh +++ /dev/null @@ -1,94 +0,0 @@ -# The code is adopted from https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/finetune_hf_llama/finetune_llama.sh -DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json -DATASET_PATH=./alpaca_data.json -# dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json - -# weights link: https://huggingface.co/huggyllama/llama-7b - -MICRO_BATCH_SIZE=16 -GLOBAL_BATCH_SIZE=256 -TP=2 -PP=2 -# require to align with weight dimensions -HIDDEN_SIZE=4096 -FFN_HIDDEN_SIZE=11008 -NUM_LAYERS=32 -NUM_HEADS=32 -SEQ_LENGTH=512 -###################################### - -MEGA_DS_LLAMA_PATH=./"llama-7b-mega-ds-T${TP}P${PP}" - -# Below configuration required for llama model as per llama paper -# --no-query-key-layer-scaling \ -# --attention-dropout 0 \ -# --hidden-dropout 0 \ -# --use-rotary-position-embeddings \ -# --untie-embeddings-and-output-weights \ -# --swiglu \ -# --normalization rmsnorm \ -# --disable-bias-linear \ -###################################### -covert_args="deepspeed tools/hf2megads_weight_converter.py \ ---hf-ckpt-num-shards 2 \ ---origin-hf-ckpt-dir $HF_LLAMA_PATH \ ---save $MEGA_DS_LLAMA_PATH" - -finetune_args="deepspeed finetune_llama.py \ ---load $MEGA_DS_LLAMA_PATH" - -comm_args="--tensor-model-parallel-size $TP \ ---pipeline-model-parallel-size $PP \ ---lr-warmup-iters 2000 \ ---weight-decay 0.1 \ ---clip-grad 1 \ ---num-layers $NUM_LAYERS \ ---hidden-size $HIDDEN_SIZE \ ---num-attention-heads $NUM_HEADS \ ---ffn-hidden-size $FFN_HIDDEN_SIZE \ ---attention-dropout 0 \ ---hidden-dropout 0 \ ---no-query-key-layer-scaling \ ---disable-bias-linear \ ---normalization rmsnorm \ ---use-rotary-position-embeddings \ ---untie-embeddings-and-output-weights \ ---swiglu \ ---seq-length $SEQ_LENGTH \ ---max-position-embeddings $SEQ_LENGTH \ ---micro-batch-size $MICRO_BATCH_SIZE \ ---global-batch-size $GLOBAL_BATCH_SIZE \ ---train-iters 3500 \ ---lr 2e-5 \ ---tensorboard-dir tensorboard_output \ ---lr-decay-iters 320000 \ ---lr-decay-style cosine \ ---log-interval 1 \ ---eval-iters 100 \ ---eval-interval 100 \ ---data-path $DATASET_PATH \ ---save-interval 1500 \ ---split 100,0,0 \ ---bf16 \ ---zero-stage 0 \ ---tokenizer-type HFTokenizer \ ---tokenizer-model $HF_LLAMA_PATH \ ---deepspeed_config ./examples_deepspeed/finetune_hf_llama/ds_config.json \ ---deepspeed \ ---distributed-backend nccl \ ---num-workers 0 \ ---no-masked-softmax-fusion \ ---no-bias-gelu-fusion \ ---no-bias-dropout-fusion \ ---no-gradient-accumulation-fusion \ ---repeated-dataloader" - -if [ "$1" = "convert" ]; then - task_args="$covert_args" -else - task_args="$finetune_args" -fi - -full_cmd="$task_args $comm_args" - -eval "$full_cmd"