From 3ff17dd16ad2f5192d2049d193dd6e7fe780fc10 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Sat, 8 Feb 2025 00:15:35 +0000 Subject: [PATCH] update test script Signed-off-by: Chendi Xue --- scripts/run_example_tp.py | 3 +- scripts/run_static-online.sh | 53 ++++++++++++++++++++------------ scripts/run_static-online_ep4.sh | 43 +++++++++++++++++--------- 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/scripts/run_example_tp.py b/scripts/run_example_tp.py index 283764cbbfdf1..a934dea15a405 100644 --- a/scripts/run_example_tp.py +++ b/scripts/run_example_tp.py @@ -4,6 +4,7 @@ import os model_path = "/data/models/DeepSeek-R1/" +#model_path = "/software/data/DeepSeek-R1/" # model_path = "deepseek-ai/DeepSeek-V2-Lite" # Parse the command-line arguments. @@ -22,7 +23,7 @@ # os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1" # os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" os.environ["VLLM_MOE_N_SLICE"] = "1" -os.environ["VLLM_EP_SIZE"] = "8" +os.environ["VLLM_EP_SIZE"] = "4" os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1" os.environ["PT_HPU_WEIGHT_SHARING"] = "0" diff --git a/scripts/run_static-online.sh b/scripts/run_static-online.sh index 16483f4172b2b..61091a89aaf81 100644 --- a/scripts/run_static-online.sh +++ b/scripts/run_static-online.sh @@ -1,12 +1,16 @@ #!/bin/bash tp_parrallel=8 -bs=96 in_len=1024 out_len=1024 multi_step=1 total_len=$((in_len + out_len)) VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128)) VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128)) +bs=96 +num_prompts=300 +request_rate=1 +gpu_utils=0.9 +log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207" # model="/data/models/DeepSeek-R1/" # tokenizer="/data/models/DeepSeek-R1/" @@ -14,20 +18,20 @@ model="/data/models/DeepSeek-R1/" tokenizer="/data/models/DeepSeek-R1/" model_name="DeepSeek-R1" +# VLLM_PROMPT_BS_BUCKET_MIN=1 \ +# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \ +# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \ +# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \ +# VLLM_DECODE_BS_BUCKET_MIN=${bs} \ +# VLLM_DECODE_BS_BUCKET_MAX=${bs} \ +# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \ +# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \ + +VLLM_SKIP_WARMUP=true \ HABANA_VISIBLE_DEVICES="ALL" \ VLLM_MOE_N_SLICE=4 \ VLLM_MLA_DISABLE_REQUANTIZATION=1 \ PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \ -VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \ -RAY_IGNORE_UNHANDLED_ERRORS="1" \ -VLLM_PROMPT_BS_BUCKET_MIN=1 \ -VLLM_PROMPT_BS_BUCKET_MAX=${bs} \ -VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \ -VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \ -VLLM_DECODE_BS_BUCKET_MIN=${bs} \ -VLLM_DECODE_BS_BUCKET_MAX=${bs} \ -VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \ -VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \ python -m vllm.entrypoints.openai.api_server \ --port 8080 \ --model ${model} \ @@ -38,14 +42,14 @@ python -m vllm.entrypoints.openai.api_server \ --use-v2-block-manager \ --num_scheduler_steps ${multi_step}\ --max-model-len 2048 \ - --distributed_executor_backend ray \ - --gpu_memory_utilization 0.9 \ - --trust_remote_code 2>&1 | tee benchmark_logs/serving.log & + --distributed_executor_backend mp \ + --gpu_memory_utilization ${gpu_utils} \ + --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log & pid=$(($!-1)) until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do n=$((n+1)) - if grep -q "Uvicorn running on" benchmark_logs/serving.log; then + if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then break fi sleep 5s @@ -53,16 +57,27 @@ done sleep 5s echo ${pid} -num_prompts=300 -request_rate=1 +hl-smi -l > benchmark_logs/${log_name}_hlsmi.log & +hl_pid=$(($!-1)) + + +start_time=$(date +%s) +echo "Start to benchmark" +python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log +end_time=$(date +%s) +echo "Time elapsed: $((end_time - start_time))s" + +sleep 10 + + start_time=$(date +%s) echo "Start to benchmark" -python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 \ ---save-result 2>&1 | tee benchmark_logs/static-online-gaudi3-0.9util-TPparallel${tp_parrallel}-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_prepad.log +python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log end_time=$(date +%s) echo "Time elapsed: $((end_time - start_time))s" sleep 10 kill ${pid} +kill ${hl_pid} #--backend openai-chat --endpoint "v1/chat/completions" \ No newline at end of file diff --git a/scripts/run_static-online_ep4.sh b/scripts/run_static-online_ep4.sh index 2c6681e0a5b5e..0dcaa6c3d6b8e 100644 --- a/scripts/run_static-online_ep4.sh +++ b/scripts/run_static-online_ep4.sh @@ -1,6 +1,5 @@ #!/bin/bash tp_parrallel=8 -bs=96 in_len=1024 out_len=1024 multi_step=1 @@ -10,6 +9,10 @@ VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128)) ep_size=4 moe_n_slice=1 gpu_utils=0.8 +bs=96 +num_prompts=96 +request_rate=96 +log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207" # model="/data/models/DeepSeek-R1/" # tokenizer="/data/models/DeepSeek-R1/" @@ -17,6 +20,15 @@ model="/data/models/DeepSeek-R1/" tokenizer="/data/models/DeepSeek-R1/" model_name="DeepSeek-R1" +# VLLM_PROMPT_BS_BUCKET_MIN=1 \ +# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \ +# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \ +# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \ +# VLLM_DECODE_BS_BUCKET_MIN=${bs} \ +# VLLM_DECODE_BS_BUCKET_MAX=${bs} \ +# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \ +# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \ +VLLM_SKIP_WARMUP=true \ HABANA_VISIBLE_DEVICES="ALL" \ VLLM_MOE_N_SLICE=${moe_n_slice} \ VLLM_EP_SIZE=${ep_size} \ @@ -25,14 +37,6 @@ PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \ VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \ RAY_IGNORE_UNHANDLED_ERRORS="1" \ PT_HPU_WEIGHT_SHARING=0 \ -VLLM_PROMPT_BS_BUCKET_MIN=1 \ -VLLM_PROMPT_BS_BUCKET_MAX=${bs} \ -VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \ -VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \ -VLLM_DECODE_BS_BUCKET_MIN=${bs} \ -VLLM_DECODE_BS_BUCKET_MAX=${bs} \ -VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \ -VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \ python -m vllm.entrypoints.openai.api_server \ --port 8080 \ --model ${model} \ @@ -45,12 +49,12 @@ python -m vllm.entrypoints.openai.api_server \ --max-model-len 2048 \ --distributed_executor_backend mp \ --gpu_memory_utilization ${gpu_utils} \ - --trust_remote_code 2>&1 | tee benchmark_logs/serving.log & + --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log & pid=$(($!-1)) until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do n=$((n+1)) - if grep -q "Uvicorn running on" benchmark_logs/serving.log; then + if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then break fi sleep 5s @@ -58,15 +62,26 @@ done sleep 10s echo ${pid} -num_prompts=96 -request_rate=96 +hl-smi -l > benchmark_logs/${log_name}_hlsmi.log & +hl_pid=$(($!-1)) + + +start_time=$(date +%s) +echo "Start to benchmark" +python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log +end_time=$(date +%s) +echo "Time elapsed: $((end_time - start_time))s" + +sleep 10 + start_time=$(date +%s) echo "Start to benchmark" -python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_2.log +python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log end_time=$(date +%s) echo "Time elapsed: $((end_time - start_time))s" sleep 10 kill ${pid} +kill ${hl_pid} #--backend openai-chat --endpoint "v1/chat/completions" \ No newline at end of file