From 3ff17dd16ad2f5192d2049d193dd6e7fe780fc10 Mon Sep 17 00:00:00 2001
From: Chendi Xue <chendi.xue@intel.com>
Date: Sat, 8 Feb 2025 00:15:35 +0000
Subject: [PATCH] update test script

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 scripts/run_example_tp.py        |  3 +-
 scripts/run_static-online.sh     | 53 ++++++++++++++++++++------------
 scripts/run_static-online_ep4.sh | 43 +++++++++++++++++---------
 3 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/scripts/run_example_tp.py b/scripts/run_example_tp.py
index 283764cbbfdf1..a934dea15a405 100644
--- a/scripts/run_example_tp.py
+++ b/scripts/run_example_tp.py
@@ -4,6 +4,7 @@
 import os
 
 model_path = "/data/models/DeepSeek-R1/"
+#model_path = "/software/data/DeepSeek-R1/"
 # model_path = "deepseek-ai/DeepSeek-V2-Lite"
 
 # Parse the command-line arguments.
@@ -22,7 +23,7 @@
 # os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
 # os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
 os.environ["VLLM_MOE_N_SLICE"] = "1"
-os.environ["VLLM_EP_SIZE"] = "8"
+os.environ["VLLM_EP_SIZE"] = "4"
 os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
 
diff --git a/scripts/run_static-online.sh b/scripts/run_static-online.sh
index 16483f4172b2b..61091a89aaf81 100644
--- a/scripts/run_static-online.sh
+++ b/scripts/run_static-online.sh
@@ -1,12 +1,16 @@
 #!/bin/bash
 tp_parrallel=8
-bs=96
 in_len=1024
 out_len=1024
 multi_step=1
 total_len=$((in_len + out_len))
 VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
 VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
+bs=96
+num_prompts=300
+request_rate=1
+gpu_utils=0.9
+log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207"
 
 # model="/data/models/DeepSeek-R1/"
 # tokenizer="/data/models/DeepSeek-R1/"
@@ -14,20 +18,20 @@ model="/data/models/DeepSeek-R1/"
 tokenizer="/data/models/DeepSeek-R1/"
 model_name="DeepSeek-R1"
 
+# VLLM_PROMPT_BS_BUCKET_MIN=1 \
+# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
+# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
+# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
+# VLLM_DECODE_BS_BUCKET_MIN=${bs} \
+# VLLM_DECODE_BS_BUCKET_MAX=${bs} \
+# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
+# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
+
+VLLM_SKIP_WARMUP=true \
 HABANA_VISIBLE_DEVICES="ALL" \
 VLLM_MOE_N_SLICE=4 \
 VLLM_MLA_DISABLE_REQUANTIZATION=1 \
 PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
-VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
-RAY_IGNORE_UNHANDLED_ERRORS="1" \
-VLLM_PROMPT_BS_BUCKET_MIN=1 \
-VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
-VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
-VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
-VLLM_DECODE_BS_BUCKET_MIN=${bs} \
-VLLM_DECODE_BS_BUCKET_MAX=${bs} \
-VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
-VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
 python -m vllm.entrypoints.openai.api_server \
     --port 8080 \
     --model ${model} \
@@ -38,14 +42,14 @@ python -m vllm.entrypoints.openai.api_server \
     --use-v2-block-manager \
     --num_scheduler_steps ${multi_step}\
     --max-model-len 2048 \
-    --distributed_executor_backend ray \
-    --gpu_memory_utilization 0.9 \
-    --trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
+    --distributed_executor_backend mp \
+    --gpu_memory_utilization ${gpu_utils} \
+    --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
 pid=$(($!-1))
 
 until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
     n=$((n+1))
-    if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
+    if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then
         break
     fi
     sleep 5s
@@ -53,16 +57,27 @@ done
 sleep 5s
 echo ${pid}
 
-num_prompts=300
-request_rate=1
+hl-smi -l > benchmark_logs/${log_name}_hlsmi.log &
+hl_pid=$(($!-1))
+
+
+start_time=$(date +%s)
+echo "Start to benchmark"
+python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log
+end_time=$(date +%s)
+echo "Time elapsed: $((end_time - start_time))s"
+
+sleep 10
+
+
 start_time=$(date +%s)
 echo "Start to benchmark"
-python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 \
---save-result 2>&1 | tee benchmark_logs/static-online-gaudi3-0.9util-TPparallel${tp_parrallel}-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_prepad.log
+python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
 end_time=$(date +%s)
 echo "Time elapsed: $((end_time - start_time))s"
 
 sleep 10
 
 kill ${pid}
+kill ${hl_pid}
 #--backend openai-chat --endpoint "v1/chat/completions"
\ No newline at end of file
diff --git a/scripts/run_static-online_ep4.sh b/scripts/run_static-online_ep4.sh
index 2c6681e0a5b5e..0dcaa6c3d6b8e 100644
--- a/scripts/run_static-online_ep4.sh
+++ b/scripts/run_static-online_ep4.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 tp_parrallel=8
-bs=96
 in_len=1024
 out_len=1024
 multi_step=1
@@ -10,6 +9,10 @@ VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
 ep_size=4
 moe_n_slice=1
 gpu_utils=0.8
+bs=96
+num_prompts=96
+request_rate=96
+log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207"
 
 # model="/data/models/DeepSeek-R1/"
 # tokenizer="/data/models/DeepSeek-R1/"
@@ -17,6 +20,15 @@ model="/data/models/DeepSeek-R1/"
 tokenizer="/data/models/DeepSeek-R1/"
 model_name="DeepSeek-R1"
 
+# VLLM_PROMPT_BS_BUCKET_MIN=1 \
+# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
+# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
+# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
+# VLLM_DECODE_BS_BUCKET_MIN=${bs} \
+# VLLM_DECODE_BS_BUCKET_MAX=${bs} \
+# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
+# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
+VLLM_SKIP_WARMUP=true \
 HABANA_VISIBLE_DEVICES="ALL" \
 VLLM_MOE_N_SLICE=${moe_n_slice} \
 VLLM_EP_SIZE=${ep_size} \
@@ -25,14 +37,6 @@ PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
 VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
 RAY_IGNORE_UNHANDLED_ERRORS="1" \
 PT_HPU_WEIGHT_SHARING=0 \
-VLLM_PROMPT_BS_BUCKET_MIN=1 \
-VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
-VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
-VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
-VLLM_DECODE_BS_BUCKET_MIN=${bs} \
-VLLM_DECODE_BS_BUCKET_MAX=${bs} \
-VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
-VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
 python -m vllm.entrypoints.openai.api_server \
     --port 8080 \
     --model ${model} \
@@ -45,12 +49,12 @@ python -m vllm.entrypoints.openai.api_server \
     --max-model-len 2048 \
     --distributed_executor_backend mp \
     --gpu_memory_utilization ${gpu_utils} \
-    --trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
+    --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
 pid=$(($!-1))
 
 until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
     n=$((n+1))
-    if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
+    if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then
         break
     fi
     sleep 5s
@@ -58,15 +62,26 @@ done
 sleep 10s
 echo ${pid}
 
-num_prompts=96
-request_rate=96
+hl-smi -l > benchmark_logs/${log_name}_hlsmi.log &
+hl_pid=$(($!-1))
+
+
+start_time=$(date +%s)
+echo "Start to benchmark"
+python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log
+end_time=$(date +%s)
+echo "Time elapsed: $((end_time - start_time))s"
+
+sleep 10
+
 start_time=$(date +%s)
 echo "Start to benchmark"
-python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_2.log
+python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
 end_time=$(date +%s)
 echo "Time elapsed: $((end_time - start_time))s"
 
 sleep 10
 
 kill ${pid}
+kill ${hl_pid}
 #--backend openai-chat --endpoint "v1/chat/completions"
\ No newline at end of file