Skip to content

Commit

Permalink
update test script
Browse files Browse the repository at this point in the history
Signed-off-by: Chendi Xue <[email protected]>
  • Loading branch information
xuechendi authored and jikunshang committed Feb 11, 2025
1 parent 6084c79 commit 3ff17dd
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 34 deletions.
3 changes: 2 additions & 1 deletion scripts/run_example_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os

model_path = "/data/models/DeepSeek-R1/"
#model_path = "/software/data/DeepSeek-R1/"
# model_path = "deepseek-ai/DeepSeek-V2-Lite"

# Parse the command-line arguments.
Expand All @@ -22,7 +23,7 @@
# os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
# os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
os.environ["VLLM_MOE_N_SLICE"] = "1"
os.environ["VLLM_EP_SIZE"] = "8"
os.environ["VLLM_EP_SIZE"] = "4"
os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
os.environ["PT_HPU_WEIGHT_SHARING"] = "0"

Expand Down
53 changes: 34 additions & 19 deletions scripts/run_static-online.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,37 @@
#!/bin/bash
tp_parrallel=8
bs=96
in_len=1024
out_len=1024
multi_step=1
total_len=$((in_len + out_len))
VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
bs=96
num_prompts=300
request_rate=1
gpu_utils=0.9
log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207"

# model="/data/models/DeepSeek-R1/"
# tokenizer="/data/models/DeepSeek-R1/"
model="/data/models/DeepSeek-R1/"
tokenizer="/data/models/DeepSeek-R1/"
model_name="DeepSeek-R1"

# VLLM_PROMPT_BS_BUCKET_MIN=1 \
# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
# VLLM_DECODE_BS_BUCKET_MIN=${bs} \
# VLLM_DECODE_BS_BUCKET_MAX=${bs} \
# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \

VLLM_SKIP_WARMUP=true \
HABANA_VISIBLE_DEVICES="ALL" \
VLLM_MOE_N_SLICE=4 \
VLLM_MLA_DISABLE_REQUANTIZATION=1 \
PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
RAY_IGNORE_UNHANDLED_ERRORS="1" \
VLLM_PROMPT_BS_BUCKET_MIN=1 \
VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
VLLM_DECODE_BS_BUCKET_MIN=${bs} \
VLLM_DECODE_BS_BUCKET_MAX=${bs} \
VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
python -m vllm.entrypoints.openai.api_server \
--port 8080 \
--model ${model} \
Expand All @@ -38,31 +42,42 @@ python -m vllm.entrypoints.openai.api_server \
--use-v2-block-manager \
--num_scheduler_steps ${multi_step}\
--max-model-len 2048 \
--distributed_executor_backend ray \
--gpu_memory_utilization 0.9 \
--trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
--distributed_executor_backend mp \
--gpu_memory_utilization ${gpu_utils} \
--trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
pid=$(($!-1))

until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
n=$((n+1))
if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then
break
fi
sleep 5s
done
sleep 5s
echo ${pid}

num_prompts=300
request_rate=1
hl-smi -l > benchmark_logs/${log_name}_hlsmi.log &
hl_pid=$(($!-1))


start_time=$(date +%s)
echo "Start to benchmark"
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log
end_time=$(date +%s)
echo "Time elapsed: $((end_time - start_time))s"

sleep 10


start_time=$(date +%s)
echo "Start to benchmark"
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 \
--save-result 2>&1 | tee benchmark_logs/static-online-gaudi3-0.9util-TPparallel${tp_parrallel}-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_prepad.log
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
end_time=$(date +%s)
echo "Time elapsed: $((end_time - start_time))s"

sleep 10

kill ${pid}
kill ${hl_pid}
#--backend openai-chat --endpoint "v1/chat/completions"
43 changes: 29 additions & 14 deletions scripts/run_static-online_ep4.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash
tp_parrallel=8
bs=96
in_len=1024
out_len=1024
multi_step=1
Expand All @@ -10,13 +9,26 @@ VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
ep_size=4
moe_n_slice=1
gpu_utils=0.8
bs=96
num_prompts=96
request_rate=96
log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_0207"

# model="/data/models/DeepSeek-R1/"
# tokenizer="/data/models/DeepSeek-R1/"
model="/data/models/DeepSeek-R1/"
tokenizer="/data/models/DeepSeek-R1/"
model_name="DeepSeek-R1"

# VLLM_PROMPT_BS_BUCKET_MIN=1 \
# VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
# VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
# VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
# VLLM_DECODE_BS_BUCKET_MIN=${bs} \
# VLLM_DECODE_BS_BUCKET_MAX=${bs} \
# VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
# VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
VLLM_SKIP_WARMUP=true \
HABANA_VISIBLE_DEVICES="ALL" \
VLLM_MOE_N_SLICE=${moe_n_slice} \
VLLM_EP_SIZE=${ep_size} \
Expand All @@ -25,14 +37,6 @@ PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
RAY_IGNORE_UNHANDLED_ERRORS="1" \
PT_HPU_WEIGHT_SHARING=0 \
VLLM_PROMPT_BS_BUCKET_MIN=1 \
VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
VLLM_PROMPT_SEQ_BUCKET_MAX=${total_len} \
VLLM_DECODE_BS_BUCKET_MIN=${bs} \
VLLM_DECODE_BS_BUCKET_MAX=${bs} \
VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
python -m vllm.entrypoints.openai.api_server \
--port 8080 \
--model ${model} \
Expand All @@ -45,28 +49,39 @@ python -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--distributed_executor_backend mp \
--gpu_memory_utilization ${gpu_utils} \
--trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
--trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
pid=$(($!-1))

until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
n=$((n+1))
if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
if grep -q "Uvicorn running on" benchmark_logs/${log_name}_serving.log; then
break
fi
sleep 5s
done
sleep 10s
echo ${pid}

num_prompts=96
request_rate=96
hl-smi -l > benchmark_logs/${log_name}_hlsmi.log &
hl_pid=$(($!-1))


start_time=$(date +%s)
echo "Start to benchmark"
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run1.log
end_time=$(date +%s)
echo "Time elapsed: $((end_time - start_time))s"

sleep 10

start_time=$(date +%s)
echo "Start to benchmark"
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_2.log
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
end_time=$(date +%s)
echo "Time elapsed: $((end_time - start_time))s"

sleep 10

kill ${pid}
kill ${hl_pid}
#--backend openai-chat --endpoint "v1/chat/completions"

0 comments on commit 3ff17dd

Please sign in to comment.