Skip to content

Commit e083282

Browse files
jiangpeng36Ronald1995
andcommitted
[Perf][V1] Fully overlap model execution
Signed-off-by: jiangpeng36 <[email protected]> Signed-off-by: Ronald1995 <[email protected]> Co-authored-by: Ronald1995 <[email protected]>
1 parent dd087ef commit e083282

File tree

5 files changed

+418
-31
lines changed

5 files changed

+418
-31
lines changed

tests/e2e/singlecard/test_ascend_scheduler.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import pytest
4+
from vllm import SamplingParams
45

56
from tests.e2e.conftest import VllmRunner
67
from tests.e2e.model_utils import check_outputs_equal
78

89
MODEL = "Qwen/Qwen3-0.6B"
910

10-
1111
def test_concurrent_partial_prefill():
1212
with VllmRunner(MODEL,
1313
additional_config={
@@ -86,3 +86,25 @@ def test_chunked_prefill_with_ascend_scheduler(
8686
name_0="vllm_output",
8787
name_1="chunked_prefill_output",
8888
)
89+
90+
91+
def test_async_scheduling() -> None:
92+
prompts = [
93+
"Hello, my name is",
94+
"The president of the United States is",
95+
"The capital of France is",
96+
"The future of AI is",
97+
] * 10
98+
sampling_params = SamplingParams(temperature=0.2,
99+
max_tokens=10,
100+
stop_token_ids=None)
101+
102+
with VllmRunner(
103+
"Qwen/Qwen2.5-0.5B-Instruct",
104+
max_model_len=4096,
105+
max_num_seqs=50,
106+
dtype="bfloat16",
107+
gpu_memory_utilization=0.9,
108+
async_scheduling=True,
109+
) as vllm_model:
110+
vllm_model.generate(prompts, sampling_params=sampling_params)
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
18+
#
19+
"""Compare the short outputs of HF and vLLM when using greedy sampling.
20+
21+
Run `pytest tests/test_offline_inference.py`.
22+
"""
23+
import os
24+
from unittest.mock import patch
25+
26+
import pytest
27+
import vllm # noqa: F401
28+
from vllm import SamplingParams
29+
from vllm.assets.audio import AudioAsset
30+
from vllm.assets.image import ImageAsset
31+
32+
import vllm_ascend # noqa: F401
33+
from tests.e2e.conftest import VllmRunner
34+
35+
MODELS = [
36+
"Qwen/Qwen2.5-0.5B-Instruct",
37+
"Qwen/Qwen3-0.6B-Base",
38+
]
39+
MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
40+
MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct"]
41+
42+
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
43+
AUDIO_ASSETS = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
44+
AUDIO_PROMPT_TEMPLATES = {
45+
1: "What is recited in the audio?",
46+
2: "What sport and what nursery rhyme are referenced?"
47+
}
48+
49+
50+
@pytest.mark.parametrize("model", MODELS)
51+
@pytest.mark.parametrize("dtype", ["half", "float16"])
52+
@pytest.mark.parametrize("max_tokens", [5])
53+
def test_models(model: str, dtype: str, max_tokens: int) -> None:
54+
# 5042 tokens for gemma2
55+
# gemma2 has alternating sliding window size of 4096
56+
# we need a prompt with more than 4096 tokens to test the sliding window
57+
prompt = "The following numbers of the sequence " + ", ".join(
58+
str(i) for i in range(1024)) + " are:"
59+
example_prompts = [prompt]
60+
61+
with VllmRunner(model,
62+
max_model_len=8192,
63+
dtype=dtype,
64+
enforce_eager=True,
65+
gpu_memory_utilization=0.7) as vllm_model:
66+
vllm_model.generate_greedy(example_prompts, max_tokens)
67+
68+
69+
@pytest.mark.parametrize("model", MULTIMODALITY_VL_MODELS)
70+
def test_multimodal_vl(model, prompt_template, vllm_runner):
71+
image = ImageAsset("cherry_blossom") \
72+
.pil_image.convert("RGB")
73+
img_questions = [
74+
"What is the content of this image?",
75+
"Describe the content of this image in detail.",
76+
"What's in the image?",
77+
"Where is this image taken?",
78+
]
79+
images = [image] * len(img_questions)
80+
prompts = prompt_template(img_questions)
81+
with vllm_runner(model,
82+
max_model_len=4096,
83+
mm_processor_kwargs={
84+
"min_pixels": 28 * 28,
85+
"max_pixels": 1280 * 28 * 28,
86+
"fps": 1,
87+
}) as vllm_model:
88+
vllm_model.generate_greedy(prompts=prompts,
89+
images=images,
90+
max_tokens=64)
91+
92+
93+
def prepare_audio_inputs(audio_count: int):
94+
audio_prompt = "".join([
95+
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
96+
for idx in range(audio_count)
97+
])
98+
question = AUDIO_PROMPT_TEMPLATES[audio_count]
99+
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
100+
"<|im_start|>user\n"
101+
f"{audio_prompt}{question}<|im_end|>\n"
102+
"<|im_start|>assistant\n")
103+
mm_data = {
104+
"audio":
105+
[asset.audio_and_sample_rate for asset in AUDIO_ASSETS[:audio_count]]
106+
}
107+
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
108+
return inputs
109+
110+
111+
@pytest.mark.parametrize("model", MULTIMODALITY_AUDIO_MODELS)
112+
@pytest.mark.parametrize("audio_count", [2])
113+
@pytest.mark.parametrize("max_tokens", [10])
114+
def test_multimodal_audio(model: str, audio_count: int,
115+
max_tokens: int) -> None:
116+
inputs = prepare_audio_inputs(audio_count)
117+
118+
sampling_params = SamplingParams(temperature=0.2,
119+
max_tokens=max_tokens,
120+
stop_token_ids=None)
121+
122+
with VllmRunner(model,
123+
max_model_len=4096,
124+
max_num_seqs=5,
125+
enforce_eager=False,
126+
dtype="bfloat16",
127+
limit_mm_per_prompt={"audio": audio_count},
128+
gpu_memory_utilization=0.9) as vllm_model:
129+
vllm_model.generate(inputs, sampling_params=sampling_params)
130+
131+
132+
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": "1"})
133+
def test_models_topk() -> None:
134+
example_prompts = [
135+
"Hello, my name is",
136+
"The president of the United States is",
137+
"The capital of France is",
138+
"The future of AI is",
139+
]
140+
sampling_params = SamplingParams(max_tokens=5,
141+
temperature=0.0,
142+
top_k=50,
143+
top_p=0.9)
144+
145+
with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
146+
max_model_len=8192,
147+
dtype="float16",
148+
enforce_eager=True,
149+
gpu_memory_utilization=0.7) as vllm_model:
150+
vllm_model.generate(example_prompts, sampling_params)
151+
152+
153+
def test_models_prompt_logprobs() -> None:
154+
155+
example_prompts = [
156+
"Hello, my name is",
157+
]
158+
159+
with VllmRunner("/home/jp/model/Qwen2.5-0.5B-Instruct",
160+
max_model_len=8192,
161+
dtype="float16",
162+
enforce_eager=True,
163+
gpu_memory_utilization=0.7) as vllm_model:
164+
vllm_model.generate_greedy_logprobs(example_prompts,
165+
max_tokens=5,
166+
num_logprobs=1)
167+
168+
169+
def test_async_scheduling() -> None:
170+
prompts = [
171+
"Hello, my name is",
172+
"The president of the United States is",
173+
"The capital of France is",
174+
"The future of AI is",
175+
] * 10
176+
sampling_params = SamplingParams(temperature=0.2,
177+
max_tokens=10,
178+
stop_token_ids=None)
179+
180+
with VllmRunner(
181+
# "Qwen/Qwen2.5-0.5B-Instruct"
182+
"/home/jp/model/Qwen2.5-0.5B-Instruct",
183+
max_model_len=4096,
184+
max_num_seqs=50,
185+
dtype="bfloat16",
186+
gpu_memory_utilization=0.9,
187+
async_scheduling=True,
188+
) as vllm_model:
189+
vllm_model.generate(prompts, sampling_params=sampling_params)

0 commit comments

Comments
 (0)