Skip to content

Commit 098e3d1

Browse files
authored
Enable All Tests on TPUv7 (#1279)
Signed-off-by: Qiliang Cui <[email protected]>
1 parent 2a10dab commit 098e3d1

File tree

10 files changed

+335
-21
lines changed

10 files changed

+335
-21
lines changed

.buildkite/pipeline_jax.yml

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -124,22 +124,6 @@ steps:
124124
--ignore=/workspace/tpu_inference/tests/layers/vllm/test_compressed_tensors_moe.py \
125125
--cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=69
126126
127-
- label: "JAX unit tests - tpuv7x"
128-
key: test_7_tpu7x
129-
soft_fail: true
130-
agents:
131-
queue: tpu_v7x_2_queue
132-
commands:
133-
- |
134-
IS_FOR_V7X=true .buildkite/scripts/run_in_docker.sh \
135-
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/ \
136-
--ignore=/workspace/tpu_inference/tests/kernels \
137-
--ignore=/workspace/tpu_inference/tests/lora \
138-
--ignore=/workspace/tpu_inference/tests/e2e \
139-
--ignore=/workspace/tpu_inference/tpu_inference/mock \
140-
--ignore=/workspace/tpu_inference/tests/layers/vllm/test_compressed_tensors_moe.py \
141-
--cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=67
142-
143127
- label: "JAX unit tests - kernels"
144128
key: test_8
145129
soft_fail: true
@@ -285,7 +269,6 @@ steps:
285269
- test_5
286270
- test_6
287271
- test_7
288-
- test_7_tpu7x
289272
- test_8
290273
- test_9
291274
- test_10
@@ -299,4 +282,4 @@ steps:
299282
commands:
300283
- |
301284
.buildkite/scripts/check_results.sh \
302-
"TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_7_tpu7x test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16
285+
"TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16

.buildkite/pipeline_jax_tpu7x.yml

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
notify:
2+
- email: "[email protected]"
3+
if: build.state == "failed" && build.branch == "main"
4+
- slack: "vllm#tpu-ci-notifications"
5+
if: build.state == "failed" && build.branch == "main"
6+
7+
steps:
8+
# -----------------------------------------------------------------
9+
# TEST STEPS - Calling wrapper
10+
# -----------------------------------------------------------------
11+
- label: "TPU7x E2E MLPerf tests for JAX models"
12+
key: tpu7x_test_0
13+
soft_fail: true
14+
env:
15+
IS_FOR_V7X: "true"
16+
agents:
17+
queue: tpu_v7x_2_queue
18+
commands:
19+
- .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
20+
21+
- label: "TPU7x E2E MLPerf tests for JAX models with quantization"
22+
key: tpu7x_test_1
23+
soft_fail: true
24+
env:
25+
QUANTIZATION: "True"
26+
IS_FOR_V7X: "true"
27+
agents:
28+
queue: tpu_v7x_2_queue
29+
commands:
30+
- |
31+
if [[ "$$NIGHTLY" == "1" ]]; then
32+
.buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
33+
else
34+
echo "Skipping: NIGHTLY environment variable not set"
35+
exit 0
36+
fi
37+
38+
- label: "TPU7x E2E MLPerf tests for JAX new models"
39+
key: tpu7x_test_2
40+
soft_fail: true
41+
env:
42+
NEW_MODEL_DESIGN: "1"
43+
IS_FOR_V7X: "true"
44+
agents:
45+
queue: tpu_v7x_2_queue
46+
commands:
47+
- |
48+
if [[ "$$NIGHTLY" == "1" ]]; then
49+
.buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
50+
else
51+
echo "Skipping: NIGHTLY environment variable not set"
52+
exit 0
53+
fi
54+
55+
- label: "TPU7x E2E MLPerf tests for JAX + vLLM models on single chip"
56+
key: tpu7x_test_3
57+
soft_fail: true
58+
env:
59+
MODEL_IMPL_TYPE: "vllm"
60+
IS_FOR_V7X: "true"
61+
agents:
62+
queue: tpu_v7x_2_queue
63+
commands:
64+
- .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
65+
66+
- label: "TPU7x E2E MLperf tests for Llama4 models"
67+
key: tpu7x_test_4
68+
soft_fail: true
69+
env:
70+
NEW_MODEL_DESIGN: "1"
71+
USE_V6E8_QUEUE: "True"
72+
IS_FOR_V7X: "true"
73+
agents:
74+
queue: tpu_v7x_8_queue
75+
commands:
76+
- |
77+
if [[ "$$NIGHTLY" == "1" ]]; then
78+
.buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
79+
else
80+
echo "Skipping: NIGHTLY environment variable not set"
81+
exit 0
82+
fi
83+
84+
85+
- label: "TPU7x E2E multi modality test"
86+
key: tpu7x_test_5
87+
soft_fail: true
88+
env:
89+
IS_FOR_V7X: "true"
90+
agents:
91+
queue: tpu_v7x_2_queue
92+
commands:
93+
- |
94+
if [[ "$$NIGHTLY" == "1" ]]; then
95+
.buildkite/scripts/run_in_docker.sh \
96+
bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_multi_modal_inference.py && \
97+
bash /workspace/tpu_inference/tests/e2e/benchmarking/mm_bench.sh'
98+
else
99+
echo "Skipping: NIGHTLY environment variable not set"
100+
exit 0
101+
fi
102+
103+
- label: "TPU7x E2E speculative decoding test"
104+
key: tpu7x_test_6
105+
soft_fail: true
106+
env:
107+
IS_FOR_V7X: "true"
108+
agents:
109+
queue: tpu_v7x_2_queue
110+
commands:
111+
- |
112+
if [[ "$$NIGHTLY" == "1" ]]; then
113+
.buildkite/scripts/run_in_docker.sh \
114+
bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_speculative_decoding.py'
115+
else
116+
echo "Skipping: NIGHTLY environment variable not set"
117+
exit 0
118+
fi
119+
120+
- label: "TPU7x JAX unit tests"
121+
key: tpu7x_test_7
122+
soft_fail: true
123+
env:
124+
IS_FOR_V7X: "true"
125+
agents:
126+
queue: tpu_v7x_2_queue
127+
commands:
128+
- |
129+
IS_FOR_V7X=true .buildkite/scripts/run_in_docker.sh \
130+
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/ \
131+
--ignore=/workspace/tpu_inference/tests/kernels \
132+
--ignore=/workspace/tpu_inference/tests/lora \
133+
--ignore=/workspace/tpu_inference/tests/e2e \
134+
--ignore=/workspace/tpu_inference/tpu_inference/mock \
135+
--ignore=/workspace/tpu_inference/tests/layers/vllm/test_compressed_tensors_moe.py \
136+
--cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=67
137+
138+
- label: "TPU7x JAX unit tests - kernels"
139+
key: tpu7x_test_8
140+
soft_fail: true
141+
env:
142+
IS_FOR_V7X: "true"
143+
agents:
144+
queue: tpu_v7x_2_queue
145+
commands:
146+
- |
147+
if [[ "$$NIGHTLY" == "1" ]] || git diff --name-only HEAD~1 | grep -qE '^(tpu_inference/kernels|tests/kernels|requirements\.txt)'; then
148+
.buildkite/scripts/run_in_docker.sh \
149+
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/kernels \
150+
--ignore=/workspace/tpu_inference/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
151+
--ignore=/workspace/tpu_inference/tests/kernels/ragged_kv_cache_update_v2_test.py \
152+
--ignore=/workspace/tpu_inference/tests/kernels/collectives \
153+
--ignore=/workspace/tpu_inference/tests/kernels/fused_moe_v1_test.py
154+
else
155+
echo "Skipping: no changes detected in kernels, tests/kernels, or requirements.txt"
156+
exit 0
157+
fi
158+
159+
- label: "TPU7x JAX unit tests - collective kernels"
160+
key: tpu7x_test_9
161+
soft_fail: true
162+
env:
163+
IS_FOR_V7X: "true"
164+
agents:
165+
queue: tpu_v7x_8_queue
166+
commands:
167+
- |
168+
if [[ "$$NIGHTLY" == "1" ]] || git diff --name-only HEAD~1 | grep -qE '^(tpu_inference/kernels/collectives|tests/kernels/collectives|requirements\.txt)'; then
169+
.buildkite/scripts/run_in_docker.sh \
170+
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/kernels/collectives
171+
else
172+
echo "Skipping: no changes detected in kernels/collectives, tests/kernels/collectives, or requirements.txt"
173+
exit 0
174+
fi
175+
176+
- label: "TPU7x lora e2e tests for JAX + vLLM models single chip"
177+
key: tpu7x_test_10
178+
soft_fail: true
179+
env:
180+
IS_FOR_V7X: "true"
181+
agents:
182+
queue: tpu_v7x_2_queue
183+
commands:
184+
- |
185+
if [[ "$$NIGHTLY" == "1" ]]; then
186+
.buildkite/scripts/run_in_docker.sh \
187+
bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
188+
else
189+
echo "Skipping: NIGHTLY environment variable not set"
190+
exit 0
191+
fi
192+
193+
- label: "TPU7x E2E MLPerf tests for JAX + vLLM models on multiple chips"
194+
key: tpu7x_test_11
195+
196+
soft_fail: true
197+
env:
198+
MODEL_IMPL_TYPE: "vllm"
199+
IS_FOR_V7X: "true"
200+
agents:
201+
queue: tpu_v7x_8_queue
202+
commands:
203+
- |
204+
if [[ "$$NIGHTLY" == "1" ]]; then
205+
.buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
206+
else
207+
echo "Skipping: NIGHTLY environment variable not set"
208+
exit 0
209+
fi
210+
211+
- label: "TPU7x E2E MLperf tests for DeepSeek-R1 (no accuracy, 12-decoder layers only)"
212+
key: tpu7x_test_12
213+
soft_fail: true
214+
env:
215+
NEW_MODEL_DESIGN: "1"
216+
USE_V6E8_QUEUE: "True"
217+
SKIP_ACCURACY_TESTS: "True"
218+
VLLM_MLA_DISABLE: "1"
219+
IS_FOR_V7X: "true"
220+
agents:
221+
queue: tpu_v7x_8_queue
222+
commands:
223+
- |
224+
if [[ "$$NIGHTLY" == "1" ]]; then
225+
.buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528 --use-dummy-weights
226+
else
227+
echo "Skipping: NIGHTLY environment variable not set"
228+
exit 0
229+
fi
230+
231+
- label: "TPU7x lora e2e tests for JAX + vLLM models multi chips"
232+
key: tpu7x_test_13
233+
soft_fail: true
234+
env:
235+
USE_V6E8_QUEUE: "True"
236+
VLLM_LOG_LEVEL: "INFO"
237+
IS_FOR_V7X: "true"
238+
agents:
239+
queue: tpu_v7x_8_queue
240+
commands:
241+
- |
242+
if [[ "$$NIGHTLY" == "1" ]]; then
243+
.buildkite/scripts/run_in_docker.sh \
244+
bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
245+
else
246+
echo "Skipping: NIGHTLY environment variable not set"
247+
exit 0
248+
fi
249+
250+
251+
- label: "TPU7x lora unit tests on single chip"
252+
key: tpu7x_test_15
253+
soft_fail: true
254+
env:
255+
IS_FOR_V7X: "true"
256+
agents:
257+
queue: tpu_v7x_2_queue
258+
commands:
259+
- |
260+
.buildkite/scripts/run_in_docker.sh \
261+
bash -c ' python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_bgmv.py && \
262+
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_layers.py'
263+
264+
- label: "TPU7x lora unit tests on multi chips"
265+
key: tpu7x_test_16
266+
soft_fail: true
267+
env:
268+
USE_V6E8_QUEUE: "True"
269+
VLLM_LOG_LEVEL: "INFO"
270+
agents:
271+
queue: tpu_v7x_8_queue
272+
commands:
273+
- |
274+
if [[ "$$NIGHTLY" == "1" ]]; then
275+
.buildkite/scripts/run_in_docker.sh \
276+
bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_layers.py'
277+
else
278+
echo "Skipping: NIGHTLY environment variable not set"
279+
exit 0
280+
fi
281+
# -----------------------------------------------------------------
282+
# NOTIFICATION STEP
283+
# -----------------------------------------------------------------
284+
- label: "TPU7x Test Notification"
285+
key: tpu7x_test_notification
286+
depends_on:
287+
- tpu7x_test_0
288+
- tpu7x_test_1
289+
- tpu7x_test_2
290+
- tpu7x_test_3
291+
- tpu7x_test_4
292+
- tpu7x_test_5
293+
- tpu7x_test_6
294+
- tpu7x_test_7
295+
- tpu7x_test_8
296+
- tpu7x_test_9
297+
- tpu7x_test_10
298+
- tpu7x_test_11
299+
- tpu7x_test_12
300+
- tpu7x_test_13
301+
- tpu7x_test_15
302+
- tpu7x_test_16
303+
agents:
304+
queue: cpu
305+
commands:
306+
- |
307+
.buildkite/scripts/check_results.sh \
308+
"TPU JAX Tests Failed" tpu7x_test_0 tpu7x_test_1 tpu7x_test_2 \
309+
tpu7x_test_3 tpu7x_test_4 tpu7x_test_5 tpu7x_test_6 tpu7x_test_7 \
310+
tpu7x_test_8 tpu7x_test_9 tpu7x_test_10 tpu7x_test_11 tpu7x_test_12 \
311+
tpu7x_test_13 tpu7x_test_15 tpu7x_test_16

.buildkite/scripts/bootstrap.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ upload_pipeline() {
2525
buildkite-agent meta-data set "VLLM_COMMIT_HASH" "${VLLM_COMMIT_HASH}"
2626
echo "Using vllm commit hash: $(buildkite-agent meta-data get "VLLM_COMMIT_HASH")"
2727
buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
28+
buildkite-agent pipeline upload .buildkite/pipeline_jax_tpu7x.yml
2829
# buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
2930
buildkite-agent pipeline upload .buildkite/main.yml
3031
buildkite-agent pipeline upload .buildkite/nightly_releases.yml

tests/layers/jax/sample/test_rejection_sampler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,7 @@ def test_rejection_sampling_approximates_target_distribution(self):
11811181
We expect that as sample size increases, the distance to the target
11821182
distribution decreases much more than the distance to random distributions.
11831183
"""
1184+
# TODO(Qiliang Cui): Remove when issue is resolved.
11841185
if 'TPU7x' in jax.devices()[0].device_kind:
11851186
pytest.skip("Skipping test on TPU TPU7x.")
11861187

tests/layers/vllm/test_compressed_tensors_w8a8_int8.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ def test_loading_model(model, mesh):
129129
])
130130
@pytest.mark.parametrize("enable_sp", [False, True])
131131
def test_row_parallel_linear(model, bias, mesh, enable_sp):
132+
133+
# TODO(Qiliang Cui): Remove when issue is resolved.
132134
if 'TPU7x' in jax.devices()[0].device_kind:
133135
pytest.skip("Skipping test on TPU TPU7x.")
134136

tests/layers/vllm/test_mxfp4.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ def test_quant_override(model, mesh):
116116
@pytest.mark.parametrize("topk", [2])
117117
def test_mxfp4_fused_moe(mesh, num_tokens, intermediate_size, hidden_size,
118118
num_experts, topk):
119+
120+
# TODO(Qiliang Cui): Remove when issue is resolved.
119121
if 'TPU7x' in jax.devices()[0].device_kind:
120122
pytest.skip("Skipping test on TPU TPU7x.")
121123

@@ -209,6 +211,7 @@ def test_mxfp4_fused_moe(mesh, num_tokens, intermediate_size, hidden_size,
209211
def test_mxfp4_fused_moe_use_kernel(mesh, num_tokens, intermediate_size,
210212
hidden_size, num_experts, topk):
211213

214+
# TODO(Qiliang Cui): Remove when issue is resolved.
212215
if 'TPU7x' in jax.devices()[0].device_kind:
213216
pytest.skip("Skipping test on TPU TPU7x.")
214217

0 commit comments

Comments
 (0)