Skip to content

Commit 08c2915

Browse files
committed
Merge main
Signed-off-by: Jacob Platin <[email protected]>
2 parents f2d0670 + 142e6e2 commit 08c2915

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2215
-2718
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Optimized Runtime Sampling (top k, top p, temperature, logit output)
2+
# feature support matrix
3+
steps:
4+
- label: "Correctness tests for Optimized Runtime Sampling"
5+
key: "optimized_runtime_sampling_CorrectnessTest"
6+
soft_fail: true
7+
agents:
8+
queue: tpu_v6e_queue
9+
commands:
10+
- |
11+
buildkite-agent meta-data set "optimized_runtime_sampling_CorrectnessTest" "to be added"
12+
- label: "Record correctness test result for Optimized Runtime Sampling"
13+
key: "record_optimized_runtime_sampling_CorrectnessTest"
14+
depends_on: "optimized_runtime_sampling_CorrectnessTest"
15+
env:
16+
CI_TARGET: "Optimized Runtime Sampling (top k, top p, temperature, logit output)"
17+
CI_STAGE: "CorrectnessTest"
18+
CI_CATEGORY: "feature support matrix"
19+
agents:
20+
queue: cpu
21+
commands:
22+
- |
23+
.buildkite/scripts/record_step_result.sh optimized_runtime_sampling_CorrectnessTest
24+
25+
- label: "Performance tests for Optimized Runtime Sampling"
26+
key: "optimized_runtime_sampling_PerformanceTest"
27+
depends_on: "record_optimized_runtime_sampling_CorrectnessTest"
28+
soft_fail: true
29+
agents:
30+
queue: tpu_v6e_queue
31+
commands:
32+
- |
33+
buildkite-agent meta-data set "optimized_runtime_sampling_PerformanceTest" "to be added"
34+
- label: "Record performance test result for Optimized Runtime Sampling"
35+
key: "record_optimized_runtime_sampling_PerformanceTest"
36+
depends_on: "optimized_runtime_sampling_PerformanceTest"
37+
env:
38+
CI_TARGET: "Optimized Runtime Sampling (top k, top p, temperature, logit output)"
39+
CI_STAGE: "PerformanceTest"
40+
CI_CATEGORY: "feature support matrix"
41+
agents:
42+
queue: cpu
43+
commands:
44+
- |
45+
.buildkite/scripts/record_step_result.sh optimized_runtime_sampling_PerformanceTest
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Out-of-tree model support
2+
# feature support matrix
3+
steps:
4+
- label: "Correctness tests for Out-of-tree model support"
5+
key: "out_of_tree_model_support_CorrectnessTest"
6+
soft_fail: true
7+
agents:
8+
queue: tpu_v6e_queue
9+
commands:
10+
- |
11+
buildkite-agent meta-data set "out_of_tree_model_support_CorrectnessTest" "to be added"
12+
- label: "Record correctness test result for Out-of-tree model support"
13+
key: "record_out_of_tree_model_support_CorrectnessTest"
14+
depends_on: "out_of_tree_model_support_CorrectnessTest"
15+
env:
16+
CI_TARGET: "Out-of-tree model support"
17+
CI_STAGE: "CorrectnessTest"
18+
CI_CATEGORY: "feature support matrix"
19+
agents:
20+
queue: cpu
21+
commands:
22+
- |
23+
.buildkite/scripts/record_step_result.sh out_of_tree_model_support_CorrectnessTest
24+
25+
- label: "Performance tests for Out-of-tree model support"
26+
key: "out_of_tree_model_support_PerformanceTest"
27+
depends_on: "record_out_of_tree_model_support_CorrectnessTest"
28+
soft_fail: true
29+
agents:
30+
queue: tpu_v6e_queue
31+
commands:
32+
- |
33+
buildkite-agent meta-data set "out_of_tree_model_support_PerformanceTest" "to be added"
34+
- label: "Record performance test result for Out-of-tree model support"
35+
key: "record_out_of_tree_model_support_PerformanceTest"
36+
depends_on: "out_of_tree_model_support_PerformanceTest"
37+
env:
38+
CI_TARGET: "Out-of-tree model support"
39+
CI_STAGE: "PerformanceTest"
40+
CI_CATEGORY: "feature support matrix"
41+
agents:
42+
queue: cpu
43+
commands:
44+
- |
45+
.buildkite/scripts/record_step_result.sh out_of_tree_model_support_PerformanceTest
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# runai_model_streamer_loader
2+
# The RunAI Model Streamer is a high-performance model loader that serves as an
3+
# alternative to the default Hugging Face loader. Instead of downloading a model
4+
# to local disk, it streams the weights from object storage (like GCS) into
5+
# GPU memory. This streaming process is significantly faster than the traditional
6+
# disk-based loading method.
7+
steps:
8+
- label: "Correctness tests for runai_model_streamer_loader"
9+
key: "runai_model_streamer_loader_CorrectnessTest"
10+
soft_fail: true
11+
agents:
12+
queue: tpu_v6e_queue
13+
commands:
14+
- .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_runai_model_streamer_loader.py::test_correctness
15+
- label: "Record correctness test result for runai_model_streamer_loader"
16+
key: "record_runai_model_streamer_loader_CorrectnessTest"
17+
depends_on: "runai_model_streamer_loader_CorrectnessTest"
18+
env:
19+
CI_TARGET: "runai_model_streamer_loader"
20+
CI_STAGE: "CorrectnessTest"
21+
CI_CATEGORY: "feature support matrix"
22+
agents:
23+
queue: cpu
24+
commands:
25+
- |
26+
.buildkite/scripts/record_step_result.sh runai_model_streamer_loader_CorrectnessTest

docs/recommended_models_features.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,5 @@ These tables show the models currently tested for accuracy and performance.
2222
## Recommended Features
2323

2424
This table shows the features currently tested for accuracy and performance.
25+
26+
{{ read_csv('../support_matrices/feature_support_matrix.csv', keep_default_na=False) }}

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ torchvision==0.24.0
1515
pathwaysutils
1616
parameterized
1717
numba==0.62.1
18+
runai-model-streamer[s3,gcs]==0.15.0
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
Feature,CorrectnessTest,PerformanceTest
22
"Chunked Prefill",✅,✅
33
"Multimodal Inputs",✅,✅
4+
"Optimized Runtime Sampling (top k, top p, temperature, logit output)",to be added,to be added
5+
"Out-of-tree model support",to be added,to be added
46
"Prefix Caching",✅,✅
57
"Single Program Multi Data",✅,✅
6-
"Speculative Decoding: Ngram",✅,
8+
"Speculative Decoding: Ngram",✅,
79
"Structured Decoding",✅,to be added
8-
"async scheduler",❌,N/A
10+
"async scheduler",✅,✅
11+
"runai_model_streamer_loader",❌,N/A
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
Model,UnitTest,IntegrationTest,Benchmark
2-
"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,
2+
"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,

support_matrices/nightly/text_only_model_support_matrix.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Model,UnitTest,IntegrationTest,Benchmark
22
"meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅
33
"Qwen/Qwen3-4B",✅,✅,✅
4-
"google/gemma-3-27b-it",✅,❌,N/A
4+
"google/gemma-3-27b-it",✅,✅,❌
55
"Qwen/Qwen3-32B",✅,✅,✅
66
"meta-llama/Llama-Guard-4-12B",N/A,N/A,N/A
77
"meta-llama/Llama-3.1-8B-Instruct",✅,✅,❌

tests/e2e/test_data_parallel.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,11 @@ def _run_inference_with_config(model_name: str,
8181
time.sleep(5)
8282

8383

84+
@pytest.mark.parametrize("model_impl_type", ["vllm", "flax_nnx"])
8485
def test_model_data_parallelism(
8586
test_prompts: list,
8687
sampling_params: SamplingParams,
88+
model_impl_type: str,
8789
):
8890
"""
8991
Test model-wise data parallelism where data=2 in the mesh axis.
@@ -95,6 +97,7 @@ def test_model_data_parallelism(
9597
"""
9698
# Use Llama 1B for this test
9799
test_model = "meta-llama/Llama-3.2-1B-Instruct"
100+
os.environ['MODEL_IMPL_TYPE'] = model_impl_type
98101

99102
# Test with data parallelism enabled
100103
outputs = _run_inference_with_config(
@@ -103,6 +106,7 @@ def test_model_data_parallelism(
103106
sampling_params=sampling_params,
104107
tensor_parallel_size=1,
105108
data_parallel_size=2,
109+
async_scheduling=True,
106110
)
107111

108112
# Verify we got outputs for all prompts
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# This file contains end-to-end tests for the RunAI Model Streamer loader.
2+
#
3+
# The RunAI Model Streamer is a high-performance model loader that serves as an
4+
# alternative to the default Hugging Face loader. Instead of downloading a model
5+
# to local disk, it streams the weights from object storage (like GCS) into
6+
# GPU memory. This streaming process is significantly faster than the
7+
# traditional disk-based loading method.
8+
9+
# The tests in this file verify that loading model weights using the
10+
# streamer produces the same results as loading the same model using the
11+
# standard Hugging Face loader. This ensures the correctness of the streamer
12+
# integration.
13+
14+
# The tests are performed by:
15+
# 1. Loading a model from Google Cloud Storage using the `runai_streamer` format.
16+
# 2. Generating output with this model.
17+
# 3. Loading the same model from Hugging Face using the default loader.
18+
# 4. Generating output with this second model.
19+
# 5. Asserting that the outputs from both models are identical.
20+
21+
from __future__ import annotations
22+
23+
import time
24+
25+
import pytest
26+
from vllm import LLM, SamplingParams
27+
28+
29+
@pytest.fixture
30+
def sampling_config():
31+
return SamplingParams(temperature=0, max_tokens=10, ignore_eos=True)
32+
33+
34+
@pytest.fixture
35+
# TODO(amacaskill): Replace with GKE owned GCS bucket.
36+
def gcs_model_name():
37+
return "gs://vertex-model-garden-public-us/llama3/llama3-8b-hf"
38+
39+
40+
@pytest.fixture
41+
def hf_model_name():
42+
return "meta-llama/Meta-Llama-3-8B"
43+
44+
45+
@pytest.fixture
46+
def prompt():
47+
return "Hello, my name is"
48+
49+
50+
def test_correctness(
51+
sampling_config: SamplingParams,
52+
gcs_model_name: str,
53+
hf_model_name: str,
54+
prompt: str,
55+
monkeypatch: pytest.MonkeyPatch,
56+
):
57+
'''
58+
Compare the outputs of a model loaded from GCS via runai_model_streamer
59+
and a model loaded from Hugging Face. The outputs should be the same.
60+
These tests attempt to use tensor_parallel_size=1. The model is 16GB,
61+
# and v6e has 32GB of HBM, so it will fit.
62+
'''
63+
# Set ENV variables so that runai_model_streamer uses anonymous GCS access.
64+
monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
65+
monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
66+
monkeypatch.setenv("CLOUD_STORAGE_EMULATOR_ENDPOINT",
67+
"https://storage.googleapis.com")
68+
gcs_llm = LLM(model=gcs_model_name,
69+
load_format="runai_streamer",
70+
max_model_len=128,
71+
max_num_seqs=16,
72+
max_num_batched_tokens=256)
73+
gcs_outputs = gcs_llm.generate([prompt], sampling_config)
74+
gcs_output_text = gcs_outputs[0].outputs[0].text
75+
del gcs_llm
76+
time.sleep(10) # Wait for TPUs to be released
77+
78+
# Test with Hugging Face model
79+
hf_llm = LLM(model=hf_model_name,
80+
max_model_len=128,
81+
max_num_seqs=16,
82+
max_num_batched_tokens=256)
83+
hf_outputs = hf_llm.generate([prompt], sampling_config)
84+
hf_output_text = hf_outputs[0].outputs[0].text
85+
del hf_llm
86+
time.sleep(10) # Wait for TPUs to be released
87+
88+
assert gcs_output_text == hf_output_text, (
89+
f"Outputs do not match! "
90+
f"GCS output: {gcs_output_text}, HF output: {hf_output_text}")

0 commit comments

Comments
 (0)