vllm-project
diff --git a/‎.buildkite/features/Optimized_Runtime_Sampling.yml‎
Lines changed: 45 additions & 0 deletions b/‎.buildkite/features/Optimized_Runtime_Sampling.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎.buildkite/features/Out_Of_Tree_Model_Support.yml‎
Lines changed: 45 additions & 0 deletions b/‎.buildkite/features/Out_Of_Tree_Model_Support.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎.buildkite/features/runai_model_streamer_loader.yml‎
Lines changed: 26 additions & 0 deletions b/‎.buildkite/features/runai_model_streamer_loader.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/recommended_models_features.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/recommended_models_features.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎support_matrices/nightly/feature_support_matrix.csv‎
Lines changed: 5 additions & 2 deletions b/‎support_matrices/nightly/feature_support_matrix.csv‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎support_matrices/nightly/multimodal_model_support_matrix.csv‎
Lines changed: 1 addition & 1 deletion b/‎support_matrices/nightly/multimodal_model_support_matrix.csv‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎support_matrices/nightly/text_only_model_support_matrix.csv‎
Lines changed: 1 addition & 1 deletion b/‎support_matrices/nightly/text_only_model_support_matrix.csv‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/e2e/test_data_parallel.py‎
Lines changed: 4 additions & 0 deletions b/‎tests/e2e/test_data_parallel.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/e2e/test_runai_model_streamer_loader.py‎
Lines changed: 90 additions & 0 deletions b/‎tests/e2e/test_runai_model_streamer_loader.py‎
Lines changed: 90 additions & 0 deletions
@@ -0,0 +1,45 @@
+# Optimized Runtime Sampling (top k, top p, temperature, logit output)
+# feature support matrix
+steps:
+  - label: "Correctness tests for Optimized Runtime Sampling"
+    key: "optimized_runtime_sampling_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "optimized_runtime_sampling_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for Optimized Runtime Sampling"
+    key: "record_optimized_runtime_sampling_CorrectnessTest"
+    depends_on: "optimized_runtime_sampling_CorrectnessTest"
+    env:
+      CI_TARGET: "Optimized Runtime Sampling (top k, top p, temperature, logit output)"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh optimized_runtime_sampling_CorrectnessTest
+
+  - label: "Performance tests for Optimized Runtime Sampling"
+    key: "optimized_runtime_sampling_PerformanceTest"
+    depends_on: "record_optimized_runtime_sampling_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "optimized_runtime_sampling_PerformanceTest" "to be added"
+  - label: "Record performance test result for Optimized Runtime Sampling"
+    key: "record_optimized_runtime_sampling_PerformanceTest"
+    depends_on: "optimized_runtime_sampling_PerformanceTest"
+    env:
+      CI_TARGET: "Optimized Runtime Sampling (top k, top p, temperature, logit output)"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh optimized_runtime_sampling_PerformanceTest
@@ -0,0 +1,45 @@
+# Out-of-tree model support
+# feature support matrix
+steps:
+  - label: "Correctness tests for Out-of-tree model support"
+    key: "out_of_tree_model_support_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "out_of_tree_model_support_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for Out-of-tree model support"
+    key: "record_out_of_tree_model_support_CorrectnessTest"
+    depends_on: "out_of_tree_model_support_CorrectnessTest"
+    env:
+      CI_TARGET: "Out-of-tree model support"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh out_of_tree_model_support_CorrectnessTest
+
+  - label: "Performance tests for Out-of-tree model support"
+    key: "out_of_tree_model_support_PerformanceTest"
+    depends_on: "record_out_of_tree_model_support_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "out_of_tree_model_support_PerformanceTest" "to be added"
+  - label: "Record performance test result for Out-of-tree model support"
+    key: "record_out_of_tree_model_support_PerformanceTest"
+    depends_on: "out_of_tree_model_support_PerformanceTest"
+    env:
+      CI_TARGET: "Out-of-tree model support"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh out_of_tree_model_support_PerformanceTest
@@ -0,0 +1,26 @@
+# runai_model_streamer_loader
+# The RunAI Model Streamer is a high-performance model loader that serves as an
+# alternative to the default Hugging Face loader. Instead of downloading a model
+# to local disk, it streams the weights from object storage (like GCS) into
+# GPU memory. This streaming process is significantly faster than the traditional
+# disk-based loading method.
+steps:
+  - label: "Correctness tests for runai_model_streamer_loader"
+    key: "runai_model_streamer_loader_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_runai_model_streamer_loader.py::test_correctness
+  - label: "Record correctness test result for runai_model_streamer_loader"
+    key: "record_runai_model_streamer_loader_CorrectnessTest"
+    depends_on: "runai_model_streamer_loader_CorrectnessTest"
+    env:
+      CI_TARGET: "runai_model_streamer_loader"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh runai_model_streamer_loader_CorrectnessTest
@@ -22,3 +22,5 @@ These tables show the models currently tested for accuracy and performance.
 ## Recommended Features
 
 This table shows the features currently tested for accuracy and performance.
+
+{{ read_csv('../support_matrices/feature_support_matrix.csv', keep_default_na=False) }}
@@ -15,3 +15,4 @@ torchvision==0.24.0
 pathwaysutils
 parameterized
 numba==0.62.1
+runai-model-streamer[s3,gcs]==0.15.0
@@ -1,8 +1,11 @@
 Feature,CorrectnessTest,PerformanceTest
 "Chunked Prefill",✅,✅
 "Multimodal Inputs",✅,✅
+"Optimized Runtime Sampling (top k, top p, temperature, logit output)",to be added,to be added
+"Out-of-tree model support",to be added,to be added
 "Prefix Caching",✅,✅
 "Single Program Multi Data",✅,✅
-"Speculative Decoding: Ngram",✅,❌
+"Speculative Decoding: Ngram",✅,✅
 "Structured Decoding",✅,to be added
-"async scheduler",❌,N/A
+"async scheduler",✅,✅
+"runai_model_streamer_loader",❌,N/A
@@ -1,2 +1,2 @@
 Model,UnitTest,IntegrationTest,Benchmark
-"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅
+"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,❌
@@ -1,7 +1,7 @@
 Model,UnitTest,IntegrationTest,Benchmark
 "meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅
 "Qwen/Qwen3-4B",✅,✅,✅
-"google/gemma-3-27b-it",✅,❌,N/A
+"google/gemma-3-27b-it",✅,✅,❌
 "Qwen/Qwen3-32B",✅,✅,✅
 "meta-llama/Llama-Guard-4-12B",N/A,N/A,N/A
 "meta-llama/Llama-3.1-8B-Instruct",✅,✅,❌
 
@@ -81,9 +81,11 @@ def _run_inference_with_config(model_name: str,
         time.sleep(5)
 
 
+@pytest.mark.parametrize("model_impl_type", ["vllm", "flax_nnx"])
 def test_model_data_parallelism(
     test_prompts: list,
     sampling_params: SamplingParams,
+    model_impl_type: str,
 ):
     """
     Test model-wise data parallelism where data=2 in the mesh axis.
@@ -95,6 +97,7 @@ def test_model_data_parallelism(
     """
     # Use Llama 1B for this test
     test_model = "meta-llama/Llama-3.2-1B-Instruct"
+    os.environ['MODEL_IMPL_TYPE'] = model_impl_type
 
     # Test with data parallelism enabled
     outputs = _run_inference_with_config(
@@ -103,6 +106,7 @@ def test_model_data_parallelism(
         sampling_params=sampling_params,
         tensor_parallel_size=1,
         data_parallel_size=2,
+        async_scheduling=True,
     )
 
     # Verify we got outputs for all prompts
 
@@ -0,0 +1,90 @@
+# This file contains end-to-end tests for the RunAI Model Streamer loader.
+#
+# The RunAI Model Streamer is a high-performance model loader that serves as an
+# alternative to the default Hugging Face loader. Instead of downloading a model
+# to local disk, it streams the weights from object storage (like GCS) into
+# GPU memory. This streaming process is significantly faster than the
+# traditional disk-based loading method.
+
+# The tests in this file verify that loading model weights using the
+# streamer produces the same results as loading the same model using the
+# standard Hugging Face loader. This ensures the correctness of the streamer
+# integration.
+
+# The tests are performed by:
+# 1. Loading a model from Google Cloud Storage using the `runai_streamer` format.
+# 2. Generating output with this model.
+# 3. Loading the same model from Hugging Face using the default loader.
+# 4. Generating output with this second model.
+# 5. Asserting that the outputs from both models are identical.
+
+from __future__ import annotations
+
+import time
+
+import pytest
+from vllm import LLM, SamplingParams
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=True)
+
+
+@pytest.fixture
+# TODO(amacaskill): Replace with GKE owned GCS bucket.
+def gcs_model_name():
+    return "gs://vertex-model-garden-public-us/llama3/llama3-8b-hf"
+
+
+@pytest.fixture
+def hf_model_name():
+    return "meta-llama/Meta-Llama-3-8B"
+
+
+@pytest.fixture
+def prompt():
+    return "Hello, my name is"
+
+
+def test_correctness(
+    sampling_config: SamplingParams,
+    gcs_model_name: str,
+    hf_model_name: str,
+    prompt: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    '''
+    Compare the outputs of a model loaded from GCS via runai_model_streamer
+    and a model loaded from Hugging Face. The outputs should be the same.
+    These tests attempt to use tensor_parallel_size=1. The model is 16GB,
+    # and v6e has 32GB of HBM, so it will fit.
+    '''
+    # Set ENV variables so that runai_model_streamer uses anonymous GCS access.
+    monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
+    monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
+    monkeypatch.setenv("CLOUD_STORAGE_EMULATOR_ENDPOINT",
+                       "https://storage.googleapis.com")
+    gcs_llm = LLM(model=gcs_model_name,
+                  load_format="runai_streamer",
+                  max_model_len=128,
+                  max_num_seqs=16,
+                  max_num_batched_tokens=256)
+    gcs_outputs = gcs_llm.generate([prompt], sampling_config)
+    gcs_output_text = gcs_outputs[0].outputs[0].text
+    del gcs_llm
+    time.sleep(10)  # Wait for TPUs to be released
+
+    # Test with Hugging Face model
+    hf_llm = LLM(model=hf_model_name,
+                 max_model_len=128,
+                 max_num_seqs=16,
+                 max_num_batched_tokens=256)
+    hf_outputs = hf_llm.generate([prompt], sampling_config)
+    hf_output_text = hf_outputs[0].outputs[0].text
+    del hf_llm
+    time.sleep(10)  # Wait for TPUs to be released
+
+    assert gcs_output_text == hf_output_text, (
+        f"Outputs do not match! "
+        f"GCS output: {gcs_output_text}, HF output: {hf_output_text}")
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`Model,UnitTest,IntegrationTest,Benchmark`
`2`		`-"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅`
	`2`	`+"Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,❌`