Skip to content

Commit 90dc758

Browse files
authored
chore: bump vllm to 0.11.0 (#3422)
Signed-off-by: alec-flowers <[email protected]>
1 parent 60975b5 commit 90dc758

File tree

10 files changed

+38
-18
lines changed

10 files changed

+38
-18
lines changed

container/Dockerfile.vllm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
1515
ARG CUDA_VERSION="12.8"
1616

1717
# Make sure to update the dependency version in pyproject.toml when updating this
18-
ARG VLLM_REF="v0.10.2"
18+
ARG VLLM_REF="v0.11.0"
1919
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
20-
ARG FLASHINF_REF="v0.3.0"
20+
ARG FLASHINF_REF="v0.3.1"
2121
ARG TORCH_BACKEND="cu128"
2222

2323
# If left blank, then we will fallback to vLLM defaults

container/deps/vllm/install_vllm.sh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
set -euo pipefail
1515

16-
VLLM_REF="v0.10.2"
16+
VLLM_REF="v0.11.0"
1717

1818
# Basic Configurations
1919
ARCH=$(uname -m)
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
2929
# These flags are applicable when installing vLLM from source code
3030
EDITABLE=true
3131
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
32-
FLASHINF_REF="v0.3.0"
32+
FLASHINF_REF="v0.3.1"
3333

3434
while [[ $# -gt 0 ]]; do
3535
case $1 in
@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
131131
cd vllm
132132
git checkout $VLLM_REF
133133

134-
# TODO remove in future vLLM release, re-instate ignore torch script
135-
# https://github.com/vllm-project/vllm/pull/24729
136-
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="[email protected]" git cherry-pick 740f064
137-
134+
# TODO leave this here in case we need to do cherry-picks in future
135+
# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="[email protected]" git cherry-pick 740f064
138136

139137
echo "\n=== Installing vLLM & FlashInfer ==="
140138

@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
243241
cd ep_kernels/
244242
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
245243

246-
echo "\n✅ All installations completed successfully!"
244+
echo "\n✅ All installations completed successfully!"

examples/multimodal/utils/protocol.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@
2222
from pydantic_core import core_schema
2323
from typing_extensions import NotRequired
2424
from vllm.inputs.data import TokensPrompt
25+
from vllm.logprobs import PromptLogprobs
2526
from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401
2627
from vllm.outputs import CompletionOutput
2728
from vllm.sampling_params import SamplingParams
28-
from vllm.sequence import PromptLogprobs, RequestMetrics
29+
from vllm.sequence import RequestMetrics
2930

3031
import dynamo.nixl_connect as connect
3132

lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ def _create_slot(self, request: Request) -> None:
192192
if self._connector.has_slot(request.request_id):
193193
return None
194194

195-
if bool(request.mm_positions):
195+
if bool(getattr(request, "mm_features", None)) or bool(
196+
getattr(request, "mm_positions", None)
197+
):
196198
raise ValueError("Unsupported request - requires mm extra keys")
197199

198200
all_token_ids = request.all_token_ids

tests/conftest.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ def pytest_configure(config):
4040
)
4141

4242

43+
@pytest.fixture()
44+
def set_ucx_tls_no_mm():
45+
"""Set UCX env defaults for all tests."""
46+
mp = pytest.MonkeyPatch()
47+
# CI note:
48+
# - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel
49+
# - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init
50+
# (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers
51+
# start on the same node (maybe a shared-memory segment collision/limits).
52+
# - Mitigation: disable UCX "mm" shared-memory transport globally for tests
53+
mp.setenv("UCX_TLS", "^mm")
54+
yield
55+
mp.undo()
56+
57+
4358
def download_models(model_list=None, ignore_weights=False):
4459
"""Download models - can be called directly or via fixture
4560

tests/fault_tolerance/cancellation/test_vllm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
193193
@pytest.mark.e2e
194194
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
195195
def test_request_cancellation_vllm_decode_cancel(
196-
request, runtime_services, predownload_models
196+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
197197
):
198198
"""
199199
End-to-end test for request cancellation during decode phase.
@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
266266
@pytest.mark.e2e
267267
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
268268
def test_request_cancellation_vllm_remote_prefill_cancel(
269-
request, runtime_services, predownload_models
269+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
270270
):
271271
"""
272272
End-to-end test for request cancellation during remote prefill phase.

tests/fault_tolerance/test_request_migration.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
290290
@pytest.mark.gpu_1
291291
@pytest.mark.e2e
292292
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
293-
def test_request_migration_vllm(request, runtime_services, predownload_models):
293+
def test_request_migration_vllm(
294+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
295+
):
294296
"""
295297
End-to-end test for worker fault tolerance with migration support.
296298

tests/frontend/reasoning_effort/test_reasoning_effort.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def __init__(self, request, worker_id: str = "reasoning-worker"):
5858
"dynamo.vllm",
5959
"--model",
6060
REASONING_TEST_MODEL,
61+
"--connector",
62+
"none", # skip nixl registration, noticing long startup times in CI. Potentially a bug...
6163
"--enforce-eager",
6264
"--dyn-tool-call-parser",
6365
"harmony",
@@ -85,7 +87,7 @@ def __init__(self, request, worker_id: str = "reasoning-worker"):
8587
("http://localhost:8000/v1/models", check_models_api),
8688
("http://localhost:8083/health", self.is_ready),
8789
],
88-
timeout=300,
90+
timeout=500,
8991
display_output=True,
9092
terminate_existing=False,
9193
stragglers=["VLLM::EngineCore"],

tests/kvbm/test_determinism_agg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
111111
"--kv-transfer-config",
112112
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',
113113
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
114-
"--max-seq-len",
114+
"--max-model-len",
115115
"8000", # required to fit on L4 GPU when using 8b model
116116
]
117117

tests/kvbm/test_determinism_disagg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
132132
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
133133
"--block-size",
134134
"16",
135-
"--max-seq-len",
135+
"--max-model-len",
136136
"8000", # required to fit on L4 GPU when using 8b model
137137
"--connector",
138138
"nixl",
@@ -148,7 +148,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
148148
"--is-prefill-worker",
149149
"--block-size",
150150
"16",
151-
"--max-seq-len",
151+
"--max-model-len",
152152
"8000", # required to fit on L4 GPU when using 8b model
153153
"--connector",
154154
"kvbm",

0 commit comments

Comments
 (0)