chore: bump vllm to 0.11.0 (#3422)

alec-flowers · web-flow · commit 90dc758938d7 · 2025-10-12T14:26:45.000-07:00
Signed-off-by: alec-flowers &lt;aflowers@nvidia.com&gt;
diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm
@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
 ARG CUDA_VERSION="12.8"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.10.2"
+ARG VLLM_REF="v0.11.0"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
-ARG FLASHINF_REF="v0.3.0"
+ARG FLASHINF_REF="v0.3.1"
 ARG TORCH_BACKEND="cu128"
 
 # If left blank, then we will fallback to vLLM defaults
diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh
@@ -13,7 +13,7 @@
 
 set -euo pipefail
 
-VLLM_REF="v0.10.2"
+VLLM_REF="v0.11.0"
 
 # Basic Configurations
 ARCH=$(uname -m)
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
 # These flags are applicable when installing vLLM from source code
 EDITABLE=true
 VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
-FLASHINF_REF="v0.3.0"
+FLASHINF_REF="v0.3.1"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
 cd vllm
 git checkout $VLLM_REF
 
-# TODO remove in future vLLM release, re-instate ignore torch script
-# https://github.com/vllm-project/vllm/pull/24729
-GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
-
+# TODO leave this here in case we need to do cherry-picks in future
+# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
 
 echo "\n=== Installing vLLM & FlashInfer ==="
 
@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
 cd ep_kernels/
 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
 
-echo "\n✅ All installations completed successfully!"
+echo "\n✅ All installations completed successfully!"
diff --git a/examples/multimodal/utils/protocol.py b/examples/multimodal/utils/protocol.py
@@ -22,10 +22,11 @@
 from pydantic_core import core_schema
 from typing_extensions import NotRequired
 from vllm.inputs.data import TokensPrompt
+from vllm.logprobs import PromptLogprobs
 from vllm.multimodal.inputs import MultiModalUUIDDict  # noqa: F401
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
+from vllm.sequence import RequestMetrics
 
 import dynamo.nixl_connect as connect
 
diff --git a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
@@ -192,7 +192,9 @@ def _create_slot(self, request: Request) -> None:
         if self._connector.has_slot(request.request_id):
             return None
 
-        if bool(request.mm_positions):
+        if bool(getattr(request, "mm_features", None)) or bool(
+            getattr(request, "mm_positions", None)
+        ):
             raise ValueError("Unsupported request - requires mm extra keys")
 
         all_token_ids = request.all_token_ids
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -40,6 +40,21 @@ def pytest_configure(config):
 )
 
 
+@pytest.fixture()
+def set_ucx_tls_no_mm():
+    """Set UCX env defaults for all tests."""
+    mp = pytest.MonkeyPatch()
+    # CI note:
+    # - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel
+    # - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init
+    #   (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers
+    #   start on the same node (maybe a shared-memory segment collision/limits).
+    # - Mitigation: disable UCX "mm" shared-memory transport globally for tests
+    mp.setenv("UCX_TLS", "^mm")
+    yield
+    mp.undo()
+
+
 def download_models(model_list=None, ignore_weights=False):
     """Download models - can be called directly or via fixture
 
diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 def test_request_cancellation_vllm_decode_cancel(
-    request, runtime_services, predownload_models
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during decode phase.
@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 def test_request_cancellation_vllm_remote_prefill_cancel(
-    request, runtime_services, predownload_models
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during remote prefill phase.
diff --git a/tests/fault_tolerance/test_request_migration.py b/tests/fault_tolerance/test_request_migration.py
@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-def test_request_migration_vllm(request, runtime_services, predownload_models):
+def test_request_migration_vllm(
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+):
     """
     End-to-end test for worker fault tolerance with migration support.
 
diff --git a/tests/frontend/reasoning_effort/test_reasoning_effort.py b/tests/frontend/reasoning_effort/test_reasoning_effort.py
@@ -58,6 +58,8 @@ def __init__(self, request, worker_id: str = "reasoning-worker"):
             "dynamo.vllm",
             "--model",
             REASONING_TEST_MODEL,
+            "--connector",
+            "none",  # skip nixl registration, noticing long startup times in CI. Potentially a bug...
             "--enforce-eager",
             "--dyn-tool-call-parser",
             "harmony",
@@ -85,7 +87,7 @@ def __init__(self, request, worker_id: str = "reasoning-worker"):
                 ("http://localhost:8000/v1/models", check_models_api),
                 ("http://localhost:8083/health", self.is_ready),
             ],
-            timeout=300,
+            timeout=500,
             display_output=True,
             terminate_existing=False,
             stragglers=["VLLM::EngineCore"],
diff --git a/tests/kvbm/test_determinism_agg.py b/tests/kvbm/test_determinism_agg.py
@@ -111,7 +111,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
             "--kv-transfer-config",
             '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',
             os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
-            "--max-seq-len",
+            "--max-model-len",
             "8000",  # required to fit on L4 GPU when using 8b model
         ]
 
diff --git a/tests/kvbm/test_determinism_disagg.py b/tests/kvbm/test_determinism_disagg.py
@@ -132,7 +132,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
             os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
             "--block-size",
             "16",
-            "--max-seq-len",
+            "--max-model-len",
             "8000",  # required to fit on L4 GPU when using 8b model
             "--connector",
             "nixl",
@@ -148,7 +148,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):
             "--is-prefill-worker",
             "--block-size",
             "16",
-            "--max-seq-len",
+            "--max-model-len",
             "8000",  # required to fit on L4 GPU when using 8b model
             "--connector",
             "kvbm",

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks):`
`111`	`111`	`"--kv-transfer-config",`
`112`	`112`	`'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',`
`113`	`113`	`os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),`
`114`		`- "--max-seq-len",`
	`114`	`+ "--max-model-len",`
`115`	`115`	`"8000", # required to fit on L4 GPU when using 8b model`
`116`	`116`	`]`
`117`	`117`