diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 030cc65fe4..ff16461692 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" ARG CUDA_VERSION="12.8" # Make sure to update the dependency version in pyproject.toml when updating this -ARG VLLM_REF="v0.10.2" +ARG VLLM_REF="v0.11.0" # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds -ARG FLASHINF_REF="v0.3.0" +ARG FLASHINF_REF="v0.3.1" ARG TORCH_BACKEND="cu128" # If left blank, then we will fallback to vLLM defaults diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index dff85a94a3..1959811949 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -13,7 +13,7 @@ set -euo pipefail -VLLM_REF="v0.10.2" +VLLM_REF="v0.11.0" # Basic Configurations ARCH=$(uname -m) @@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM # These flags are applicable when installing vLLM from source code EDITABLE=true VLLM_GIT_URL="https://github.com/vllm-project/vllm.git" -FLASHINF_REF="v0.3.0" +FLASHINF_REF="v0.3.1" while [[ $# -gt 0 ]]; do case $1 in @@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm cd vllm git checkout $VLLM_REF -# TODO remove in future vLLM release, re-instate ignore torch script -# https://github.com/vllm-project/vllm/pull/24729 -GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064 - +# TODO leave this here in case we need to do cherry-picks in future +# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064 echo "\n=== Installing vLLM & FlashInfer ===" @@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ===" cd ep_kernels/ TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh -echo "\n✅ All installations completed successfully!" \ No newline at end of file +echo "\n✅ All installations completed successfully!" diff --git a/examples/multimodal/utils/protocol.py b/examples/multimodal/utils/protocol.py index a2caee1efc..aeb682f023 100644 --- a/examples/multimodal/utils/protocol.py +++ b/examples/multimodal/utils/protocol.py @@ -22,10 +22,11 @@ from pydantic_core import core_schema from typing_extensions import NotRequired from vllm.inputs.data import TokensPrompt +from vllm.logprobs import PromptLogprobs from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401 from vllm.outputs import CompletionOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import PromptLogprobs, RequestMetrics +from vllm.sequence import RequestMetrics import dynamo.nixl_connect as connect diff --git a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py index 8c69909c48..e081203088 100644 --- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py +++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py @@ -192,7 +192,9 @@ def _create_slot(self, request: Request) -> None: if self._connector.has_slot(request.request_id): return None - if bool(request.mm_positions): + if bool(getattr(request, "mm_features", None)) or bool( + getattr(request, "mm_positions", None) + ): raise ValueError("Unsupported request - requires mm extra keys") all_token_ids = request.all_token_ids diff --git a/tests/conftest.py b/tests/conftest.py index 23e5b630b7..a22c33d5c3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,21 @@ def pytest_configure(config): ) +@pytest.fixture() +def set_ucx_tls_no_mm(): + """Set UCX env defaults for all tests.""" + mp = pytest.MonkeyPatch() + # CI note: + # - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel + # - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init + # (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers + # start on the same node (maybe a shared-memory segment collision/limits). + # - Mitigation: disable UCX "mm" shared-memory transport globally for tests + mp.setenv("UCX_TLS", "^mm") + yield + mp.undo() + + def download_models(model_list=None, ignore_weights=False): """Download models - can be called directly or via fixture diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py index c8841fe020..374c26ec04 100644 --- a/tests/fault_tolerance/cancellation/test_vllm.py +++ b/tests/fault_tolerance/cancellation/test_vllm.py @@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated( @pytest.mark.e2e @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) def test_request_cancellation_vllm_decode_cancel( - request, runtime_services, predownload_models + request, runtime_services, predownload_models, set_ucx_tls_no_mm ): """ End-to-end test for request cancellation during decode phase. @@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel( @pytest.mark.e2e @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) def test_request_cancellation_vllm_remote_prefill_cancel( - request, runtime_services, predownload_models + request, runtime_services, predownload_models, set_ucx_tls_no_mm ): """ End-to-end test for request cancellation during remote prefill phase. diff --git a/tests/fault_tolerance/test_request_migration.py b/tests/fault_tolerance/test_request_migration.py index 970de159de..54b41466b5 100644 --- a/tests/fault_tolerance/test_request_migration.py +++ b/tests/fault_tolerance/test_request_migration.py @@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None: @pytest.mark.gpu_1 @pytest.mark.e2e @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) -def test_request_migration_vllm(request, runtime_services, predownload_models): +def test_request_migration_vllm( + request, runtime_services, predownload_models, set_ucx_tls_no_mm +): """ End-to-end test for worker fault tolerance with migration support. diff --git a/tests/frontend/reasoning_effort/test_reasoning_effort.py b/tests/frontend/reasoning_effort/test_reasoning_effort.py index 4402fcc7a4..0c53760b7e 100644 --- a/tests/frontend/reasoning_effort/test_reasoning_effort.py +++ b/tests/frontend/reasoning_effort/test_reasoning_effort.py @@ -58,6 +58,8 @@ def __init__(self, request, worker_id: str = "reasoning-worker"): "dynamo.vllm", "--model", REASONING_TEST_MODEL, + "--connector", + "none", # skip nixl registration, noticing long startup times in CI. Potentially a bug... "--enforce-eager", "--dyn-tool-call-parser", "harmony", @@ -85,7 +87,7 @@ def __init__(self, request, worker_id: str = "reasoning-worker"): ("http://localhost:8000/v1/models", check_models_api), ("http://localhost:8083/health", self.is_ready), ], - timeout=300, + timeout=500, display_output=True, terminate_existing=False, stragglers=["VLLM::EngineCore"], diff --git a/tests/kvbm/test_determinism_agg.py b/tests/kvbm/test_determinism_agg.py index b8da0ff6e0..fa8b6ba364 100755 --- a/tests/kvbm/test_determinism_agg.py +++ b/tests/kvbm/test_determinism_agg.py @@ -111,7 +111,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks): "--kv-transfer-config", '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}', os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), - "--max-seq-len", + "--max-model-len", "8000", # required to fit on L4 GPU when using 8b model ] diff --git a/tests/kvbm/test_determinism_disagg.py b/tests/kvbm/test_determinism_disagg.py index 8c0f4523d0..6a9f9205e2 100755 --- a/tests/kvbm/test_determinism_disagg.py +++ b/tests/kvbm/test_determinism_disagg.py @@ -132,7 +132,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks): os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), "--block-size", "16", - "--max-seq-len", + "--max-model-len", "8000", # required to fit on L4 GPU when using 8b model "--connector", "nixl", @@ -148,7 +148,7 @@ def _set_up_vllm_config(self, gpu_cache_blocks): "--is-prefill-worker", "--block-size", "16", - "--max-seq-len", + "--max-model-len", "8000", # required to fit on L4 GPU when using 8b model "--connector", "kvbm",