Support ray/dynamo nightly + vLLM 0.22 (cu129) across all extras

praateekmahajan · praateekmahajan · commit aaa42ad81fd7 · 2026-06-10T22:04:05.000Z
Enable the Ray 3.0 nightly + ai-dynamo nightly + vLLM 0.22 inference stack
on the CUDA-12.9 image while keeping the full Curator dependency set
(`uv sync --all-extras --all-groups`) resolvable and buildable.

pyproject.toml:
- ray: track the 3.0.0.dev0 nightly wheel (rolling /latest/ URL)
- ai-dynamo and ai-dynamo-runtime &gt;=1.3.0.dev0, both first-party so
  prerelease="if-necessary-or-explicit" enables the newest nightly without
  blanket prereleases (runtime is a transitive with stable releases, so it
  needs an explicit marker or uv backtracks to an older dynamo dev)
- vLLM 0.22.0+cu129 via a dedicated cu129 wheel index + tool.uv.sources
  (default vLLM is now cu130; keep torch/vllm on CUDA 12.9)
- drop nixl-cu13: ray[llm]/nixl hard-pin the CUDA-13 NIXL backend, whose
  eager `import nixl_ep` dlopens the absent libcudart.so.13 on cu12.9; keep
  the nixl meta + nixl-cu12 backend
- opencv-python -&gt; opencv-python-headless (no libGL/GPL GUI/FFmpeg bundling;
  matches vllm/mistral_common/albumentations)
- bump torch/torchvision/torchaudio/torchcodec to the 2.11 cu129 line

dynamo actor venv runtime_env (vllm.py): Ray builds it via a bare
`uv pip install ai-dynamo[vllm]` that ignores pyproject, so force cu129 the
way uv/vLLM document: --torch-backend cu129, unsafe-best-match (needed for
nixl's split index resolution), and a per-version cu129 vllm index derived
from ai-dynamo's own pin; the --override file pins ray== and drops nixl-cu13.

Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;
diff --git a/nemo_curator/core/serve/dynamo/vllm.py b/nemo_curator/core/serve/dynamo/vllm.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import importlib.metadata
 import json
 import tempfile
 from functools import reduce
@@ -24,6 +25,7 @@
 
 import ray
 from loguru import logger
+from packaging.requirements import Requirement
 
 from nemo_curator.core.serve.base import BaseModelConfig
 from nemo_curator.core.serve.dynamo.infra import (
@@ -50,19 +52,80 @@
     from nemo_curator.core.serve.placement import ReplicaBundleSpec
 
 
-# ai-dynamo[vllm]'s [vllm] extra carries a hard ray pin, but Ray refuses
-# actor venvs whose ray version differs from the cluster head's. uv has no
-# inline override syntax — only ``--override <file>`` — so we materialize a
-# tiny constraints file at a fixed path on every node via
-# ``ensure_actor_overrides_on_all_nodes``; the content is derived from the
-# driver's ``ray.__version__`` at fan-out time so a future Curator ray bump
-# doesn't need a code change here.
+# The actor venv ``uv pip install`` needs overrides that pyproject's ``[tool.uv]``
+# can't reach (Ray runs it in an empty cwd). uv has no inline override syntax —
+# only ``--override <file>`` — so we materialize a constraints file at a fixed path
+# on every node via ``ensure_actor_overrides_on_all_nodes``. It carries:
+#   * ``ray==<driver version>`` — ai-dynamo[vllm]'s [vllm] extra has a hard ray pin,
+#     but Ray refuses actor venvs whose ray differs from the cluster head's. Derived
+#     from the driver's ``ray.__version__`` so a future Curator ray bump needs no edit.
+#   * ``nixl-cu13`` dropped — ai-dynamo[vllm] pulls the CUDA-13 NIXL backend, whose
+#     eagerly-imported ``nixl_ep_cpp.so`` dlopens libcudart.so.13 (absent on this
+#     CUDA-12.9 image). The base image excludes it via pyproject, but that override
+#     doesn't reach this standalone install; re-apply it here so the cu12 backend wins.
 _ACTOR_VENV_OVERRIDES_PATH = Path(tempfile.gettempdir()) / "nemo_curator_dynamo_actor_overrides.txt"
+_ACTOR_VENV_NIXL_CU13_EXCLUSION = "nixl-cu13 ; sys_platform == 'never'"
+# The CUDA build the actor venv must match (torch ecosystem + vllm wheel variant).
+_ACTOR_VENV_CUDA_TAG = "cu129"
+
+
+def _vllm_cu129_index_url() -> str | None:
+    """The vLLM cu129 wheel index for the exact version ai-dynamo[vllm] pins.
+
+    ai-dynamo's [vllm] extra pins an exact vllm (e.g. ``==0.22.1``) that may
+    differ from Curator's base vllm — the base installs ai-dynamo WITHOUT its
+    [vllm] extra, so its vllm comes from Curator's own pin, while the actor
+    venv installs ``ai-dynamo[vllm]`` and must honor ai-dynamo's pin. vLLM
+    publishes a per-version cu129 wheel index at ``wheels.vllm.ai/<v>/cu129``;
+    pointing at the pinned version means its ``+cu129`` local build sorts above
+    the default cu130 wheel under unsafe-best-match. Derived from ai-dynamo's
+    own metadata so a nightly bump (which changes the vllm pin) needs no edit.
+
+    Returns None if ai-dynamo (or its vllm pin) can't be found — only happens
+    when the dynamo backend isn't actually installed, where this is unused.
+    """
+    try:
+        requirements = importlib.metadata.requires("ai-dynamo") or []
+    except importlib.metadata.PackageNotFoundError:
+        return None
+    for raw in requirements:
+        req = Requirement(raw)
+        if req.name != "vllm":
+            continue
+        pinned = next((spec.version for spec in req.specifier if spec.operator in ("==", "===")), None)
+        if pinned:
+            return f"https://wheels.vllm.ai/{pinned}/{_ACTOR_VENV_CUDA_TAG}"
+    return None
+
+
+# Ray builds the actor venv with a bare ``uv pip install`` in an empty cwd, so it
+# inherits none of the project's ``[tool.uv]`` index/source/prerelease config — only
+# what we pass here. Force CUDA 12.9 the way vLLM documents for uv: --torch-backend
+# routes the torch ecosystem to the cu129 index, and the per-version cu129 vllm index
+# (see ``_vllm_cu129_index_url``) keeps vllm on cu129. ``unsafe-best-match`` is REQUIRED
+# so nixl resolves (its version is split across pypi.nvidia.com and PyPI, which the
+# default first-match strategy can't combine).
+_ACTOR_VENV_UV_OPTIONS = [
+    "--override",
+    str(_ACTOR_VENV_OVERRIDES_PATH),
+    "--torch-backend",
+    _ACTOR_VENV_CUDA_TAG,
+    "--index-strategy",
+    "unsafe-best-match",
+    "--prerelease",
+    "if-necessary-or-explicit",
+    *(
+        arg
+        for url in ("https://pypi.nvidia.com", _vllm_cu129_index_url())
+        if url is not None
+        for arg in ("--extra-index-url", url)
+    ),
+]
 
 DYNAMO_VLLM_RUNTIME_ENV: dict[str, Any] = {
     "uv": {
         "packages": ["ai-dynamo[vllm]"],
-        "uv_pip_install_options": ["--override", str(_ACTOR_VENV_OVERRIDES_PATH)],
+        "uv_pip_install_options": _ACTOR_VENV_UV_OPTIONS,
     },
     "config": {"setup_timeout_seconds": 600},
 }
@@ -78,7 +141,8 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
 
     The file pins ``ray=={ray.__version__}`` (read from the driver) so the
     actor venv keeps the same ray patch as the cluster head — Ray rejects
-    any mismatch.
+    any mismatch — and drops ``nixl-cu13`` so the cu12 NIXL backend is used
+    (see module comment on :data:`_ACTOR_VENV_OVERRIDES_PATH`).
 
     Must run inside an active Ray context, before any worker spawned with
     :data:`DYNAMO_VLLM_RUNTIME_ENV` lands. The runtime_env_agent on each
@@ -91,7 +155,7 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
     run_on_each_node(
         _write_actor_overrides_file,
         str(_ACTOR_VENV_OVERRIDES_PATH),
-        f"ray=={ray.__version__}\n",
+        f"ray=={ray.__version__}\n{_ACTOR_VENV_NIXL_CU13_EXCLUSION}\n",
         ignore_head_node=ignore_head_node,
     )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,7 +66,7 @@ dependencies = [
     "openai>=1.0.0",
     "pandas>=2.1.0",
     "pyarrow",
-    "ray[default,data]>=2.55.1",
+    "ray[default,data] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp313-cp313-manylinux2014_x86_64.whl ; python_version == '3.13' and platform_machine == 'x86_64' and platform_system != 'Darwin'",
     "torch",
     "transformers",
 ]
@@ -76,14 +76,18 @@ cuda12 = [
     "gpustat",
     "nvidia-ml-py",
 ]
-vllm = ["vllm>=0.14.1; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
+vllm = ["vllm[flashinfer,runai,otel]==0.22.0+cu129; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
 
 # Inference Server (Ray Serve + vLLM) - for serving LLMs alongside Curator pipelines
 inference_server = [
     "nemo_curator[cuda12]",
     "nemo_curator[vllm]",
-    "vllm<0.19; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # Ray Serve LLM 2.55.1 isn't compatible with vllm 0.19+
-    "ai-dynamo==1.1.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # pin so the Dynamo actor venv resolves to the same release we test against; gated to x86_64 since vllm wheels are x86_64-only
+    "ai-dynamo>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
+    # First-party + explicit .dev0 marker so prerelease="if-necessary-or-explicit" enables
+    # nightlies for ai-dynamo-runtime too. ai-dynamo pins it (==<its dev>), but it's a
+    # transitive with stable releases, so without this the newest dynamo nightly can't
+    # resolve (its runtime pin is a disallowed prerelease) and uv falls back to an older dev.
+    "ai-dynamo-runtime>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "boto3>=1.35", # Get rid once https://github.com/ray-project/ray/issues/61269 is fixed
     "nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "ray[serve,llm]>=2.55.1",
@@ -216,7 +220,7 @@ text_cuda12 = [
 # Video Curation Dependencies
 video_cpu = [
     "av==13.1.0",
-    "opencv-python",
+    "opencv-python-headless",  # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
     "torchvision",
     "einops",
     "easydict",
@@ -230,7 +234,7 @@ video_cuda12 = [
     "flash-attn<=2.8.3; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "pycuda",
     "PyNvVideoCodec==2.0.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
-    "torch<=2.10.0",
+    "torch<=2.11.0",
     "torchaudio",
 ]
 
@@ -252,7 +256,7 @@ interleaved_cpu = [
     "albumentations",
     "matplotlib",
     "open_clip_torch",
-    "opencv-python",
+    "opencv-python-headless",  # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
     "Pillow",
     "pypdfium2",
     "s3fs>=2024.12.0",
@@ -290,7 +294,7 @@ all = [
 ]
 
 [dependency-groups]
-build = ["setuptools", "torch<=2.10.0", "Cython", "packaging"]
+build = ["setuptools", "torch<=2.11.0", "Cython", "packaging"]
 dev = ["jupyter"]
 linting = ["pre-commit", "ruff==0.14.10"]
 test = [
@@ -317,6 +321,10 @@ package = true
 managed = true
 default-groups = ["dev", "test"]
 index-strategy = "unsafe-best-match"
+# Default mode: only pick a prerelease when a requirement carries an explicit
+# prerelease marker (ai-dynamo>=1.3.0.dev0) or when every version in range is a
+# prerelease (ray nightly's otel transitives). Avoids blanket prereleases elsewhere.
+prerelease = "if-necessary-or-explicit"
 no-build-isolation-package = ["flash-attn"]
 constraint-dependencies = [
     "aiohttp>=3.13.3", # Addresses CVE GHSA-6mq8-rvhq-8wgg
@@ -340,13 +348,15 @@ override-dependencies = [
     "kaldiio;  sys_platform == 'never'",
     "levenshtein;  sys_platform == 'never'",
     "numpy>=2.0.0,<=2.2.0", # Override nemo-toolkits constraint of <2.0.0, upperbounds for Numba compatibility
+    "numba==0.65.0", # Override RAPIDS/legacy caps for the inference image; vLLM 0.22 requires numba 0.65.0
     "protobuf>=5.29.5,<7.0",  # Override nemo-toolkits constraint of ~=5.29.5; <7.0 due to ray serve FieldDescriptor API breakage
     "setuptools>=80.10.1", # Override setuptools range in other dependencies to address CVE GHSA-58pv-8j8x-9vj2
-    "torch==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
-    "torchaudio==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
-    "torchvision==0.25.0", # Match torch==2.10.0
-    "torchcodec~=0.10.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.10.x for torch 2.10 ABI compatibility — torchcodec doesn't declare a torch dep, so the resolver can't enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
+    "torch==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match vLLM's CUDA requirements; Linux resolves to cu129 via tool.uv.sources
+    "torchaudio==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
+    "torchvision==0.26.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
+    "torchcodec~=0.11.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.11.x for torch 2.11 ABI compatibility; torchcodec does not declare a torch dep, so the resolver cannot enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
     "nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",  # Override ray[llm]'s unconditional nixl dep for ARM
+    "nixl-cu13; sys_platform == 'never'", # ray[llm]/nixl hard-pin the CUDA-13 NIXL backend. On this CUDA-12.9 image vLLM's eager `import nixl_ep` would load cu13's nixl_ep_cpp.so and dlopen the absent libcudart.so.13. Drop it; the nixl meta + nixl-cu12 backend (nixl's own default) remain.
     "xgrammar>=0.1.32", # Override vllm's ==0.1.29 pin to address CVE GHSA-7rgv-gqhr-fxg3 (DoS via multi-layer nesting)
 ]
 
@@ -365,6 +375,11 @@ name = "pytorch"
 url = "https://download.pytorch.org/whl/cu129"
 explicit = true
 
+[[tool.uv.index]]
+name = "vllm-cu129"
+url = "https://wheels.vllm.ai/0.22.0/cu129"
+explicit = true
+
 [tool.uv.sources]
 torch = [
     { index = "pytorch", marker = "sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')" },
@@ -382,6 +397,12 @@ torchcodec = [
     { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
     { index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
 ]
+ai-dynamo = { index = "nvidia" }
+ai-dynamo-runtime = { index = "nvidia" }
+vllm = [
+    { index = "vllm-cu129", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
+    { index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
+]
 nixl = { index = "pypi" }
 nixl-cu12 = { index = "pypi" }
 
diff --git a/tests/core/serve/dynamo/test_vllm.py b/tests/core/serve/dynamo/test_vllm.py
@@ -346,12 +346,13 @@ class TestEnsureActorOverridesOnAllNodes:
     ``--override`` constraints file before workers are spawned."""
 
     def test_writes_current_ray_version_at_path(self, shared_ray_client: None, tmp_path: Path) -> None:
-        """The fan-out writes ``ray=={ray.__version__}`` at the configured
-        path on every alive node. Catches regressions where the content is
-        hardcoded and silently drifts after a Curator ray bump.
+        """The fan-out writes ``ray=={ray.__version__}`` plus the nixl-cu13
+        exclusion at the configured path on every alive node. Catches
+        regressions where the content is hardcoded and silently drifts after
+        a Curator ray bump.
         """
         override_path = tmp_path / "override.txt"
         with mock.patch.object(dynamo_vllm, "_ACTOR_VENV_OVERRIDES_PATH", override_path):
             dynamo_vllm.ensure_actor_overrides_on_all_nodes()
 
-        assert override_path.read_text() == f"ray=={ray.__version__}\n"
+        assert override_path.read_text() == f"ray=={ray.__version__}\n{dynamo_vllm._ACTOR_VENV_NIXL_CU13_EXCLUSION}\n"
diff --git a/uv.lock b/uv.lock