Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions benchmarking/nightly-benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ entries:
--input-path={dataset:tinystories,parquet}
--dataset-size-gb=10
--model-identifier=google/embeddinggemma-300m
--model-variation=vllm_text
--model-variation=vllm_text_pretokenized
--cache-dir={dataset:text_models_hf_cache,files}
timeout_s: 800
sink_data:
Expand All @@ -278,7 +278,7 @@ entries:
--input-path={dataset:tinystories,parquet}
--dataset-size-gb=10
--model-identifier=google/embeddinggemma-300m
--model-variation=vllm_text
--model-variation=vllm_text_pretokenized
--cache-dir={dataset:text_models_hf_cache,files}
timeout_s: 800
sink_data:
Expand Down Expand Up @@ -875,7 +875,7 @@ entries:
--video-dir={dataset:videos,mp4}
--model-dir={dataset:videos_model_weights,files}
--video-limit=1000
timeout_s: 400
timeout_s: 900
sink_data:
- name: slack
ping_on_failure:
Expand Down
8 changes: 3 additions & 5 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,9 @@ COPY docker/common/install_etcd_nats.sh .
RUN bash install_etcd_nats.sh && \
rm install_etcd_nats.sh

# Install HAProxy so Ray Serve's HAProxy ingress mode (RAY_SERVE_ENABLE_HA_PROXY=1) has the binary on PATH
COPY docker/common/install_haproxy.sh .
RUN bash install_haproxy.sh && \
rm install_haproxy.sh
ENV RAY_SERVE_HAPROXY_BINARY_PATH=/usr/local/bin/haproxy
# Use the ray-haproxy wheel so Ray Serve's HAProxy ingress mode can discover
# the packaged binary without compiling HAProxy in the image.
ENV RAY_SERVE_EXPERIMENTAL_PIP_HAPROXY=1


FROM nemo_curator_dep AS nemo_curator
Expand Down
69 changes: 0 additions & 69 deletions docker/common/install_haproxy.sh

This file was deleted.

29 changes: 29 additions & 0 deletions nemo_curator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,35 @@
os.environ["RAY_MAX_LIMIT_FROM_API_SERVER"] = str(API_LIMIT)
os.environ["RAY_MAX_LIMIT_FROM_DATA_SOURCE"] = str(API_LIMIT)


def _ensure_ray_dashboard_frontend() -> None:
"""Stub Ray's dashboard frontend dir once (nightly ray only), before any cluster starts.

Ray *nightly* wheels omit the prebuilt dashboard frontend (``dashboard/client/build``
is an npm artifact built only for releases), so the dashboard process dies with
``FrontendNotFoundError`` and its state API server never registers — which breaks
every ``ray.util.state`` call (Xenna drives pipelines through it) with "Could not
read 'dashboard' from GCS". Creating the dir (relative to the installed ``ray``, so
it works in any venv) lets the dashboard start; the web UI itself is unused.

Gated to dev/nightly builds so published releases (which ship the frontend) are
untouched. Best-effort: a read-only install must not break ``import``.
"""
import contextlib
from pathlib import Path

import ray
from packaging.version import Version

if not Version(ray.__version__).is_devrelease:
return
# Best-effort: a read-only install must not break ``import nemo_curator``.
with contextlib.suppress(OSError):
(Path(ray.__file__).parent / "dashboard" / "client" / "build" / "static").mkdir(parents=True, exist_ok=True)


_ensure_ray_dashboard_frontend()

# Raise an informative error early to users on unsupported systems
if sys.platform != "linux":
_msg = (
Expand Down
97 changes: 86 additions & 11 deletions nemo_curator/core/serve/dynamo/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import importlib.metadata
import json
import tempfile
from functools import reduce
Expand All @@ -24,6 +25,7 @@

import ray
from loguru import logger
from packaging.requirements import InvalidRequirement, Requirement

from nemo_curator.core.serve.base import BaseModelConfig
from nemo_curator.core.serve.dynamo.infra import (
Expand All @@ -50,19 +52,91 @@
from nemo_curator.core.serve.placement import ReplicaBundleSpec


# ai-dynamo[vllm]'s [vllm] extra carries a hard ray pin, but Ray refuses
# actor venvs whose ray version differs from the cluster head's. uv has no
# inline override syntax — only ``--override <file>`` — so we materialize a
# tiny constraints file at a fixed path on every node via
# ``ensure_actor_overrides_on_all_nodes``; the content is derived from the
# driver's ``ray.__version__`` at fan-out time so a future Curator ray bump
# doesn't need a code change here.
# The actor venv ``uv pip install`` needs overrides that pyproject's ``[tool.uv]``
# can't reach (Ray runs it in an empty cwd). uv has no inline override syntax —
# only ``--override <file>`` — so we materialize a constraints file at a fixed path
# on every node via ``ensure_actor_overrides_on_all_nodes``. It carries:
# * ``ray==<driver version>`` — ai-dynamo[vllm]'s [vllm] extra has a hard ray pin,
# but Ray refuses actor venvs whose ray differs from the cluster head's. Derived
# from the driver's ``ray.__version__`` so a future Curator ray bump needs no edit.
# * ``nixl-cu13`` dropped — ai-dynamo[vllm] pulls the CUDA-13 NIXL backend, whose
# eagerly-imported ``nixl_ep_cpp.so`` dlopens libcudart.so.13 (absent on this
# CUDA-12.9 image). The base image excludes it via pyproject, but that override
# doesn't reach this standalone install; re-apply it here so the cu12 backend wins.
_ACTOR_VENV_OVERRIDES_PATH = Path(tempfile.gettempdir()) / "nemo_curator_dynamo_actor_overrides.txt"
_ACTOR_VENV_NIXL_CU13_EXCLUSION = "nixl-cu13 ; sys_platform == 'never'"
# The CUDA build the actor venv must match (torch ecosystem + vllm wheel variant).
_ACTOR_VENV_CUDA_TAG = "cu129"
# Latest known nightly that includes ai-dynamo/dynamo#10380 while ai-dynamo[vllm]
# still pins vLLM 0.22.x. Newer 1.3.0 nightlies moved to vLLM 0.23.0.
_DYNAMO_NIGHTLY_VERSION = "1.3.0.dev20260615"


def _vllm_cu129_index_url() -> str | None:
"""The vLLM cu129 wheel index for the exact version ai-dynamo[vllm] pins.

ai-dynamo's [vllm] extra pins an exact vllm (e.g. ``==0.22.1``) that may
differ from Curator's base vllm — the base installs ai-dynamo WITHOUT its
[vllm] extra, so its vllm comes from Curator's own pin, while the actor
venv installs ``ai-dynamo[vllm]`` and must honor ai-dynamo's pin. vLLM
publishes a per-version cu129 wheel index at ``wheels.vllm.ai/<v>/cu129``;
pointing at the pinned version means its ``+cu129`` local build sorts above
the default cu130 wheel under unsafe-best-match. Derived from ai-dynamo's
own metadata so a nightly bump (which changes the vllm pin) needs no edit.

Returns None if ai-dynamo (or its vllm pin) can't be found — only happens
when the dynamo backend isn't actually installed, where this is unused.
"""
try:
requirements = importlib.metadata.requires("ai-dynamo") or []
except importlib.metadata.PackageNotFoundError:
return None
for raw in requirements:
try:
req = Requirement(raw)
except InvalidRequirement:
continue # a malformed Requires-Dist line must not break module import
# Match vllm only as it applies under the [vllm] extra we install (skip a vllm
# pin that some other ai-dynamo extra might add under a different marker).
if req.name != "vllm" or (req.marker is not None and not req.marker.evaluate({"extra": "vllm"})):
continue
pinned = next((spec.version for spec in req.specifier if spec.operator in ("==", "===")), None)
if pinned:
return f"https://wheels.vllm.ai/{pinned}/{_ACTOR_VENV_CUDA_TAG}"
Comment thread
praateekmahajan marked this conversation as resolved.
return None
Comment thread
praateekmahajan marked this conversation as resolved.


# Ray builds the actor venv with a bare ``uv pip install`` in an empty cwd, so it
# inherits none of the project's ``[tool.uv]`` index/source/prerelease config — only
# what we pass here. Force CUDA 12.9 the way vLLM documents for uv: --torch-backend
# routes the torch ecosystem to the cu129 index, and the per-version cu129 vllm index
# (see ``_vllm_cu129_index_url``) keeps vllm on cu129. ``unsafe-best-match`` is REQUIRED
# so nixl resolves (its version is split across pypi.nvidia.com and PyPI, which the
# default first-match strategy can't combine).
_ACTOR_VENV_UV_OPTIONS = [
"--override",
str(_ACTOR_VENV_OVERRIDES_PATH),
"--torch-backend",
_ACTOR_VENV_CUDA_TAG,
"--index-strategy",
"unsafe-best-match",
"--prerelease",
"if-necessary-or-explicit",
*(
arg
for url in ("https://pypi.nvidia.com", _vllm_cu129_index_url())
if url is not None
for arg in ("--extra-index-url", url)
),
]

DYNAMO_VLLM_RUNTIME_ENV: dict[str, Any] = {
"uv": {
"packages": ["ai-dynamo[vllm]"],
"uv_pip_install_options": ["--override", str(_ACTOR_VENV_OVERRIDES_PATH)],
"packages": [
f"ai-dynamo[vllm]=={_DYNAMO_NIGHTLY_VERSION}",
f"ai-dynamo-runtime=={_DYNAMO_NIGHTLY_VERSION}",
],
"uv_pip_install_options": _ACTOR_VENV_UV_OPTIONS,
},
"config": {"setup_timeout_seconds": 600},
}
Expand All @@ -78,7 +152,8 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No

The file pins ``ray=={ray.__version__}`` (read from the driver) so the
actor venv keeps the same ray patch as the cluster head — Ray rejects
any mismatch.
any mismatch — and drops ``nixl-cu13`` so the cu12 NIXL backend is used
(see module comment on :data:`_ACTOR_VENV_OVERRIDES_PATH`).

Must run inside an active Ray context, before any worker spawned with
:data:`DYNAMO_VLLM_RUNTIME_ENV` lands. The runtime_env_agent on each
Expand All @@ -91,7 +166,7 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
run_on_each_node(
_write_actor_overrides_file,
str(_ACTOR_VENV_OVERRIDES_PATH),
f"ray=={ray.__version__}\n",
f"ray=={ray.__version__}\n{_ACTOR_VENV_NIXL_CU13_EXCLUSION}\n",
ignore_head_node=ignore_head_node,
)

Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/stages/text/embedders/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__( # noqa: PLR0913
model_identifier: str,
vllm_init_kwargs: dict[str, Any] | None = None,
text_field: str = "text",
pretokenize: bool = False,
pretokenize: bool = True,
embedding_field: str = "embeddings",
max_chars: int | None = None,
cache_dir: str | None = None,
Expand Down
Loading
Loading