Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
3bf9f87
remove requirements from vllm-base
afeldman-nm Oct 2, 2025
7f9093a
wip
afeldman-nm Oct 2, 2025
b2f73ee
enable requirements install with cuda, common
Oct 8, 2025
7bc6dbe
dummy change
Oct 9, 2025
cb90aa5
docker file
Oct 9, 2025
e021716
fix
Oct 9, 2025
82650f9
added back in common reqs
Oct 9, 2025
b0cdb79
add dev reqs
Oct 9, 2025
ce13e2a
add dev reqs
Oct 9, 2025
0577068
add dev reqs
Oct 9, 2025
09ccbb7
Merge branch 'main' into ci_build_dockerfile
Oct 10, 2025
a970e28
move add
Oct 10, 2025
b11fbc4
Merge branch 'main' into ci_build_dockerfile
Oct 10, 2025
ef39117
dummy change
Oct 10, 2025
d279ff4
Merge branch 'main' into ci_build_dockerfile
Oct 10, 2025
c23a175
mount
Oct 20, 2025
67ab819
Merge branch 'main' into ci_build_dockerfile
Oct 20, 2025
3172cc6
mount
Oct 20, 2025
de665da
fix
Oct 20, 2025
f15ffb2
Merge branch 'main' into ci_build_dockerfile
Oct 20, 2025
4eb634d
fix
Oct 20, 2025
b7ad2b9
tweak
Oct 21, 2025
93026fa
mount
Oct 21, 2025
b0bcab7
Merge branch 'main' into ci_build_dockerfile
Oct 21, 2025
e51463a
dummy change
Oct 21, 2025
59411b6
Merge branch 'main' into ci_build_dockerfile
Oct 21, 2025
861ff0c
sccache
Oct 21, 2025
7a16bee
Merge branch 'main' into ci_build_dockerfile
Oct 21, 2025
344ec0d
sccache
Oct 21, 2025
0c3d51c
Merge branch 'main' into ci_build_dockerfile
Oct 21, 2025
49edd97
merge; apache issue
abf149 Oct 21, 2025
c37fa4c
Merge branch 'main' into ci_build_dockerfile
abf149 Oct 21, 2025
c1e266c
threads
abf149 Oct 21, 2025
523fd30
prerelease=allow
abf149 Oct 21, 2025
82b70b3
again
abf149 Oct 21, 2025
e02731b
again
abf149 Oct 21, 2025
fd479ad
tweek
abf149 Oct 21, 2025
dc048ac
Merge branch 'main' into ci_build_dockerfile
abf149 Oct 22, 2025
95d33a7
tweak
abf149 Oct 22, 2025
e4c0e7c
debug
Oct 23, 2025
d43045d
Merge branch 'ci_build_dockerfile' of https://github.com/neuralmagic/…
Oct 23, 2025
a506e8b
tweak
Oct 24, 2025
43a7d8c
Merge branch 'main' into ci_build_dockerfile
Oct 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy

ENV SCCACHE_LOG="debug"

# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
Expand All @@ -134,7 +136,9 @@ COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
--prerelease=allow \
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
--prerelease=allow \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# cuda arch list used by torch
Expand All @@ -153,6 +157,12 @@ ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL

# Ensure build caching is enabled
ENV CMAKE_C_COMPILER_LAUNCHER=sccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=sccache
ENV CMAKE_CUDA_COMPILER_LAUNCHER=sccache


# install build dependencies
COPY requirements/build.txt requirements/build.txt

Expand All @@ -173,10 +183,10 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ARG nvcc_threads=32
ENV NVCC_THREADS=$nvcc_threads

ARG USE_SCCACHE
Expand Down Expand Up @@ -353,13 +363,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--pre pytorch_triton==3.3.0+gitab727c40 ; \
fi

# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
&& uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# install build and runtime dependencies
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \
--prerelease=allow \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# TODO (huydhn): Remove this once xformers is released for 2.9.0
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
Expand Down Expand Up @@ -428,8 +438,6 @@ ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
# note that this uses vllm installed by `pip`
FROM vllm-base AS test

ADD . /vllm-workspace/

ARG PYTHON_VERSION

ARG PIP_INDEX_URL UV_INDEX_URL
Expand All @@ -444,13 +452,19 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy

# install development dependencies (for testing)
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi

# Move host vLLM workspace into image
ADD . /vllm-workspace/

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
Expand All @@ -463,6 +477,14 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

# Install vllm wheel
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
--prerelease=allow \
dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Source code is used in the `python_only_compile.sh` test
# We hide it inside `src/` so that this source code
# will not be imported by other tests
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def build(
max_num_splits = 0 # 0 means use FA3's heuristics, not CG compatible
if self.use_full_cuda_graph and num_actual_tokens <= self.max_cudagraph_size:
# NOTE(woosuk): Setting num_splits > 1 may increase the memory
# usage, because the intermediate buffers of size [num_splits,
# usage, because theintermediate bffers of size [num_splits,
# num_heads, num_tokens, head_size] are allocated. Therefore,
# we only set num_splits when using cuda graphs.
max_num_splits = self.max_num_splits
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/cudagraph_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ class CudagraphDispatcher:
Runtime cudagraph dispatcher to dispatch keys for multiple set of
cudagraphs.

The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
The dispatcherstores two sets of dispatch keys, one for PIECEWISE and one
for FULL cudagraph runtime mode. The keys are initialized depending on
attention support and what cudagraph mode is set in CompilationConfig. The
attention support and what cudagraph modeis set in CompilationConfig. The
keys stored in dispatcher are the only source of truth for valid
cudagraphs that can be dispatched at runtime.

Expand Down