Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions Dockerfile-cuda-blackwell
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 AS base-builder

ENV SCCACHE=0.10.0
ENV RUSTC_WRAPPER=/usr/local/bin/sccache
ENV PATH="/root/.cargo/bin:${PATH}"
# aligned with `cargo-chef` version in `lukemathwalker/cargo-chef:latest-rust-1.85-bookworm`
ENV CARGO_CHEF=0.1.71

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
curl \
libssl-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*

# Donwload and configure sccache
RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
chmod +x /usr/local/bin/sccache

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
RUN cargo install cargo-chef --version $CARGO_CHEF --locked

FROM base-builder AS planner

WORKDIR /usr/src

COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./

RUN cargo chef prepare --recipe-path recipe.json

FROM base-builder AS builder

ARG CUDA_COMPUTE_CAP=80
ARG GIT_SHA
ARG DOCKER_LABEL

# Limit parallelism
ARG RAYON_NUM_THREADS
ARG CARGO_BUILD_JOBS
ARG CARGO_BUILD_INCREMENTAL

# sccache specific variables
ARG SCCACHE_GHA_ENABLED

WORKDIR /usr/src

RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
--mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
then \
nvprune --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
elif [ ${CUDA_COMPUTE_CAP} -ge 80 -a ${CUDA_COMPUTE_CAP} -lt 90 ]; \
then \
nvprune --generate-code code=sm_80 --generate-code code=sm_${CUDA_COMPUTE_CAP} /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \
then \
nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
then \
nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
else \
echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
fi;

COPY --from=planner /usr/src/recipe.json recipe.json

RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
--mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
then \
cargo chef cook --release --features candle-cuda-turing --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \
else \
cargo chef cook --release --features candle-cuda --features static-linking --no-default-features --recipe-path recipe.json && sccache -s; \
fi;

COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./

FROM builder AS http-builder

RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
--mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
then \
cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F http --no-default-features && sccache -s; \
else \
cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F http --no-default-features && sccache -s; \
fi;

FROM builder AS grpc-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
unzip \
&& rm -rf /var/lib/apt/lists/*

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY proto proto

RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
--mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
then \
cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F static-linking -F grpc --no-default-features && sccache -s; \
else \
cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \
fi;

FROM nvidia/cuda:12.9.0-base-ubuntu22.04 AS base

ARG DEFAULT_USE_FLASH_ATTENTION=True

ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80 \
USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
libssl-dev \
curl \
&& rm -rf /var/lib/apt/lists/*

FROM base AS grpc

COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router

ENTRYPOINT ["text-embeddings-router"]
CMD ["--json-output"]

FROM base

COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router

ENTRYPOINT ["text-embeddings-router"]
CMD ["--json-output"]
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,8 @@ You can build the CPU container with:
docker build .
```

### CUDA - Pre Blackwell architecture

To build the CUDA containers, you need to know the compute cap of the GPU you will be using
at runtime.

Expand Down Expand Up @@ -584,6 +586,39 @@ runtime_compute_cap=90
docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
```

### CUDA - Blackwell architecture

To build the CUDA containers for the Blackwell architecture CUDA 12.9 is required, you need to use a different Dockerfile
and set the compute cap to 120.
This Dockerfile can still be used to build for previous architectures.

Commands to build the container:

```shell
# Get submodule dependencies
git submodule update --init

# Example for Turing (T4, RTX 2000 series, ...)
runtime_compute_cap=75

# Example for A100
runtime_compute_cap=80

# Example for A10
runtime_compute_cap=86

# Example for Ada Lovelace (RTX 4000 series, ...)
runtime_compute_cap=89

# Example for H100
runtime_compute_cap=90

# Example for Blackwell (RTX 5000 series, ...)
runtime_compute_cap=120

docker build . -f Dockerfile-cuda-blackwell --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
```

### Apple M1/M2 arm64 architectures

#### DISCLAIMER
Expand Down
2 changes: 2 additions & 0 deletions backends/candle/src/compute_cap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize)
(86..=89, 80..=86) => true,
(89, 89) => true,
(90, 90) => true,
(120, 120) => true,
(_, _) => false,
}
}
Expand All @@ -54,6 +55,7 @@ mod tests {
assert!(compute_cap_matching(86, 86));
assert!(compute_cap_matching(89, 89));
assert!(compute_cap_matching(90, 90));
assert!(compute_cap_matching(120, 120));

assert!(compute_cap_matching(86, 80));
assert!(compute_cap_matching(89, 80));
Expand Down
2 changes: 1 addition & 1 deletion backends/candle/src/flash_attn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pub(crate) fn flash_attn_varlen(
}
#[cfg(not(feature = "flash-attn-v1"))]
candle::bail!("Flash attention v1 is not installed. Use `flash-attn-v1` feature.")
} else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 {
} else if (80..90).contains(&runtime_compute_cap) || runtime_compute_cap == 90 || runtime_compute_cap == 120 {
#[cfg(feature = "flash-attn")]
{
use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};
Expand Down
6 changes: 4 additions & 2 deletions docs/source/en/custom_container.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

# Build a custom container for TEI

You can build our own CPU or CUDA TEI container using Docker. To build a CPU container, run the following command in the
You can build our own CPU or CUDA TEI container using Docker. To build a CPU container, run the following command in the
directory containing your custom Dockerfile:

```shell
Expand All @@ -32,9 +32,11 @@ the examples of runtime compute capabilities for various GPU types:
- A10 - `runtime_compute_cap=86`
- Ada Lovelace (RTX 4000 series, ...) - `runtime_compute_cap=89`
- H100 - `runtime_compute_cap=90`
- Blackwell (RTX 5000 series, ...) - `runtime_compute_cap=120`

Once you have determined the compute capability is determined, set it as the `runtime_compute_cap` variable and build
the container as shown in the example below:
the container using `Dockerfile-cuda` if the runtime compute cap is lower than 120 otherwise use
`Dockerfile-cuda-blackwell` as shown in the example below:

```shell
# Get submodule dependencies
Expand Down