Skip to content

Commit 065aabb

Browse files
authored
doc: Update TRTLLM deployment doc. (#2960)
* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI. * doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI. * fix: PR comments
1 parent cb747b3 commit 065aabb

File tree

3 files changed

+155
-47
lines changed

3 files changed

+155
-47
lines changed

.github/workflows/build.yaml

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
export runs_on="aws-g6-12xl-plus-priv-cache"
6565
export platform=""
6666
export extra_pytest=""
67-
export target="nil"
67+
export target=""
6868
;;
6969
cuda-trtllm)
7070
export dockerfile="Dockerfile_trtllm"
@@ -74,7 +74,13 @@ jobs:
7474
export runs_on="ubuntu-latest"
7575
export platform=""
7676
export extra_pytest=""
77-
export build_type="dev"
77+
if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then
78+
export build_type="release";
79+
export target="";
80+
else
81+
export build_type="dev";
82+
export target="ci-runtime";
83+
fi
7884
;;
7985
rocm)
8086
export dockerfile="Dockerfile_amd"
@@ -85,7 +91,7 @@ jobs:
8591
export runs_on="ubuntu-latest"
8692
export platform=""
8793
export extra_pytest="-k test_flash_gemma_gptq_load"
88-
export target="nil"
94+
export target=""
8995
;;
9096
intel-xpu)
9197
export dockerfile="Dockerfile_intel"
@@ -95,7 +101,7 @@ jobs:
95101
export runs_on="ubuntu-latest"
96102
export platform="xpu"
97103
export extra_pytest=""
98-
export target="nil"
104+
export target=""
99105
;;
100106
intel-cpu)
101107
export dockerfile="Dockerfile_intel"
@@ -106,7 +112,7 @@ jobs:
106112
export runs_on="aws-highmemory-32-plus-priv"
107113
export platform="cpu"
108114
export extra_pytest="-k test_flash_gemma_simple"
109-
export target="nil"
115+
export target=""
110116
;;
111117
esac
112118
echo $dockerfile
@@ -193,7 +199,7 @@ jobs:
193199
sccache_gha_enabled=on
194200
actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
195201
actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
196-
202+
target: ${{ env.TARGET }}
197203
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
198204
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
199205
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max

Dockerfile_trtllm

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,6 @@ COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
123123
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
124124
COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
125125

126-
FROM runtime
127-
128-
LABEL co.huggingface.vendor="Hugging Face Inc."
129-
LABEL org.opencontainers.image.authors="[email protected]"
130-
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
131-
132-
ENTRYPOINT ["./text-generation-launcher"]
133-
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
134-
135126
# This is used only for the CI/CD
136127
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
137128
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
@@ -152,3 +143,13 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
152143

153144
# Basically we copy from target/debug instead of target/release
154145
COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
146+
147+
# This is the final image
148+
FROM runtime
149+
150+
LABEL co.huggingface.vendor="Hugging Face Inc."
151+
LABEL org.opencontainers.image.authors="[email protected]"
152+
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
153+
154+
ENTRYPOINT ["./text-generation-launcher"]
155+
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]

docs/source/backends/trtllm.md

Lines changed: 133 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,13 @@ The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
44
that uses NVIDIA's TensorRT library for inference acceleration.
55
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
66

7-
To use the TRTLLM backend you need to compile `engines` for the models you want to use.
8-
Each `engine` must be compiled on the same GPU architecture that you will use for inference.
7+
To use the TRTLLM backend **you need to compile** `engines` for the models you want to use.
8+
Each `engine` must be compiled for a given set of:
9+
- GPU architecture that you will use for inference (e.g. A100, L40, etc.)
10+
- Maximum batch size
11+
- Maximum input length
12+
- Maximum output length
13+
- Maximum beams width
914

1015
## Supported models
1116

@@ -19,63 +24,159 @@ want to use.
1924

2025
```bash
2126
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
22-
23-
# Install huggingface_cli
24-
python -m pip install huggingface-cli[hf_transfer]
25-
26-
# Login to the Hugging Face Hub
27-
huggingface-cli login
28-
29-
# Create a directory to store the model
30-
mkdir -p /tmp/models/$MODEL_NAME
31-
32-
# Create a directory to store the compiled engine
33-
mkdir -p /tmp/engines/$MODEL_NAME
34-
35-
# Download the model
36-
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
37-
27+
DESTINATION="/tmp/engines/$MODEL_NAME"
28+
HF_TOKEN="hf_xxx"
3829
# Compile the engine using Optimum-NVIDIA
30+
# This will create a compiled engine in the /tmp/engines/meta-llama/Llama-3.1-8B-Instruct
31+
# directory for 1 GPU
3932
docker run \
4033
--rm \
4134
-it \
4235
--gpus=1 \
43-
-v /tmp/models/$MODEL_NAME:/model \
44-
-v /tmp/engines/$MODEL_NAME:/engine \
45-
huggingface/optimum-nvidia \
46-
optimum-cli export trtllm \
36+
--shm-size=1g \
37+
-v "$DESTINATION":/engine \
38+
-e HF_TOKEN=$HF_TOKEN \
39+
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
40+
huggingface/optimum-nvidia:v0.1.0b9-py310 \
41+
bash -c "optimum-cli export trtllm \
4742
--tp=1 \
4843
--pp=1 \
49-
--max-batch-size=128 \
44+
--max-batch-size=64 \
5045
--max-input-length 4096 \
5146
--max-output-length 8192 \
5247
--max-beams-width=1 \
53-
--destination /engine \
54-
$MODEL_NAME
48+
--destination /tmp/engine \
49+
$MODEL_NAME && cp -rL /tmp/engine/* /engine/"
5550
```
5651

57-
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
52+
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory, in a subfolder named after the GPU used to compile the model.
5853

5954
## Using the TRTLLM backend
6055

6156
Run TGI-TRTLLM Docker image with the compiled engine:
6257

6358
```bash
59+
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
60+
DESTINATION="/tmp/engines/$MODEL_NAME"
61+
HF_TOKEN="hf_xxx"
6462
docker run \
6563
--gpus 1 \
64+
--shm-size=1g \
6665
-it \
6766
--rm \
6867
-p 3000:3000 \
6968
-e MODEL=$MODEL_NAME \
7069
-e PORT=3000 \
71-
-e HF_TOKEN='hf_XXX' \
72-
-v /tmp/engines/$MODEL_NAME:/data \
70+
-e HF_TOKEN=$HF_TOKEN \
71+
-v "$DESTINATION"/<YOUR_GPU_ARCHITECTURE>/engines:/data \
7372
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
74-
--executor-worker executorWorker \
75-
--model-id /data/$MODEL_NAME
73+
--model-id /data/ \
74+
--tokenizer-name $MODEL_NAME
7675
```
7776

7877
## Development
7978

80-
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
81-
`.devcontainer` directory.
79+
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) with the following `.devcontainer.json` file:
80+
```json
81+
{
82+
"name": "CUDA",
83+
"build": {
84+
"dockerfile": "Dockerfile_trtllm",
85+
"context": ".."
86+
},
87+
"remoteEnv": {
88+
"PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
89+
"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
90+
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
91+
},
92+
"customizations" : {
93+
"jetbrains" : {
94+
"backend" : "CLion"
95+
}
96+
}
97+
}
98+
```
99+
100+
and `Dockerfile_trtllm`:
101+
102+
```Dockerfile
103+
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
104+
ARG build_type=release
105+
ARG ompi_version=4.1.7
106+
107+
# CUDA dependent dependencies resolver stage
108+
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
109+
110+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
111+
build-essential \
112+
cmake \
113+
curl \
114+
gcc-14 \
115+
g++-14 \
116+
git \
117+
git-lfs \
118+
lld \
119+
libssl-dev \
120+
libucx-dev \
121+
libasan8 \
122+
libubsan1 \
123+
ninja-build \
124+
pkg-config \
125+
pipx \
126+
python3 \
127+
python3-dev \
128+
python3-setuptools \
129+
tar \
130+
wget --no-install-recommends && \
131+
pipx ensurepath
132+
133+
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
134+
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
135+
136+
# Install OpenMPI
137+
FROM cuda-builder AS mpi-builder
138+
WORKDIR /opt/src/mpi
139+
140+
ARG ompi_version
141+
ENV OMPI_VERSION=${ompi_version}
142+
ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
143+
ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
144+
https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .
145+
146+
RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
147+
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
148+
make -j all && \
149+
make install && \
150+
rm -rf ${OMPI_TARBALL_FILENAME}/..
151+
152+
# Install TensorRT
153+
FROM cuda-builder AS trt-builder
154+
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
155+
RUN chmod +x /opt/install_tensorrt.sh && \
156+
/opt/install_tensorrt.sh
157+
158+
# Build Backend
159+
FROM cuda-builder AS tgi-builder
160+
WORKDIR /usr/src/text-generation-inference
161+
162+
# Scoped global args reuse
163+
ARG cuda_arch_list
164+
ARG build_type
165+
ARG sccache_gha_enabled
166+
ARG actions_cache_url
167+
ARG actions_runtime_token
168+
169+
# Install Rust
170+
ENV PATH="/root/.cargo/bin:$PATH"
171+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
172+
chmod -R a+w /root/.rustup && \
173+
chmod -R a+w /root/.cargo && \
174+
cargo install sccache --locked
175+
176+
ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
177+
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
178+
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
179+
180+
ENV USE_LLD_LINKER=ON
181+
ENV CUDA_ARCH_LIST=${cuda_arch_list}
182+
```

0 commit comments

Comments
 (0)