Skip to content

Commit

Permalink
doc: Update TRTLLM deployment doc. (#2960)
Browse files Browse the repository at this point in the history
* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* fix: PR comments
  • Loading branch information
Hugoch authored Jan 30, 2025
1 parent cb747b3 commit 065aabb
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 47 deletions.
18 changes: 12 additions & 6 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
export runs_on="aws-g6-12xl-plus-priv-cache"
export platform=""
export extra_pytest=""
export target="nil"
export target=""
;;
cuda-trtllm)
export dockerfile="Dockerfile_trtllm"
Expand All @@ -74,7 +74,13 @@ jobs:
export runs_on="ubuntu-latest"
export platform=""
export extra_pytest=""
export build_type="dev"
if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then
export build_type="release";
export target="";
else
export build_type="dev";
export target="ci-runtime";
fi
;;
rocm)
export dockerfile="Dockerfile_amd"
Expand All @@ -85,7 +91,7 @@ jobs:
export runs_on="ubuntu-latest"
export platform=""
export extra_pytest="-k test_flash_gemma_gptq_load"
export target="nil"
export target=""
;;
intel-xpu)
export dockerfile="Dockerfile_intel"
Expand All @@ -95,7 +101,7 @@ jobs:
export runs_on="ubuntu-latest"
export platform="xpu"
export extra_pytest=""
export target="nil"
export target=""
;;
intel-cpu)
export dockerfile="Dockerfile_intel"
Expand All @@ -106,7 +112,7 @@ jobs:
export runs_on="aws-highmemory-32-plus-priv"
export platform="cpu"
export extra_pytest="-k test_flash_gemma_simple"
export target="nil"
export target=""
;;
esac
echo $dockerfile
Expand Down Expand Up @@ -193,7 +199,7 @@ jobs:
sccache_gha_enabled=on
actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
target: ${{ env.TARGET }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
Expand Down
19 changes: 10 additions & 9 deletions Dockerfile_trtllm
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,6 @@ COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher

FROM runtime

LABEL co.huggingface.vendor="Hugging Face Inc."
LABEL org.opencontainers.image.authors="[email protected]"
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"

ENTRYPOINT ["./text-generation-launcher"]
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]

# This is used only for the CI/CD
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
Expand All @@ -152,3 +143,13 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi

# Basically we copy from target/debug instead of target/release
COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher

# This is the final image
FROM runtime

LABEL co.huggingface.vendor="Hugging Face Inc."
LABEL org.opencontainers.image.authors="[email protected]"
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"

ENTRYPOINT ["./text-generation-launcher"]
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
165 changes: 133 additions & 32 deletions docs/source/backends/trtllm.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@ The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
that uses NVIDIA's TensorRT library for inference acceleration.
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.

To use the TRTLLM backend you need to compile `engines` for the models you want to use.
Each `engine` must be compiled on the same GPU architecture that you will use for inference.
To use the TRTLLM backend **you need to compile** `engines` for the models you want to use.
Each `engine` must be compiled for a given set of:
- GPU architecture that you will use for inference (e.g. A100, L40, etc.)
- Maximum batch size
- Maximum input length
- Maximum output length
- Maximum beams width

## Supported models

Expand All @@ -19,63 +24,159 @@ want to use.

```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"

# Install huggingface_cli
python -m pip install huggingface-cli[hf_transfer]

# Login to the Hugging Face Hub
huggingface-cli login

# Create a directory to store the model
mkdir -p /tmp/models/$MODEL_NAME

# Create a directory to store the compiled engine
mkdir -p /tmp/engines/$MODEL_NAME

# Download the model
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME

DESTINATION="/tmp/engines/$MODEL_NAME"
HF_TOKEN="hf_xxx"
# Compile the engine using Optimum-NVIDIA
# This will create a compiled engine in the /tmp/engines/meta-llama/Llama-3.1-8B-Instruct
# directory for 1 GPU
docker run \
--rm \
-it \
--gpus=1 \
-v /tmp/models/$MODEL_NAME:/model \
-v /tmp/engines/$MODEL_NAME:/engine \
huggingface/optimum-nvidia \
optimum-cli export trtllm \
--shm-size=1g \
-v "$DESTINATION":/engine \
-e HF_TOKEN=$HF_TOKEN \
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
huggingface/optimum-nvidia:v0.1.0b9-py310 \
bash -c "optimum-cli export trtllm \
--tp=1 \
--pp=1 \
--max-batch-size=128 \
--max-batch-size=64 \
--max-input-length 4096 \
--max-output-length 8192 \
--max-beams-width=1 \
--destination /engine \
$MODEL_NAME
--destination /tmp/engine \
$MODEL_NAME && cp -rL /tmp/engine/* /engine/"
```

Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory, in a subfolder named after the GPU used to compile the model.

## Using the TRTLLM backend

Run TGI-TRTLLM Docker image with the compiled engine:

```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
DESTINATION="/tmp/engines/$MODEL_NAME"
HF_TOKEN="hf_xxx"
docker run \
--gpus 1 \
--shm-size=1g \
-it \
--rm \
-p 3000:3000 \
-e MODEL=$MODEL_NAME \
-e PORT=3000 \
-e HF_TOKEN='hf_XXX' \
-v /tmp/engines/$MODEL_NAME:/data \
-e HF_TOKEN=$HF_TOKEN \
-v "$DESTINATION"/<YOUR_GPU_ARCHITECTURE>/engines:/data \
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
--executor-worker executorWorker \
--model-id /data/$MODEL_NAME
--model-id /data/ \
--tokenizer-name $MODEL_NAME
```

## Development

To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
`.devcontainer` directory.
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) with the following `.devcontainer.json` file:
```json
{
"name": "CUDA",
"build": {
"dockerfile": "Dockerfile_trtllm",
"context": ".."
},
"remoteEnv": {
"PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
},
"customizations" : {
"jetbrains" : {
"backend" : "CLion"
}
}
}
```

and `Dockerfile_trtllm`:

```Dockerfile
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
ARG build_type=release
ARG ompi_version=4.1.7

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
cmake \
curl \
gcc-14 \
g++-14 \
git \
git-lfs \
lld \
libssl-dev \
libucx-dev \
libasan8 \
libubsan1 \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget --no-install-recommends && \
pipx ensurepath

ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt

# Install OpenMPI
FROM cuda-builder AS mpi-builder
WORKDIR /opt/src/mpi

ARG ompi_version
ENV OMPI_VERSION=${ompi_version}
ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .

RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
make -j all && \
make install && \
rm -rf ${OMPI_TARBALL_FILENAME}/..

# Install TensorRT
FROM cuda-builder AS trt-builder
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
/opt/install_tensorrt.sh

# Build Backend
FROM cuda-builder AS tgi-builder
WORKDIR /usr/src/text-generation-inference

# Scoped global args reuse
ARG cuda_arch_list
ARG build_type
ARG sccache_gha_enabled
ARG actions_cache_url
ARG actions_runtime_token

# Install Rust
ENV PATH="/root/.cargo/bin:$PATH"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
chmod -R a+w /root/.rustup && \
chmod -R a+w /root/.cargo && \
cargo install sccache --locked

ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"

ENV USE_LLD_LINKER=ON
ENV CUDA_ARCH_LIST=${cuda_arch_list}
```

0 comments on commit 065aabb

Please sign in to comment.