willow-inference-server/Dockerfile at main · nikito/willow-inference-server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
FROM nvcr.io/nvidia/tensorrt:22.12-py3

WORKDIR /app

# Set in environment in case we need to build any extensions
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.2;7.5;8.0;8.6;8.9;9.0+PTX"

# Install zstd and git-lfs for model compression and distribution
RUN apt-get update && apt-get install -y zstd  git-lfs && rm -rf /var/lib/apt/lists/*

# Install our torch ver matching cuda
RUN --mount=type=cache,target=/root/.cache pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118

COPY requirements.txt .
# Run pip install with cache so we speedup subsequent rebuilds
RUN --mount=type=cache,target=/root/.cache pip install -r requirements.txt

# Install auto-gptq
RUN --mount=type=cache,target=/root/.cache BUILD_CUDA_EXT=0 pip install auto-gptq[triton]==0.2.2

COPY . .

CMD ./entrypoint.sh
EXPOSE 19000
EXPOSE 19001