Skip to content

Commit

Permalink
Merge branch 'main' into feature/usage-stats
Browse files Browse the repository at this point in the history
  • Loading branch information
ErikKaum committed Jul 15, 2024
2 parents af661fd + dbb23fb commit 81c9ad7
Show file tree
Hide file tree
Showing 109 changed files with 4,275 additions and 3,708 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/autodocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,16 @@ jobs:
id: install-router
run: cargo install --path router/

- uses: actions/setup-node@v4
with:
node-version: 22

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Check that documentation is up-to-date
run: |
npm install -g swagger-cli
python update_doc.py --check
13 changes: 9 additions & 4 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ on:
# - rocm
# - intel
required: true
release-tests:
description: "Run release integration tests"
required: true
default: false
type: boolean

jobs:
build-and-push:
Expand All @@ -23,7 +28,7 @@ jobs:
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
# TODO see with @Glegendre to get CPU runner here instead
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
permissions:
contents: write
packages: write
Expand Down Expand Up @@ -131,8 +136,8 @@ jobs:
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
cache-to: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
- name: Final
id: final
run: |
Expand All @@ -148,7 +153,7 @@ jobs:
runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
env:
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main') && '--release' || '' }}
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/ci_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@ on:
- "Dockerfile_amd"
- "Dockerfile_intel"
branches:
- 'main'
- "main"
workflow_dispatch:
inputs:
release-tests:
description: "Run release integration tests"
required: true
default: false
type: boolean

jobs:
build:
Expand All @@ -33,4 +40,6 @@ jobs:
uses: ./.github/workflows/build.yaml # calls the one above ^
with:
hardware: ${{ matrix.hardware }}
# https://github.com/actions/runner/issues/2206
release-tests: ${{ inputs.release-tests == true }}
secrets: inherit
28 changes: 3 additions & 25 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ RUN cargo build --profile release-opt
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install

# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
ARG PYTORCH_VERSION=2.3.0

ARG PYTHON_VERSION=3.10
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=12.1
Expand Down Expand Up @@ -241,7 +243,10 @@ COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements_cuda.txt && \
pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
pip install nvidia-nccl-cu12==2.22.3

ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2

# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
Expand Down
27 changes: 14 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,20 @@ to power Hugging Chat, the Inference API and Inference Endpoint.

## Table of contents

- [Get Started](#get-started)
- [API Documentation](#api-documentation)
- [Using a private or gated model](#using-a-private-or-gated-model)
- [A note on Shared Memory](#a-note-on-shared-memory-shm)
- [Distributed Tracing](#distributed-tracing)
- [Local Install](#local-install)
- [CUDA Kernels](#cuda-kernels)
- [Optimized architectures](#optimized-architectures)
- [Run Mistral](#run-a-model)
- [Run](#run)
- [Quantization](#quantization)
- [Develop](#develop)
- [Testing](#testing)
- [Get Started](#get-started)
- [Docker](#docker)
- [API documentation](#api-documentation)
- [Using a private or gated model](#using-a-private-or-gated-model)
- [A note on Shared Memory (shm)](#a-note-on-shared-memory-shm)
- [Distributed Tracing](#distributed-tracing)
- [Architecture](#architecture)
- [Local install](#local-install)
- [Optimized architectures](#optimized-architectures)
- [Run locally](#run-locally)
- [Run](#run)
- [Quantization](#quantization)
- [Develop](#develop)
- [Testing](#testing)

Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:

Expand Down
2 changes: 1 addition & 1 deletion clients/python/text_generation/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class ChoiceDeltaToolCall(BaseModel):
class ChoiceDelta(BaseModel):
role: str
content: Optional[str] = None
tool_calls: Optional[ChoiceDeltaToolCall]
tool_calls: Optional[ChoiceDeltaToolCall] = None


class Choice(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
title: Using TGI with Intel Gaudi
- local: installation_inferentia
title: Using TGI with AWS Inferentia
- local: installation_intel
title: Using TGI with Intel GPUs
- local: installation
title: Installation from source
- local: supported_models
Expand Down
1 change: 1 addition & 0 deletions docs/source/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Several variants of the model server exist that are actively supported by Huggin

- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
Expand Down
19 changes: 19 additions & 0 deletions docs/source/installation_intel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Using TGI with Intel GPUs

TGI optimized models are supported on Intel Data Center GPU [Max1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html), [Max1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html), the recommended usage is through Docker.


On a server powered by Intel GPUs, TGI can be launched with the following command:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:latest-intel \
--model-id $model --cuda-graphs 0
```

The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
2 changes: 1 addition & 1 deletion docs/source/quicktour.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \

### Supported hardware

TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Intel GPUs](./installation_intel), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.

## Consuming TGI

Expand Down
1 change: 1 addition & 0 deletions docs/source/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Text Generation Inference enables serving optimized models on specific hardware
- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
- [Gemma](https://huggingface.co/google/gemma-7b)
- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
- [Gemma2](https://huggingface.co/google/gemma2-9b)
- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,85 +5,80 @@
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"id": 2323,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.7890625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.625,
"text": "request"
"id": 1715,
"logprob": -11.34375,
"text": " request"
}
],
"seed": null,
"tokens": [
{
"id": 13,
"logprob": -2.3359375,
"id": 198,
"logprob": -2.5742188,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.8779297,
"id": 262,
"logprob": -1.6230469,
"special": false,
"text": "Test"
"text": " "
},
{
"id": 2009,
"logprob": -1.2744141,
"id": 3270,
"logprob": -2.046875,
"special": false,
"text": " request"
"text": " \"\"\"\n"
},
{
"id": 13,
"logprob": -1.6933594,
"id": 262,
"logprob": -0.015281677,
"special": false,
"text": "\n"
"text": " "
},
{
"id": 3057,
"logprob": -1.4648438,
"id": 422,
"logprob": -2.1425781,
"special": false,
"text": "Test"
"text": " if"
},
{
"id": 2009,
"logprob": -0.15600586,
"id": 1715,
"logprob": -0.9238281,
"special": false,
"text": " request"
},
{
"id": 13,
"logprob": -0.8027344,
"id": 13204,
"logprob": -0.076660156,
"special": false,
"text": "\n"
"text": ".method"
},
{
"id": 3057,
"logprob": -0.23022461,
"id": 624,
"logprob": -0.021987915,
"special": false,
"text": "Test"
"text": " =="
},
{
"id": 2009,
"logprob": -0.0069885254,
"id": 364,
"logprob": -0.39208984,
"special": false,
"text": " request"
"text": " '"
},
{
"id": 13,
"logprob": -0.02218628,
"id": 3019,
"logprob": -0.10821533,
"special": false,
"text": "\n"
"text": "POST"
}
],
"top_tokens": null
},
"generated_text": "\nTest request\nTest request\nTest request\n"
"generated_text": "\n \"\"\"\n if request.method == 'POST"
}
Loading

0 comments on commit 81c9ad7

Please sign in to comment.