Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark profiles and multi-turn conversations #1

Merged
merged 16 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 9 additions & 18 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
name: Build and push docker image to internal registry
name: Build and push docker image to registry

on:
workflow_call:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
branches:
- "main"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref }}-build
cancel-in-progress: true
jobs:
build-and-push:
Expand All @@ -31,7 +23,6 @@ jobs:
install: true
buildkitd-config: /tmp/buildkitd.toml
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
Expand All @@ -44,9 +35,9 @@ jobs:
uses: docker/metadata-action@v5
with:
images: |
registry.internal.huggingface.tech/api-inference/inference-benchmarker
ghcr.io/huggingface/inference-benchmarker
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
Expand All @@ -58,10 +49,10 @@ jobs:
images: |
ghcr.io/huggingface/inference-benchmarker
tags: |
type=semver,pattern={{version}}${{ env.LABEL }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
Expand All @@ -72,7 +63,7 @@ jobs:
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
PLATFORM=${{ env.PLATFORM }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
32 changes: 32 additions & 0 deletions .github/workflows/build_ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Build inference-benchmarker

on:
workflow_dispatch:
workflow_call:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
branches:
- "main"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
rust:
name: Rust checks
permissions:
pull-requests: write
contents: read
uses: ./.github/workflows/rust.yaml
build:
permissions:
packages: write
contents: read
name: Build and push docker image
uses: ./.github/workflows/build.yaml
needs: rust
14 changes: 3 additions & 11 deletions .github/workflows/rust.yml → .github/workflows/rust.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
name: Rust checks

on:
workflow_dispatch:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
paths:
- 'src/**'
- .github/workflows/rust.yml
workflow_call:

name: Rust checks
permissions:
pull-requests: write
contents: read
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: Secret Leaks

on:
push:

name: Secret Leaks

jobs:
trufflehog:
runs-on:
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ serde_with = "3.9.0"
sysinfo = "0.31.4"
mockito = "1.5.0"
tabled = "=0.14"
uuid = { version = "1.11.0", features = ["v4", "fast-rng"] }

[build-dependencies]
vergen-gitcl = { version = "1.0.1" }
vergen-gitcl = { version = "1.0.1" }
27 changes: 22 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
* Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant chat API.
* Automatic Sweep Mode: Detects maximum throughput and sweeps in-between.
* Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads.
* Benchmark profiles: Presets to benchmark for different model use cases (eg. chat, summarization, code completion...).
* High-Performance: Built with Rust 🦀 for high-performance benchmarking.
* JSON Output: Delivers performance results in a structured, easy-to-analyze format.

Expand All @@ -34,6 +35,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
* [1. Start an inference server](#1-start-an-inference-server)
* [2. Run a benchmark using Docker image](#2-run-a-benchmark-using-docker-image)
* [Configure your benchmark](#configure-your-benchmark)
* [Profiles](#profiles)
* [Benchmark mode](#benchmark-mode)
* [Dataset configuration](#dataset-configuration)
* [Prompt configuration](#prompt-configuration)
Expand Down Expand Up @@ -79,6 +81,7 @@ docker run --runtime nvidia --gpus all \
```shell
MODEL=meta-llama/Llama-3.1-8B-Instruct
HF_TOKEN=<your HF READ token>
# run a benchmark to evaluate the performance of the model for chat use case
# we mount results to the current directory
$ docker run \
--rm \
Expand All @@ -89,18 +92,32 @@ $ docker run \
ghcr.io/huggingface/inference-benchmarker:latest \
inference-benchmarker \
--tokenizer-name "$MODEL" \
--max-vus 800 \
--url http://localhost:8080 \
--warmup 20s \
--num-rates 10 \
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10"
--profile chat
```

Results will be saved in JSON format in current directory.

### Configure your benchmark

#### Profiles

Profiles are presets to benchmark for different model use cases. Available profiles:
- `chat`
Simulates a multi-turn chat scenario in which the model answers to successive user prompts.
The model is prompted with the whole conversation history at each turn. Prefix caching will have a significant impact
on the performance of this benchmark.
- `code-generation`
Simulates code-complete scenarios. Model is given large code snippets and asked to complete them with a few tokens
(e.g. a function name, a few code lines).
- `classification`
Simulates cases where the model is fed with large chunks of business data or document repeatedly and users
ask simple questions about the content (summarization, classification...).
Those use cases benefit a lot from prefix caching and chunked prefill.
- `fixed-length`
Model is sent fixed-length prompts to void the impact of variable-length tokenization on the benchmark.
This is a technical benchmark to evaluate the raw throughput of the model.

#### Benchmark mode

In default mode, tool runs a `sweep` benchmark. It first runs a throughput test to find the maximum throughput, then
Expand Down
3 changes: 2 additions & 1 deletion src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,8 @@ impl Widget for &App {
Some(_) => "Manual".to_string(),
};
let config_text = Text::from(vec![Line::from(vec![
format!("Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
format!("Profile: {profile} | Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
profile = self.benchmark_config.profile.clone().unwrap_or("N/A".to_string()),
kind = self.benchmark_config.benchmark_kind,
max_vus = self.benchmark_config.max_vus,
duration = self.benchmark_config.duration.as_secs_f64(),
Expand Down
2 changes: 2 additions & 0 deletions src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub struct BenchmarkConfig {
pub prompt_options: Option<TokenizeOptions>,
pub decode_options: Option<TokenizeOptions>,
pub tokenizer: String,
pub profile: Option<String>,
#[serde(rename = "meta")]
pub extra_metadata: Option<HashMap<String, String>>,
}
Expand Down Expand Up @@ -439,6 +440,7 @@ mod tests {
prompt_options: None,
decode_options: None,
tokenizer: "gpt2".to_string(),
profile: None,
extra_metadata: None,
},
backend,
Expand Down
20 changes: 19 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::sync::Arc;
pub use crate::app::run_console;
pub use crate::benchmark::{BenchmarkConfig, BenchmarkKind};
use crate::benchmark::{Event, MessageEvent};
pub use crate::profiles::apply_profile;
use crate::requests::OpenAITextGenerationBackend;
pub use crate::requests::TokenizeOptions;
use chrono::Local;
Expand All @@ -23,6 +24,7 @@ mod benchmark;
mod event;
mod executors;
mod flux;
mod profiles;
mod requests;
mod results;
mod scheduler;
Expand All @@ -32,6 +34,7 @@ mod writers;
pub struct RunConfiguration {
pub url: String,
pub tokenizer_name: String,
pub profile: Option<String>,
pub max_vus: u64,
pub duration: std::time::Duration,
pub rates: Option<Vec<f64>>,
Expand All @@ -48,10 +51,24 @@ pub struct RunConfiguration {
pub model_name: String,
}

pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
pub async fn run(mut run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
info!("Starting benchmark");
// set process system limits
sysinfo::set_open_files_limit(0);
// apply profile if needed
run_config = match run_config.profile.clone() {
None => run_config,
Some(profile) => match apply_profile(profile.as_str(), run_config) {
Ok(config) => {
info!("Profile applied: {}", profile);
config
}
Err(e) => {
error!("Failed to apply profile: {:?}", e);
return Err(e);
}
},
};
// initialize tokenizer
let params = FromPretrainedParameters {
token: run_config.hf_token.clone(),
Expand Down Expand Up @@ -88,6 +105,7 @@ pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyho
prompt_options: run_config.prompt_options.clone(),
decode_options: run_config.decode_options.clone(),
tokenizer: run_config.tokenizer_name.clone(),
profile: run_config.profile.clone(),
extra_metadata: run_config.extra_metadata.clone(),
};
config.validate()?;
Expand Down
Loading
Loading