Skip to content
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e8d4c5c
feat: TPU host offload for KV cache
juncgu-google Nov 24, 2025
cd5cce2
tweaks
juncgu-google Nov 24, 2025
21ae0de
offload envs
juncgu-google Nov 24, 2025
a5ec87d
rm saving behavior
juncgu-google Nov 24, 2025
0fc7dad
tweaks
juncgu-google Nov 24, 2025
df3b091
staging_tokens --> staging_blocks
juncgu-google Nov 24, 2025
aca95f1
updte gke yaml
juncgu-google Nov 24, 2025
ace918a
tweaks
juncgu-google Nov 25, 2025
894c747
fix imports in kv_cache tests
juncgu-google Nov 25, 2025
a24e4bb
tweaks
juncgu-google Nov 25, 2025
616ac13
tweaks
juncgu-google Nov 25, 2025
6f8ae20
multi-request worker test
juncgu-google Nov 26, 2025
ff4d31f
debug: add jax block
juncgu-google Nov 26, 2025
43f8f1e
worker_test: multi requests; acc_test: precompile
juncgu-google Dec 1, 2025
97153b9
add feature test
juncgu-google Dec 1, 2025
560caf8
follow up changes in the upstream; and update test scripts
juncgu-google Dec 1, 2025
12c4885
update ci tests
juncgu-google Dec 3, 2025
2901e56
update unit-test yml
juncgu-google Dec 3, 2025
7b0a20a
Update test
dannawang0221 Dec 4, 2025
63b0c0b
fix gke kv cache verification with sampling_param.temperature=0
juncgu-google Dec 4, 2025
8b79f68
Change sampling params to configrable
dannawang0221 Dec 4, 2025
a3ff52b
config pre-mapped buffer of tpu
juncgu-google Dec 5, 2025
0249329
Update benchmark pods
dannawang0221 Dec 6, 2025
9ac152e
tweaks
juncgu-google Dec 6, 2025
b0ddb8c
fix load_spec for unscheduled requests; fix cached request with both …
juncgu-google Dec 10, 2025
43e2fb4
cpu chunk: ready_to_evict: ref_cnt==0
juncgu-google Dec 11, 2025
0d39925
update unit tests
juncgu-google Dec 12, 2025
7d81d90
put offload folder under tpu_inference
juncgu-google Dec 12, 2025
b659166
Update banchmark pods
dannawang0221 Dec 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 0 additions & 45 deletions .buildkite/features/KV_Cache_Host_Offloading.yml

This file was deleted.

49 changes: 49 additions & 0 deletions .buildkite/features/KV_Cache_Offload.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# KV Cache Offload
# feature support matrix
steps:
- label: "Correctness tests for KV Cache Offload"
key: "KV_Cache_Offload_CorrectnessTest"
soft_fail: true
env:
USE_V6E8_QUEUE: "True"
VLLM_LOG_LEVEL: "INFO"
agents:
queue: tpu_v6e_8_queue
commands:
- |
.buildkite/scripts/run_in_docker.sh \
python3 -m pytest -s -v /workspace/tpu_inference/tests/distributed/offload/tpu_offload_accuracy_test.py
- label: "Record correctness test result for KV Cache Offload"
key: "record_KV_Cache_Offload_CorrectnessTest"
depends_on: "KV_Cache_Offload_CorrectnessTest"
env:
CI_TARGET: "KV Cache Offload"
CI_STAGE: "CorrectnessTest"
CI_CATEGORY: "feature support matrix"
agents:
queue: cpu
commands:
- |
.buildkite/scripts/record_step_result.sh KV_Cache_Offload_CorrectnessTest

- label: "Performance tests for KV Cache Offload"
key: "KV_Cache_Offload_PerformanceTest"
depends_on: "record_KV_Cache_Offload_CorrectnessTest"
soft_fail: true
agents:
queue: tpu_v6e_queue
commands:
- |
buildkite-agent meta-data set "KV_Cache_Offload_PerformanceTest" "to be added"
- label: "Record performance test result for KV Cache Offload"
key: "record_KV_Cache_Offload_PerformanceTest"
depends_on: "KV_Cache_Offload_PerformanceTest"
env:
CI_TARGET: "KV Cache Offload"
CI_STAGE: "PerformanceTest"
CI_CATEGORY: "feature support matrix"
agents:
queue: cpu
commands:
- |
.buildkite/scripts/record_step_result.sh KV_Cache_Offload_PerformanceTest
20 changes: 19 additions & 1 deletion .buildkite/pipeline_jax.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ steps:
--ignore=/workspace/tpu_inference/tests/e2e \
--ignore=/workspace/tpu_inference/tpu_inference/mock \
--ignore=/workspace/tpu_inference/tests/layers/vllm/test_compressed_tensors_moe.py \
--ignore=/workspace/tpu_inference/tests/distributed/offload \
--cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=69

- label: "JAX unit tests - kernels"
Expand All @@ -137,6 +138,7 @@ steps:
--ignore=/workspace/tpu_inference/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
--ignore=/workspace/tpu_inference/tests/kernels/ragged_kv_cache_update_v2_test.py \
--ignore=/workspace/tpu_inference/tests/kernels/collectives \
--ignore=/workspace/tpu_inference/tests/kernels/host_dma_test.py \
--ignore=/workspace/tpu_inference/tests/kernels/fused_moe_v1_test.py
else
echo "Skipping: no changes detected in kernels, tests/kernels, or requirements.txt"
Expand Down Expand Up @@ -255,6 +257,21 @@ steps:
echo "Skipping: NIGHTLY environment variable not set"
exit 0
fi

- label: "kv cache offload tests on multi chips"
key: test_17
soft_fail: true
env:
USE_V6E8_QUEUE: "True"
VLLM_LOG_LEVEL: "INFO"
agents:
queue: tpu_v6e_8_queue
commands:
- |
.buildkite/scripts/run_in_docker.sh \
python3 -m pytest -s -v -x /workspace/tpu_inference/tests/distributed/offload/ \
/workspace/tpu_inference/tests/kernels/host_dma_test.py \
--ignore=/workspace/tpu_inference/tests/distributed/offload/tpu_offload_accuracy_test.py
# -----------------------------------------------------------------
# NOTIFICATION STEP
# -----------------------------------------------------------------
Expand All @@ -277,9 +294,10 @@ steps:
- test_13
- test_15
- test_16
- test_17
agents:
queue: cpu
commands:
- |
.buildkite/scripts/check_results.sh \
"TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16
"TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16 test_17
117 changes: 117 additions & 0 deletions examples/offload/gke/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Benchmarks using SGLang bench_serving tool

This guide outlines the steps to deploy a vLLM serving instance on Google Kubernetes Engine (GKE) with TPUs, create a service to expose it, and then run the SGLang `bench_serving.py` benchmark against it. Two deployment options for vLLM are provided: a baseline without host offload and one with TPU host offload for KV cache.

## Prerequisites

* `kubectl` configured to connect to your GKE cluster.
* `gcloud` CLI installed and authenticated.
* A GKE cluster with TPU nodes (the below steps have been verified with `ct6e-standard-8t` GKE node)
* Access to Llama-3.3-70B model on Hugging Face

## 1. Create Hugging Face Token Secret

A Hugging Face token is required to pull the model. Create a Kubernetes secret with your token:

```bash
kubectl create secret generic hf-token-secret --from-literal=token='<YOUR_HF_TOKEN>'
```

Replace `<YOUR_HF_TOKEN>` with your actual Hugging Face token.

## 2. Deploy vLLM Pod (Choose One)

Choose one of the following deployment options for your vLLM pod. Ensure the right container image is used in the pod spec

### Option A: Baseline vLLM (No Host Offload)

This deployment uses a standard vLLM setup without any specific TPU host offload connector. The KV cache will reside entirely on the TPU HBM.

```bash
kubectl apply -f deploy-baseline.yaml
```

### Option B: vLLM with TPU Host Offload

This deployment configures vLLM to use a `TPUOffloadConnector` for KV cache offload to the host CPU memory. This is specified by the `--kv-transfer-config` argument.

```bash
kubectl apply -f deploy-cpu-offload.yaml
```

## 3. Deploy Service

Deploy a LoadBalancer service to expose your vLLM deployment. This will provide an external IP address to send benchmark requests to.

```bash
kubectl apply -f service.yaml
```

After deployment, get the external IP of the service:

```bash
kubectl get service tpu-offline-inference -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
```

This command will directly output the external IP address. It might take a few minutes for the IP to be provisioned.

## 4. Run Benchmark

Instead of installing SGLang locally, we can run the benchmark from within the Kubernetes cluster using a dedicated pod. This approach avoids local dependency management and ensures the benchmark runs in a consistent environment.

### a. Configure the Benchmark Pod

A sample pod specification is provided in `benchmark-pod.yaml`. Before deploying it, you need to configure the environment variables within the file, especially the `IP` of the vLLM service.

Open `benchmark-pod.yaml` and replace `<Your service EXTERNAL-IP>` with the actual external IP address of your `tpu-offline-inference` service obtained in step 3.

You can also adjust the following benchmark parameters via environment variables in the `benchmark-pod.yaml` file:

* `GSP_NUM_GROUPS`: The number of unique system prompts.
* `GSP_PROMPTS_PER_GROUP`: The number of questions per system prompt.
* `GSP_SYSTEM_PROMPT_LEN`: The token length of the system prompt.
* `GSP_QUESTION_LEN`: The token length of the question.
* `GSP_OUTPUT_LEN`: The desired output token length.
* `MODEL`: The model to benchmark.

### b. Deploy the Benchmark Pod

Once configured, deploy the benchmark pod:

```bash
kubectl apply -f benchmark-pod.yaml
```

The pod will start, clone the SGLang repository, install dependencies, and run the benchmark.

### c. Monitor the Benchmark

You can monitor the progress of the benchmark by checking the logs of the pod:

```bash
kubectl logs -f sglang-benchmark
```

The pod is configured with `restartPolicy: Never`, so it will run the benchmark once and then complete.

## 5. Understanding `generated-shared-prefix` Dataset

The `generated-shared-prefix` dataset is designed to benchmark serving performance for workloads where multiple requests share a common, long prefix. This is common in applications using system prompts or few-shot examples.

**How it works:**

1. **System Prompt Generation:** A specified number of unique "system prompts" are generated. Each is a long sequence of random tokens.
2. **Question Generation:** Shorter "questions" (random tokens) are generated.
3. **Prompt Combination:** Each system prompt is combined with multiple unique questions to form final prompts. This creates groups of prompts where each prompt in a group shares the exact same system prompt as a prefix.
4. **Request Creation:** Each final prompt is packaged with its desired output length.
5. **Shuffling:** The entire set of generated requests is randomly shuffled. This interleaves requests from different groups, simulating realistic traffic where shared prefixes are not necessarily processed sequentially.
6. **Caching:** The generated dataset is cached locally for faster subsequent runs with the same parameters.

**Key Parameters for `generated-shared-prefix`:**

* `--gsp-num-groups`: The number of unique system prompts to generate. Each system prompt forms a "group" of requests.
* `--gsp-prompts-per-group`: The number of unique questions that will be appended to each system prompt. This determines how many requests will share a given system prompt.
* `--gsp-system-prompt-len`: The length (in tokens) of each generated system prompt.
* `--gsp-question-len`: The length (in tokens) of each generated question.
* `--gsp-output-len`: The desired length (in tokens) of the generated output for each request.
* `--seed`: (Optional) An integer seed for random number generation, ensuring reproducible prompt generation and request shuffling across runs.
55 changes: 55 additions & 0 deletions examples/offload/gke/benchmarks/benchmark-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: v1
kind: Pod
metadata:
name: sglang-benchmark
spec:
containers:
- name: sglang-benchmark-container
image: python:3.9-slim
command: ["/bin/bash", "-c"]
args:
- |
set -ex
apt-get update && apt-get install -y git
git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
cd sglang
pip install --upgrade pip
pip install protobuf aiohttp numpy requests tqdm transformers
python3 python/sglang/bench_serving.py \
--host=$(IP) \
--port=$(PORT) \
--dataset-name='generated-shared-prefix' \
--model=$(MODEL) \
--tokenizer=$(MODEL) \
--backend=vllm \
--gsp-num-groups=$(GSP_NUM_GROUPS) \
--gsp-prompts-per-group=$(GSP_PROMPTS_PER_GROUP) \
--gsp-system-prompt-len=$(GSP_SYSTEM_PROMPT_LEN) \
--gsp-question-len=$(GSP_QUESTION_LEN) \
--gsp-output-len=$(GSP_OUTPUT_LEN) \
--request-rate=800 \
--max-concurrency=300 \
--seed 42
env:
- name: IP
value: "34.162.66.198" # Replace with the external IP of your deployed service
- name: PORT
value: "80"
- name: MODEL
value: "meta-llama/Llama-3.3-70B-Instruct"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: GSP_NUM_GROUPS
value: "2"
- name: GSP_PROMPTS_PER_GROUP
value: "16"
- name: GSP_SYSTEM_PROMPT_LEN
value: "2048"
- name: GSP_QUESTION_LEN
value: "256"
- name: GSP_OUTPUT_LEN
value: "512"
restartPolicy: Never
39 changes: 39 additions & 0 deletions examples/offload/gke/benchmarks/deploy-baseline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: tpu-offline-inference
spec:
replicas: 1
selector:
matchLabels:
app: tpu-offline-inference
template:
metadata:
labels:
app: tpu-offline-inference
spec:
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 2x4 # Specify the physical topology for the TPU slice.
containers:
- name: tpu-job
image: <your-tpu-inference-container-image>
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
- "vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8000 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: SKIP_JAX_PRECOMPILE
value: "1"
ports:
- containerPort: 8000
resources:
requests:
google.com/tpu: 8
limits:
google.com/tpu: 8
Loading