vllm-project · juncgu-google · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -0,0 +1,49 @@
+# KV Cache Offload
+# feature support matrix
+steps:
+  - label: "Correctness tests for KV Cache Offload"
+    key: "KV_Cache_Offload_CorrectnessTest"
+    soft_fail: true
+    env:
+      USE_V6E8_QUEUE: "True"
+      VLLM_LOG_LEVEL: "INFO"
+    agents:
+      queue: tpu_v6e_8_queue
+    commands:
+      - |
+        .buildkite/scripts/run_in_docker.sh \
+          python3 -m pytest -s -v /workspace/tpu_inference/tests/distributed/offload/tpu_offload_accuracy_test.py
+  - label: "Record correctness test result for KV Cache Offload"
+    key: "record_KV_Cache_Offload_CorrectnessTest"
+    depends_on: "KV_Cache_Offload_CorrectnessTest"
+    env:
+      CI_TARGET: "KV Cache Offload"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh KV_Cache_Offload_CorrectnessTest
+
+  - label: "Performance tests for KV Cache Offload"
+    key: "KV_Cache_Offload_PerformanceTest"
+    depends_on: "record_KV_Cache_Offload_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "KV_Cache_Offload_PerformanceTest" "to be added"
+  - label: "Record performance test result for KV Cache Offload"
+    key: "record_KV_Cache_Offload_PerformanceTest"
+    depends_on: "KV_Cache_Offload_PerformanceTest"
+    env:
+      CI_TARGET: "KV Cache Offload"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "feature support matrix"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh KV_Cache_Offload_PerformanceTest
@@ -122,6 +122,7 @@ steps:
            --ignore=/workspace/tpu_inference/tests/e2e \
            --ignore=/workspace/tpu_inference/tpu_inference/mock \
            --ignore=/workspace/tpu_inference/tests/layers/vllm/test_compressed_tensors_moe.py \
+           --ignore=/workspace/tpu_inference/tests/distributed/offload \
            --cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=69
 
    - label: "JAX unit tests - kernels"
@@ -137,6 +138,7 @@ steps:
              --ignore=/workspace/tpu_inference/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
              --ignore=/workspace/tpu_inference/tests/kernels/ragged_kv_cache_update_v2_test.py \
              --ignore=/workspace/tpu_inference/tests/kernels/collectives \
+             --ignore=/workspace/tpu_inference/tests/kernels/host_dma_test.py \
              --ignore=/workspace/tpu_inference/tests/kernels/fused_moe_v1_test.py
          else
            echo "Skipping: no changes detected in kernels, tests/kernels, or requirements.txt"
@@ -255,6 +257,21 @@ steps:
            echo "Skipping: NIGHTLY environment variable not set"
            exit 0
          fi
+
+   - label: "kv cache offload tests on multi chips"
+     key: test_17
+     soft_fail: true
+     env:
+       USE_V6E8_QUEUE: "True"
+       VLLM_LOG_LEVEL: "INFO"
+     agents:
+       queue: tpu_v6e_8_queue
+     commands:
+       - |
+         .buildkite/scripts/run_in_docker.sh \
+           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/distributed/offload/ \
+           /workspace/tpu_inference/tests/kernels/host_dma_test.py \
+           --ignore=/workspace/tpu_inference/tests/distributed/offload/tpu_offload_accuracy_test.py
   # -----------------------------------------------------------------
   # NOTIFICATION STEP
   # -----------------------------------------------------------------
@@ -277,9 +294,10 @@ steps:
        - test_13
        - test_15
        - test_16
+       - test_17
      agents:
        queue: cpu
      commands:
        - |
          .buildkite/scripts/check_results.sh \
-           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16
+           "TPU JAX Tests Failed" test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9 test_10 test_11 test_12 test_13 test_15 test_16 test_17
@@ -0,0 +1,117 @@
+# Benchmarks using SGLang bench_serving tool
+
+This guide outlines the steps to deploy a vLLM serving instance on Google Kubernetes Engine (GKE) with TPUs, create a service to expose it, and then run the SGLang `bench_serving.py` benchmark against it. Two deployment options for vLLM are provided: a baseline without host offload and one with TPU host offload for KV cache.
+
+## Prerequisites
+
+* `kubectl` configured to connect to your GKE cluster.
+* `gcloud` CLI installed and authenticated.
+* A GKE cluster with TPU nodes (the below steps have been verified with `ct6e-standard-8t` GKE node)
+* Access to Llama-3.3-70B model on Hugging Face
+
+## 1. Create Hugging Face Token Secret
+
+A Hugging Face token is required to pull the model. Create a Kubernetes secret with your token:
+
+```bash
+kubectl create secret generic hf-token-secret --from-literal=token='<YOUR_HF_TOKEN>'
+```
+
+Replace `<YOUR_HF_TOKEN>` with your actual Hugging Face token.
+
+## 2. Deploy vLLM Pod (Choose One)
+
+Choose one of the following deployment options for your vLLM pod. Ensure the right container image is used in the pod spec
+
+### Option A: Baseline vLLM (No Host Offload)
+
+This deployment uses a standard vLLM setup without any specific TPU host offload connector. The KV cache will reside entirely on the TPU HBM.
+
+```bash
+kubectl apply -f deploy-baseline.yaml
+```
+
+### Option B: vLLM with TPU Host Offload
+
+This deployment configures vLLM to use a `TPUOffloadConnector` for KV cache offload to the host CPU memory. This is specified by the `--kv-transfer-config` argument.
+
+```bash
+kubectl apply -f deploy-cpu-offload.yaml
+```
+
+## 3. Deploy Service
+
+Deploy a LoadBalancer service to expose your vLLM deployment. This will provide an external IP address to send benchmark requests to.
+
+```bash
+kubectl apply -f service.yaml
+```
+
+After deployment, get the external IP of the service:
+
+```bash
+kubectl get service tpu-offline-inference -o jsonpath='{.status.loadBalancer.ingress[0].ip}'
+```
+
+This command will directly output the external IP address. It might take a few minutes for the IP to be provisioned.
+
+## 4. Run Benchmark
+
+Instead of installing SGLang locally, we can run the benchmark from within the Kubernetes cluster using a dedicated pod. This approach avoids local dependency management and ensures the benchmark runs in a consistent environment.
+
+### a. Configure the Benchmark Pod
+
+A sample pod specification is provided in `benchmark-pod.yaml`. Before deploying it, you need to configure the environment variables within the file, especially the `IP` of the vLLM service.
+
+Open `benchmark-pod.yaml` and replace `<Your service EXTERNAL-IP>` with the actual external IP address of your `tpu-offline-inference` service obtained in step 3.
+
+You can also adjust the following benchmark parameters via environment variables in the `benchmark-pod.yaml` file:
+
+* `GSP_NUM_GROUPS`: The number of unique system prompts.
+* `GSP_PROMPTS_PER_GROUP`: The number of questions per system prompt.
+* `GSP_SYSTEM_PROMPT_LEN`: The token length of the system prompt.
+* `GSP_QUESTION_LEN`: The token length of the question.
+* `GSP_OUTPUT_LEN`: The desired output token length.
+* `MODEL`: The model to benchmark.
+
+### b. Deploy the Benchmark Pod
+
+Once configured, deploy the benchmark pod:
+
+```bash
+kubectl apply -f benchmark-pod.yaml
+```
+
+The pod will start, clone the SGLang repository, install dependencies, and run the benchmark.
+
+### c. Monitor the Benchmark
+
+You can monitor the progress of the benchmark by checking the logs of the pod:
+
+```bash
+kubectl logs -f sglang-benchmark
+```
+
+The pod is configured with `restartPolicy: Never`, so it will run the benchmark once and then complete.
+
+## 5. Understanding `generated-shared-prefix` Dataset
+
+The `generated-shared-prefix` dataset is designed to benchmark serving performance for workloads where multiple requests share a common, long prefix. This is common in applications using system prompts or few-shot examples.
+
+**How it works:**
+
+1. **System Prompt Generation:** A specified number of unique "system prompts" are generated. Each is a long sequence of random tokens.
+2. **Question Generation:** Shorter "questions" (random tokens) are generated.
+3. **Prompt Combination:** Each system prompt is combined with multiple unique questions to form final prompts. This creates groups of prompts where each prompt in a group shares the exact same system prompt as a prefix.
+4. **Request Creation:** Each final prompt is packaged with its desired output length.
+5. **Shuffling:** The entire set of generated requests is randomly shuffled. This interleaves requests from different groups, simulating realistic traffic where shared prefixes are not necessarily processed sequentially.
+6. **Caching:** The generated dataset is cached locally for faster subsequent runs with the same parameters.
+
+**Key Parameters for `generated-shared-prefix`:**
+
+* `--gsp-num-groups`: The number of unique system prompts to generate. Each system prompt forms a "group" of requests.
+* `--gsp-prompts-per-group`: The number of unique questions that will be appended to each system prompt. This determines how many requests will share a given system prompt.
+* `--gsp-system-prompt-len`: The length (in tokens) of each generated system prompt.
+* `--gsp-question-len`: The length (in tokens) of each generated question.
+* `--gsp-output-len`: The desired length (in tokens) of the generated output for each request.
+* `--seed`: (Optional) An integer seed for random number generation, ensuring reproducible prompt generation and request shuffling across runs.
@@ -0,0 +1,55 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: sglang-benchmark
+spec:
+  containers:
+  - name: sglang-benchmark-container
+    image: python:3.9-slim
+    command: ["/bin/bash", "-c"]
+    args:
+    - |
+      set -ex
+      apt-get update && apt-get install -y git
+      git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
+      cd sglang
+      pip install --upgrade pip
+      pip install protobuf aiohttp numpy requests tqdm transformers
+      python3 python/sglang/bench_serving.py \
+        --host=$(IP) \
+        --port=$(PORT) \
+        --dataset-name='generated-shared-prefix' \
+        --model=$(MODEL) \
+        --tokenizer=$(MODEL) \
+        --backend=vllm \
+        --gsp-num-groups=$(GSP_NUM_GROUPS) \
+        --gsp-prompts-per-group=$(GSP_PROMPTS_PER_GROUP) \
+        --gsp-system-prompt-len=$(GSP_SYSTEM_PROMPT_LEN) \
+        --gsp-question-len=$(GSP_QUESTION_LEN) \
+        --gsp-output-len=$(GSP_OUTPUT_LEN) \
+        --request-rate=800 \
+        --max-concurrency=300 \
+        --seed 42
+    env:
+    - name: IP
+      value: "34.162.66.198" # Replace with the external IP of your deployed service
+    - name: PORT
+      value: "80"
+    - name: MODEL
+      value: "meta-llama/Llama-3.3-70B-Instruct"
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+    - name: GSP_NUM_GROUPS
+      value: "2"
+    - name: GSP_PROMPTS_PER_GROUP
+      value: "16"
+    - name: GSP_SYSTEM_PROMPT_LEN
+      value: "2048"
+    - name: GSP_QUESTION_LEN
+      value: "256"
+    - name: GSP_OUTPUT_LEN
+      value: "512"
+  restartPolicy: Never
@@ -0,0 +1,39 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tpu-offline-inference
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tpu-offline-inference
+  template:
+    metadata:
+      labels:
+        app: tpu-offline-inference
+    spec:
+      nodeSelector:
+        cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+        cloud.google.com/gke-tpu-topology: 2x4 # Specify the physical topology for the TPU slice.
+      containers:
+      - name: tpu-job
+        image: <your-tpu-inference-container-image>
+        imagePullPolicy: Always
+        command: ["/bin/sh", "-c"]
+        args:
+        - "vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8000 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        - name: SKIP_JAX_PRECOMPILE
+          value: "1"
+        ports:
+        - containerPort: 8000
+        resources:
+          requests:
+            google.com/tpu: 8
+          limits:
+            google.com/tpu: 8