From 1bed7e4ef377950e3e60053b7d6e7a86d285a750 Mon Sep 17 00:00:00 2001
From: Ylang Tsou <ylangt@google.com>
Date: Thu, 11 Dec 2025 16:34:15 +0800
Subject: [PATCH 1/2] Add workflow to build vLLM-TPU wheel using PyPI
 tpu-inference

Signed-off-by: Ylang Tsou <ylangt@google.com>
---
 .buildkite/pipeline_pypi.yml           | 54 +++++++++++++++
 .buildkite/scripts/bootstrap.sh        |  1 +
 .buildkite/scripts/build_vllm_tpu.sh   | 93 ++++++++++++++++++++++++++
 .buildkite/scripts/run_with_pypi.sh    | 10 +++
 .buildkite/scripts/setup_docker_env.sh | 12 +++-
 docker/Dockerfile.pypi                 | 33 +++++++++
 6 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 .buildkite/pipeline_pypi.yml
 create mode 100755 .buildkite/scripts/build_vllm_tpu.sh
 create mode 100755 .buildkite/scripts/run_with_pypi.sh
 create mode 100644 docker/Dockerfile.pypi
diff --git a/.buildkite/pipeline_pypi.yml b/.buildkite/pipeline_pypi.yml
new file mode 100644
index 000000000..b78cabf85
--- /dev/null
+++ b/.buildkite/pipeline_pypi.yml
@@ -0,0 +1,54 @@
+steps:
+  # -----------------------------------------------------------------
+  # TEST STEPS - Calling wrapper
+  # -----------------------------------------------------------------
+   - label: "Wait for 20 mins"
+     if: build.env("NIGHTLY") == "1"
+     key: "wait_20_minutes"
+     depends_on: "record_verified_commit_hashes"
+     agents:
+      queue: cpu
+     commands:
+      - "echo 'Starting 20 minute delay...'"
+      - "sleep 1200"
+      - "echo 'Delay finished, starting benchmarks.'"
+
+   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
+     key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark"
+     if: build.env("NIGHTLY") == "1"
+     depends_on: "wait_20_minutes"
+     agents:
+      queue: tpu_v6e_queue
+     env:
+      TEST_MODEL: meta-llama/Llama-3.1-8B-Instruct
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 10.77
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 256
+      MAX_NUM_BATCHED_TOKENS: 1024
+     commands:
+      - |
+        .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh
+
+   - label: "Performance benchmarks for Qwen/Qwen3-4B"
+     if: build.env("NIGHTLY") == "1"
+     key: "Qwen_Qwen3-4B_Benchmark"
+     depends_on: "wait_20_minutes"
+     agents:
+      queue: tpu_v6e_queue
+     env:
+      TEST_MODEL: Qwen/Qwen3-4B
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 11.00
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 94
+      MAX_NUM_BATCHED_TOKENS: 4096
+     commands:
+      - |
+       .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index d567b3004..927b62341 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -29,6 +29,7 @@ upload_pipeline() {
     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
     buildkite-agent pipeline upload .buildkite/main.yml
     buildkite-agent pipeline upload .buildkite/nightly_releases.yml
+    buildkite-agent pipeline upload .buildkite/pipeline_pypi.yml
 }
 
 echo "--- Starting Buildkite Bootstrap ---"
diff --git a/.buildkite/scripts/build_vllm_tpu.sh b/.buildkite/scripts/build_vllm_tpu.sh
new file mode 100755
index 000000000..73ac29a7a
--- /dev/null
+++ b/.buildkite/scripts/build_vllm_tpu.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+set -e
+
+# --- Script Configuration ---
+TPU_INFERENCE_VERSION=$1
+VLLM_TPU_VERSION=$2
+VLLM_BRANCH=${3:-"main"}
+VLLM_REPO="https://github.com/vllm-project/vllm.git"
+REPO_DIR="vllm"
+
+# --- Argument Validation ---
+if [ "$#" -lt 2 ]; then
+    echo "Usage: $0 <tpu-inference-version> <vllm-tpu-version> [vllm-branch-or-tag]"
+    echo "  [vllm-branch-or-tag] is optional, defaults to 'main'."
+    exit 1
+fi
+
+echo "--- Starting vLLM-TPU wheel build ---"
+echo "TPU Inference Version: ${TPU_INFERENCE_VERSION}"
+echo "vLLM-TPU Version: ${VLLM_TPU_VERSION}"
+echo "vLLM Branch/Tag: ${VLLM_BRANCH}"
+
+# --- Step 1: Clone vLLM repository ---
+if [ -d "$REPO_DIR" ]; then
+    echo "Repository '$REPO_DIR' already exists. Skipping clone."
+else
+    echo "Cloning vLLM repository..."
+    git clone ${VLLM_REPO}
+fi
+cd ${REPO_DIR}
+
+# --- Step 1.5: Checkout the specified vLLM branch/tag ---
+echo "Checking out vLLM branch/tag: ${VLLM_BRANCH}..."
+if ! git checkout "${VLLM_BRANCH}"; then
+    echo "ERROR: Failed to checkout branch/tag '${VLLM_BRANCH}'. Please check the branch/tag name."
+    exit 1
+fi
+echo "Successfully checked out ${VLLM_BRANCH}."
+git pull || echo "Warning: Failed to pull updates (may be on a tag)."
+
+# --- Step 2: Update tpu-inference version in requirements ---
+REQUIRED_LINE="tpu-inference==${TPU_INFERENCE_VERSION}"
+REQUIREMENTS_FILE="requirements/tpu.txt"
+BACKUP_FILE="${REQUIREMENTS_FILE}.bak"
+
+echo "Updating tpu-inference version in $REQUIREMENTS_FILE..."
+
+if [ -f "$REQUIREMENTS_FILE" ]; then
+    # Check if the last character is NOT a newline. If not, append one.
+    if [ "$(tail -c 1 "$REQUIREMENTS_FILE")" != "" ]; then
+        echo "" >> "$REQUIREMENTS_FILE"
+        echo "(Action: Added missing newline to the end of $REQUIREMENTS_FILE for safety.)"
+    fi
+fi
+
+if grep -q "^tpu-inference==" "$REQUIREMENTS_FILE"; then
+    # Replace the existing version using sed, which creates the .bak file
+    echo "(Action: Existing version found. Replacing.)"
+    sed -i.bak "s/^tpu-inference==.*/$REQUIRED_LINE/" "$REQUIREMENTS_FILE"
+
+else
+    # Line not found -> Append the new line to the file end, and manually create .bak
+    echo "(Action: Line not found. Appending new dependency.)"
+    echo "$REQUIRED_LINE" >> "$REQUIREMENTS_FILE"
+
+    # Create an empty .bak file for consistency, so cleanup works later.
+    touch "$BACKUP_FILE"
+fi
+
+# --- Step 3: Execute the vLLM TPU build script ---
+echo "Ensuring 'build' package is installed..."
+pip install build
+echo "Executing the vLLM TPU build script..."
+bash tools/vllm-tpu/build.sh "${VLLM_TPU_VERSION}"
+
+echo "--- Build complete! ---"
+echo "The wheel file can be found in the 'vllm/dist' directory."
+
+# --- Step 4: Cleanup and Revert Requirements File ---
+echo "--- Cleaning up local changes ---"
+
+if [ -f "$BACKUP_FILE" ]; then
+    echo "Reverting $REQUIREMENTS_FILE from backup."
+    # Remove the modified file
+    rm -f "$REQUIREMENTS_FILE"
+    # Rename the backup file back to the original name
+    mv "$BACKUP_FILE" "$REQUIREMENTS_FILE"
+else
+    echo "Warning: Backup file $BACKUP_FILE not found. Skipping revert."
+fi
+
+echo "Cleanup complete. Script finished."
diff --git a/.buildkite/scripts/run_with_pypi.sh b/.buildkite/scripts/run_with_pypi.sh
new file mode 100755
index 000000000..a9ea777fb
--- /dev/null
+++ b/.buildkite/scripts/run_with_pypi.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Exit on error, exit on unset variable, fail on pipe errors.
+set -euo pipefail
+
+# Build vllm-tpu with nightly tpu-inference from PyPI (using docker/Dockerfile.pypi instead of docker/Dockerfile).
+export RUN_WITH_PYPI="true"
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR/run_in_docker.sh"
diff --git a/.buildkite/scripts/setup_docker_env.sh b/.buildkite/scripts/setup_docker_env.sh
index 3aded737b..06a2acebf 100644
--- a/.buildkite/scripts/setup_docker_env.sh
+++ b/.buildkite/scripts/setup_docker_env.sh
@@ -7,6 +7,16 @@ setup_environment() {
   local image_name_param=${1:-"vllm-tpu"}
   IMAGE_NAME="$image_name_param"
 
+  local DOCKERFILE_NAME="Dockerfile"
+
+# Determine whether to build from PyPI packages or source.
+  if [[ "${RUN_WITH_PYPI:-false}" == "true" ]]; then
+    DOCKERFILE_NAME="Dockerfile.pypi"
+    echo "Building from PyPI packages. Using docker/${DOCKERFILE_NAME}"
+  else
+    echo "Building from source. Using docker/${DOCKERFILE_NAME}"
+  fi
+
   if ! grep -q "^HF_TOKEN=" /etc/environment; then
     gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
     sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
@@ -60,5 +70,5 @@ setup_environment() {
   docker build \
       --build-arg VLLM_COMMIT_HASH="${VLLM_COMMIT_HASH}" \
       --build-arg IS_FOR_V7X="${IS_FOR_V7X:-false}" \
-      --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+      --no-cache -f docker/${DOCKERFILE_NAME} -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
 }
diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi
new file mode 100644
index 000000000..45b1898c0
--- /dev/null
+++ b/docker/Dockerfile.pypi
@@ -0,0 +1,33 @@
+ARG NIGHTLY_DATE="20250714"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
+# The latest main will be used if arg unspecified
+ARG VLLM_COMMIT_HASH=""
+
+FROM $BASE_IMAGE
+
+# Remove existing versions of dependencies
+RUN pip uninstall -y torch torch_xla torchvision
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    git \
+    libopenblas-base libopenmpi-dev libomp-dev
+
+# Install tpu_inference
+WORKDIR /workspace/tpu_inference
+COPY requirements_benchmarking.txt .
+# These are needed for the E2E benchmarking tests (i.e. tests/e2e/benchmarking/mlperf.sh)
+RUN pip install -r requirements_benchmarking.txt --retries 3
+COPY . .
+
+# Build vllm-tpu wheel
+WORKDIR /workspace
+ARG VLLM_COMMIT_HASH
+RUN TPU_INFERENCE_VERSION=$(pip index versions tpu-inference --pre 2>/dev/null | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+" | head -n 1) && VLLM_TPU_VERSION=${TPU_INFERENCE_VERSION} && \
+    bash tpu_inference/.buildkite/scripts/build_vllm_tpu.sh ${TPU_INFERENCE_VERSION} ${VLLM_TPU_VERSION} ${VLLM_COMMIT_HASH}
+
+# Install vllm-tpu wheel
+RUN pip install --no-cache-dir vllm/dist/*.whl
+
+
+CMD ["/bin/bash"]

From 981ba3bda33090a3ffb3817997bbbe9337a2bb3c Mon Sep 17 00:00:00 2001
From: Ylang Tsou <ylangt@google.com>
Date: Fri, 12 Dec 2025 10:59:09 +0800
Subject: [PATCH 2/2] add check pypi version

Signed-off-by: Ylang Tsou <ylangt@google.com>
---
 .buildkite/pipeline_pypi.yml        | 19 ++++-----------
 .buildkite/scripts/bootstrap.sh     |  6 ++---
 .buildkite/scripts/run_with_pypi.sh | 36 +++++++++++++++++++++++++++++
 docker/Dockerfile.pypi              |  8 ++++++-
 4 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/.buildkite/pipeline_pypi.yml b/.buildkite/pipeline_pypi.yml
index b78cabf85..482fe7d85 100644
--- a/.buildkite/pipeline_pypi.yml
+++ b/.buildkite/pipeline_pypi.yml
@@ -2,21 +2,10 @@ steps:
   # -----------------------------------------------------------------
   # TEST STEPS - Calling wrapper
   # -----------------------------------------------------------------
-   - label: "Wait for 20 mins"
-     if: build.env("NIGHTLY") == "1"
-     key: "wait_20_minutes"
-     depends_on: "record_verified_commit_hashes"
-     agents:
-      queue: cpu
-     commands:
-      - "echo 'Starting 20 minute delay...'"
-      - "sleep 1200"
-      - "echo 'Delay finished, starting benchmarks.'"
-
    - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
      key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark"
-     if: build.env("NIGHTLY") == "1"
-     depends_on: "wait_20_minutes"
+     #if: build.env("NIGHTLY") == "1"
+     depends_on: "record_verified_commit_hashes"
      agents:
       queue: tpu_v6e_queue
      env:
@@ -34,9 +23,9 @@ steps:
         .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh
 
    - label: "Performance benchmarks for Qwen/Qwen3-4B"
-     if: build.env("NIGHTLY") == "1"
+     #if: build.env("NIGHTLY") == "1"
      key: "Qwen_Qwen3-4B_Benchmark"
-     depends_on: "wait_20_minutes"
+     depends_on: "record_verified_commit_hashes"
      agents:
       queue: tpu_v6e_queue
      env:
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
index 927b62341..9a9ce4ff8 100755
--- a/.buildkite/scripts/bootstrap.sh
+++ b/.buildkite/scripts/bootstrap.sh
@@ -24,11 +24,11 @@ upload_pipeline() {
     VLLM_COMMIT_HASH=$(git ls-remote https://github.com/vllm-project/vllm.git HEAD | awk '{ print $1}')
     buildkite-agent meta-data set "VLLM_COMMIT_HASH" "${VLLM_COMMIT_HASH}"
     echo "Using vllm commit hash: $(buildkite-agent meta-data get "VLLM_COMMIT_HASH")"
-    buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
-    buildkite-agent pipeline upload .buildkite/pipeline_jax_tpu7x.yml
+    #buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+    #buildkite-agent pipeline upload .buildkite/pipeline_jax_tpu7x.yml
     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
     buildkite-agent pipeline upload .buildkite/main.yml
-    buildkite-agent pipeline upload .buildkite/nightly_releases.yml
+    #buildkite-agent pipeline upload .buildkite/nightly_releases.yml
     buildkite-agent pipeline upload .buildkite/pipeline_pypi.yml
 }
 
diff --git a/.buildkite/scripts/run_with_pypi.sh b/.buildkite/scripts/run_with_pypi.sh
index a9ea777fb..724fcdedb 100755
--- a/.buildkite/scripts/run_with_pypi.sh
+++ b/.buildkite/scripts/run_with_pypi.sh
@@ -2,6 +2,42 @@
 # Exit on error, exit on unset variable, fail on pipe errors.
 set -euo pipefail
 
+# Get the nightly TPU_INFERENCE_VERSION based on the latest stable tag and current date.
+LATEST_STABLE_TAG=$(git tag --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n 1)
+BASE_VERSION=${LATEST_STABLE_TAG#v}
+# TODO: Temporary logic for testing. Remove 'yesterday' before merging.
+DATETIME_STR=$(date -d 'yesterday' +%Y%m%d)
+TPU_INFERENCE_VERSION="${BASE_VERSION}.dev${DATETIME_STR}"
+
+echo "Target Nightly Version: ${TPU_INFERENCE_VERSION}"
+
+# Configuration
+PACKAGE_NAME="tpu-inference"
+MAX_RETRIES=20
+SLEEP_SEC=60
+FOUND_VERSION=false
+
+echo "Checking PyPI for ${PACKAGE_NAME} == ${TPU_INFERENCE_VERSION}..."
+
+# Retry logic to check if the version is available on PyPI
+for ((i=1; i<=MAX_RETRIES; i++)); do
+    if pip index versions "${PACKAGE_NAME}" --pre 2>/dev/null | grep -q "${TPU_INFERENCE_VERSION}"; then
+        echo "Success! Found version ${TPU_INFERENCE_VERSION} on PyPI."
+        FOUND_VERSION=true
+        break
+    fi
+
+    echo "[Attempt $i/$MAX_RETRIES] Version not found yet. Waiting ${SLEEP_SEC} seconds..."
+    if [ "$i" -lt "$MAX_RETRIES" ]; then
+        sleep "$SLEEP_SEC"
+    fi
+done
+
+if [ "$FOUND_VERSION" = "false" ]; then
+    echo "The version ${TPU_INFERENCE_VERSION} was not found on PyPI."
+    exit 1
+fi
+
 # Build vllm-tpu with nightly tpu-inference from PyPI (using docker/Dockerfile.pypi instead of docker/Dockerfile).
 export RUN_WITH_PYPI="true"
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi
index 45b1898c0..032f30ac4 100644
--- a/docker/Dockerfile.pypi
+++ b/docker/Dockerfile.pypi
@@ -5,6 +5,8 @@ ARG VLLM_COMMIT_HASH=""
 
 FROM $BASE_IMAGE
 
+ARG IS_FOR_V7X="false"
+
 # Remove existing versions of dependencies
 RUN pip uninstall -y torch torch_xla torchvision
 
@@ -17,7 +19,11 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /workspace/tpu_inference
 COPY requirements_benchmarking.txt .
 # These are needed for the E2E benchmarking tests (i.e. tests/e2e/benchmarking/mlperf.sh)
-RUN pip install -r requirements_benchmarking.txt --retries 3
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements_benchmarking.txt --retries 3
+COPY requirements_v7x.txt .
+RUN --mount=type=cache,target=/root/.cache/pip if [ "$IS_FOR_V7X" = "true" ]; then \
+        pip install -r requirements_v7x.txt; \
+    fi
 COPY . .
 
 # Build vllm-tpu wheel