Add Dockerfile to verify vllm-tpu wheel

Ylang Tsou · Ylang Tsou · commit 87a31c20eec2 · 2025-12-08T09:43:45.000+08:00
Signed-off-by: Ylang Tsou &lt;ylangt@google.com&gt;
diff --git a/.buildkite/pipeline_test_pypi.yml b/.buildkite/pipeline_test_pypi.yml
@@ -0,0 +1,39 @@
+steps:
+  # -----------------------------------------------------------------
+  # TEST STEPS - Calling wrapper
+  # -----------------------------------------------------------------
+   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
+     key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark"
+     agents:
+      queue: tpu_v6e_queue
+     env:
+      TEST_MODEL: meta-llama/Llama-3.1-8B-Instruct
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 10.77
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 256
+      MAX_NUM_BATCHED_TOKENS: 1024
+     commands:
+      - |
+        .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh
+
+   - label: "Performance benchmarks for Qwen/Qwen3-4B"
+     key: "Qwen_Qwen3-4B_Benchmark"
+     agents:
+      queue: tpu_v6e_queue
+     env:
+      TEST_MODEL: Qwen/Qwen3-4B
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 11.00
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 94
+      MAX_NUM_BATCHED_TOKENS: 4096
+     commands:
+      - |
+       .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh
diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh
@@ -24,10 +24,10 @@ upload_pipeline() {
     VLLM_COMMIT_HASH=$(git ls-remote https://github.com/vllm-project/vllm.git HEAD | awk '{ print $1}')
     buildkite-agent meta-data set "VLLM_COMMIT_HASH" "${VLLM_COMMIT_HASH}"
     echo "Using vllm commit hash: $(buildkite-agent meta-data get "VLLM_COMMIT_HASH")"
-    buildkite-agent pipeline upload .buildkite/pipeline_jax.yml
+    buildkite-agent pipeline upload .buildkite/pipeline_test_pypi.yml
     # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml
-    buildkite-agent pipeline upload .buildkite/main.yml
-    buildkite-agent pipeline upload .buildkite/nightly_releases.yml
+    # buildkite-agent pipeline upload .buildkite/main.yml
+    # buildkite-agent pipeline upload .buildkite/nightly_releases.yml
 }
 
 echo "--- Starting Buildkite Bootstrap ---"
diff --git a/.buildkite/scripts/build_vllm_tpu.sh b/.buildkite/scripts/build_vllm_tpu.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+set -e
+
+# --- Script Configuration ---
+TPU_INFERENCE_VERSION=$1
+VLLM_TPU_VERSION=$2
+VLLM_BRANCH=${3:-"main"}
+VLLM_REPO="https://github.com/vllm-project/vllm.git"
+REPO_DIR="vllm"
+
+# --- Argument Validation ---
+if [ "$#" -lt 2 ]; then
+    echo "Usage: $0 <tpu-inference-version> <vllm-tpu-version> [vllm-branch-or-tag]"
+    echo "  [vllm-branch-or-tag] is optional, defaults to 'main'."
+    exit 1
+fi
+
+echo "--- Starting vLLM-TPU wheel build ---"
+echo "TPU Inference Version: ${TPU_INFERENCE_VERSION}"
+echo "vLLM-TPU Version: ${VLLM_TPU_VERSION}"
+echo "vLLM Branch/Tag: ${VLLM_BRANCH}"
+
+# --- Step 1: Clone vLLM repository ---
+if [ -d "$REPO_DIR" ]; then
+    echo "Repository '$REPO_DIR' already exists. Skipping clone."
+else
+    echo "Cloning vLLM repository..."
+    git clone ${VLLM_REPO}
+fi
+cd ${REPO_DIR}
+
+# --- Step 1.5: Checkout the specified vLLM branch/tag ---
+echo "Checking out vLLM branch/tag: ${VLLM_BRANCH}..."
+if ! git checkout "${VLLM_BRANCH}"; then
+    echo "ERROR: Failed to checkout branch/tag '${VLLM_BRANCH}'. Please check the branch/tag name."
+    exit 1
+fi
+echo "Successfully checked out ${VLLM_BRANCH}."
+git pull || echo "Warning: Failed to pull updates (may be on a tag)."
+
+# --- Step 2: Update tpu-inference version in requirements ---
+REQUIRED_LINE="tpu-inference==${TPU_INFERENCE_VERSION}"
+REQUIREMENTS_FILE="requirements/tpu.txt"
+BACKUP_FILE="${REQUIREMENTS_FILE}.bak"
+
+echo "Updating tpu-inference version in $REQUIREMENTS_FILE..."
+
+if [ -f "$REQUIREMENTS_FILE" ]; then
+    # Check if the last character is NOT a newline. If not, append one.
+    if [ "$(tail -c 1 "$REQUIREMENTS_FILE")" != "" ]; then
+        echo "" >> "$REQUIREMENTS_FILE"
+        echo "(Action: Added missing newline to the end of $REQUIREMENTS_FILE for safety.)"
+    fi
+fi
+
+if grep -q "^tpu-inference==" "$REQUIREMENTS_FILE"; then
+    # Replace the existing version using sed, which creates the .bak file
+    echo "(Action: Existing version found. Replacing.)"
+    sed -i.bak "s/^tpu-inference==.*/$REQUIRED_LINE/" "$REQUIREMENTS_FILE"
+
+else
+    # Line not found -> Append the new line to the file end, and manually create .bak
+    echo "(Action: Line not found. Appending new dependency.)"
+    echo "$REQUIRED_LINE" >> "$REQUIREMENTS_FILE"
+
+    # Create an empty .bak file for consistency, so cleanup works later.
+    touch "$BACKUP_FILE"
+fi
+
+# --- Step 3: Execute the vLLM TPU build script ---
+echo "Ensuring 'build' package is installed..."
+pip install build
+echo "Executing the vLLM TPU build script..."
+bash tools/vllm-tpu/build.sh "${VLLM_TPU_VERSION}"
+
+echo "--- Build complete! ---"
+echo "The wheel file can be found in the 'vllm/dist' directory."
+
+# --- Step 4: Cleanup and Revert Requirements File ---
+echo "--- Cleaning up local changes ---"
+
+if [ -f "$BACKUP_FILE" ]; then
+    echo "Reverting $REQUIREMENTS_FILE from backup."
+    # Remove the modified file
+    rm -f "$REQUIREMENTS_FILE"
+    # Rename the backup file back to the original name
+    mv "$BACKUP_FILE" "$REQUIREMENTS_FILE"
+else
+    echo "Warning: Backup file $BACKUP_FILE not found. Skipping revert."
+fi
+
+echo "Cleanup complete. Script finished."
diff --git a/.buildkite/scripts/run_with_pypi.sh b/.buildkite/scripts/run_with_pypi.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+#
+# .buildkite/run_with_pypi.sh
+# ---------------------------
+
+# Exit on error, exit on unset variable, fail on pipe errors.
+set -euo pipefail
+
+if [ "$#" -eq 0 ]; then
+  echo "ERROR: Usage: $0 <command_and_args_to_run_with_pypi...>"
+  exit 1
+fi
+
+ENV_VARS=(
+  -e TEST_MODEL="${TEST_MODEL:-}"
+  -e MINIMUM_ACCURACY_THRESHOLD="${MINIMUM_ACCURACY_THRESHOLD:-}"
+  -e MINIMUM_THROUGHPUT_THRESHOLD="${MINIMUM_THROUGHPUT_THRESHOLD:-}"
+  -e TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-}"
+  -e INPUT_LEN="${INPUT_LEN:-}"
+  -e OUTPUT_LEN="${OUTPUT_LEN:-}"
+  -e PREFIX_LEN="${PREFIX_LEN:-}"
+  -e MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
+  -e MAX_NUM_SEQS="${MAX_NUM_SEQS:-}"
+  -e MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}"
+)
+
+if ! grep -q "^HF_TOKEN=" /etc/environment; then
+  gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
+  sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
+  echo "Added HF_TOKEN to /etc/environment."
+else
+  echo "HF_TOKEN already exists in /etc/environment."
+fi
+
+# shellcheck disable=1091
+source /etc/environment
+
+if [ -z "${BUILDKITE_COMMIT:-}" ]; then
+  echo "ERROR: BUILDKITE_COMMIT environment variable is not set." >&2
+  echo "This script expects BUILDKITE_COMMIT to tag the Docker image." >&2
+  exit 1
+fi
+
+if [ -z "${MODEL_IMPL_TYPE:-}" ]; then
+  MODEL_IMPL_TYPE=flax_nnx
+fi
+
+# Try to cache HF models
+persist_cache_dir="/mnt/disks/persist/models"
+
+if ( mkdir -p "$persist_cache_dir" ); then
+  LOCAL_HF_HOME="$persist_cache_dir"
+else
+  echo "Error: Failed to create $persist_cache_dir"
+  exit 1
+fi
+DOCKER_HF_HOME="/tmp/hf_home"
+
+# (TODO): Consider creating a remote registry to cache and share between agents.
+# Subsequent builds on the same host should be cached.
+
+# Cleanup of existing containers and images.
+echo "Starting cleanup for vllm-tpu..."
+# Get all unique image IDs for the repository 'vllm-tpu'
+old_images=$(docker images vllm-tpu -q | uniq)
+total_containers=""
+
+if [ -n "$old_images" ]; then
+    echo "Found old vllm-tpu images. Checking for dependent containers..."
+    # Loop through each image ID and find any containers (running or not) using it.
+    for img_id in $old_images; do
+        total_containers="$total_containers $(docker ps -a -q --filter "ancestor=$img_id")"
+    done
+
+    # Remove any found containers
+    if [ -n "$total_containers" ]; then
+        echo "Removing leftover containers using vllm-tpu image(s)..."
+        echo "$total_containers" | xargs -n1 | sort -u | xargs -r docker rm -f
+    fi
+
+    echo "Removing old vllm-tpu image(s)..."
+    docker rmi -f "$old_images"
+else
+    echo "No vllm-tpu images found to clean up."
+fi
+
+echo "Pruning old Docker build cache..."
+docker builder prune -f
+
+echo "Cleanup complete."
+
+echo "Installing Python dependencies"
+python3 -m pip install --progress-bar off buildkite-test-collector==0.1.9
+echo "Python dependencies installed"
+
+
+echo "--- Displaying current disk usage (df -h) ---"
+df -h
+echo "-----------------------------------------------"
+
+IMAGE_NAME="vllm-tpu"
+docker build --no-cache -f docker/Dockerfile.pypi -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
+
+exec docker run \
+  --privileged \
+  --net host \
+  --shm-size=16G \
+  --rm \
+  -v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \
+  "${ENV_VARS[@]}" \
+  -e HF_HOME="$DOCKER_HF_HOME" \
+  -e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
+  -e HF_TOKEN="$HF_TOKEN" \
+  -e VLLM_XLA_CACHE_PATH="$DOCKER_HF_HOME/.cache/jax_cache" \
+  -e VLLM_XLA_CHECK_RECOMPILATION=1 \
+  ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \
+  ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \
+  ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \
+  ${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \
+  ${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \
+  "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
+  "$@" # Pass all script arguments as the command to run in the container
+echo "docker run complete"
diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi
@@ -0,0 +1,30 @@
+ARG NIGHTLY_DATE="20250714"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+
+# Remove existing versions of dependencies
+RUN pip uninstall -y torch torch_xla torchvision
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    git \
+    libopenblas-base libopenmpi-dev libomp-dev
+
+# Install tpu_inference
+WORKDIR /workspace/tpu_inference
+COPY . .
+RUN export TPU_INFERENCE_VERSION=$(pip index versions tpu-inference --pre 2>/dev/null | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+" | head -n 1) && \
+    echo -n "${TPU_INFERENCE_VERSION}" > /tmp/tpu_inference_version
+
+# Build vllm-tpu wheel
+WORKDIR /workspace
+RUN export VLLM_TPU_VERSION=$(cat /tmp/tpu_inference_version) && \
+    bash tpu_inference/.buildkite/scripts/build_vllm_tpu.sh ${VLLM_TPU_VERSION} ${VLLM_TPU_VERSION}
+
+# Install vllm-tpu wheel
+WORKDIR /workspace/vllm
+RUN pip install --no-cache-dir dist/*.whl
+
+
+CMD ["/bin/bash"]