From 1bed7e4ef377950e3e60053b7d6e7a86d285a750 Mon Sep 17 00:00:00 2001 From: Ylang Tsou Date: Thu, 11 Dec 2025 16:34:15 +0800 Subject: [PATCH 1/2] Add workflow to build vLLM-TPU wheel using PyPI tpu-inference Signed-off-by: Ylang Tsou --- .buildkite/pipeline_pypi.yml | 54 +++++++++++++++ .buildkite/scripts/bootstrap.sh | 1 + .buildkite/scripts/build_vllm_tpu.sh | 93 ++++++++++++++++++++++++++ .buildkite/scripts/run_with_pypi.sh | 10 +++ .buildkite/scripts/setup_docker_env.sh | 12 +++- docker/Dockerfile.pypi | 33 +++++++++ 6 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 .buildkite/pipeline_pypi.yml create mode 100755 .buildkite/scripts/build_vllm_tpu.sh create mode 100755 .buildkite/scripts/run_with_pypi.sh create mode 100644 docker/Dockerfile.pypi diff --git a/.buildkite/pipeline_pypi.yml b/.buildkite/pipeline_pypi.yml new file mode 100644 index 000000000..b78cabf85 --- /dev/null +++ b/.buildkite/pipeline_pypi.yml @@ -0,0 +1,54 @@ +steps: + # ----------------------------------------------------------------- + # TEST STEPS - Calling wrapper + # ----------------------------------------------------------------- + - label: "Wait for 20 mins" + if: build.env("NIGHTLY") == "1" + key: "wait_20_minutes" + depends_on: "record_verified_commit_hashes" + agents: + queue: cpu + commands: + - "echo 'Starting 20 minute delay...'" + - "sleep 1200" + - "echo 'Delay finished, starting benchmarks.'" + + - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" + key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark" + if: build.env("NIGHTLY") == "1" + depends_on: "wait_20_minutes" + agents: + queue: tpu_v6e_queue + env: + TEST_MODEL: meta-llama/Llama-3.1-8B-Instruct + TENSOR_PARALLEL_SIZE: 1 + MINIMUM_THROUGHPUT_THRESHOLD: 10.77 + INPUT_LEN: 1800 + OUTPUT_LEN: 128 + PREFIX_LEN: 0 + MAX_MODEL_LEN: 2048 + MAX_NUM_SEQS: 256 + MAX_NUM_BATCHED_TOKENS: 1024 + commands: + - | + .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh + + - label: "Performance benchmarks for Qwen/Qwen3-4B" + if: build.env("NIGHTLY") == "1" + key: "Qwen_Qwen3-4B_Benchmark" + depends_on: "wait_20_minutes" + agents: + queue: tpu_v6e_queue + env: + TEST_MODEL: Qwen/Qwen3-4B + TENSOR_PARALLEL_SIZE: 1 + MINIMUM_THROUGHPUT_THRESHOLD: 11.00 + INPUT_LEN: 1800 + OUTPUT_LEN: 128 + PREFIX_LEN: 0 + MAX_MODEL_LEN: 2048 + MAX_NUM_SEQS: 94 + MAX_NUM_BATCHED_TOKENS: 4096 + commands: + - | + .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index d567b3004..927b62341 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -29,6 +29,7 @@ upload_pipeline() { # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml buildkite-agent pipeline upload .buildkite/main.yml buildkite-agent pipeline upload .buildkite/nightly_releases.yml + buildkite-agent pipeline upload .buildkite/pipeline_pypi.yml } echo "--- Starting Buildkite Bootstrap ---" diff --git a/.buildkite/scripts/build_vllm_tpu.sh b/.buildkite/scripts/build_vllm_tpu.sh new file mode 100755 index 000000000..73ac29a7a --- /dev/null +++ b/.buildkite/scripts/build_vllm_tpu.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +set -e + +# --- Script Configuration --- +TPU_INFERENCE_VERSION=$1 +VLLM_TPU_VERSION=$2 +VLLM_BRANCH=${3:-"main"} +VLLM_REPO="https://github.com/vllm-project/vllm.git" +REPO_DIR="vllm" + +# --- Argument Validation --- +if [ "$#" -lt 2 ]; then + echo "Usage: $0 [vllm-branch-or-tag]" + echo " [vllm-branch-or-tag] is optional, defaults to 'main'." + exit 1 +fi + +echo "--- Starting vLLM-TPU wheel build ---" +echo "TPU Inference Version: ${TPU_INFERENCE_VERSION}" +echo "vLLM-TPU Version: ${VLLM_TPU_VERSION}" +echo "vLLM Branch/Tag: ${VLLM_BRANCH}" + +# --- Step 1: Clone vLLM repository --- +if [ -d "$REPO_DIR" ]; then + echo "Repository '$REPO_DIR' already exists. Skipping clone." +else + echo "Cloning vLLM repository..." + git clone ${VLLM_REPO} +fi +cd ${REPO_DIR} + +# --- Step 1.5: Checkout the specified vLLM branch/tag --- +echo "Checking out vLLM branch/tag: ${VLLM_BRANCH}..." +if ! git checkout "${VLLM_BRANCH}"; then + echo "ERROR: Failed to checkout branch/tag '${VLLM_BRANCH}'. Please check the branch/tag name." + exit 1 +fi +echo "Successfully checked out ${VLLM_BRANCH}." +git pull || echo "Warning: Failed to pull updates (may be on a tag)." + +# --- Step 2: Update tpu-inference version in requirements --- +REQUIRED_LINE="tpu-inference==${TPU_INFERENCE_VERSION}" +REQUIREMENTS_FILE="requirements/tpu.txt" +BACKUP_FILE="${REQUIREMENTS_FILE}.bak" + +echo "Updating tpu-inference version in $REQUIREMENTS_FILE..." + +if [ -f "$REQUIREMENTS_FILE" ]; then + # Check if the last character is NOT a newline. If not, append one. + if [ "$(tail -c 1 "$REQUIREMENTS_FILE")" != "" ]; then + echo "" >> "$REQUIREMENTS_FILE" + echo "(Action: Added missing newline to the end of $REQUIREMENTS_FILE for safety.)" + fi +fi + +if grep -q "^tpu-inference==" "$REQUIREMENTS_FILE"; then + # Replace the existing version using sed, which creates the .bak file + echo "(Action: Existing version found. Replacing.)" + sed -i.bak "s/^tpu-inference==.*/$REQUIRED_LINE/" "$REQUIREMENTS_FILE" + +else + # Line not found -> Append the new line to the file end, and manually create .bak + echo "(Action: Line not found. Appending new dependency.)" + echo "$REQUIRED_LINE" >> "$REQUIREMENTS_FILE" + + # Create an empty .bak file for consistency, so cleanup works later. + touch "$BACKUP_FILE" +fi + +# --- Step 3: Execute the vLLM TPU build script --- +echo "Ensuring 'build' package is installed..." +pip install build +echo "Executing the vLLM TPU build script..." +bash tools/vllm-tpu/build.sh "${VLLM_TPU_VERSION}" + +echo "--- Build complete! ---" +echo "The wheel file can be found in the 'vllm/dist' directory." + +# --- Step 4: Cleanup and Revert Requirements File --- +echo "--- Cleaning up local changes ---" + +if [ -f "$BACKUP_FILE" ]; then + echo "Reverting $REQUIREMENTS_FILE from backup." + # Remove the modified file + rm -f "$REQUIREMENTS_FILE" + # Rename the backup file back to the original name + mv "$BACKUP_FILE" "$REQUIREMENTS_FILE" +else + echo "Warning: Backup file $BACKUP_FILE not found. Skipping revert." +fi + +echo "Cleanup complete. Script finished." diff --git a/.buildkite/scripts/run_with_pypi.sh b/.buildkite/scripts/run_with_pypi.sh new file mode 100755 index 000000000..a9ea777fb --- /dev/null +++ b/.buildkite/scripts/run_with_pypi.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Exit on error, exit on unset variable, fail on pipe errors. +set -euo pipefail + +# Build vllm-tpu with nightly tpu-inference from PyPI (using docker/Dockerfile.pypi instead of docker/Dockerfile). +export RUN_WITH_PYPI="true" +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR/run_in_docker.sh" diff --git a/.buildkite/scripts/setup_docker_env.sh b/.buildkite/scripts/setup_docker_env.sh index 3aded737b..06a2acebf 100644 --- a/.buildkite/scripts/setup_docker_env.sh +++ b/.buildkite/scripts/setup_docker_env.sh @@ -7,6 +7,16 @@ setup_environment() { local image_name_param=${1:-"vllm-tpu"} IMAGE_NAME="$image_name_param" + local DOCKERFILE_NAME="Dockerfile" + +# Determine whether to build from PyPI packages or source. + if [[ "${RUN_WITH_PYPI:-false}" == "true" ]]; then + DOCKERFILE_NAME="Dockerfile.pypi" + echo "Building from PyPI packages. Using docker/${DOCKERFILE_NAME}" + else + echo "Building from source. Using docker/${DOCKERFILE_NAME}" + fi + if ! grep -q "^HF_TOKEN=" /etc/environment; then gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \ sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)" @@ -60,5 +70,5 @@ setup_environment() { docker build \ --build-arg VLLM_COMMIT_HASH="${VLLM_COMMIT_HASH}" \ --build-arg IS_FOR_V7X="${IS_FOR_V7X:-false}" \ - --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" . + --no-cache -f docker/${DOCKERFILE_NAME} -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" . } diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi new file mode 100644 index 000000000..45b1898c0 --- /dev/null +++ b/docker/Dockerfile.pypi @@ -0,0 +1,33 @@ +ARG NIGHTLY_DATE="20250714" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" +# The latest main will be used if arg unspecified +ARG VLLM_COMMIT_HASH="" + +FROM $BASE_IMAGE + +# Remove existing versions of dependencies +RUN pip uninstall -y torch torch_xla torchvision + +# Install some basic utilities +RUN apt-get update && apt-get install -y \ + git \ + libopenblas-base libopenmpi-dev libomp-dev + +# Install tpu_inference +WORKDIR /workspace/tpu_inference +COPY requirements_benchmarking.txt . +# These are needed for the E2E benchmarking tests (i.e. tests/e2e/benchmarking/mlperf.sh) +RUN pip install -r requirements_benchmarking.txt --retries 3 +COPY . . + +# Build vllm-tpu wheel +WORKDIR /workspace +ARG VLLM_COMMIT_HASH +RUN TPU_INFERENCE_VERSION=$(pip index versions tpu-inference --pre 2>/dev/null | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.dev[0-9]+" | head -n 1) && VLLM_TPU_VERSION=${TPU_INFERENCE_VERSION} && \ + bash tpu_inference/.buildkite/scripts/build_vllm_tpu.sh ${TPU_INFERENCE_VERSION} ${VLLM_TPU_VERSION} ${VLLM_COMMIT_HASH} + +# Install vllm-tpu wheel +RUN pip install --no-cache-dir vllm/dist/*.whl + + +CMD ["/bin/bash"] From 981ba3bda33090a3ffb3817997bbbe9337a2bb3c Mon Sep 17 00:00:00 2001 From: Ylang Tsou Date: Fri, 12 Dec 2025 10:59:09 +0800 Subject: [PATCH 2/2] add check pypi version Signed-off-by: Ylang Tsou --- .buildkite/pipeline_pypi.yml | 19 ++++----------- .buildkite/scripts/bootstrap.sh | 6 ++--- .buildkite/scripts/run_with_pypi.sh | 36 +++++++++++++++++++++++++++++ docker/Dockerfile.pypi | 8 ++++++- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/.buildkite/pipeline_pypi.yml b/.buildkite/pipeline_pypi.yml index b78cabf85..482fe7d85 100644 --- a/.buildkite/pipeline_pypi.yml +++ b/.buildkite/pipeline_pypi.yml @@ -2,21 +2,10 @@ steps: # ----------------------------------------------------------------- # TEST STEPS - Calling wrapper # ----------------------------------------------------------------- - - label: "Wait for 20 mins" - if: build.env("NIGHTLY") == "1" - key: "wait_20_minutes" - depends_on: "record_verified_commit_hashes" - agents: - queue: cpu - commands: - - "echo 'Starting 20 minute delay...'" - - "sleep 1200" - - "echo 'Delay finished, starting benchmarks.'" - - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct" key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark" - if: build.env("NIGHTLY") == "1" - depends_on: "wait_20_minutes" + #if: build.env("NIGHTLY") == "1" + depends_on: "record_verified_commit_hashes" agents: queue: tpu_v6e_queue env: @@ -34,9 +23,9 @@ steps: .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/benchmark.sh - label: "Performance benchmarks for Qwen/Qwen3-4B" - if: build.env("NIGHTLY") == "1" + #if: build.env("NIGHTLY") == "1" key: "Qwen_Qwen3-4B_Benchmark" - depends_on: "wait_20_minutes" + depends_on: "record_verified_commit_hashes" agents: queue: tpu_v6e_queue env: diff --git a/.buildkite/scripts/bootstrap.sh b/.buildkite/scripts/bootstrap.sh index 927b62341..9a9ce4ff8 100755 --- a/.buildkite/scripts/bootstrap.sh +++ b/.buildkite/scripts/bootstrap.sh @@ -24,11 +24,11 @@ upload_pipeline() { VLLM_COMMIT_HASH=$(git ls-remote https://github.com/vllm-project/vllm.git HEAD | awk '{ print $1}') buildkite-agent meta-data set "VLLM_COMMIT_HASH" "${VLLM_COMMIT_HASH}" echo "Using vllm commit hash: $(buildkite-agent meta-data get "VLLM_COMMIT_HASH")" - buildkite-agent pipeline upload .buildkite/pipeline_jax.yml - buildkite-agent pipeline upload .buildkite/pipeline_jax_tpu7x.yml + #buildkite-agent pipeline upload .buildkite/pipeline_jax.yml + #buildkite-agent pipeline upload .buildkite/pipeline_jax_tpu7x.yml # buildkite-agent pipeline upload .buildkite/pipeline_torch.yml buildkite-agent pipeline upload .buildkite/main.yml - buildkite-agent pipeline upload .buildkite/nightly_releases.yml + #buildkite-agent pipeline upload .buildkite/nightly_releases.yml buildkite-agent pipeline upload .buildkite/pipeline_pypi.yml } diff --git a/.buildkite/scripts/run_with_pypi.sh b/.buildkite/scripts/run_with_pypi.sh index a9ea777fb..724fcdedb 100755 --- a/.buildkite/scripts/run_with_pypi.sh +++ b/.buildkite/scripts/run_with_pypi.sh @@ -2,6 +2,42 @@ # Exit on error, exit on unset variable, fail on pipe errors. set -euo pipefail +# Get the nightly TPU_INFERENCE_VERSION based on the latest stable tag and current date. +LATEST_STABLE_TAG=$(git tag --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n 1) +BASE_VERSION=${LATEST_STABLE_TAG#v} +# TODO: Temporary logic for testing. Remove 'yesterday' before merging. +DATETIME_STR=$(date -d 'yesterday' +%Y%m%d) +TPU_INFERENCE_VERSION="${BASE_VERSION}.dev${DATETIME_STR}" + +echo "Target Nightly Version: ${TPU_INFERENCE_VERSION}" + +# Configuration +PACKAGE_NAME="tpu-inference" +MAX_RETRIES=20 +SLEEP_SEC=60 +FOUND_VERSION=false + +echo "Checking PyPI for ${PACKAGE_NAME} == ${TPU_INFERENCE_VERSION}..." + +# Retry logic to check if the version is available on PyPI +for ((i=1; i<=MAX_RETRIES; i++)); do + if pip index versions "${PACKAGE_NAME}" --pre 2>/dev/null | grep -q "${TPU_INFERENCE_VERSION}"; then + echo "Success! Found version ${TPU_INFERENCE_VERSION} on PyPI." + FOUND_VERSION=true + break + fi + + echo "[Attempt $i/$MAX_RETRIES] Version not found yet. Waiting ${SLEEP_SEC} seconds..." + if [ "$i" -lt "$MAX_RETRIES" ]; then + sleep "$SLEEP_SEC" + fi +done + +if [ "$FOUND_VERSION" = "false" ]; then + echo "The version ${TPU_INFERENCE_VERSION} was not found on PyPI." + exit 1 +fi + # Build vllm-tpu with nightly tpu-inference from PyPI (using docker/Dockerfile.pypi instead of docker/Dockerfile). export RUN_WITH_PYPI="true" SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi index 45b1898c0..032f30ac4 100644 --- a/docker/Dockerfile.pypi +++ b/docker/Dockerfile.pypi @@ -5,6 +5,8 @@ ARG VLLM_COMMIT_HASH="" FROM $BASE_IMAGE +ARG IS_FOR_V7X="false" + # Remove existing versions of dependencies RUN pip uninstall -y torch torch_xla torchvision @@ -17,7 +19,11 @@ RUN apt-get update && apt-get install -y \ WORKDIR /workspace/tpu_inference COPY requirements_benchmarking.txt . # These are needed for the E2E benchmarking tests (i.e. tests/e2e/benchmarking/mlperf.sh) -RUN pip install -r requirements_benchmarking.txt --retries 3 +RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements_benchmarking.txt --retries 3 +COPY requirements_v7x.txt . +RUN --mount=type=cache,target=/root/.cache/pip if [ "$IS_FOR_V7X" = "true" ]; then \ + pip install -r requirements_v7x.txt; \ + fi COPY . . # Build vllm-tpu wheel