diff --git a/gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh b/gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh new file mode 100755 index 0000000..f3c9772 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# ============================================================================== +# Script: 0_start_vm_h100.sh +# Description: Provisions an H100 GPU VM (a3-highgpu-1g) for CC benchmarks. +# Supports multi-zone fallback and Confidential Compute (TDX). +# +# Usage: +# ./0_start_vm_h100.sh [--confidential] [--secure-boot] [--zone ] +# +# Flags: +# --confidential : Enable Confidential Compute (TDX) for the VM. +# --secure-boot : Enable Shielded Secure Boot for the VM. +# --zone : Target a specific zone (overrides fallback list). +# ============================================================================== + +# Configuration +PROJECT_ID="fx-gen-ai-sandbox" +VM_NAME="h100-test-vm" +MACHINE_TYPE="a3-highgpu-1g" +GPU_COUNT=1 +DISK_SIZE=250 +SERVICE_ACCOUNT="18209811701-compute@developer.gserviceaccount.com" +IMAGE="projects/ubuntu-os-accelerator-images/global/images/ubuntu-accelerator-2404-amd64-with-nvidia-580-v20251021" + +# List of zones to try in sequential order (fallback mechanism) +ZONES=("us-central1-a" "us-central1-c" "europe-west4-b") + +# Default Flags +CONFIDENTIAL_FLAG="" +SECURE_BOOT_FLAG="--no-shielded-secure-boot" + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + --confidential) + # Enable Confidential Computing with Intel TDX + CONFIDENTIAL_FLAG="--confidential-compute-type=TDX" + echo "Enabling Confidential Computing (TDX)..." + ;; + --secure-boot) + # Enable Shielded Secure Boot + SECURE_BOOT_FLAG="--shielded-secure-boot" + echo "Enabling Shielded Secure Boot..." + ;; + --zone) + # Override zones list with a specific user-provided zone + if [[ -n "$2" && "$2" != --* ]]; then + ZONES=("$2") + echo "Targeting specific zone: $2" + shift + else + echo "Error: --zone requires a value." + exit 1 + fi + ;; + *) + echo "Unknown argument: $1" + ;; + esac + shift +done + +# Iterate through zones until a VM is successfully created +for ZONE in "${ZONES[@]}"; do + REGION="${ZONE%-*}" + echo "--------------------------------------------------------" + echo "Attempting to start VM $VM_NAME in zone $ZONE..." + echo "--------------------------------------------------------" + + # 1. Ensure Snapshot Schedule exists in the region + # Resource policies are regional, so we ensure it exists for the target region. + echo "Checking/Creating snapshot schedule 'default-schedule-1' in region $REGION..." + gcloud compute resource-policies create snapshot-schedule default-schedule-1 \ + --project=$PROJECT_ID \ + --region=$REGION \ + --max-retention-days=14 \ + --on-source-disk-delete=keep-auto-snapshots \ + --daily-schedule \ + --start-time=00:00 \ + 2>/dev/null || echo "Snapshot schedule already exists or could not be created." + + # 2. Configure Disk based on Zone + if [[ "$ZONE" == europe-west4* ]]; then + DISK_TYPE="hyperdisk-balanced" + DISK_EXTRAS=",provisioned-iops=6000,provisioned-throughput=890" + else + DISK_TYPE="pd-balanced" + DISK_EXTRAS="" + fi + + # 3. Construct and Execute the gcloud create command + # Confidential Compute (TDX) and Secure Boot flags are applied here if set via arguments. + if gcloud compute instances create "$VM_NAME" \ + --project="$PROJECT_ID" \ + --zone="$ZONE" \ + --machine-type="$MACHINE_TYPE" \ + --network-interface="network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=sandbox-vpc-default" \ + --metadata="enable-osconfig=TRUE" \ + --no-restart-on-failure \ + --maintenance-policy="TERMINATE" \ + --provisioning-model="SPOT" \ + --instance-termination-action="STOP" \ + --discard-local-ssds-at-termination-timestamp=true \ + --service-account="$SERVICE_ACCOUNT" \ + --scopes="https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/trace.append" \ + --accelerator="count=$GPU_COUNT,type=nvidia-h100-80gb" \ + --create-disk="auto-delete=yes,boot=yes,device-name=$VM_NAME,disk-resource-policy=projects/$PROJECT_ID/regions/$REGION/resourcePolicies/default-schedule-1,image=$IMAGE,mode=rw,size=$DISK_SIZE,type=$DISK_TYPE$DISK_EXTRAS" \ + $SECURE_BOOT_FLAG \ + $CONFIDENTIAL_FLAG \ + --shielded-vtpm \ + --shielded-integrity-monitoring \ + --labels="goog-ops-agent-policy=v2-x86-template-1-4-0,goog-ec-src=vm_add-gcloud" \ + --reservation-affinity="none"; then + + echo "Successfully created VM in $ZONE." + + # 4. Post-creation: Configure Ops Agent + echo "Configuring Ops Agent..." + printf 'agentsRule:\n packageState: installed\n version: latest\ninstanceFilter:\n inclusionLabels:\n - labels:\n goog-ops-agent-policy: v2-x86-template-1-4-0\n' > config.yaml + + POLICY_NAME="goog-ops-agent-v2-x86-template-1-4-0-${ZONE}" + echo "Applying Ops Agent policy: $POLICY_NAME" + gcloud compute instances ops-agents policies create "$POLICY_NAME" \ + --project="$PROJECT_ID" \ + --zone="$ZONE" \ + --file=config.yaml || \ + gcloud compute instances ops-agents policies update "$POLICY_NAME" \ + --project="$PROJECT_ID" \ + --zone="$ZONE" \ + --file=config.yaml || \ + echo "Warning: Failed to create or update Ops Agent policy, but VM is up." + + echo "All set!" + exit 0 + else + echo "Failed to create VM in $ZONE. Trying next zone..." + fi +done + +echo "Error: Could not start VM in any of the specified zones." +exit 1 diff --git a/gcp_vm_scripts/cc_benchmarks/1_setup_environment.sh b/gcp_vm_scripts/cc_benchmarks/1_setup_environment.sh new file mode 100755 index 0000000..01127c3 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/1_setup_environment.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# ============================================================================== +# Script 1: Setup Environment +# +# Purpose: +# This script performs a one-time setup for a new machine by installing +# Docker and the NVIDIA Container Toolkit. It is idempotent, meaning it +# can be safely re-run without causing issues. +# +# Usage: +# ./1_setup_environment.sh +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +# Parse command-line arguments +CONFIDENTIAL=false +while [[ "$#" -gt 0 ]]; do + case $1 in + --confidential) CONFIDENTIAL=true ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +echo "--- [Step 1/1] Setting up environment ---" + +# --- Install Docker --- +if command -v docker &> /dev/null; then + echo "Docker is already installed. Skipping installation." +else + echo "Installing Docker..." + sudo apt-get update + sudo apt-get install -y ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + echo "Docker installation complete." +fi + +# Enable Confidential Computing before verifying CUDA Toolkit. +if [ "$CONFIDENTIAL" = true ]; then + echo "Enabling Confidential Computing..." + # Enable Linux Kernel Crypto API + echo "install nvidia /sbin/modprobe ecdsa_generic; /sbin/modprobe ecdh; /sbin/modprobe --ignore-install nvidia" | sudo tee /etc/modprobe.d/nvidia-lkca.conf + sudo update-initramfs -u + + # Enable Confidential Compute GPUs Ready state + sudo nvidia-smi conf-compute -srs 1 + + # Set startup unit to enable Confidential Compute GPUs Ready state on each boot + sudo tee /etc/systemd/system/cc-gpu-ready.service > /dev/null << 'EOF' +[Unit] +Description=Set Confidential Compute GPU to Ready mode +After=multi-user.target +Wants=nvidia-persistenced.service + +[Service] +Type=oneshot +ExecStartPre=/bin/sleep 2 +ExecStart=/usr/bin/nvidia-smi conf-compute -srs 1 +ExecStartPost=/usr/bin/nvidia-smi conf-compute -grs +RemainAfterExit=true + +[Install] +WantedBy=multi-user.target +EOF + + sudo systemctl daemon-reload + sudo systemctl enable cc-gpu-ready.service + + nvidia-smi conf-compute -f # should say CC status: ON + nvidia-smi conf-compute -grs # should say ready +fi + +# --- Install NVIDIA Container Toolkit --- +if dpkg -l | grep -q nvidia-container-toolkit; then + echo "NVIDIA Container Toolkit is already installed. Skipping installation." +else + echo "Installing NVIDIA Container Toolkit..." + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + sudo nvidia-ctk runtime configure --runtime=docker + sudo systemctl restart docker + echo "NVIDIA Container Toolkit installation complete." +fi + +echo "--- Environment setup is complete. ---" + +sleep 3 +echo "Enabling persistence mode..." + +# Enable persistence mode to establish a secure Security Protocol and Data Model (SPDM) connection +sudo mkdir -p /etc/systemd/system/nvidia-persistenced.service.d +cat < --model +# +# Example: +# ./2_start_server.sh --hardware H100 --model Qwen +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +# --- Configuration --- +DOCKER_IMAGE="nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc1" +CONTAINER_NAME="trtllm_server" + +# Associative arrays for configurations +declare -A MODEL_PATHS +MODEL_PATHS["Qwen_H100"]="Qwen/Qwen3-30B-A3B" +MODEL_PATHS["Mistral_H100"]="mistralai/Mistral-7B-v0.1" + +declare -A MAX_BATCH_SIZES +MAX_BATCH_SIZES["Qwen_H100"]=512 +MAX_BATCH_SIZES["Mistral_H100"]=2056 + +declare -A TP_SIZES +TP_SIZES["Qwen_H100"]=1 +TP_SIZES["Mistral_H100"]=1 + +# --- Helper Functions --- +usage() { + echo "Usage: $0 --hardware H100 --model " + exit 1 +} + +# --- Main Execution --- +main() { + HARDWARE="" + MODEL="" + + while [[ "$#" -gt 0 ]]; do + case $1 in + --hardware) HARDWARE="$2"; shift ;; + --model) MODEL="$2"; shift ;; + *) usage ;; + esac + shift + done + + if [ -z "$HARDWARE" ] || [ -z "$MODEL" ]; then + echo "Error: --hardware and --model are required arguments." + usage + fi + + CONFIG_KEY="${MODEL}_${HARDWARE}" + MODEL_PATH=${MODEL_PATHS[$CONFIG_KEY]} + MAX_BATCH_SIZE=${MAX_BATCH_SIZES[$CONFIG_KEY]} + TP_SIZE=${TP_SIZES[$CONFIG_KEY]} + + if [ -z "$MODEL_PATH" ]; then + echo "Error: Invalid hardware/model combination. Only H100 with Qwen or Mistral is supported." + exit 1 + fi + + echo "--- [Step 1/2] Starting server with configuration ---" + echo "Hardware: $HARDWARE" + echo "Model: $MODEL" + echo "Model Path: $MODEL_PATH" + echo "Max Batch Size: $MAX_BATCH_SIZE" + echo "TP Size: $TP_SIZE" + echo "----------------------------------------------------" + + if [ "$(sudo docker ps -q -f name=$CONTAINER_NAME)" ]; then + echo "Error: A container with the name '$CONTAINER_NAME' is already running." + echo "Please stop it first by running ./4_stop_server.sh" + exit 1 + fi + + if [ "$(sudo docker ps -aq -f status=exited -f name=$CONTAINER_NAME)" ]; then + echo "Removing existing stopped container..." + sudo docker rm $CONTAINER_NAME + fi + + if [ "$(sudo docker ps -aq -f name=$CONTAINER_NAME)" ]; then + echo "Removing existing container..." + sudo docker rm -f $CONTAINER_NAME + fi + + echo "Creating local directory for artifacts..." + mkdir -p ~/llm_benchmarks/artifacts + echo "Creating local directory for scripts..." + mkdir -p ~/scripts + echo "Creating local directory for genai-bench output..." + mkdir -p ~/genai-bench-output + + echo "Starting Docker container '$CONTAINER_NAME' in detached mode..." + if [ -z "$HF_TOKEN" ]; then + echo "HF_TOKEN is not set. Please enter your Hugging Face token:" + read -s HF_TOKEN + echo "HF_TOKEN set: $HF_TOKEN" + fi + + sudo docker run --ipc host --gpus all -p 8000:8000 -v ~/llm_benchmarks/artifacts:/app/tensorrt_llm/artifacts -v ~/scripts:/app/scripts -v ~/genai-bench-output:/genai-bench -e HF_TOKEN=$HF_TOKEN -d --name $CONTAINER_NAME $DOCKER_IMAGE sleep infinity + + echo "--- [Step 2/2] Launching trtllm-serve inside the container ---" + + EXEC_CMD="trtllm-serve \"$MODEL_PATH\" \ + --host 0.0.0.0 \ + --max_batch_size $MAX_BATCH_SIZE \ + --max_num_tokens 16384 \ + --max_seq_len 16384 \ + --tp_size $TP_SIZE" + + sudo docker exec -d $CONTAINER_NAME bash -c "$EXEC_CMD > /var/log/trtllm_server.log 2>&1 &" + + echo "Server is starting in the background. It may take a few minutes to become ready." + echo "You can check the logs with: sudo docker exec -it $CONTAINER_NAME tail -f /var/log/trtllm_server.log" + echo "To get an interactive shell inside the container, run: sudo docker exec -it $CONTAINER_NAME bash" +} + +main "$@" diff --git a/gcp_vm_scripts/cc_benchmarks/3_run_benchmark.sh b/gcp_vm_scripts/cc_benchmarks/3_run_benchmark.sh new file mode 100755 index 0000000..861afdd --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/3_run_benchmark.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# ============================================================================== +# Script 3: Run Benchmark Client (CC Benchmarks) +# +# Purpose: +# Runs the genai-perf benchmark client against the running TRT-LLM server. +# It uses the optimal request-rate for CC Benchmarks (H100). +# +# Usage: +# ./3_run_benchmark.sh --hardware --model +# +# Example: +# ./3_run_benchmark.sh --hardware H100 --model Qwen +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +# --- Configuration --- +CONTAINER_NAME="trtllm_server" + +# Associative arrays for configurations +declare -A MODEL_PATHS +MODEL_PATHS["Qwen_H100"]="Qwen/Qwen3-30B-A3B" +MODEL_PATHS["Mistral_H100"]="mistralai/Mistral-7B-v0.1" + +declare -A REQUEST_RATES +REQUEST_RATES["Qwen_H100"]=5 +REQUEST_RATES["Mistral_H100"]=10 + +# --- Helper Functions --- +usage() { + echo "Usage: $0 --hardware H100 --model " + exit 1 +} + +# --- Main Execution --- +main() { + HARDWARE="" + MODEL="" + + while [[ "$#" -gt 0 ]]; do + case $1 in + --hardware) HARDWARE="$2"; shift ;; + --model) MODEL="$2"; shift ;; + *) usage ;; + esac + shift + done + + if [ -z "$HARDWARE" ] || [ -z "$MODEL" ]; then + echo "Error: --hardware and --model are required arguments." + usage + fi + + CONFIG_KEY="${MODEL}_${HARDWARE}" + MODEL_PATH=${MODEL_PATHS[$CONFIG_KEY]} + REQUEST_RATE=${REQUEST_RATES[$CONFIG_KEY]} + + # Determine endpoint type (Mistral base model needs 'completions' endpoint) + ENDPOINT_TYPE="chat" + if [[ "$MODEL" == "Mistral" ]]; then + ENDPOINT_TYPE="completions" + fi + + if [ -z "$MODEL_PATH" ]; then + echo "Error: Invalid hardware/model combination. Only H100 with Qwen or Mistral is supported." + exit 1 + fi + + echo "--- [Step 1/2] Running benchmark with configuration ---" + echo "Hardware: $HARDWARE" + echo "Model: $MODEL" + echo "Model Path: $MODEL_PATH" + echo "Request Rate: $REQUEST_RATE" + echo "-------------------------------------------------------" + + if ! [ "$(sudo docker ps -q -f name=$CONTAINER_NAME)" ]; then + echo "Error: The server container '$CONTAINER_NAME' is not running." + echo "Please start it first by running ./2_start_server.sh" + exit 1 + fi + + echo "--- [Step 2/2] Executing genai-perf inside the container ---" + + # First, ensure genai-perf is installed inside the container + sudo docker exec $CONTAINER_NAME pip install genai-perf + + # Execute the benchmark command + sudo docker exec $CONTAINER_NAME genai-perf profile \ + -m "$MODEL_PATH" \ + --tokenizer "$MODEL_PATH" \ + --endpoint-type $ENDPOINT_TYPE \ + --random-seed 123 \ + --prefix-prompt-length 2500 \ + --synthetic-input-tokens-mean 7500 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 1000 \ + --output-tokens-stddev 0 \ + --request-count 1000 \ + --request-rate $REQUEST_RATE \ + --url localhost:8000 \ + --streaming \ + --extra-inputs ignore_eos:true + + echo "--- Benchmark finished ---" +} + +main "$@" diff --git a/gcp_vm_scripts/cc_benchmarks/4_stop_server.sh b/gcp_vm_scripts/cc_benchmarks/4_stop_server.sh new file mode 100755 index 0000000..0005911 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/4_stop_server.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# ============================================================================== +# Script 4: Stop TRT-LLM Server +# +# Purpose: +# Stops and removes the running TRT-LLM server container. +# +# Usage: +# ./4_stop_server.sh +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +# --- Configuration --- +CONTAINER_NAME="trtllm_server" + +# --- Main Execution --- +echo "--- Stopping and removing the server container ---" + +if [ "$(sudo docker ps -q -f name=$CONTAINER_NAME)" ]; then + echo "Stopping container '$CONTAINER_NAME'..." + sudo docker stop $CONTAINER_NAME + echo "Container stopped." +else + echo "Container '$CONTAINER_NAME' is not running." +fi + +if [ "$(sudo docker ps -aq -f status=exited -f name=$CONTAINER_NAME)" ]; then + echo "Removing container '$CONTAINER_NAME'..." + sudo docker rm $CONTAINER_NAME + echo "Container removed." +elif ! [ "$(sudo docker ps -q -f name=$CONTAINER_NAME)" ]; then + echo "No stopped container named '$CONTAINER_NAME' to remove." +fi + +echo "--- Cleanup complete ---" diff --git a/gcp_vm_scripts/cc_benchmarks/5_package_and_download_artifacts.sh b/gcp_vm_scripts/cc_benchmarks/5_package_and_download_artifacts.sh new file mode 100755 index 0000000..ec2c9b7 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/5_package_and_download_artifacts.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# ============================================================================== +# Script: Package and Download Artifacts +# +# Purpose: +# Packages benchmark artifacts and logs into a zip file and provides +# the command to download it to your local machine. This script handles +# root-owned files by using sudo and changing ownership. +# +# Usage: +# Run this script on the remote VM after SSH'ing in: +# ./5_package_and_download_artifacts.sh +# +# Then copy and run the provided gcloud compute scp command on your laptop. +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +echo "====================================================================" +echo "Package and Download Artifacts Script" +echo "====================================================================" +echo "" + +# --- Step 1: Install zip if not already installed --- +echo "[Step 1/4] Checking if zip is installed..." +if ! command -v zip &> /dev/null; then + echo "zip is not installed. Installing..." + sudo apt-get update + sudo apt-get install -y zip + echo "zip installed successfully." +else + echo "zip is already installed." +fi +echo "" + +# --- Step 2: Create timestamped zip archive --- +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +ZIP_FILENAME="llm_benchmarks_${TIMESTAMP}.zip" +ZIP_PATH="/tmp/${ZIP_FILENAME}" + +echo "[Step 2/4] Creating zip archive: ${ZIP_FILENAME}" +echo "This may take a moment..." + +mkdir -p ~/llm_benchmarks/log +sudo docker cp trtllm_server:/var/log/trtllm_server.log ~/llm_benchmarks/log/trtllm_server.log + +# Use sudo to zip the root-owned files +sudo zip -r "${ZIP_PATH}" \ + ~/llm_benchmarks/artifacts \ + ~/llm_benchmarks/log/trtllm_server.log \ + 2>/dev/null || true + +if [ ! -f "${ZIP_PATH}" ]; then + echo "Error: Failed to create zip archive." + exit 1 +fi + +echo "Archive created successfully at: ${ZIP_PATH}" +echo "" + +# --- Step 3: Change ownership to current user --- +echo "[Step 3/4] Changing ownership of zip file to current user..." +sudo chown $(whoami):$(whoami) "${ZIP_PATH}" +echo "Ownership changed successfully." +echo "" + +# --- Step 4: Provide download instructions --- +echo "[Step 4/4] Archive is ready for download!" +echo "" +echo "====================================================================" +echo "DOWNLOAD INSTRUCTIONS" +echo "====================================================================" +echo "" +echo "The archive is ready at: ${ZIP_PATH}" +echo "" +echo "On your LOCAL LAPTOP, run the download script:" +echo "" +echo " ./gcp_vm_scripts/6_download_artifacts.sh" +echo "" +echo "Or manually download with:" +echo "" +echo " gcloud compute scp [INSTANCE_NAME]:${ZIP_PATH} ~/Downloads/" +echo "" +echo "====================================================================" +echo "" diff --git a/gcp_vm_scripts/cc_benchmarks/6_download_artifacts.sh b/gcp_vm_scripts/cc_benchmarks/6_download_artifacts.sh new file mode 100755 index 0000000..94c6943 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/6_download_artifacts.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +# ============================================================================== +# Script: Download Artifacts from VM +# +# Purpose: +# Downloads the packaged artifacts zip file from the remote VM to your +# local Downloads folder. Run this script on your LOCAL LAPTOP after +# running 5_package_and_download_artifacts.sh on the VM. +# +# Usage: +# ./6_download_artifacts.sh [INSTANCE_NAME] [--zone ZONE] [--project PROJECT] +# +# Examples: +# ./6_download_artifacts.sh my-vm-instance +# ./6_download_artifacts.sh h200-euro --zone europe-west1-b --project fx-gen-ai-sandbox +# ./6_download_artifacts.sh --zone us-central1-a --project my-project +# +# If you don't provide an instance name, the script will try to auto-detect it. +# Zone and project are optional; if not provided, gcloud will use defaults. +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +echo "====================================================================" +echo "Download Artifacts from VM" +echo "====================================================================" +echo "" + +# Parse command-line arguments +INSTANCE_NAME="" +ZONE="" +PROJECT="" + +while [[ $# -gt 0 ]]; do + case $1 in + --zone) + ZONE="$2" + shift 2 + ;; + --project) + PROJECT="$2" + shift 2 + ;; + *) + if [ -z "$INSTANCE_NAME" ]; then + INSTANCE_NAME="$1" + fi + shift + ;; + esac +done + +# If no instance name provided, try to auto-detect +if [ -z "$INSTANCE_NAME" ]; then + echo "No instance name provided. Attempting to auto-detect..." + echo "" + + # List all running instances + echo "Available GCP instances:" + gcloud compute instances list --format="table(name,zone,status)" + echo "" + + # Try to find instances with common naming patterns + INSTANCES=$(gcloud compute instances list --filter="status=RUNNING" --format="value(name)") + INSTANCE_COUNT=$(echo "$INSTANCES" | wc -l | tr -d ' ') + + if [ "$INSTANCE_COUNT" -eq 1 ]; then + INSTANCE_NAME=$(echo "$INSTANCES" | head -n 1) + echo "Auto-detected instance: $INSTANCE_NAME" + echo "" + else + echo "Error: Multiple or no running instances found." + echo "Please specify the instance name:" + echo "" + echo "Usage: $0 [INSTANCE_NAME]" + echo "" + exit 1 + fi +fi + +echo "Target instance: $INSTANCE_NAME" + +# Auto-detect zone and project if not provided +if [ -z "$ZONE" ] || [ -z "$PROJECT" ]; then + echo "Auto-detecting zone and project for instance..." + + # Get instance details + INSTANCE_INFO=$(gcloud compute instances list --filter="name=$INSTANCE_NAME" --format="value(zone,selfLink)" 2>/dev/null | head -n 1) + + if [ -n "$INSTANCE_INFO" ]; then + # Extract zone if not provided + if [ -z "$ZONE" ]; then + DETECTED_ZONE=$(echo "$INSTANCE_INFO" | awk '{print $1}') + if [ -n "$DETECTED_ZONE" ]; then + # Extract just the zone name from the full path (e.g., "us-central1-a" from "projects/.../zones/us-central1-a") + ZONE=$(basename "$DETECTED_ZONE") + echo "Auto-detected zone: $ZONE" + fi + fi + + # Extract project if not provided + if [ -z "$PROJECT" ]; then + SELF_LINK=$(echo "$INSTANCE_INFO" | awk '{print $2}') + if [[ "$SELF_LINK" =~ projects/([^/]+)/ ]]; then + PROJECT="${BASH_REMATCH[1]}" + echo "Auto-detected project: $PROJECT" + fi + fi + else + echo "Warning: Could not auto-detect zone/project. Using gcloud defaults." + fi +fi + +if [ -n "$ZONE" ]; then + echo "Using zone: $ZONE" +fi +if [ -n "$PROJECT" ]; then + echo "Using project: $PROJECT" +fi +echo "" + +# Build gcloud command with optional zone and project +GCLOUD_SSH_CMD="gcloud compute ssh" +GCLOUD_SCP_CMD="gcloud compute scp" + +if [ -n "$ZONE" ]; then + GCLOUD_SSH_CMD="$GCLOUD_SSH_CMD --zone $ZONE" + GCLOUD_SCP_CMD="$GCLOUD_SCP_CMD --zone $ZONE" +fi + +if [ -n "$PROJECT" ]; then + GCLOUD_SSH_CMD="$GCLOUD_SSH_CMD --project $PROJECT" + GCLOUD_SCP_CMD="$GCLOUD_SCP_CMD --project $PROJECT" +fi + +# Find the most recent zip file in /tmp/ on the VM +echo "Looking for the most recent artifacts zip file on the VM..." +REMOTE_ZIP=$($GCLOUD_SSH_CMD "$INSTANCE_NAME" --command="ls -t /tmp/llm_benchmarks_*.zip 2>/dev/null | head -n 1" 2>/dev/null || echo "") + +if [ -z "$REMOTE_ZIP" ]; then + echo "Error: No artifacts zip file found on the VM." + echo "" + echo "Please run the following on the VM first:" + echo " ./5_package_and_download_artifacts.sh" + echo "" + exit 1 +fi + +ZIP_FILENAME=$(basename "$REMOTE_ZIP") +echo "Found: $ZIP_FILENAME" +echo "" + +# Download the file +echo "Downloading to ~/Downloads/$ZIP_FILENAME..." +$GCLOUD_SCP_CMD "${INSTANCE_NAME}:${REMOTE_ZIP}" ~/Downloads/ + +if [ $? -eq 0 ]; then + echo "" + echo "====================================================================" + echo "SUCCESS!" + echo "====================================================================" + echo "" + echo "File downloaded to: ~/Downloads/$ZIP_FILENAME" + echo "" + echo "You can now extract it with:" + echo " unzip ~/Downloads/$ZIP_FILENAME -d ~/Downloads/" + echo "" + echo "To clean up the temporary file on the VM, run:" + if [ -n "$ZONE" ] && [ -n "$PROJECT" ]; then + echo " gcloud compute ssh $INSTANCE_NAME --zone $ZONE --project $PROJECT --command=\"rm $REMOTE_ZIP\"" + elif [ -n "$ZONE" ]; then + echo " gcloud compute ssh $INSTANCE_NAME --zone $ZONE --command=\"rm $REMOTE_ZIP\"" + elif [ -n "$PROJECT" ]; then + echo " gcloud compute ssh $INSTANCE_NAME --project $PROJECT --command=\"rm $REMOTE_ZIP\"" + else + echo " gcloud compute ssh $INSTANCE_NAME --command=\"rm $REMOTE_ZIP\"" + fi + echo "" +else + echo "" + echo "Error: Download failed." + exit 1 +fi diff --git a/gcp_vm_scripts/cc_benchmarks/README.md b/gcp_vm_scripts/cc_benchmarks/README.md new file mode 100644 index 0000000..e4ae74d --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/README.md @@ -0,0 +1,114 @@ +# CC Benchmarks Suite (H100) + +This directory contains scripts to run benchmarks for Qwen (30B) and Mistral (7B) models on NVIDIA H100 hardware. + +## Workflow + +### 1. Preparation (from Local Machine) +Ensure all local scripts are executable: +```bash +chmod +x gcp_vm_scripts/cc_benchmarks/*.sh +``` + +### 2. Provision VM (from Local Machine) +Use the unified start script to create an H100 VM. By default, the script targets `a3-highgpu-1g` and automatically tries multiple zones (`us-central1-a`, `us-central1-c`, `europe-west4-b`) if the initial attempt fails. + +```bash +# Standard VM (Non-confidential) +./gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh + +# VM with Confidential Compute (TDX) enabled +./gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh --confidential + +# VM with Confidential Compute (TDX) AND Secure Boot enabled +./gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh --confidential --secure-boot + +# Target a specific zone +./gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh --zone us-central1-a +``` + +### 3. Upload Scripts to VM (from Local Machine) +Once the VM is running, upload the benchmark scripts to the VM's home directory. +```bash +./gcp_vm_scripts/cc_benchmarks/upload_scripts.sh h100-test-vm --zone "$gcp_zone" --project "$gcp_project_id" +``` +*Replace `h100-test-vm` with your actual instance name if different.* + +### 4. Connect to VM +SSH into the VM. +```bash +gcloud compute ssh --zone "$gcp_zone" "h100-test-vm" --project "$gcp_project_id" +``` + +### 5. Setup Environment and Permissions +You can run the setup script directly on the VM (Option A) or trigger it from your local machine (Option B). + +**Option A: Run on VM** +SSH into the VM (as shown in Step 4) and run: +```bash +chmod +x *.sh +./1_setup_environment.sh +``` + +**Option B: Run from Local Machine** +Run the following command from your local machine to execute the script remotely. This allows you to see the output locally, ensuring you don't lose the logs when the VM reboots. + +First, export your GCP configuration variables: +```bash +export gcp_zone=us-central1-a +export gcp_project_id=fx-gen-ai-sandbox +``` + +Then run the setup script: +```bash +gcloud compute ssh --zone "$gcp_zone" "h100-test-vm" --project "$gcp_project_id" --command "chmod +x 1_setup_environment.sh && ./1_setup_environment.sh" +``` +*Note: The script reboots the VM upon completion, which will automatically close the SSH connection. This is expected.* + +### 6. Start Server (on VM) +Start the TRT-LLM server with the desired model. +Supported models: `Qwen` (Qwen3-30B-A3B), `Mistral` (Mistral-7B-v0.1). +Supported hardware: `H100` (optimized). + +```bash +./2_start_server.sh --hardware H100 --model Qwen +``` +or +```bash +./2_start_server.sh --hardware H100 --model Mistral +``` + +The server will start in the background. You can check logs with: +```bash +sudo docker exec -it trtllm_server tail -f /var/log/trtllm_server.log +``` + +### 7. Run Benchmark (on VM) +Run the benchmark client. +```bash +./3_run_benchmark.sh --hardware H100 --model Qwen +``` +Make sure to match the model you started the server with. + +### 8. Package Artifacts (on VM) +After the benchmark completes, package the results on the VM. +```bash +./5_package_and_download_artifacts.sh +``` +This will create a zip file in `/tmp/` and provide instructions. + +### 9. Download Artifacts (from Local Machine) +On your **local machine**, download the artifacts. +```bash +./gcp_vm_scripts/cc_benchmarks/6_download_artifacts.sh h100-test-vm --zone "$gcp_zone" --project "$gcp_project_id" +``` + +### 10. Cleanup (on VM/Local Machine) +Stop the server on the VM. +```bash +./4_stop_server.sh +``` +Don't forget to delete the VM when done to avoid costs (from local machine). +```bash +gcloud compute instances delete h100-test-vm --zone "$gcp_zone" --project "$gcp_project_id" +``` diff --git a/gcp_vm_scripts/cc_benchmarks/config.yaml b/gcp_vm_scripts/cc_benchmarks/config.yaml new file mode 100644 index 0000000..07047b3 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/config.yaml @@ -0,0 +1,7 @@ +agentsRule: + packageState: installed + version: latest +instanceFilter: + inclusionLabels: + - labels: + goog-ops-agent-policy: v2-x86-template-1-4-0 diff --git a/gcp_vm_scripts/cc_benchmarks/upload_scripts.sh b/gcp_vm_scripts/cc_benchmarks/upload_scripts.sh new file mode 100755 index 0000000..b41fc09 --- /dev/null +++ b/gcp_vm_scripts/cc_benchmarks/upload_scripts.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +# ============================================================================== +# Script: Upload CC Benchmark Scripts to VM +# +# Purpose: +# Uploads the necessary CC benchmark scripts to the remote VM's +# home directory. +# +# Usage: +# ./upload_cc_benchmarks.sh [INSTANCE_NAME] [--zone ZONE] [--project PROJECT] +# +# Examples: +# ./upload_cc_benchmarks.sh h100-test-vm +# ./upload_cc_benchmarks.sh h100-test-vm --zone us-central1-c --project fx-gen-ai-sandbox +# +# If you don't provide an instance name, the script will try to auto-detect it. +# Zone and project are optional and will be auto-detected if not provided. +# ============================================================================== + +# Exit immediately if a command exits with a non-zero status. +set -e + +echo "====================================================================" +echo "Upload CC Benchmark Scripts to VM" +echo "====================================================================" +echo "" + +# Parse command-line arguments +INSTANCE_NAME="" +ZONE="" +PROJECT="" + +while [[ $# -gt 0 ]]; do + case $1 in + --zone) + ZONE="$2" + shift 2 + ;; + --project) + PROJECT="$2" + shift 2 + ;; + *) + if [ -z "$INSTANCE_NAME" ]; then + INSTANCE_NAME="$1" + fi + shift + ;; + esac +done + +# If no instance name provided, try to auto-detect +if [ -z "$INSTANCE_NAME" ]; then + echo "No instance name provided. Attempting to auto-detect..." + echo "" + + # List all running instances + echo "Available GCP instances:" + gcloud compute instances list --format="table(name,zone,status)" + echo "" + + # Try to find instances with common naming patterns + INSTANCES=$(gcloud compute instances list --filter="status=RUNNING" --format="value(name)") + INSTANCE_COUNT=$(echo "$INSTANCES" | wc -l | tr -d ' ') + + if [ "$INSTANCE_COUNT" -eq 1 ]; then + INSTANCE_NAME=$(echo "$INSTANCES" | head -n 1) + echo "Auto-detected instance: $INSTANCE_NAME" + echo "" + else + echo "Error: Multiple or no running instances found." + echo "Please specify the instance name:" + echo "" + echo "Usage: $0 [INSTANCE_NAME]" + echo "" + exit 1 + fi +fi + +echo "Target instance: $INSTANCE_NAME" + +# Auto-detect zone and project if not provided +if [ -z "$ZONE" ] || [ -z "$PROJECT" ]; then + echo "Auto-detecting zone and project for instance..." + + # Get instance details + INSTANCE_INFO=$(gcloud compute instances list --filter="name=$INSTANCE_NAME" --format="value(zone,selfLink)" 2>/dev/null | head -n 1) + + if [ -n "$INSTANCE_INFO" ]; then + # Extract zone if not provided + if [ -z "$ZONE" ]; then + DETECTED_ZONE=$(echo "$INSTANCE_INFO" | awk '{print $1}') + if [ -n "$DETECTED_ZONE" ]; then + ZONE=$(basename "$DETECTED_ZONE") + echo "Auto-detected zone: $ZONE" + fi + fi + + # Extract project if not provided + if [ -z "$PROJECT" ]; then + SELF_LINK=$(echo "$INSTANCE_INFO" | awk '{print $2}') + if [[ "$SELF_LINK" =~ projects/([^/]+)/ ]]; then + PROJECT="${BASH_REMATCH[1]}" + echo "Auto-detected project: $PROJECT" + fi + fi + else + echo "Warning: Could not auto-detect zone/project. Using gcloud defaults." + fi +fi + +if [ -n "$ZONE" ]; then + echo "Using zone: $ZONE" +fi +if [ -n "$PROJECT" ]; then + echo "Using project: $PROJECT" +fi +echo "" + +# Build gcloud command with optional zone and project +GCLOUD_SCP_CMD="gcloud compute scp" + +if [ -n "$ZONE" ]; then + GCLOUD_SCP_CMD="$GCLOUD_SCP_CMD --zone $ZONE" +fi + +if [ -n "$PROJECT" ]; then + GCLOUD_SCP_CMD="$GCLOUD_SCP_CMD --project $PROJECT" +fi + +# List of files to upload from the cc_benchmarks directory +SCRIPT_DIR=$(dirname "$0") +CC_DIR="${SCRIPT_DIR}" + +FILES_TO_UPLOAD=( + "${CC_DIR}/1_setup_environment.sh" + "${CC_DIR}/2_start_server.sh" + "${CC_DIR}/3_run_benchmark.sh" + "${CC_DIR}/4_stop_server.sh" + "${CC_DIR}/5_package_and_download_artifacts.sh" +) + +# Upload the files +echo "Uploading scripts to ~ on $INSTANCE_NAME..." +$GCLOUD_SCP_CMD "${FILES_TO_UPLOAD[@]}" "${INSTANCE_NAME}:~/" + +if [ $? -eq 0 ]; then + echo "" + echo "====================================================================" + echo "SUCCESS!" + echo "====================================================================" + echo "" + echo "The following files were uploaded to the home directory on $INSTANCE_NAME:" + for file in "${FILES_TO_UPLOAD[@]}"; do + echo " - $(basename "$file")" + done + echo "" +else + echo "" + echo "Error: Upload failed." + exit 1 +fi