Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions gcp_vm_scripts/cc_benchmarks/0_start_vm_h100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/bin/bash

# ==============================================================================
# Script: 0_start_vm_h100.sh
# Description: Provisions an H100 GPU VM (a3-highgpu-1g) for CC benchmarks.
# Supports multi-zone fallback and Confidential Compute (TDX).
#
# Usage:
# ./0_start_vm_h100.sh [--confidential] [--secure-boot] [--zone <zone>]
#
# Flags:
# --confidential : Enable Confidential Compute (TDX) for the VM.
# --secure-boot : Enable Shielded Secure Boot for the VM.
# --zone : Target a specific zone (overrides fallback list).
# ==============================================================================

# Configuration
PROJECT_ID="fx-gen-ai-sandbox"
VM_NAME="h100-test-vm"
MACHINE_TYPE="a3-highgpu-1g"
GPU_COUNT=1
DISK_SIZE=250
SERVICE_ACCOUNT="18209811701-compute@developer.gserviceaccount.com"
IMAGE="projects/ubuntu-os-accelerator-images/global/images/ubuntu-accelerator-2404-amd64-with-nvidia-580-v20251021"

# List of zones to try in sequential order (fallback mechanism)
ZONES=("us-central1-a" "us-central1-c" "europe-west4-b")

# Default Flags
CONFIDENTIAL_FLAG=""
SECURE_BOOT_FLAG="--no-shielded-secure-boot"

# Parse arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
--confidential)
# Enable Confidential Computing with Intel TDX
CONFIDENTIAL_FLAG="--confidential-compute-type=TDX"
echo "Enabling Confidential Computing (TDX)..."
;;
--secure-boot)
# Enable Shielded Secure Boot
SECURE_BOOT_FLAG="--shielded-secure-boot"
echo "Enabling Shielded Secure Boot..."
;;
--zone)
# Override zones list with a specific user-provided zone
if [[ -n "$2" && "$2" != --* ]]; then
ZONES=("$2")
echo "Targeting specific zone: $2"
shift
else
echo "Error: --zone requires a value."
exit 1
fi
;;
*)
echo "Unknown argument: $1"
;;
esac
shift
done

# Iterate through zones until a VM is successfully created
for ZONE in "${ZONES[@]}"; do
REGION="${ZONE%-*}"
echo "--------------------------------------------------------"
echo "Attempting to start VM $VM_NAME in zone $ZONE..."
echo "--------------------------------------------------------"

# 1. Ensure Snapshot Schedule exists in the region
# Resource policies are regional, so we ensure it exists for the target region.
echo "Checking/Creating snapshot schedule 'default-schedule-1' in region $REGION..."
gcloud compute resource-policies create snapshot-schedule default-schedule-1 \
--project=$PROJECT_ID \
--region=$REGION \
--max-retention-days=14 \
--on-source-disk-delete=keep-auto-snapshots \
--daily-schedule \
--start-time=00:00 \
2>/dev/null || echo "Snapshot schedule already exists or could not be created."

# 2. Configure Disk based on Zone
if [[ "$ZONE" == europe-west4* ]]; then
DISK_TYPE="hyperdisk-balanced"
DISK_EXTRAS=",provisioned-iops=6000,provisioned-throughput=890"
else
DISK_TYPE="pd-balanced"
DISK_EXTRAS=""
fi

# 3. Construct and Execute the gcloud create command
# Confidential Compute (TDX) and Secure Boot flags are applied here if set via arguments.
if gcloud compute instances create "$VM_NAME" \
--project="$PROJECT_ID" \
--zone="$ZONE" \
--machine-type="$MACHINE_TYPE" \
--network-interface="network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=sandbox-vpc-default" \
--metadata="enable-osconfig=TRUE" \
--no-restart-on-failure \
--maintenance-policy="TERMINATE" \
--provisioning-model="SPOT" \
--instance-termination-action="STOP" \
--discard-local-ssds-at-termination-timestamp=true \
--service-account="$SERVICE_ACCOUNT" \
--scopes="https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/trace.append" \
--accelerator="count=$GPU_COUNT,type=nvidia-h100-80gb" \
--create-disk="auto-delete=yes,boot=yes,device-name=$VM_NAME,disk-resource-policy=projects/$PROJECT_ID/regions/$REGION/resourcePolicies/default-schedule-1,image=$IMAGE,mode=rw,size=$DISK_SIZE,type=$DISK_TYPE$DISK_EXTRAS" \
$SECURE_BOOT_FLAG \
$CONFIDENTIAL_FLAG \
--shielded-vtpm \
--shielded-integrity-monitoring \
--labels="goog-ops-agent-policy=v2-x86-template-1-4-0,goog-ec-src=vm_add-gcloud" \
--reservation-affinity="none"; then

echo "Successfully created VM in $ZONE."

# 4. Post-creation: Configure Ops Agent
echo "Configuring Ops Agent..."
printf 'agentsRule:\n packageState: installed\n version: latest\ninstanceFilter:\n inclusionLabels:\n - labels:\n goog-ops-agent-policy: v2-x86-template-1-4-0\n' > config.yaml

POLICY_NAME="goog-ops-agent-v2-x86-template-1-4-0-${ZONE}"
echo "Applying Ops Agent policy: $POLICY_NAME"
gcloud compute instances ops-agents policies create "$POLICY_NAME" \
--project="$PROJECT_ID" \
--zone="$ZONE" \
--file=config.yaml || \
gcloud compute instances ops-agents policies update "$POLICY_NAME" \
--project="$PROJECT_ID" \
--zone="$ZONE" \
--file=config.yaml || \
echo "Warning: Failed to create or update Ops Agent policy, but VM is up."

echo "All set!"
exit 0
else
echo "Failed to create VM in $ZONE. Trying next zone..."
fi
done

echo "Error: Could not start VM in any of the specified zones."
exit 1
116 changes: 116 additions & 0 deletions gcp_vm_scripts/cc_benchmarks/1_setup_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash

# ==============================================================================
# Script 1: Setup Environment
#
# Purpose:
# This script performs a one-time setup for a new machine by installing
# Docker and the NVIDIA Container Toolkit. It is idempotent, meaning it
# can be safely re-run without causing issues.
#
# Usage:
# ./1_setup_environment.sh
# ==============================================================================

# Exit immediately if a command exits with a non-zero status.
set -e

# Parse command-line arguments

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add sudo apt upgrade -y to this script as well.

CONFIDENTIAL=false
while [[ "$#" -gt 0 ]]; do
case $1 in
--confidential) CONFIDENTIAL=true ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done

echo "--- [Step 1/1] Setting up environment ---"

# --- Install Docker ---
if command -v docker &> /dev/null; then
echo "Docker is already installed. Skipping installation."
else
echo "Installing Docker..."
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably don't need do call update again here.

sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
echo "Docker installation complete."
fi

# Enable Confidential Computing before verifying CUDA Toolkit.
if [ "$CONFIDENTIAL" = true ]; then
echo "Enabling Confidential Computing..."
# Enable Linux Kernel Crypto API
echo "install nvidia /sbin/modprobe ecdsa_generic; /sbin/modprobe ecdh; /sbin/modprobe --ignore-install nvidia" | sudo tee /etc/modprobe.d/nvidia-lkca.conf
sudo update-initramfs -u

# Enable Confidential Compute GPUs Ready state
sudo nvidia-smi conf-compute -srs 1

# Set startup unit to enable Confidential Compute GPUs Ready state on each boot
sudo tee /etc/systemd/system/cc-gpu-ready.service > /dev/null << 'EOF'
[Unit]
Description=Set Confidential Compute GPU to Ready mode
After=multi-user.target
Wants=nvidia-persistenced.service

[Service]
Type=oneshot
ExecStartPre=/bin/sleep 2
ExecStart=/usr/bin/nvidia-smi conf-compute -srs 1
ExecStartPost=/usr/bin/nvidia-smi conf-compute -grs
RemainAfterExit=true

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl daemon-reload
sudo systemctl enable cc-gpu-ready.service

nvidia-smi conf-compute -f # should say CC status: ON
nvidia-smi conf-compute -grs # should say ready
fi

# --- Install NVIDIA Container Toolkit ---
if dpkg -l | grep -q nvidia-container-toolkit; then
echo "NVIDIA Container Toolkit is already installed. Skipping installation."
else
echo "Installing NVIDIA Container Toolkit..."
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
echo "NVIDIA Container Toolkit installation complete."
fi

echo "--- Environment setup is complete. ---"

sleep 3
echo "Enabling persistence mode..."

# Enable persistence mode to establish a secure Security Protocol and Data Model (SPDM) connection
sudo mkdir -p /etc/systemd/system/nvidia-persistenced.service.d
cat <<EOF | sudo tee /etc/systemd/system/nvidia-persistenced.service.d/override.conf
[Service]
# Clear the original ExecStart then provide our desired command:
ExecStart=
ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --uvm-persistence-mode --verbose
EOF

sudo systemctl daemon-reload
sudo systemctl enable nvidia-persistenced.service
sudo reboot
128 changes: 128 additions & 0 deletions gcp_vm_scripts/cc_benchmarks/2_start_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/bin/bash

# ==============================================================================
# Script 2: Start TRT-LLM Server (CC Benchmarks)
#
# Purpose:
# Starts the TRT-LLM server in a detached Docker container with the
# optimal configuration for CC Benchmarks (H100).
#
# Usage:
# ./2_start_server.sh --hardware <HW> --model <MODEL>
#
# Example:
# ./2_start_server.sh --hardware H100 --model Qwen
# ==============================================================================

# Exit immediately if a command exits with a non-zero status.
set -e

# --- Configuration ---
DOCKER_IMAGE="nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc1"
CONTAINER_NAME="trtllm_server"

# Associative arrays for configurations
declare -A MODEL_PATHS
MODEL_PATHS["Qwen_H100"]="Qwen/Qwen3-30B-A3B"
MODEL_PATHS["Mistral_H100"]="mistralai/Mistral-7B-v0.1"

declare -A MAX_BATCH_SIZES
MAX_BATCH_SIZES["Qwen_H100"]=512
MAX_BATCH_SIZES["Mistral_H100"]=2056

declare -A TP_SIZES
TP_SIZES["Qwen_H100"]=1
TP_SIZES["Mistral_H100"]=1

# --- Helper Functions ---
usage() {
echo "Usage: $0 --hardware H100 --model <Qwen|Mistral>"
exit 1
}

# --- Main Execution ---
main() {
HARDWARE=""
MODEL=""

while [[ "$#" -gt 0 ]]; do
case $1 in
--hardware) HARDWARE="$2"; shift ;;
--model) MODEL="$2"; shift ;;
*) usage ;;
esac
shift
done

if [ -z "$HARDWARE" ] || [ -z "$MODEL" ]; then
echo "Error: --hardware and --model are required arguments."
usage
fi

CONFIG_KEY="${MODEL}_${HARDWARE}"
MODEL_PATH=${MODEL_PATHS[$CONFIG_KEY]}
MAX_BATCH_SIZE=${MAX_BATCH_SIZES[$CONFIG_KEY]}
TP_SIZE=${TP_SIZES[$CONFIG_KEY]}

if [ -z "$MODEL_PATH" ]; then
echo "Error: Invalid hardware/model combination. Only H100 with Qwen or Mistral is supported."
exit 1
fi

echo "--- [Step 1/2] Starting server with configuration ---"
echo "Hardware: $HARDWARE"
echo "Model: $MODEL"
echo "Model Path: $MODEL_PATH"
echo "Max Batch Size: $MAX_BATCH_SIZE"
echo "TP Size: $TP_SIZE"
echo "----------------------------------------------------"

if [ "$(sudo docker ps -q -f name=$CONTAINER_NAME)" ]; then
echo "Error: A container with the name '$CONTAINER_NAME' is already running."
echo "Please stop it first by running ./4_stop_server.sh"
exit 1
fi

if [ "$(sudo docker ps -aq -f status=exited -f name=$CONTAINER_NAME)" ]; then
echo "Removing existing stopped container..."
sudo docker rm $CONTAINER_NAME
fi

if [ "$(sudo docker ps -aq -f name=$CONTAINER_NAME)" ]; then
echo "Removing existing container..."
sudo docker rm -f $CONTAINER_NAME
fi

echo "Creating local directory for artifacts..."
mkdir -p ~/llm_benchmarks/artifacts
echo "Creating local directory for scripts..."
mkdir -p ~/scripts
echo "Creating local directory for genai-bench output..."
mkdir -p ~/genai-bench-output

echo "Starting Docker container '$CONTAINER_NAME' in detached mode..."
if [ -z "$HF_TOKEN" ]; then
echo "HF_TOKEN is not set. Please enter your Hugging Face token:"
read -s HF_TOKEN
echo "HF_TOKEN set: $HF_TOKEN"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't log $HF_TOKEN on screen.

fi

sudo docker run --ipc host --gpus all -p 8000:8000 -v ~/llm_benchmarks/artifacts:/app/tensorrt_llm/artifacts -v ~/scripts:/app/scripts -v ~/genai-bench-output:/genai-bench -e HF_TOKEN=$HF_TOKEN -d --name $CONTAINER_NAME $DOCKER_IMAGE sleep infinity

echo "--- [Step 2/2] Launching trtllm-serve inside the container ---"

EXEC_CMD="trtllm-serve \"$MODEL_PATH\" \
--host 0.0.0.0 \
--max_batch_size $MAX_BATCH_SIZE \
--max_num_tokens 16384 \
--max_seq_len 16384 \
--tp_size $TP_SIZE"

sudo docker exec -d $CONTAINER_NAME bash -c "$EXEC_CMD > /var/log/trtllm_server.log 2>&1 &"

echo "Server is starting in the background. It may take a few minutes to become ready."
echo "You can check the logs with: sudo docker exec -it $CONTAINER_NAME tail -f /var/log/trtllm_server.log"
echo "To get an interactive shell inside the container, run: sudo docker exec -it $CONTAINER_NAME bash"
}

main "$@"
Loading
Loading