Skip to content

trampoline.ipynb

trampoline.ipynb #11

Workflow file for this run

# File: trampoline.yml
# Code: Claude Code and Codex
# Review: Ryoichi Ando (ryoichi.ando@zozo.com)
# License: Apache v2.0
name: trampoline.ipynb
on:
workflow_dispatch:
inputs:
instance_type:
description: 'EC2 instance type'
required: true
default: 'g6e.2xlarge'
type: choice
options:
- g6.2xlarge
- g6e.2xlarge
region:
description: 'AWS Region'
required: true
default: 'us-east-2'
type: choice
options:
- us-east-1
- us-east-2
- ap-northeast-1
jobs:
run-on-gpu:
name: Run on GPU Instance
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
env:
AWS_REGION: ${{ github.event.inputs.region }}
INSTANCE_TYPE: ${{ github.event.inputs.instance_type }}
BRANCH: ${{ github.ref_name }}
EXAMPLE: trampoline
WORKDIR: /home/ubuntu
USER: ubuntu
steps:
- name: Show input parameters
run: |
echo "## Input Parameters"
echo "Branch: ${{ github.ref_name }}"
echo "Instance Type: ${{ github.event.inputs.instance_type }}"
echo "Region: ${{ github.event.inputs.region }}"
- name: Checkout repository
uses: actions/checkout@v4
- name: Configure AWS credentials via OIDC
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Verify AWS authentication
run: |
echo "Testing AWS authentication..."
aws sts get-caller-identity
echo "AWS Region: $AWS_REGION"
echo "Instance Type: $INSTANCE_TYPE"
echo "Branch: $BRANCH"
echo "Example: $EXAMPLE"
- name: Get GitHub Actions runner public IP
id: runner-ip
run: |
echo "Fetching GitHub Actions runner public IP..."
RUNNER_IP=$(curl -s --max-time 10 https://checkip.amazonaws.com | tr -d '\n')
if [ -z "$RUNNER_IP" ]; then
echo "ERROR: Failed to get IP from checkip.amazonaws.com"
exit 1
fi
echo "::add-mask::$RUNNER_IP"
echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_OUTPUT
echo "GitHub Actions Runner IP: $RUNNER_IP"
- name: Find Deep Learning AMI
id: ami
run: |
echo "Finding latest Deep Learning AMI with GPU support..."
AMI_ID=$(aws ec2 describe-images \
--owners amazon \
--filters \
"Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04)*" \
"Name=state,Values=available" \
"Name=architecture,Values=x86_64" \
--query 'sort_by(Images, &CreationDate)[-1].ImageId' \
--region "$AWS_REGION" \
--output text)
if [ "$AMI_ID" = "None" ] || [ -z "$AMI_ID" ]; then
echo "ERROR: Deep Learning AMI not found in region $AWS_REGION"
echo "This workflow requires the Deep Learning AMI with pre-installed NVIDIA drivers"
echo "Please check if the AMI is available in your selected region"
exit 1
fi
echo "AMI_ID=$AMI_ID" >> $GITHUB_OUTPUT
echo "Found AMI: $AMI_ID"
- name: Get default VPC ID
id: vpc
run: |
echo "Getting default VPC ID..."
VPC_ID=$(aws ec2 describe-vpcs \
--filters "Name=isDefault,Values=true" \
--query 'Vpcs[0].VpcId' \
--region "$AWS_REGION" \
--output text)
if [ "$VPC_ID" = "None" ] || [ -z "$VPC_ID" ]; then
echo "ERROR: Default VPC not found in region $AWS_REGION"
exit 1
fi
echo "VPC_ID=$VPC_ID" >> $GITHUB_OUTPUT
echo "Default VPC: $VPC_ID"
- name: Generate unique identifiers
id: ids
run: |
TIMESTAMP=$(date +%Y%m%d%H%M%S)
RANDOM_SUFFIX=$(head /dev/urandom | tr -dc a-z0-9 | head -c 6)
TEMP_INSTANCE_ID="temp-${TIMESTAMP}-${RANDOM_SUFFIX}"
# Generate random SSH port (10001-65535)
SSH_PORT=$((10001 + RANDOM % 55535))
echo "::add-mask::$SSH_PORT"
echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_OUTPUT
echo "TEMP_INSTANCE_ID=$TEMP_INSTANCE_ID" >> $GITHUB_OUTPUT
echo "SSH_PORT=$SSH_PORT" >> $GITHUB_OUTPUT
echo "Temporary Instance ID: $TEMP_INSTANCE_ID"
echo "SSH Port: $SSH_PORT"
- name: Setup persistent security group
id: security-group
run: |
echo "Setting up persistent security group 'github-actions-persistent'..."
SG_NAME="github-actions-persistent"
SG_DESCRIPTION="Persistent security group for GitHub Actions with dynamic rules"
# Check if security group already exists
SG_ID=$(aws ec2 describe-security-groups \
--filters "Name=group-name,Values=$SG_NAME" \
--query 'SecurityGroups[0].GroupId' \
--region "$AWS_REGION" \
--output text || echo "")
if [ "$SG_ID" = "None" ] || [ -z "$SG_ID" ]; then
echo "Security group does not exist. Creating new one..."
# Create security group
SG_ID=$(aws ec2 create-security-group \
--group-name "$SG_NAME" \
--description "$SG_DESCRIPTION" \
--vpc-id "${{ steps.vpc.outputs.VPC_ID }}" \
--query 'GroupId' \
--region "$AWS_REGION" \
--output text)
echo "Security Group created: $SG_ID"
# Tag the security group
aws ec2 create-tags \
--resources "$SG_ID" \
--tags \
"Key=Name,Value=$SG_NAME" \
"Key=ManagedBy,Value=GitHubActions" \
"Key=Purpose,Value=PersistentDynamicRules" \
"Key=CreatedAt,Value=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--region "$AWS_REGION"
echo "Security Group tagged successfully"
else
echo "Using existing security group: $SG_ID"
fi
echo "SG_ID=$SG_ID" >> $GITHUB_OUTPUT
# Add only custom SSH port (no port 22)
echo "Adding ingress rule for runner IP on port ${{ steps.ids.outputs.SSH_PORT }}"
aws ec2 authorize-security-group-ingress \
--group-id "$SG_ID" \
--ip-permissions \
"IpProtocol=tcp,FromPort=${{ steps.ids.outputs.SSH_PORT }},ToPort=${{ steps.ids.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.runner-ip.outputs.RUNNER_IP }}/32,Description='GHA Run ${{ github.run_id }} Port ${{ steps.ids.outputs.SSH_PORT }}'}]" \
--region "$AWS_REGION" 2>&1 || echo "Note: Rule may already exist"
echo "RUNNER_IP_CIDR=${{ steps.runner-ip.outputs.RUNNER_IP }}/32" >> $GITHUB_OUTPUT
echo "SSH_PORT=${{ steps.ids.outputs.SSH_PORT }}" >> $GITHUB_OUTPUT
echo "SSH ingress rule added successfully (custom port only)"
RULE_COUNT=$(aws ec2 describe-security-groups \
--group-ids "$SG_ID" \
--query 'length(SecurityGroups[0].IpPermissions)' \
--region "$AWS_REGION" \
--output text)
echo "Security group has $RULE_COUNT active ingress rule(s)"
- name: Retrieve SSH key from Parameter Store
id: keypair
run: |
echo "Retrieving SSH private key from AWS Systems Manager..."
KEY_NAME="${{ secrets.AWS_KEY_PAIR_NAME }}"
# Retrieve the SSH private key from Parameter Store
aws ssm get-parameter \
--name "/github-actions/ec2/ssh-key" \
--with-decryption \
--query 'Parameter.Value' \
--region "$AWS_REGION" \
--output text > /tmp/github-actions-ec2.pem
chmod 600 /tmp/github-actions-ec2.pem
echo "SSH key retrieved successfully"
echo "KEY_PATH=/tmp/github-actions-ec2.pem" >> $GITHUB_OUTPUT
- name: Create user data script
run: |
echo '#!/bin/bash' > /tmp/user-data.sh
echo 'set -x' >> /tmp/user-data.sh
echo 'exec > >(tee /var/log/user-data.log) 2>&1' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo 'echo "=== User Data Script Started ==="' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Wait for system to be ready' >> /tmp/user-data.sh
echo 'sleep 5' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Create SSH privilege separation directory' >> /tmp/user-data.sh
echo 'echo "Creating /run/sshd directory"' >> /tmp/user-data.sh
echo 'mkdir -p /run/sshd' >> /tmp/user-data.sh
echo 'chmod 0755 /run/sshd' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Configure custom SSH port' >> /tmp/user-data.sh
echo 'echo "Configuring SSH port to '"${{ steps.ids.outputs.SSH_PORT }}"'"' >> /tmp/user-data.sh
echo 'perl -pi -e "s/^#?Port 22$/Port '"${{ steps.ids.outputs.SSH_PORT }}"'/" /etc/ssh/sshd_config' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Ensure Port directive exists' >> /tmp/user-data.sh
echo 'if ! grep -q "^Port '"${{ steps.ids.outputs.SSH_PORT }}"'" /etc/ssh/sshd_config; then' >> /tmp/user-data.sh
echo ' echo "Port '"${{ steps.ids.outputs.SSH_PORT }}"'" >> /etc/ssh/sshd_config' >> /tmp/user-data.sh
echo 'fi' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo 'echo "SSH config after modification:"' >> /tmp/user-data.sh
echo 'grep "^Port" /etc/ssh/sshd_config' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Disable systemd socket activation' >> /tmp/user-data.sh
echo 'echo "Disabling socket activation"' >> /tmp/user-data.sh
echo 'systemctl stop ssh.socket' >> /tmp/user-data.sh
echo 'systemctl disable ssh.socket' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Test SSH configuration' >> /tmp/user-data.sh
echo 'echo "Testing SSH configuration"' >> /tmp/user-data.sh
echo 'sshd -t' >> /tmp/user-data.sh
echo 'if [ $? -eq 0 ]; then' >> /tmp/user-data.sh
echo ' echo "SSH config valid, restarting SSH service"' >> /tmp/user-data.sh
echo ' systemctl restart ssh.service' >> /tmp/user-data.sh
echo ' sleep 2' >> /tmp/user-data.sh
echo ' systemctl status ssh.service' >> /tmp/user-data.sh
echo ' echo "Checking listening ports:"' >> /tmp/user-data.sh
echo ' ss -tlnp | grep sshd || netstat -tlnp | grep sshd' >> /tmp/user-data.sh
echo ' echo "SSH reconfiguration successful"' >> /tmp/user-data.sh
echo 'else' >> /tmp/user-data.sh
echo ' echo "ERROR: SSH config invalid"' >> /tmp/user-data.sh
echo ' exit 1' >> /tmp/user-data.sh
echo 'fi' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Install Rust (needed for cargo build)' >> /tmp/user-data.sh
echo 'curl --proto '"'"'=https'"'"' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y' >> /tmp/user-data.sh
echo 'source "$HOME/.cargo/env"' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Verify nvidia-smi is available' >> /tmp/user-data.sh
echo 'if command -v nvidia-smi &> /dev/null; then' >> /tmp/user-data.sh
echo ' echo "NVIDIA drivers confirmed"' >> /tmp/user-data.sh
echo ' nvidia-smi' >> /tmp/user-data.sh
echo 'else' >> /tmp/user-data.sh
echo ' echo "Warning: nvidia-smi not found"' >> /tmp/user-data.sh
echo 'fi' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo '# Create workspace directory' >> /tmp/user-data.sh
echo 'mkdir -p ${WORKDIR}/workspace' >> /tmp/user-data.sh
echo 'chown -R ${USER}:${USER} ${WORKDIR}/workspace' >> /tmp/user-data.sh
echo '' >> /tmp/user-data.sh
echo 'nvidia-smi | tee /tmp/nvidia-smi-output.txt' >> /tmp/user-data.sh
echo 'touch /tmp/setup-complete' >> /tmp/user-data.sh
echo 'echo "=== User Data Script Complete ==="' >> /tmp/user-data.sh
- name: Launch EC2 instance
id: instance
run: |
echo "Launching EC2 instance with SSH configured on port ${{ steps.ids.outputs.SSH_PORT }}..."
# Base64 encode for AWS
USER_DATA=$(base64 -w 0 /tmp/user-data.sh)
INSTANCE_ID=$(aws ec2 run-instances \
--image-id "${{ steps.ami.outputs.AMI_ID }}" \
--instance-type "$INSTANCE_TYPE" \
--key-name "${{ secrets.AWS_KEY_PAIR_NAME }}" \
--security-group-ids "${{ steps.security-group.outputs.SG_ID }}" \
--user-data "$USER_DATA" \
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=256,VolumeType=gp3,DeleteOnTermination=true}" \
--tag-specifications \
"ResourceType=instance,Tags=[\
{Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}},\
{Key=ManagedBy,Value=GitHubActions},\
{Key=Purpose,Value=GPURunner},\
{Key=Workflow,Value=${{ github.workflow }}},\
{Key=RunId,Value=${{ github.run_id }}},\
{Key=Branch,Value=${{ env.BRANCH }}},\
{Key=Example,Value=${{ env.EXAMPLE }}}\
]" \
"ResourceType=volume,Tags=[\
{Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}-volume},\
{Key=ManagedBy,Value=GitHubActions},\
{Key=Purpose,Value=GPURunner},\
{Key=Workflow,Value=${{ github.workflow }}},\
{Key=Example,Value=${{ env.EXAMPLE }}}\
]" \
--instance-initiated-shutdown-behavior terminate \
--query 'Instances[0].InstanceId' \
--region "$AWS_REGION" \
--output text)
echo "INSTANCE_ID=$INSTANCE_ID" >> $GITHUB_OUTPUT
echo "Instance launched: $INSTANCE_ID"
- name: Wait for instance to be running
run: |
echo "Waiting for instance to be running..."
aws ec2 wait instance-running \
--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
--region "$AWS_REGION"
PUBLIC_IP=$(aws ec2 describe-instances \
--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
--query 'Reservations[0].Instances[0].PublicIpAddress' \
--region "$AWS_REGION" \
--output text)
echo "::add-mask::$PUBLIC_IP"
echo "PUBLIC_IP=$PUBLIC_IP" >> $GITHUB_ENV
echo "Instance is running at: $PUBLIC_IP"
- name: Wait for cloud-init and SSH on custom port
run: |
echo "Waiting for cloud-init to complete and SSH to be available on port ${{ steps.ids.outputs.SSH_PORT }}..."
# Wait longer initially to allow cloud-init to run
echo "Waiting 60 seconds for cloud-init to start..."
sleep 60
MAX_ATTEMPTS=40
ATTEMPT=0
while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} "echo 'SSH ready on custom port'" 2>/dev/null; then
echo "SSH connection established on port ${{ steps.ids.outputs.SSH_PORT }}"
break
else
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then
echo "Failed to establish SSH connection on port ${{ steps.ids.outputs.SSH_PORT }} after $MAX_ATTEMPTS attempts"
echo "Attempting to fetch console output for debugging..."
aws ec2 get-console-output \
--instance-id "${{ steps.instance.outputs.INSTANCE_ID }}" \
--region "$AWS_REGION" \
--output text || echo "Could not fetch console output"
exit 1
fi
echo "Attempt $ATTEMPT/$MAX_ATTEMPTS failed, retrying in 10 seconds..."
sleep 10
fi
done
- name: Wait for instance setup
run: |
echo "Waiting for instance setup to complete..."
MAX_WAIT=300
ELAPSED=0
while [ $ELAPSED -lt $MAX_WAIT ]; do
if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"test -f /tmp/setup-complete" 2>/dev/null; then
echo "Instance setup completed"
break
else
sleep 10
ELAPSED=$((ELAPSED + 10))
if [ $ELAPSED -ge $MAX_WAIT ]; then
echo "Setup timeout, continuing anyway..."
break
fi
fi
done
- name: Create archive of repository
run: |
echo "Creating repository archive..."
git archive --format=tar.gz --output=/tmp/repo.tar.gz HEAD
- name: Transfer repository to instance
run: |
echo "Transferring repository to instance..."
scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" \
/tmp/repo.tar.gz ${USER}@${PUBLIC_IP}:${WORKDIR}/
echo "Extracting repository on instance..."
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"cd ${WORKDIR} && tar -xzf repo.tar.gz && rm repo.tar.gz"
- name: Setup Python environment and run warmup
run: |
echo "Setting up Python environment and running warmup.py..."
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH'
set -e
cd ${WORKDIR}
# Run warmup.py
echo "Running warmup.py..."
python3 warmup.py --skip-confirmation
echo "Warmup completed"
ENDSSH
- name: Build Rust project
run: |
echo "Building Rust project with cargo..."
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH'
set -e
cd ${WORKDIR}
# Setup Rust environment
source "$HOME/.cargo/env"
# Build the project
echo "Running cargo build --release..."
cargo build --release
echo "Cargo build completed"
ENDSSH
- name: Convert assertion notebook to Python script
run: |
echo "Converting assertion notebook: examples/fail-examples/assertion.ipynb"
cat > /tmp/convert_assertion.sh << 'SCRIPTEOF'
#!/bin/bash
set -e
cd $WORKDIR
source ~/.local/share/ppf-cts/venv/bin/activate
jupyter nbconvert --to python "examples/fail-examples/assertion.ipynb" --output "/tmp/assertion_base.py"
cat > /tmp/assertion.py << 'PYEOF'
import sys
import os
sys.path.insert(0, '$WORKDIR')
sys.path.insert(0, '$WORKDIR/frontend')
os.environ['PYTHONPATH'] = '$WORKDIR:$WORKDIR/frontend:' + os.environ.get('PYTHONPATH', '')
PYEOF
cat "/tmp/assertion_base.py" >> /tmp/assertion.py
echo "Assertion script prepared at /tmp/assertion.py"
SCRIPTEOF
scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" \
/tmp/convert_assertion.sh ${USER}@${PUBLIC_IP}:/tmp/
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"chmod +x /tmp/convert_assertion.sh && WORKDIR='${WORKDIR}' /tmp/convert_assertion.sh"
- name: Run assertion test (expect failure)
run: |
echo "Running assertion test to verify error propagation via SSH..."
echo "This test uses the same execution pattern as main examples"
echo "Expected result: FAILURE (AssertionError)"
# Run using the exact same pattern as the main example iterations
if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo 'assertion' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/assertion.py 2>&1 | tee /tmp/ci/assertion.log"; then
echo "ERROR: Assertion test should have failed but succeeded"
echo "This means errors are NOT being propagated correctly!"
exit 1
else
echo "SUCCESS: Assertion test failed as expected"
echo "Error propagation via SSH is working correctly"
echo "Main example tests can now proceed with confidence"
fi
- name: Convert notebook to Python script
run: |
echo "Converting notebook example: ${EXAMPLE}.ipynb to Python script"
cat > /tmp/convert_notebook.sh << SCRIPT
#!/bin/bash
set -e
cd ${WORKDIR}
# Create CI directory structure for collecting results
mkdir -p /tmp/ci
echo "CI directory created for collecting results"
# Activate Python environment
source ~/.local/share/ppf-cts/venv/bin/activate
# Convert notebook to Python script
echo "Converting notebook examples/\${EXAMPLE}.ipynb to Python script..."
jupyter nbconvert --to python "examples/\${EXAMPLE}.ipynb" --output "/tmp/\${EXAMPLE}_base.py"
# Create the runnable script with proper imports
cat > /tmp/\${EXAMPLE}.py << 'EOF'
import sys
import os
# Add the repository root to Python path so frontend can be imported
sys.path.insert(0, '${WORKDIR}')
sys.path.insert(0, '${WORKDIR}/frontend')
# Set environment variables if needed
os.environ['PYTHONPATH'] = '${WORKDIR}:${WORKDIR}/frontend:' + os.environ.get('PYTHONPATH', '')
# Now run the converted notebook
EOF
# Append the converted notebook content
cat "/tmp/\${EXAMPLE}_base.py" >> /tmp/\${EXAMPLE}.py
echo "Script prepared at /tmp/\${EXAMPLE}.py"
SCRIPT
scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" \
/tmp/convert_notebook.sh ${USER}@${PUBLIC_IP}:/tmp/
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"chmod +x /tmp/convert_notebook.sh && EXAMPLE='${EXAMPLE}' WORKDIR='${WORKDIR}' /tmp/convert_notebook.sh"
- name: Run 1st iteration
run: |
echo "Running 1st iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '1st' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_1.log"
- name: Run 2nd iteration
run: |
echo "Running 2nd iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '2nd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_2.log"
- name: Run 3rd iteration
run: |
echo "Running 3rd iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '3rd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_3.log"
- name: Run 4th iteration
run: |
echo "Running 4th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '4th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_4.log"
- name: Run 5th iteration
run: |
echo "Running 5th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '5th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_5.log"
- name: Run 6th iteration
run: |
echo "Running 6th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '6th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_6.log"
- name: Run 7th iteration
run: |
echo "Running 7th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '7th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_7.log"
- name: Run 8th iteration
run: |
echo "Running 8th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '8th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_8.log"
- name: Run 9th iteration
run: |
echo "Running 9th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '9th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_9.log"
- name: Run 10th iteration
run: |
echo "Running 10th iteration of ${EXAMPLE}"
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"set -o pipefail && echo '10th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_10.log"
- name: Collect results
if: success() || failure()
run: |
echo "Collecting results from all runs..."
mkdir -p ci
# Delete large binary files on remote before copying to save bandwidth
# CI output is in ppf-cts cache directory: ~/.cache/ppf-cts/ci
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"find ~/.cache/ppf-cts/ci -type f \( -name '*.bin' -o -name '*.pickle' -o -name '*.ply' -o -name '*.gz' \) -delete 2>/dev/null" || true
# Copy CI output from ppf-cts cache directory (session data, previews, etc.)
rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \
${USER}@${PUBLIC_IP}:~/.cache/ppf-cts/ci/ ./ci/ || echo "No ppf-cts CI files found"
# Also copy logs from /tmp/ci
rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \
${USER}@${PUBLIC_IP}:/tmp/ci/ ./ci/ || echo "No log files found"
echo "## Collected Files:"
ls -laR ci/ | head -100
echo "## Run Summary:"
for i in {1..10}; do
if [ -f ci/run_${i}.log ]; then
echo "Run ${i}: Completed"
else
echo "Run ${i}: No log found"
fi
done
- name: Upload artifact
if: success() || failure()
uses: actions/upload-artifact@v4
with:
name: ci-${{ env.EXAMPLE }}
path: ci
retention-days: 3
- name: GPU information
if: success() || failure()
run: |
echo "Getting GPU information..."
ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
"nvidia-smi" || echo "Failed to get GPU info"
- name: Re-authenticate for cleanup
if: always()
continue-on-error: true
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Cleanup - Terminate Instance
if: always()
continue-on-error: true
run: |
if [ -n "${{ steps.instance.outputs.INSTANCE_ID }}" ]; then
echo "Initiating instance termination: ${{ steps.instance.outputs.INSTANCE_ID }}"
aws ec2 terminate-instances \
--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
--region "$AWS_REGION" || true
echo "Termination initiated. Instance will terminate in the background."
else
echo "No instance to terminate"
fi
- name: Cleanup - Remove Ingress Rules
if: always()
continue-on-error: true
run: |
if [ -n "${{ steps.security-group.outputs.SG_ID }}" ] && [ -n "${{ steps.security-group.outputs.RUNNER_IP_CIDR }}" ]; then
echo "Removing ingress rules from security group ${{ steps.security-group.outputs.SG_ID }}"
# Remove custom port rule
if [ -n "${{ steps.security-group.outputs.SSH_PORT }}" ]; then
echo "Removing port ${{ steps.security-group.outputs.SSH_PORT }} rule..."
aws ec2 revoke-security-group-ingress \
--group-id "${{ steps.security-group.outputs.SG_ID }}" \
--ip-permissions \
"IpProtocol=tcp,FromPort=${{ steps.security-group.outputs.SSH_PORT }},ToPort=${{ steps.security-group.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.security-group.outputs.RUNNER_IP_CIDR }}}]" \
--region "$AWS_REGION" 2>&1 || echo "Note: Custom port rule may have already been removed"
fi
echo "Ingress rule removed successfully"
echo "Security group ${{ steps.security-group.outputs.SG_ID }} remains for future use"
else
echo "No ingress rules to remove"
fi
- name: Cleanup - Remove Local SSH Key
if: always()
continue-on-error: true
run: |
if [ -n "${{ steps.keypair.outputs.KEY_PATH }}" ] && [ -f "${{ steps.keypair.outputs.KEY_PATH }}" ]; then
rm -f "${{ steps.keypair.outputs.KEY_PATH }}"
echo "Local SSH key file removed"
fi
- name: Summary
if: always()
run: |
echo "## Workflow Summary"
echo "- Region: $AWS_REGION"
echo "- Instance Type: $INSTANCE_TYPE"
echo "- Branch: $BRANCH"
echo "- Example: $EXAMPLE"
echo "- Instance ID: ${{ steps.instance.outputs.INSTANCE_ID || 'Not launched' }}"
echo "- Run Status: ${{ steps.run_example.outcome || 'Not run' }}"