yarn.ipynb #20

Workflow file for this run

	# File: yarn.yml
	# Code: Claude Code and Codex
	# Review: Ryoichi Ando ([email protected])
	# License: Apache v2.0

	name: yarn.ipynb

	on:
	workflow_dispatch:
	inputs:
	instance_type:
	description: 'EC2 instance type'
	required: true
	default: 'g6e.2xlarge'
	type: choice
	options:
	- g6.2xlarge
	- g6e.2xlarge
	region:
	description: 'AWS Region'
	required: true
	default: 'us-east-2'
	type: choice
	options:
	- us-east-1
	- us-east-2
	- ap-northeast-1

	jobs:
	run-on-gpu:
	name: Run on GPU Instance
	runs-on: ubuntu-latest
	permissions:
	id-token: write
	contents: read

	env:
	AWS_REGION: ${{ github.event.inputs.region }}
	INSTANCE_TYPE: ${{ github.event.inputs.instance_type }}
	BRANCH: ${{ github.ref_name }}
	EXAMPLE: yarn
	WORKDIR: /home/ubuntu
	USER: ubuntu

	steps:
	- name: Show input parameters
	run: \|
	echo "## Input Parameters"
	echo "Branch: ${{ github.ref_name }}"
	echo "Instance Type: ${{ github.event.inputs.instance_type }}"
	echo "Region: ${{ github.event.inputs.region }}"

	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Configure AWS credentials via OIDC
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
	aws-region: ${{ env.AWS_REGION }}

	- name: Verify AWS authentication
	run: \|
	echo "Testing AWS authentication..."
	aws sts get-caller-identity
	echo "AWS Region: $AWS_REGION"
	echo "Instance Type: $INSTANCE_TYPE"
	echo "Branch: $BRANCH"
	echo "Example: $EXAMPLE"

	- name: Get GitHub Actions runner public IP
	id: runner-ip
	run: \|
	echo "Fetching GitHub Actions runner public IP..."

	RUNNER_IP=$(curl -s --max-time 10 https://checkip.amazonaws.com \| tr -d '\n')
	if [ -z "$RUNNER_IP" ]; then
	echo "ERROR: Failed to get IP from checkip.amazonaws.com"
	exit 1
	fi

	echo "::add-mask::$RUNNER_IP"
	echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_OUTPUT
	echo "GitHub Actions Runner IP: $RUNNER_IP"

	- name: Find Deep Learning AMI
	id: ami
	run: \|
	echo "Finding latest Deep Learning AMI with GPU support..."
	AMI_ID=$(aws ec2 describe-images \
	--owners amazon \
	--filters \
	"Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04)*" \
	"Name=state,Values=available" \
	"Name=architecture,Values=x86_64" \
	--query 'sort_by(Images, &CreationDate)[-1].ImageId' \
	--region "$AWS_REGION" \
	--output text)

	if [ "$AMI_ID" = "None" ] \|\| [ -z "$AMI_ID" ]; then
	echo "ERROR: Deep Learning AMI not found in region $AWS_REGION"
	echo "This workflow requires the Deep Learning AMI with pre-installed NVIDIA drivers"
	echo "Please check if the AMI is available in your selected region"
	exit 1
	fi

	echo "AMI_ID=$AMI_ID" >> $GITHUB_OUTPUT
	echo "Found AMI: $AMI_ID"

	- name: Get default VPC ID
	id: vpc
	run: \|
	echo "Getting default VPC ID..."
	VPC_ID=$(aws ec2 describe-vpcs \
	--filters "Name=isDefault,Values=true" \
	--query 'Vpcs[0].VpcId' \
	--region "$AWS_REGION" \
	--output text)

	if [ "$VPC_ID" = "None" ] \|\| [ -z "$VPC_ID" ]; then
	echo "ERROR: Default VPC not found in region $AWS_REGION"
	exit 1
	fi

	echo "VPC_ID=$VPC_ID" >> $GITHUB_OUTPUT
	echo "Default VPC: $VPC_ID"

	- name: Generate unique identifiers
	id: ids
	run: \|
	TIMESTAMP=$(date +%Y%m%d%H%M%S)
	RANDOM_SUFFIX=$(head /dev/urandom \| tr -dc a-z0-9 \| head -c 6)
	TEMP_INSTANCE_ID="temp-${TIMESTAMP}-${RANDOM_SUFFIX}"

	# Generate random SSH port (10001-65535)
	SSH_PORT=$((10001 + RANDOM % 55535))
	echo "::add-mask::$SSH_PORT"
	echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_OUTPUT
	echo "TEMP_INSTANCE_ID=$TEMP_INSTANCE_ID" >> $GITHUB_OUTPUT
	echo "SSH_PORT=$SSH_PORT" >> $GITHUB_OUTPUT
	echo "Temporary Instance ID: $TEMP_INSTANCE_ID"
	echo "SSH Port: $SSH_PORT"

	- name: Setup persistent security group
	id: security-group
	run: \|
	echo "Setting up persistent security group 'github-actions-persistent'..."

	SG_NAME="github-actions-persistent"
	SG_DESCRIPTION="Persistent security group for GitHub Actions with dynamic rules"

	# Check if security group already exists
	SG_ID=$(aws ec2 describe-security-groups \
	--filters "Name=group-name,Values=$SG_NAME" \
	--query 'SecurityGroups[0].GroupId' \
	--region "$AWS_REGION" \
	--output text \|\| echo "")

	if [ "$SG_ID" = "None" ] \|\| [ -z "$SG_ID" ]; then
	echo "Security group does not exist. Creating new one..."

	# Create security group
	SG_ID=$(aws ec2 create-security-group \
	--group-name "$SG_NAME" \
	--description "$SG_DESCRIPTION" \
	--vpc-id "${{ steps.vpc.outputs.VPC_ID }}" \
	--query 'GroupId' \
	--region "$AWS_REGION" \
	--output text)

	echo "Security Group created: $SG_ID"

	# Tag the security group
	aws ec2 create-tags \
	--resources "$SG_ID" \
	--tags \
	"Key=Name,Value=$SG_NAME" \
	"Key=ManagedBy,Value=GitHubActions" \
	"Key=Purpose,Value=PersistentDynamicRules" \
	"Key=CreatedAt,Value=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
	--region "$AWS_REGION"

	echo "Security Group tagged successfully"
	else
	echo "Using existing security group: $SG_ID"
	fi

	echo "SG_ID=$SG_ID" >> $GITHUB_OUTPUT

	# Add only custom SSH port (no port 22)
	echo "Adding ingress rule for runner IP on port ${{ steps.ids.outputs.SSH_PORT }}"
	aws ec2 authorize-security-group-ingress \
	--group-id "$SG_ID" \
	--ip-permissions \
	"IpProtocol=tcp,FromPort=${{ steps.ids.outputs.SSH_PORT }},ToPort=${{ steps.ids.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.runner-ip.outputs.RUNNER_IP }}/32,Description='GHA Run ${{ github.run_id }} Port ${{ steps.ids.outputs.SSH_PORT }}'}]" \
	--region "$AWS_REGION" 2>&1 \|\| echo "Note: Rule may already exist"

	echo "RUNNER_IP_CIDR=${{ steps.runner-ip.outputs.RUNNER_IP }}/32" >> $GITHUB_OUTPUT
	echo "SSH_PORT=${{ steps.ids.outputs.SSH_PORT }}" >> $GITHUB_OUTPUT
	echo "SSH ingress rule added successfully (custom port only)"

	RULE_COUNT=$(aws ec2 describe-security-groups \
	--group-ids "$SG_ID" \
	--query 'length(SecurityGroups[0].IpPermissions)' \
	--region "$AWS_REGION" \
	--output text)
	echo "Security group has $RULE_COUNT active ingress rule(s)"

	- name: Retrieve SSH key from Parameter Store
	id: keypair
	run: \|
	echo "Retrieving SSH private key from AWS Systems Manager..."
	KEY_NAME="${{ secrets.AWS_KEY_PAIR_NAME }}"

	# Retrieve the SSH private key from Parameter Store
	aws ssm get-parameter \
	--name "/github-actions/ec2/ssh-key" \
	--with-decryption \
	--query 'Parameter.Value' \
	--region "$AWS_REGION" \
	--output text > /tmp/github-actions-ec2.pem

	chmod 600 /tmp/github-actions-ec2.pem
	echo "SSH key retrieved successfully"
	echo "KEY_PATH=/tmp/github-actions-ec2.pem" >> $GITHUB_OUTPUT

	- name: Create user data script
	run: \|
	echo '#!/bin/bash' > /tmp/user-data.sh
	echo 'set -x' >> /tmp/user-data.sh
	echo 'exec > >(tee /var/log/user-data.log) 2>&1' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo 'echo "=== User Data Script Started ==="' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Wait for system to be ready' >> /tmp/user-data.sh
	echo 'sleep 5' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Create SSH privilege separation directory' >> /tmp/user-data.sh
	echo 'echo "Creating /run/sshd directory"' >> /tmp/user-data.sh
	echo 'mkdir -p /run/sshd' >> /tmp/user-data.sh
	echo 'chmod 0755 /run/sshd' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Configure custom SSH port' >> /tmp/user-data.sh
	echo 'echo "Configuring SSH port to '"${{ steps.ids.outputs.SSH_PORT }}"'"' >> /tmp/user-data.sh
	echo 'perl -pi -e "s/^#?Port 22$/Port '"${{ steps.ids.outputs.SSH_PORT }}"'/" /etc/ssh/sshd_config' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Ensure Port directive exists' >> /tmp/user-data.sh
	echo 'if ! grep -q "^Port '"${{ steps.ids.outputs.SSH_PORT }}"'" /etc/ssh/sshd_config; then' >> /tmp/user-data.sh
	echo ' echo "Port '"${{ steps.ids.outputs.SSH_PORT }}"'" >> /etc/ssh/sshd_config' >> /tmp/user-data.sh
	echo 'fi' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo 'echo "SSH config after modification:"' >> /tmp/user-data.sh
	echo 'grep "^Port" /etc/ssh/sshd_config' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Disable systemd socket activation' >> /tmp/user-data.sh
	echo 'echo "Disabling socket activation"' >> /tmp/user-data.sh
	echo 'systemctl stop ssh.socket' >> /tmp/user-data.sh
	echo 'systemctl disable ssh.socket' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Test SSH configuration' >> /tmp/user-data.sh
	echo 'echo "Testing SSH configuration"' >> /tmp/user-data.sh
	echo 'sshd -t' >> /tmp/user-data.sh
	echo 'if [ $? -eq 0 ]; then' >> /tmp/user-data.sh
	echo ' echo "SSH config valid, restarting SSH service"' >> /tmp/user-data.sh
	echo ' systemctl restart ssh.service' >> /tmp/user-data.sh
	echo ' sleep 2' >> /tmp/user-data.sh
	echo ' systemctl status ssh.service' >> /tmp/user-data.sh
	echo ' echo "Checking listening ports:"' >> /tmp/user-data.sh
	echo ' ss -tlnp \| grep sshd \|\| netstat -tlnp \| grep sshd' >> /tmp/user-data.sh
	echo ' echo "SSH reconfiguration successful"' >> /tmp/user-data.sh
	echo 'else' >> /tmp/user-data.sh
	echo ' echo "ERROR: SSH config invalid"' >> /tmp/user-data.sh
	echo ' exit 1' >> /tmp/user-data.sh
	echo 'fi' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Install Rust (needed for cargo build)' >> /tmp/user-data.sh
	echo 'curl --proto '"'"'=https'"'"' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y' >> /tmp/user-data.sh
	echo 'source "$HOME/.cargo/env"' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Verify nvidia-smi is available' >> /tmp/user-data.sh
	echo 'if command -v nvidia-smi &> /dev/null; then' >> /tmp/user-data.sh
	echo ' echo "NVIDIA drivers confirmed"' >> /tmp/user-data.sh
	echo ' nvidia-smi' >> /tmp/user-data.sh
	echo 'else' >> /tmp/user-data.sh
	echo ' echo "Warning: nvidia-smi not found"' >> /tmp/user-data.sh
	echo 'fi' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo '# Create workspace directory' >> /tmp/user-data.sh
	echo 'mkdir -p ${WORKDIR}/workspace' >> /tmp/user-data.sh
	echo 'chown -R ${USER}:${USER} ${WORKDIR}/workspace' >> /tmp/user-data.sh
	echo '' >> /tmp/user-data.sh
	echo 'nvidia-smi \| tee /tmp/nvidia-smi-output.txt' >> /tmp/user-data.sh
	echo 'touch /tmp/setup-complete' >> /tmp/user-data.sh
	echo 'echo "=== User Data Script Complete ==="' >> /tmp/user-data.sh

	- name: Launch EC2 instance
	id: instance
	run: \|
	echo "Launching EC2 instance with SSH configured on port ${{ steps.ids.outputs.SSH_PORT }}..."

	# Base64 encode for AWS
	USER_DATA=$(base64 -w 0 /tmp/user-data.sh)

	INSTANCE_ID=$(aws ec2 run-instances \
	--image-id "${{ steps.ami.outputs.AMI_ID }}" \
	--instance-type "$INSTANCE_TYPE" \
	--key-name "${{ secrets.AWS_KEY_PAIR_NAME }}" \
	--security-group-ids "${{ steps.security-group.outputs.SG_ID }}" \
	--user-data "$USER_DATA" \
	--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=256,VolumeType=gp3,DeleteOnTermination=true}" \
	--tag-specifications \
	"ResourceType=instance,Tags=[\
	{Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}},\
	{Key=ManagedBy,Value=GitHubActions},\
	{Key=Purpose,Value=GPURunner},\
	{Key=Workflow,Value=${{ github.workflow }}},\
	{Key=RunId,Value=${{ github.run_id }}},\
	{Key=Branch,Value=${{ env.BRANCH }}},\
	{Key=Example,Value=${{ env.EXAMPLE }}}\
	]" \
	"ResourceType=volume,Tags=[\
	{Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}-volume},\
	{Key=ManagedBy,Value=GitHubActions},\
	{Key=Purpose,Value=GPURunner},\
	{Key=Workflow,Value=${{ github.workflow }}},\
	{Key=Example,Value=${{ env.EXAMPLE }}}\
	]" \
	--instance-initiated-shutdown-behavior terminate \
	--query 'Instances[0].InstanceId' \
	--region "$AWS_REGION" \
	--output text)

	echo "INSTANCE_ID=$INSTANCE_ID" >> $GITHUB_OUTPUT
	echo "Instance launched: $INSTANCE_ID"

	- name: Wait for instance to be running
	run: \|
	echo "Waiting for instance to be running..."
	aws ec2 wait instance-running \
	--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
	--region "$AWS_REGION"

	PUBLIC_IP=$(aws ec2 describe-instances \
	--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
	--query 'Reservations[0].Instances[0].PublicIpAddress' \
	--region "$AWS_REGION" \
	--output text)

	echo "::add-mask::$PUBLIC_IP"
	echo "PUBLIC_IP=$PUBLIC_IP" >> $GITHUB_ENV
	echo "Instance is running at: $PUBLIC_IP"

	- name: Wait for cloud-init and SSH on custom port
	run: \|
	echo "Waiting for cloud-init to complete and SSH to be available on port ${{ steps.ids.outputs.SSH_PORT }}..."

	# Wait longer initially to allow cloud-init to run
	echo "Waiting 60 seconds for cloud-init to start..."
	sleep 60

	MAX_ATTEMPTS=40
	ATTEMPT=0

	while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
	if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} "echo 'SSH ready on custom port'" 2>/dev/null; then
	echo "SSH connection established on port ${{ steps.ids.outputs.SSH_PORT }}"
	break
	else
	ATTEMPT=$((ATTEMPT + 1))
	if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then
	echo "Failed to establish SSH connection on port ${{ steps.ids.outputs.SSH_PORT }} after $MAX_ATTEMPTS attempts"
	echo "Attempting to fetch console output for debugging..."
	aws ec2 get-console-output \
	--instance-id "${{ steps.instance.outputs.INSTANCE_ID }}" \
	--region "$AWS_REGION" \
	--output text \|\| echo "Could not fetch console output"
	exit 1
	fi
	echo "Attempt $ATTEMPT/$MAX_ATTEMPTS failed, retrying in 10 seconds..."
	sleep 10
	fi
	done

	- name: Wait for instance setup
	run: \|
	echo "Waiting for instance setup to complete..."
	MAX_WAIT=300
	ELAPSED=0

	while [ $ELAPSED -lt $MAX_WAIT ]; do
	if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"test -f /tmp/setup-complete" 2>/dev/null; then
	echo "Instance setup completed"
	break
	else
	sleep 10
	ELAPSED=$((ELAPSED + 10))
	if [ $ELAPSED -ge $MAX_WAIT ]; then
	echo "Setup timeout, continuing anyway..."
	break
	fi
	fi
	done

	- name: Create archive of repository
	run: \|
	echo "Creating repository archive..."
	git archive --format=tar.gz --output=/tmp/repo.tar.gz HEAD

	- name: Transfer repository to instance
	run: \|
	echo "Transferring repository to instance..."
	scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" \
	/tmp/repo.tar.gz ${USER}@${PUBLIC_IP}:${WORKDIR}/

	echo "Extracting repository on instance..."
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"cd ${WORKDIR} && tar -xzf repo.tar.gz && rm repo.tar.gz"

	- name: Setup Python environment and run warmup
	run: \|
	echo "Setting up Python environment and running warmup.py..."
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH'
	set -e
	cd ${WORKDIR}

	# Run warmup.py
	echo "Running warmup.py..."
	python3 warmup.py --skip-confirmation

	echo "Warmup completed"
	ENDSSH

	- name: Build Rust project
	run: \|
	echo "Building Rust project with cargo..."
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH'
	set -e
	cd ${WORKDIR}

	# Setup Rust environment
	source "$HOME/.cargo/env"

	# Build the project
	echo "Running cargo build --release..."
	cargo build --release

	echo "Cargo build completed"
	ENDSSH

	- name: Convert assertion notebook to Python script
	run: \|
	echo "Converting assertion notebook: examples/fail-examples/assertion.ipynb"

	cat > /tmp/convert_assertion.sh << 'SCRIPTEOF'
	#!/bin/bash
	set -e
	cd $WORKDIR
	source ~/.local/share/ppf-cts/venv/bin/activate
	jupyter nbconvert --to python "examples/fail-examples/assertion.ipynb" --output "/tmp/assertion_base.py"
	cat > /tmp/assertion.py << 'PYEOF'
	import sys
	import os
	sys.path.insert(0, '$WORKDIR')
	sys.path.insert(0, '$WORKDIR/frontend')
	os.environ['PYTHONPATH'] = '$WORKDIR:$WORKDIR/frontend:' + os.environ.get('PYTHONPATH', '')
	PYEOF
	cat "/tmp/assertion_base.py" >> /tmp/assertion.py
	echo "Assertion script prepared at /tmp/assertion.py"
	SCRIPTEOF

	scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" \
	/tmp/convert_assertion.sh ${USER}@${PUBLIC_IP}:/tmp/

	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"chmod +x /tmp/convert_assertion.sh && WORKDIR='${WORKDIR}' /tmp/convert_assertion.sh"

	- name: Run assertion test (expect failure)
	run: \|
	echo "Running assertion test to verify error propagation via SSH..."
	echo "This test uses the same execution pattern as main examples"
	echo "Expected result: FAILURE (AssertionError)"

	# Run using the exact same pattern as the main example iterations
	if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo 'assertion' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/assertion.py 2>&1 \| tee /tmp/ci/assertion.log"; then
	echo "ERROR: Assertion test should have failed but succeeded"
	echo "This means errors are NOT being propagated correctly!"
	exit 1
	else
	echo "SUCCESS: Assertion test failed as expected"
	echo "Error propagation via SSH is working correctly"
	echo "Main example tests can now proceed with confidence"
	fi

	- name: Convert notebook to Python script
	run: \|
	echo "Converting notebook example: ${EXAMPLE}.ipynb to Python script"

	cat > /tmp/convert_notebook.sh << SCRIPT
	#!/bin/bash
	set -e
	cd ${WORKDIR}

	# Create CI directory structure for collecting results
	mkdir -p /tmp/ci
	echo "CI directory created for collecting results"

	# Activate Python environment
	source ~/.local/share/ppf-cts/venv/bin/activate

	# Convert notebook to Python script
	echo "Converting notebook examples/\${EXAMPLE}.ipynb to Python script..."
	jupyter nbconvert --to python "examples/\${EXAMPLE}.ipynb" --output "/tmp/\${EXAMPLE}_base.py"

	# Create the runnable script with proper imports
	cat > /tmp/\${EXAMPLE}.py << 'EOF'
	import sys
	import os

	# Add the repository root to Python path so frontend can be imported
	sys.path.insert(0, '${WORKDIR}')
	sys.path.insert(0, '${WORKDIR}/frontend')

	# Set environment variables if needed
	os.environ['PYTHONPATH'] = '${WORKDIR}:${WORKDIR}/frontend:' + os.environ.get('PYTHONPATH', '')

	# Now run the converted notebook
	EOF

	# Append the converted notebook content
	cat "/tmp/\${EXAMPLE}_base.py" >> /tmp/\${EXAMPLE}.py

	echo "Script prepared at /tmp/\${EXAMPLE}.py"
	SCRIPT

	scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" \
	/tmp/convert_notebook.sh ${USER}@${PUBLIC_IP}:/tmp/

	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"chmod +x /tmp/convert_notebook.sh && EXAMPLE='${EXAMPLE}' WORKDIR='${WORKDIR}' /tmp/convert_notebook.sh"

	- name: Run 1st iteration
	run: \|
	echo "Running 1st iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '1st' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_1.log"

	- name: Run 2nd iteration
	run: \|
	echo "Running 2nd iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '2nd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_2.log"

	- name: Run 3rd iteration
	run: \|
	echo "Running 3rd iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '3rd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_3.log"

	- name: Run 4th iteration
	run: \|
	echo "Running 4th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '4th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_4.log"

	- name: Run 5th iteration
	run: \|
	echo "Running 5th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '5th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_5.log"

	- name: Run 6th iteration
	run: \|
	echo "Running 6th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '6th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_6.log"

	- name: Run 7th iteration
	run: \|
	echo "Running 7th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '7th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_7.log"

	- name: Run 8th iteration
	run: \|
	echo "Running 8th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '8th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_8.log"

	- name: Run 9th iteration
	run: \|
	echo "Running 9th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '9th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_9.log"

	- name: Run 10th iteration
	run: \|
	echo "Running 10th iteration of ${EXAMPLE}"
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"set -o pipefail && echo '10th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 \| tee /tmp/ci/run_10.log"

	- name: Collect results
	if: success() \|\| failure()
	run: \|
	echo "Collecting results from all runs..."
	mkdir -p ci
	# Delete large binary files on remote before copying to save bandwidth
	# CI output is in ppf-cts cache directory: ~/.cache/ppf-cts/ci
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"find ~/.cache/ppf-cts/ci -type f $ -name '.bin' -o -name '.pickle' -o -name '.ply' -o -name '.gz' $ -delete 2>/dev/null" \|\| true
	# Copy CI output from ppf-cts cache directory (session data, previews, etc.)
	rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \
	${USER}@${PUBLIC_IP}:~/.cache/ppf-cts/ci/ ./ci/ \|\| echo "No ppf-cts CI files found"
	# Also copy logs from /tmp/ci
	rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \
	${USER}@${PUBLIC_IP}:/tmp/ci/ ./ci/ \|\| echo "No log files found"
	echo "## Collected Files:"
	ls -laR ci/ \| head -100
	echo "## Run Summary:"
	for i in {1..10}; do
	if [ -f ci/run_${i}.log ]; then
	echo "Run ${i}: Completed"
	else
	echo "Run ${i}: No log found"
	fi
	done

	- name: Upload artifact
	if: success() \|\| failure()
	uses: actions/upload-artifact@v4
	with:
	name: ci-${{ env.EXAMPLE }}
	path: ci
	retention-days: 3

	- name: GPU information
	if: success() \|\| failure()
	run: \|
	echo "Getting GPU information..."
	ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
	-o ServerAliveInterval=60 -o ServerAliveCountMax=10 \
	-i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \
	"nvidia-smi" \|\| echo "Failed to get GPU info"

	- name: Re-authenticate for cleanup
	if: always()
	continue-on-error: true
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
	aws-region: ${{ env.AWS_REGION }}

	- name: Cleanup - Terminate Instance
	if: always()
	continue-on-error: true
	run: \|
	if [ -n "${{ steps.instance.outputs.INSTANCE_ID }}" ]; then
	echo "Initiating instance termination: ${{ steps.instance.outputs.INSTANCE_ID }}"
	aws ec2 terminate-instances \
	--instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \
	--region "$AWS_REGION" \|\| true
	echo "Termination initiated. Instance will terminate in the background."
	else
	echo "No instance to terminate"
	fi

	- name: Cleanup - Remove Ingress Rules
	if: always()
	continue-on-error: true
	run: \|
	if [ -n "${{ steps.security-group.outputs.SG_ID }}" ] && [ -n "${{ steps.security-group.outputs.RUNNER_IP_CIDR }}" ]; then
	echo "Removing ingress rules from security group ${{ steps.security-group.outputs.SG_ID }}"

	# Remove custom port rule
	if [ -n "${{ steps.security-group.outputs.SSH_PORT }}" ]; then
	echo "Removing port ${{ steps.security-group.outputs.SSH_PORT }} rule..."
	aws ec2 revoke-security-group-ingress \
	--group-id "${{ steps.security-group.outputs.SG_ID }}" \
	--ip-permissions \
	"IpProtocol=tcp,FromPort=${{ steps.security-group.outputs.SSH_PORT }},ToPort=${{ steps.security-group.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.security-group.outputs.RUNNER_IP_CIDR }}}]" \
	--region "$AWS_REGION" 2>&1 \|\| echo "Note: Custom port rule may have already been removed"
	fi

	echo "Ingress rule removed successfully"
	echo "Security group ${{ steps.security-group.outputs.SG_ID }} remains for future use"
	else
	echo "No ingress rules to remove"
	fi

	- name: Cleanup - Remove Local SSH Key
	if: always()
	continue-on-error: true
	run: \|
	if [ -n "${{ steps.keypair.outputs.KEY_PATH }}" ] && [ -f "${{ steps.keypair.outputs.KEY_PATH }}" ]; then
	rm -f "${{ steps.keypair.outputs.KEY_PATH }}"
	echo "Local SSH key file removed"
	fi

	- name: Summary
	if: always()
	run: \|
	echo "## Workflow Summary"
	echo "- Region: $AWS_REGION"
	echo "- Instance Type: $INSTANCE_TYPE"
	echo "- Branch: $BRANCH"
	echo "- Example: $EXAMPLE"
	echo "- Instance ID: ${{ steps.instance.outputs.INSTANCE_ID \|\| 'Not launched' }}"
	echo "- Run Status: ${{ steps.run_example.outcome \|\| 'Not run' }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

yarn.ipynb #20

Workflow file

yarn.ipynb #20

Uh oh!

Workflow file for this run