yarn.ipynb #20
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # File: yarn.yml | |
| # Code: Claude Code and Codex | |
| # Review: Ryoichi Ando ([email protected]) | |
| # License: Apache v2.0 | |
| name: yarn.ipynb | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| instance_type: | |
| description: 'EC2 instance type' | |
| required: true | |
| default: 'g6e.2xlarge' | |
| type: choice | |
| options: | |
| - g6.2xlarge | |
| - g6e.2xlarge | |
| region: | |
| description: 'AWS Region' | |
| required: true | |
| default: 'us-east-2' | |
| type: choice | |
| options: | |
| - us-east-1 | |
| - us-east-2 | |
| - ap-northeast-1 | |
| jobs: | |
| run-on-gpu: | |
| name: Run on GPU Instance | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| AWS_REGION: ${{ github.event.inputs.region }} | |
| INSTANCE_TYPE: ${{ github.event.inputs.instance_type }} | |
| BRANCH: ${{ github.ref_name }} | |
| EXAMPLE: yarn | |
| WORKDIR: /home/ubuntu | |
| USER: ubuntu | |
| steps: | |
| - name: Show input parameters | |
| run: | | |
| echo "## Input Parameters" | |
| echo "Branch: ${{ github.ref_name }}" | |
| echo "Instance Type: ${{ github.event.inputs.instance_type }}" | |
| echo "Region: ${{ github.event.inputs.region }}" | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Configure AWS credentials via OIDC | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Verify AWS authentication | |
| run: | | |
| echo "Testing AWS authentication..." | |
| aws sts get-caller-identity | |
| echo "AWS Region: $AWS_REGION" | |
| echo "Instance Type: $INSTANCE_TYPE" | |
| echo "Branch: $BRANCH" | |
| echo "Example: $EXAMPLE" | |
| - name: Get GitHub Actions runner public IP | |
| id: runner-ip | |
| run: | | |
| echo "Fetching GitHub Actions runner public IP..." | |
| RUNNER_IP=$(curl -s --max-time 10 https://checkip.amazonaws.com | tr -d '\n') | |
| if [ -z "$RUNNER_IP" ]; then | |
| echo "ERROR: Failed to get IP from checkip.amazonaws.com" | |
| exit 1 | |
| fi | |
| echo "::add-mask::$RUNNER_IP" | |
| echo "RUNNER_IP=$RUNNER_IP" >> $GITHUB_OUTPUT | |
| echo "GitHub Actions Runner IP: $RUNNER_IP" | |
| - name: Find Deep Learning AMI | |
| id: ami | |
| run: | | |
| echo "Finding latest Deep Learning AMI with GPU support..." | |
| AMI_ID=$(aws ec2 describe-images \ | |
| --owners amazon \ | |
| --filters \ | |
| "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04)*" \ | |
| "Name=state,Values=available" \ | |
| "Name=architecture,Values=x86_64" \ | |
| --query 'sort_by(Images, &CreationDate)[-1].ImageId' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| if [ "$AMI_ID" = "None" ] || [ -z "$AMI_ID" ]; then | |
| echo "ERROR: Deep Learning AMI not found in region $AWS_REGION" | |
| echo "This workflow requires the Deep Learning AMI with pre-installed NVIDIA drivers" | |
| echo "Please check if the AMI is available in your selected region" | |
| exit 1 | |
| fi | |
| echo "AMI_ID=$AMI_ID" >> $GITHUB_OUTPUT | |
| echo "Found AMI: $AMI_ID" | |
| - name: Get default VPC ID | |
| id: vpc | |
| run: | | |
| echo "Getting default VPC ID..." | |
| VPC_ID=$(aws ec2 describe-vpcs \ | |
| --filters "Name=isDefault,Values=true" \ | |
| --query 'Vpcs[0].VpcId' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| if [ "$VPC_ID" = "None" ] || [ -z "$VPC_ID" ]; then | |
| echo "ERROR: Default VPC not found in region $AWS_REGION" | |
| exit 1 | |
| fi | |
| echo "VPC_ID=$VPC_ID" >> $GITHUB_OUTPUT | |
| echo "Default VPC: $VPC_ID" | |
| - name: Generate unique identifiers | |
| id: ids | |
| run: | | |
| TIMESTAMP=$(date +%Y%m%d%H%M%S) | |
| RANDOM_SUFFIX=$(head /dev/urandom | tr -dc a-z0-9 | head -c 6) | |
| TEMP_INSTANCE_ID="temp-${TIMESTAMP}-${RANDOM_SUFFIX}" | |
| # Generate random SSH port (10001-65535) | |
| SSH_PORT=$((10001 + RANDOM % 55535)) | |
| echo "::add-mask::$SSH_PORT" | |
| echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_OUTPUT | |
| echo "TEMP_INSTANCE_ID=$TEMP_INSTANCE_ID" >> $GITHUB_OUTPUT | |
| echo "SSH_PORT=$SSH_PORT" >> $GITHUB_OUTPUT | |
| echo "Temporary Instance ID: $TEMP_INSTANCE_ID" | |
| echo "SSH Port: $SSH_PORT" | |
| - name: Setup persistent security group | |
| id: security-group | |
| run: | | |
| echo "Setting up persistent security group 'github-actions-persistent'..." | |
| SG_NAME="github-actions-persistent" | |
| SG_DESCRIPTION="Persistent security group for GitHub Actions with dynamic rules" | |
| # Check if security group already exists | |
| SG_ID=$(aws ec2 describe-security-groups \ | |
| --filters "Name=group-name,Values=$SG_NAME" \ | |
| --query 'SecurityGroups[0].GroupId' \ | |
| --region "$AWS_REGION" \ | |
| --output text || echo "") | |
| if [ "$SG_ID" = "None" ] || [ -z "$SG_ID" ]; then | |
| echo "Security group does not exist. Creating new one..." | |
| # Create security group | |
| SG_ID=$(aws ec2 create-security-group \ | |
| --group-name "$SG_NAME" \ | |
| --description "$SG_DESCRIPTION" \ | |
| --vpc-id "${{ steps.vpc.outputs.VPC_ID }}" \ | |
| --query 'GroupId' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| echo "Security Group created: $SG_ID" | |
| # Tag the security group | |
| aws ec2 create-tags \ | |
| --resources "$SG_ID" \ | |
| --tags \ | |
| "Key=Name,Value=$SG_NAME" \ | |
| "Key=ManagedBy,Value=GitHubActions" \ | |
| "Key=Purpose,Value=PersistentDynamicRules" \ | |
| "Key=CreatedAt,Value=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ | |
| --region "$AWS_REGION" | |
| echo "Security Group tagged successfully" | |
| else | |
| echo "Using existing security group: $SG_ID" | |
| fi | |
| echo "SG_ID=$SG_ID" >> $GITHUB_OUTPUT | |
| # Add only custom SSH port (no port 22) | |
| echo "Adding ingress rule for runner IP on port ${{ steps.ids.outputs.SSH_PORT }}" | |
| aws ec2 authorize-security-group-ingress \ | |
| --group-id "$SG_ID" \ | |
| --ip-permissions \ | |
| "IpProtocol=tcp,FromPort=${{ steps.ids.outputs.SSH_PORT }},ToPort=${{ steps.ids.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.runner-ip.outputs.RUNNER_IP }}/32,Description='GHA Run ${{ github.run_id }} Port ${{ steps.ids.outputs.SSH_PORT }}'}]" \ | |
| --region "$AWS_REGION" 2>&1 || echo "Note: Rule may already exist" | |
| echo "RUNNER_IP_CIDR=${{ steps.runner-ip.outputs.RUNNER_IP }}/32" >> $GITHUB_OUTPUT | |
| echo "SSH_PORT=${{ steps.ids.outputs.SSH_PORT }}" >> $GITHUB_OUTPUT | |
| echo "SSH ingress rule added successfully (custom port only)" | |
| RULE_COUNT=$(aws ec2 describe-security-groups \ | |
| --group-ids "$SG_ID" \ | |
| --query 'length(SecurityGroups[0].IpPermissions)' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| echo "Security group has $RULE_COUNT active ingress rule(s)" | |
| - name: Retrieve SSH key from Parameter Store | |
| id: keypair | |
| run: | | |
| echo "Retrieving SSH private key from AWS Systems Manager..." | |
| KEY_NAME="${{ secrets.AWS_KEY_PAIR_NAME }}" | |
| # Retrieve the SSH private key from Parameter Store | |
| aws ssm get-parameter \ | |
| --name "/github-actions/ec2/ssh-key" \ | |
| --with-decryption \ | |
| --query 'Parameter.Value' \ | |
| --region "$AWS_REGION" \ | |
| --output text > /tmp/github-actions-ec2.pem | |
| chmod 600 /tmp/github-actions-ec2.pem | |
| echo "SSH key retrieved successfully" | |
| echo "KEY_PATH=/tmp/github-actions-ec2.pem" >> $GITHUB_OUTPUT | |
| - name: Create user data script | |
| run: | | |
| echo '#!/bin/bash' > /tmp/user-data.sh | |
| echo 'set -x' >> /tmp/user-data.sh | |
| echo 'exec > >(tee /var/log/user-data.log) 2>&1' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo 'echo "=== User Data Script Started ==="' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Wait for system to be ready' >> /tmp/user-data.sh | |
| echo 'sleep 5' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Create SSH privilege separation directory' >> /tmp/user-data.sh | |
| echo 'echo "Creating /run/sshd directory"' >> /tmp/user-data.sh | |
| echo 'mkdir -p /run/sshd' >> /tmp/user-data.sh | |
| echo 'chmod 0755 /run/sshd' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Configure custom SSH port' >> /tmp/user-data.sh | |
| echo 'echo "Configuring SSH port to '"${{ steps.ids.outputs.SSH_PORT }}"'"' >> /tmp/user-data.sh | |
| echo 'perl -pi -e "s/^#?Port 22$/Port '"${{ steps.ids.outputs.SSH_PORT }}"'/" /etc/ssh/sshd_config' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Ensure Port directive exists' >> /tmp/user-data.sh | |
| echo 'if ! grep -q "^Port '"${{ steps.ids.outputs.SSH_PORT }}"'" /etc/ssh/sshd_config; then' >> /tmp/user-data.sh | |
| echo ' echo "Port '"${{ steps.ids.outputs.SSH_PORT }}"'" >> /etc/ssh/sshd_config' >> /tmp/user-data.sh | |
| echo 'fi' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo 'echo "SSH config after modification:"' >> /tmp/user-data.sh | |
| echo 'grep "^Port" /etc/ssh/sshd_config' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Disable systemd socket activation' >> /tmp/user-data.sh | |
| echo 'echo "Disabling socket activation"' >> /tmp/user-data.sh | |
| echo 'systemctl stop ssh.socket' >> /tmp/user-data.sh | |
| echo 'systemctl disable ssh.socket' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Test SSH configuration' >> /tmp/user-data.sh | |
| echo 'echo "Testing SSH configuration"' >> /tmp/user-data.sh | |
| echo 'sshd -t' >> /tmp/user-data.sh | |
| echo 'if [ $? -eq 0 ]; then' >> /tmp/user-data.sh | |
| echo ' echo "SSH config valid, restarting SSH service"' >> /tmp/user-data.sh | |
| echo ' systemctl restart ssh.service' >> /tmp/user-data.sh | |
| echo ' sleep 2' >> /tmp/user-data.sh | |
| echo ' systemctl status ssh.service' >> /tmp/user-data.sh | |
| echo ' echo "Checking listening ports:"' >> /tmp/user-data.sh | |
| echo ' ss -tlnp | grep sshd || netstat -tlnp | grep sshd' >> /tmp/user-data.sh | |
| echo ' echo "SSH reconfiguration successful"' >> /tmp/user-data.sh | |
| echo 'else' >> /tmp/user-data.sh | |
| echo ' echo "ERROR: SSH config invalid"' >> /tmp/user-data.sh | |
| echo ' exit 1' >> /tmp/user-data.sh | |
| echo 'fi' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Install Rust (needed for cargo build)' >> /tmp/user-data.sh | |
| echo 'curl --proto '"'"'=https'"'"' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y' >> /tmp/user-data.sh | |
| echo 'source "$HOME/.cargo/env"' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Verify nvidia-smi is available' >> /tmp/user-data.sh | |
| echo 'if command -v nvidia-smi &> /dev/null; then' >> /tmp/user-data.sh | |
| echo ' echo "NVIDIA drivers confirmed"' >> /tmp/user-data.sh | |
| echo ' nvidia-smi' >> /tmp/user-data.sh | |
| echo 'else' >> /tmp/user-data.sh | |
| echo ' echo "Warning: nvidia-smi not found"' >> /tmp/user-data.sh | |
| echo 'fi' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo '# Create workspace directory' >> /tmp/user-data.sh | |
| echo 'mkdir -p ${WORKDIR}/workspace' >> /tmp/user-data.sh | |
| echo 'chown -R ${USER}:${USER} ${WORKDIR}/workspace' >> /tmp/user-data.sh | |
| echo '' >> /tmp/user-data.sh | |
| echo 'nvidia-smi | tee /tmp/nvidia-smi-output.txt' >> /tmp/user-data.sh | |
| echo 'touch /tmp/setup-complete' >> /tmp/user-data.sh | |
| echo 'echo "=== User Data Script Complete ==="' >> /tmp/user-data.sh | |
| - name: Launch EC2 instance | |
| id: instance | |
| run: | | |
| echo "Launching EC2 instance with SSH configured on port ${{ steps.ids.outputs.SSH_PORT }}..." | |
| # Base64 encode for AWS | |
| USER_DATA=$(base64 -w 0 /tmp/user-data.sh) | |
| INSTANCE_ID=$(aws ec2 run-instances \ | |
| --image-id "${{ steps.ami.outputs.AMI_ID }}" \ | |
| --instance-type "$INSTANCE_TYPE" \ | |
| --key-name "${{ secrets.AWS_KEY_PAIR_NAME }}" \ | |
| --security-group-ids "${{ steps.security-group.outputs.SG_ID }}" \ | |
| --user-data "$USER_DATA" \ | |
| --block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=256,VolumeType=gp3,DeleteOnTermination=true}" \ | |
| --tag-specifications \ | |
| "ResourceType=instance,Tags=[\ | |
| {Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}},\ | |
| {Key=ManagedBy,Value=GitHubActions},\ | |
| {Key=Purpose,Value=GPURunner},\ | |
| {Key=Workflow,Value=${{ github.workflow }}},\ | |
| {Key=RunId,Value=${{ github.run_id }}},\ | |
| {Key=Branch,Value=${{ env.BRANCH }}},\ | |
| {Key=Example,Value=${{ env.EXAMPLE }}}\ | |
| ]" \ | |
| "ResourceType=volume,Tags=[\ | |
| {Key=Name,Value=gpu-runner-${{ steps.ids.outputs.TIMESTAMP }}-volume},\ | |
| {Key=ManagedBy,Value=GitHubActions},\ | |
| {Key=Purpose,Value=GPURunner},\ | |
| {Key=Workflow,Value=${{ github.workflow }}},\ | |
| {Key=Example,Value=${{ env.EXAMPLE }}}\ | |
| ]" \ | |
| --instance-initiated-shutdown-behavior terminate \ | |
| --query 'Instances[0].InstanceId' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| echo "INSTANCE_ID=$INSTANCE_ID" >> $GITHUB_OUTPUT | |
| echo "Instance launched: $INSTANCE_ID" | |
| - name: Wait for instance to be running | |
| run: | | |
| echo "Waiting for instance to be running..." | |
| aws ec2 wait instance-running \ | |
| --instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \ | |
| --region "$AWS_REGION" | |
| PUBLIC_IP=$(aws ec2 describe-instances \ | |
| --instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \ | |
| --query 'Reservations[0].Instances[0].PublicIpAddress' \ | |
| --region "$AWS_REGION" \ | |
| --output text) | |
| echo "::add-mask::$PUBLIC_IP" | |
| echo "PUBLIC_IP=$PUBLIC_IP" >> $GITHUB_ENV | |
| echo "Instance is running at: $PUBLIC_IP" | |
| - name: Wait for cloud-init and SSH on custom port | |
| run: | | |
| echo "Waiting for cloud-init to complete and SSH to be available on port ${{ steps.ids.outputs.SSH_PORT }}..." | |
| # Wait longer initially to allow cloud-init to run | |
| echo "Waiting 60 seconds for cloud-init to start..." | |
| sleep 60 | |
| MAX_ATTEMPTS=40 | |
| ATTEMPT=0 | |
| while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do | |
| if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} "echo 'SSH ready on custom port'" 2>/dev/null; then | |
| echo "SSH connection established on port ${{ steps.ids.outputs.SSH_PORT }}" | |
| break | |
| else | |
| ATTEMPT=$((ATTEMPT + 1)) | |
| if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then | |
| echo "Failed to establish SSH connection on port ${{ steps.ids.outputs.SSH_PORT }} after $MAX_ATTEMPTS attempts" | |
| echo "Attempting to fetch console output for debugging..." | |
| aws ec2 get-console-output \ | |
| --instance-id "${{ steps.instance.outputs.INSTANCE_ID }}" \ | |
| --region "$AWS_REGION" \ | |
| --output text || echo "Could not fetch console output" | |
| exit 1 | |
| fi | |
| echo "Attempt $ATTEMPT/$MAX_ATTEMPTS failed, retrying in 10 seconds..." | |
| sleep 10 | |
| fi | |
| done | |
| - name: Wait for instance setup | |
| run: | | |
| echo "Waiting for instance setup to complete..." | |
| MAX_WAIT=300 | |
| ELAPSED=0 | |
| while [ $ELAPSED -lt $MAX_WAIT ]; do | |
| if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "test -f /tmp/setup-complete" 2>/dev/null; then | |
| echo "Instance setup completed" | |
| break | |
| else | |
| sleep 10 | |
| ELAPSED=$((ELAPSED + 10)) | |
| if [ $ELAPSED -ge $MAX_WAIT ]; then | |
| echo "Setup timeout, continuing anyway..." | |
| break | |
| fi | |
| fi | |
| done | |
| - name: Create archive of repository | |
| run: | | |
| echo "Creating repository archive..." | |
| git archive --format=tar.gz --output=/tmp/repo.tar.gz HEAD | |
| - name: Transfer repository to instance | |
| run: | | |
| echo "Transferring repository to instance..." | |
| scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" \ | |
| /tmp/repo.tar.gz ${USER}@${PUBLIC_IP}:${WORKDIR}/ | |
| echo "Extracting repository on instance..." | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "cd ${WORKDIR} && tar -xzf repo.tar.gz && rm repo.tar.gz" | |
| - name: Setup Python environment and run warmup | |
| run: | | |
| echo "Setting up Python environment and running warmup.py..." | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH' | |
| set -e | |
| cd ${WORKDIR} | |
| # Run warmup.py | |
| echo "Running warmup.py..." | |
| python3 warmup.py --skip-confirmation | |
| echo "Warmup completed" | |
| ENDSSH | |
| - name: Build Rust project | |
| run: | | |
| echo "Building Rust project with cargo..." | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} << 'ENDSSH' | |
| set -e | |
| cd ${WORKDIR} | |
| # Setup Rust environment | |
| source "$HOME/.cargo/env" | |
| # Build the project | |
| echo "Running cargo build --release..." | |
| cargo build --release | |
| echo "Cargo build completed" | |
| ENDSSH | |
| - name: Convert assertion notebook to Python script | |
| run: | | |
| echo "Converting assertion notebook: examples/fail-examples/assertion.ipynb" | |
| cat > /tmp/convert_assertion.sh << 'SCRIPTEOF' | |
| #!/bin/bash | |
| set -e | |
| cd $WORKDIR | |
| source ~/.local/share/ppf-cts/venv/bin/activate | |
| jupyter nbconvert --to python "examples/fail-examples/assertion.ipynb" --output "/tmp/assertion_base.py" | |
| cat > /tmp/assertion.py << 'PYEOF' | |
| import sys | |
| import os | |
| sys.path.insert(0, '$WORKDIR') | |
| sys.path.insert(0, '$WORKDIR/frontend') | |
| os.environ['PYTHONPATH'] = '$WORKDIR:$WORKDIR/frontend:' + os.environ.get('PYTHONPATH', '') | |
| PYEOF | |
| cat "/tmp/assertion_base.py" >> /tmp/assertion.py | |
| echo "Assertion script prepared at /tmp/assertion.py" | |
| SCRIPTEOF | |
| scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" \ | |
| /tmp/convert_assertion.sh ${USER}@${PUBLIC_IP}:/tmp/ | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "chmod +x /tmp/convert_assertion.sh && WORKDIR='${WORKDIR}' /tmp/convert_assertion.sh" | |
| - name: Run assertion test (expect failure) | |
| run: | | |
| echo "Running assertion test to verify error propagation via SSH..." | |
| echo "This test uses the same execution pattern as main examples" | |
| echo "Expected result: FAILURE (AssertionError)" | |
| # Run using the exact same pattern as the main example iterations | |
| if ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo 'assertion' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/assertion.py 2>&1 | tee /tmp/ci/assertion.log"; then | |
| echo "ERROR: Assertion test should have failed but succeeded" | |
| echo "This means errors are NOT being propagated correctly!" | |
| exit 1 | |
| else | |
| echo "SUCCESS: Assertion test failed as expected" | |
| echo "Error propagation via SSH is working correctly" | |
| echo "Main example tests can now proceed with confidence" | |
| fi | |
| - name: Convert notebook to Python script | |
| run: | | |
| echo "Converting notebook example: ${EXAMPLE}.ipynb to Python script" | |
| cat > /tmp/convert_notebook.sh << SCRIPT | |
| #!/bin/bash | |
| set -e | |
| cd ${WORKDIR} | |
| # Create CI directory structure for collecting results | |
| mkdir -p /tmp/ci | |
| echo "CI directory created for collecting results" | |
| # Activate Python environment | |
| source ~/.local/share/ppf-cts/venv/bin/activate | |
| # Convert notebook to Python script | |
| echo "Converting notebook examples/\${EXAMPLE}.ipynb to Python script..." | |
| jupyter nbconvert --to python "examples/\${EXAMPLE}.ipynb" --output "/tmp/\${EXAMPLE}_base.py" | |
| # Create the runnable script with proper imports | |
| cat > /tmp/\${EXAMPLE}.py << 'EOF' | |
| import sys | |
| import os | |
| # Add the repository root to Python path so frontend can be imported | |
| sys.path.insert(0, '${WORKDIR}') | |
| sys.path.insert(0, '${WORKDIR}/frontend') | |
| # Set environment variables if needed | |
| os.environ['PYTHONPATH'] = '${WORKDIR}:${WORKDIR}/frontend:' + os.environ.get('PYTHONPATH', '') | |
| # Now run the converted notebook | |
| EOF | |
| # Append the converted notebook content | |
| cat "/tmp/\${EXAMPLE}_base.py" >> /tmp/\${EXAMPLE}.py | |
| echo "Script prepared at /tmp/\${EXAMPLE}.py" | |
| SCRIPT | |
| scp -P ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" \ | |
| /tmp/convert_notebook.sh ${USER}@${PUBLIC_IP}:/tmp/ | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "chmod +x /tmp/convert_notebook.sh && EXAMPLE='${EXAMPLE}' WORKDIR='${WORKDIR}' /tmp/convert_notebook.sh" | |
| - name: Run 1st iteration | |
| run: | | |
| echo "Running 1st iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '1st' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_1.log" | |
| - name: Run 2nd iteration | |
| run: | | |
| echo "Running 2nd iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '2nd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_2.log" | |
| - name: Run 3rd iteration | |
| run: | | |
| echo "Running 3rd iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '3rd' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_3.log" | |
| - name: Run 4th iteration | |
| run: | | |
| echo "Running 4th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '4th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_4.log" | |
| - name: Run 5th iteration | |
| run: | | |
| echo "Running 5th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '5th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_5.log" | |
| - name: Run 6th iteration | |
| run: | | |
| echo "Running 6th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '6th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_6.log" | |
| - name: Run 7th iteration | |
| run: | | |
| echo "Running 7th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '7th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_7.log" | |
| - name: Run 8th iteration | |
| run: | | |
| echo "Running 8th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '8th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_8.log" | |
| - name: Run 9th iteration | |
| run: | | |
| echo "Running 9th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '9th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_9.log" | |
| - name: Run 10th iteration | |
| run: | | |
| echo "Running 10th iteration of ${EXAMPLE}" | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "set -o pipefail && echo '10th' > ${WORKDIR}/frontend/.CI && cd ${WORKDIR} && source ~/.local/share/ppf-cts/venv/bin/activate && python3 /tmp/${EXAMPLE}.py 2>&1 | tee /tmp/ci/run_10.log" | |
| - name: Collect results | |
| if: success() || failure() | |
| run: | | |
| echo "Collecting results from all runs..." | |
| mkdir -p ci | |
| # Delete large binary files on remote before copying to save bandwidth | |
| # CI output is in ppf-cts cache directory: ~/.cache/ppf-cts/ci | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "find ~/.cache/ppf-cts/ci -type f \( -name '*.bin' -o -name '*.pickle' -o -name '*.ply' -o -name '*.gz' \) -delete 2>/dev/null" || true | |
| # Copy CI output from ppf-cts cache directory (session data, previews, etc.) | |
| rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \ | |
| ${USER}@${PUBLIC_IP}:~/.cache/ppf-cts/ci/ ./ci/ || echo "No ppf-cts CI files found" | |
| # Also copy logs from /tmp/ci | |
| rsync -avz -e "ssh -p ${{ steps.ids.outputs.SSH_PORT }} -i ${{ steps.keypair.outputs.KEY_PATH }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=10" \ | |
| ${USER}@${PUBLIC_IP}:/tmp/ci/ ./ci/ || echo "No log files found" | |
| echo "## Collected Files:" | |
| ls -laR ci/ | head -100 | |
| echo "## Run Summary:" | |
| for i in {1..10}; do | |
| if [ -f ci/run_${i}.log ]; then | |
| echo "Run ${i}: Completed" | |
| else | |
| echo "Run ${i}: No log found" | |
| fi | |
| done | |
| - name: Upload artifact | |
| if: success() || failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ci-${{ env.EXAMPLE }} | |
| path: ci | |
| retention-days: 3 | |
| - name: GPU information | |
| if: success() || failure() | |
| run: | | |
| echo "Getting GPU information..." | |
| ssh -p ${{ steps.ids.outputs.SSH_PORT }} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -o ServerAliveInterval=60 -o ServerAliveCountMax=10 \ | |
| -i "${{ steps.keypair.outputs.KEY_PATH }}" ${USER}@${PUBLIC_IP} \ | |
| "nvidia-smi" || echo "Failed to get GPU info" | |
| - name: Re-authenticate for cleanup | |
| if: always() | |
| continue-on-error: true | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Cleanup - Terminate Instance | |
| if: always() | |
| continue-on-error: true | |
| run: | | |
| if [ -n "${{ steps.instance.outputs.INSTANCE_ID }}" ]; then | |
| echo "Initiating instance termination: ${{ steps.instance.outputs.INSTANCE_ID }}" | |
| aws ec2 terminate-instances \ | |
| --instance-ids "${{ steps.instance.outputs.INSTANCE_ID }}" \ | |
| --region "$AWS_REGION" || true | |
| echo "Termination initiated. Instance will terminate in the background." | |
| else | |
| echo "No instance to terminate" | |
| fi | |
| - name: Cleanup - Remove Ingress Rules | |
| if: always() | |
| continue-on-error: true | |
| run: | | |
| if [ -n "${{ steps.security-group.outputs.SG_ID }}" ] && [ -n "${{ steps.security-group.outputs.RUNNER_IP_CIDR }}" ]; then | |
| echo "Removing ingress rules from security group ${{ steps.security-group.outputs.SG_ID }}" | |
| # Remove custom port rule | |
| if [ -n "${{ steps.security-group.outputs.SSH_PORT }}" ]; then | |
| echo "Removing port ${{ steps.security-group.outputs.SSH_PORT }} rule..." | |
| aws ec2 revoke-security-group-ingress \ | |
| --group-id "${{ steps.security-group.outputs.SG_ID }}" \ | |
| --ip-permissions \ | |
| "IpProtocol=tcp,FromPort=${{ steps.security-group.outputs.SSH_PORT }},ToPort=${{ steps.security-group.outputs.SSH_PORT }},IpRanges=[{CidrIp=${{ steps.security-group.outputs.RUNNER_IP_CIDR }}}]" \ | |
| --region "$AWS_REGION" 2>&1 || echo "Note: Custom port rule may have already been removed" | |
| fi | |
| echo "Ingress rule removed successfully" | |
| echo "Security group ${{ steps.security-group.outputs.SG_ID }} remains for future use" | |
| else | |
| echo "No ingress rules to remove" | |
| fi | |
| - name: Cleanup - Remove Local SSH Key | |
| if: always() | |
| continue-on-error: true | |
| run: | | |
| if [ -n "${{ steps.keypair.outputs.KEY_PATH }}" ] && [ -f "${{ steps.keypair.outputs.KEY_PATH }}" ]; then | |
| rm -f "${{ steps.keypair.outputs.KEY_PATH }}" | |
| echo "Local SSH key file removed" | |
| fi | |
| - name: Summary | |
| if: always() | |
| run: | | |
| echo "## Workflow Summary" | |
| echo "- Region: $AWS_REGION" | |
| echo "- Instance Type: $INSTANCE_TYPE" | |
| echo "- Branch: $BRANCH" | |
| echo "- Example: $EXAMPLE" | |
| echo "- Instance ID: ${{ steps.instance.outputs.INSTANCE_ID || 'Not launched' }}" | |
| echo "- Run Status: ${{ steps.run_example.outcome || 'Not run' }}" |