Nightly CI #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly CI | |
| on: | |
| schedule: | |
| # 00:00 Beijing time (UTC+8) = 16:00 UTC | |
| - cron: '0 16 * * *' | |
| workflow_dispatch: | |
| concurrency: | |
| group: nightly-ci | |
| cancel-in-progress: true | |
| env: | |
| GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} | |
| INSTANCE_NAME: areal-nightly | |
| INSTANCE_ZONE: asia-southeast1-c | |
| jobs: | |
| guard: | |
| name: Check trigger permissions | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Block non-admin manual triggers | |
| if: github.event_name == 'workflow_dispatch' | |
| uses: actions/github-script@v8 | |
| with: | |
| script: | | |
| const { data: { permission } } = await github.rest.repos.getCollaboratorPermissionLevel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| username: context.actor, | |
| }); | |
| if (permission !== 'admin') { | |
| core.setFailed(`User ${context.actor} has '${permission}' permission; admin required for manual dispatch.`); | |
| } | |
| start-runner: | |
| name: Start areal-nightly instance | |
| needs: guard | |
| runs-on: ubuntu-latest | |
| outputs: | |
| was_running: ${{ steps.start-instance.outputs.was_running }} | |
| steps: | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v3 | |
| with: | |
| credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v3 | |
| - name: Start instance if stopped | |
| id: start-instance | |
| run: | | |
| set -euo pipefail | |
| status=$(gcloud compute instances describe "$INSTANCE_NAME" \ | |
| --project "$GCP_PROJECT_ID" \ | |
| --zone "$INSTANCE_ZONE" \ | |
| --format="get(status)" || echo "NOT_FOUND") | |
| if [ "$status" = "NOT_FOUND" ]; then | |
| echo "Error: Instance $INSTANCE_NAME not found in zone $INSTANCE_ZONE" >&2 | |
| exit 1 | |
| fi | |
| if [ "$status" = "RUNNING" ]; then | |
| echo "Instance $INSTANCE_NAME is already running." | |
| echo "was_running=true" >> "$GITHUB_OUTPUT" | |
| elif [ "$status" = "TERMINATED" ] || [ "$status" = "SUSPENDED" ]; then | |
| echo "Instance $INSTANCE_NAME is $status. Starting it..." | |
| gcloud compute instances start "$INSTANCE_NAME" \ | |
| --project "$GCP_PROJECT_ID" \ | |
| --zone "$INSTANCE_ZONE" | |
| echo "Instance started successfully." | |
| echo "was_running=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "Instance $INSTANCE_NAME has unexpected status: $status" >&2 | |
| exit 1 | |
| fi | |
| - name: Wait for runner to come online | |
| uses: actions/github-script@v8 | |
| env: | |
| GH_PAT: ${{ secrets.GH_PAT }} | |
| with: | |
| github-token: ${{ secrets.GH_PAT }} | |
| script: | | |
| const instanceName = process.env.INSTANCE_NAME || 'areal-nightly'; | |
| const maxAttempts = 60; | |
| const delayMs = 15000; | |
| const pat = process.env.GH_PAT; | |
| if (!pat) { | |
| core.setFailed('GH_PAT secret is not configured.'); | |
| return; | |
| } | |
| const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); | |
| for (let attempt = 1; attempt <= maxAttempts; attempt++) { | |
| const response = await github.rest.actions.listSelfHostedRunnersForRepo({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| per_page: 100, | |
| request: { | |
| headers: { | |
| authorization: `token ${pat}`, | |
| }, | |
| }, | |
| }); | |
| const found = response.data.runners.find((runner) => runner.name === instanceName); | |
| if (found && found.status === 'online') { | |
| core.info(`Runner ${instanceName} is online.`); | |
| return; | |
| } | |
| core.info(`Runner ${instanceName} not ready yet (attempt ${attempt}/${maxAttempts}).`); | |
| await wait(delayMs); | |
| } | |
| throw new Error(`Timed out waiting for runner ${instanceName} to come online.`); | |
| nightly-tests: | |
| name: Run nightly tests | |
| needs: start-runner | |
| runs-on: | |
| - self-hosted | |
| - areal-nightly | |
| timeout-minutes: 480 | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: System info | |
| run: | | |
| echo "=== GPU info ===" | |
| nvidia-smi | |
| echo "" | |
| echo "=== Python ===" | |
| python --version || python3 --version | |
| echo "" | |
| echo "=== Disk ===" | |
| df -h | |
| echo "" | |
| echo "=== Memory ===" | |
| free -h | |
| - name: Dummy test (placeholder) | |
| run: | | |
| echo "Nightly CI running on $(hostname) at $(date -u)" | |
| echo "TODO: Replace with actual long-running tests" | |
| sleep 10 | |
| echo "Dummy test completed successfully." | |
| stop-runner: | |
| name: Stop areal-nightly instance | |
| needs: | |
| - start-runner | |
| - nightly-tests | |
| if: always() && needs.start-runner.outputs.was_running != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v3 | |
| with: | |
| credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v3 | |
| - name: Stop instance | |
| run: | | |
| set -euo pipefail | |
| status=$(gcloud compute instances describe "$INSTANCE_NAME" \ | |
| --project "$GCP_PROJECT_ID" \ | |
| --zone "$INSTANCE_ZONE" \ | |
| --format="get(status)" 2>/dev/null || echo "NOT_FOUND") | |
| if [ "$status" = "NOT_FOUND" ]; then | |
| echo "Warning: Instance $INSTANCE_NAME not found." | |
| exit 0 | |
| fi | |
| if [ "$status" = "RUNNING" ]; then | |
| echo "Stopping instance $INSTANCE_NAME..." | |
| gcloud compute instances stop "$INSTANCE_NAME" \ | |
| --project "$GCP_PROJECT_ID" \ | |
| --zone "$INSTANCE_ZONE" | |
| echo "Instance stopped successfully." | |
| else | |
| echo "Instance $INSTANCE_NAME is already in status: $status" | |
| fi |