Skip to content

Nightly CI

Nightly CI #5

Workflow file for this run

name: Nightly CI
on:
schedule:
# 00:00 Beijing time (UTC+8) = 16:00 UTC
- cron: '0 16 * * *'
workflow_dispatch:
concurrency:
group: nightly-ci
cancel-in-progress: true
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
INSTANCE_NAME: areal-nightly
INSTANCE_ZONE: asia-southeast1-c
jobs:
guard:
name: Check trigger permissions
runs-on: ubuntu-latest
steps:
- name: Block non-admin manual triggers
if: github.event_name == 'workflow_dispatch'
uses: actions/github-script@v8
with:
script: |
const { data: { permission } } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: context.actor,
});
if (permission !== 'admin') {
core.setFailed(`User ${context.actor} has '${permission}' permission; admin required for manual dispatch.`);
}
start-runner:
name: Start areal-nightly instance
needs: guard
runs-on: ubuntu-latest
outputs:
was_running: ${{ steps.start-instance.outputs.was_running }}
steps:
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v3
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v3
- name: Start instance if stopped
id: start-instance
run: |
set -euo pipefail
status=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE" \
--format="get(status)" || echo "NOT_FOUND")
if [ "$status" = "NOT_FOUND" ]; then
echo "Error: Instance $INSTANCE_NAME not found in zone $INSTANCE_ZONE" >&2
exit 1
fi
if [ "$status" = "RUNNING" ]; then
echo "Instance $INSTANCE_NAME is already running."
echo "was_running=true" >> "$GITHUB_OUTPUT"
elif [ "$status" = "TERMINATED" ] || [ "$status" = "SUSPENDED" ]; then
echo "Instance $INSTANCE_NAME is $status. Starting it..."
gcloud compute instances start "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE"
echo "Instance started successfully."
echo "was_running=false" >> "$GITHUB_OUTPUT"
else
echo "Instance $INSTANCE_NAME has unexpected status: $status" >&2
exit 1
fi
- name: Wait for runner to come online
uses: actions/github-script@v8
env:
GH_PAT: ${{ secrets.GH_PAT }}
with:
github-token: ${{ secrets.GH_PAT }}
script: |
const instanceName = process.env.INSTANCE_NAME || 'areal-nightly';
const maxAttempts = 60;
const delayMs = 15000;
const pat = process.env.GH_PAT;
if (!pat) {
core.setFailed('GH_PAT secret is not configured.');
return;
}
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const response = await github.rest.actions.listSelfHostedRunnersForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
per_page: 100,
request: {
headers: {
authorization: `token ${pat}`,
},
},
});
const found = response.data.runners.find((runner) => runner.name === instanceName);
if (found && found.status === 'online') {
core.info(`Runner ${instanceName} is online.`);
return;
}
core.info(`Runner ${instanceName} not ready yet (attempt ${attempt}/${maxAttempts}).`);
await wait(delayMs);
}
throw new Error(`Timed out waiting for runner ${instanceName} to come online.`);
nightly-tests:
name: Run nightly tests
needs: start-runner
runs-on:
- self-hosted
- areal-nightly
timeout-minutes: 480
steps:
- uses: actions/checkout@v6
- name: System info
run: |
echo "=== GPU info ==="
nvidia-smi
echo ""
echo "=== Python ==="
python --version || python3 --version
echo ""
echo "=== Disk ==="
df -h
echo ""
echo "=== Memory ==="
free -h
- name: Dummy test (placeholder)
run: |
echo "Nightly CI running on $(hostname) at $(date -u)"
echo "TODO: Replace with actual long-running tests"
sleep 10
echo "Dummy test completed successfully."
stop-runner:
name: Stop areal-nightly instance
needs:
- start-runner
- nightly-tests
if: always() && needs.start-runner.outputs.was_running != 'true'
runs-on: ubuntu-latest
steps:
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v3
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v3
- name: Stop instance
run: |
set -euo pipefail
status=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE" \
--format="get(status)" 2>/dev/null || echo "NOT_FOUND")
if [ "$status" = "NOT_FOUND" ]; then
echo "Warning: Instance $INSTANCE_NAME not found."
exit 0
fi
if [ "$status" = "RUNNING" ]; then
echo "Stopping instance $INSTANCE_NAME..."
gcloud compute instances stop "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE"
echo "Instance stopped successfully."
else
echo "Instance $INSTANCE_NAME is already in status: $status"
fi