From 521abb2906f43a7da004f55c8792ec9affffedc6 Mon Sep 17 00:00:00 2001 From: kunxian xia Date: Tue, 23 Jun 2026 21:12:49 +0800 Subject: [PATCH] ci: protect self-hosted runners from fork PRs --- .github/workflows/gpu-integration.yml | 17 +++++++++++++---- .github/workflows/integration.yml | 6 +++++- ci/gpu-runner/README.md | 18 +++++++++++++----- ci/gpu-runner/entrypoint.sh | 6 +++++- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.github/workflows/gpu-integration.yml b/.github/workflows/gpu-integration.yml index c08361e2d..6d2bdd58d 100644 --- a/.github/workflows/gpu-integration.yml +++ b/.github/workflows/gpu-integration.yml @@ -25,10 +25,14 @@ concurrency: jobs: gpu-integration: - # Manual dispatch always runs; PRs only when non-draft and labeled `gpu-ci`. + # Manual dispatch and master pushes run; PRs only run for non-draft, same-repo + # branches labeled `gpu-ci`. Fork PRs must not execute on self-hosted runners. if: | github.event_name == 'workflow_dispatch' || - (github.event.pull_request.draft == false && + github.event_name == 'push' || + (github.event_name == 'pull_request' && + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository && contains(github.event.pull_request.labels.*.name, 'gpu-ci')) name: GPU integration testing @@ -50,6 +54,10 @@ jobs: - uses: actions/checkout@v4 + - name: Isolate PR cargo target + if: github.event_name == 'pull_request' + run: echo "CARGO_TARGET_DIR=${GITHUB_WORKSPACE}/target" >> "$GITHUB_ENV" + # The real GPU backend lives in the private ceno-gpu repo. Cargo.toml ships # with the [patch] that redirects `ceno_gpu` to ../ceno-gpu/cuda_hal # COMMENTED OUT (it's a local-dev affordance), so by default cargo would @@ -81,9 +89,10 @@ jobs: - name: Install m4 run: sudo apt-get install -y m4 - # No actions/cache here: the GPU runner persists target/ on a host volume + # No actions/cache here: trusted GPU runs persist target/ on a host volume # via CARGO_TARGET_DIR (see ci/gpu-runner/README.md), which is bigger and - # faster than GitHub's cache backend for a workspace this size. + # faster than GitHub's cache backend for a workspace this size. PR runs + # override CARGO_TARGET_DIR to the ephemeral workspace target above. # Release-only on purpose: unlike the CPU integration job we do NOT also # run debug-mode steps. Building both profiles compiles the whole workspace diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index bdb751e32..3a2c50080 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -14,6 +14,7 @@ concurrency: jobs: skip_check: + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository runs-on: [ self-hosted, Linux, X64 ] outputs: should_skip: ${{ steps.skip_check.outputs.should_skip }} @@ -29,7 +30,10 @@ jobs: integration: needs: [ skip_check ] if: | - github.event.pull_request.draft == false && + (github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name == github.repository) && + (github.event_name != 'pull_request' || + github.event.pull_request.draft == false) && (github.event.action == 'ready_for_review' || needs.skip_check.outputs.should_skip != 'true') name: Integration testing diff --git a/ci/gpu-runner/README.md b/ci/gpu-runner/README.md index 0558e6478..04f381710 100644 --- a/ci/gpu-runner/README.md +++ b/ci/gpu-runner/README.md @@ -103,7 +103,8 @@ prover (`--features gpu`) on this runner. It's opt-in (GPU time is scarce): - **Manually** — Actions tab → *GPU Integration* → *Run workflow*. Optionally set the `example` input (default `keccak_syscall`). -- **On a PR** — add the `gpu-ci` label to the PR. +- **On a PR** — add the `gpu-ci` label to a branch in this repository. Fork PRs + are skipped because they must not execute code on self-hosted runners. - **On push to master** — records the proving-time baseline (see below). It loads `secrets.CENO_GPU_DEPLOY_KEY` into ssh-agent, clones the private @@ -161,7 +162,9 @@ The watchdog checks every minute and restarts on stop or GPU-unreachable. ## Notes & gotchas - **PAT scope**: repo-level runner needs `repo` (classic) or fine-grained - "Administration: Read and write" on this repo. + "Administration: Read and write" on this repo. The entrypoint uses the PAT + only to mint a runner registration token, then unsets it before starting the + Actions runner so job steps do not inherit it. - **Warm builds across ephemeral restarts**: two named volumes persist between containers — `ceno-gpu-runner-cargo` (the cargo registry, so deps aren't re-downloaded) and `ceno-gpu-runner-target` (mounted at `/cache/target`, with @@ -169,9 +172,14 @@ The watchdog checks every minute and restarts on stop or GPU-unreachable. This keeps recompiles fast even though each job runs in a fresh container. Because of this, **GPU jobs can drop the `actions/cache` step for `target/`** — the host volume is bigger (no ~10 GB cache limit) and faster (no network - restore) than GitHub's cache backend for a workspace this size. The single - ephemeral runner serves one job at a time, so there's no concurrent writer on - the shared target dir. (To reset: `docker volume rm ceno-gpu-runner-target`.) + restore) than GitHub's cache backend for a workspace this size. PR jobs + override `CARGO_TARGET_DIR` to the ephemeral workspace target so they do not + share the trusted persistent target dir. The single ephemeral runner serves + one job at a time, so there's no concurrent writer on the shared target dir. + (To reset: `docker volume rm ceno-gpu-runner-target`.) +- **Fork PRs**: any workflow that targets this runner must guard PR jobs with + `github.event.pull_request.head.repo.full_name == github.repository`, or use a + GitHub-hosted runner for fork code. - **Alternative to cron**: `--restart unless-stopped` in `start-runner.sh` restarts on crash instantly, but won't catch a hung container or a GPU that silently went away — that's why the watchdog also probes `nvidia-smi`. diff --git a/ci/gpu-runner/entrypoint.sh b/ci/gpu-runner/entrypoint.sh index cf04853b4..6526181da 100755 --- a/ci/gpu-runner/entrypoint.sh +++ b/ci/gpu-runner/entrypoint.sh @@ -10,7 +10,8 @@ # # Required env: # GITHUB_PAT - classic PAT with `repo` scope, or fine-grained token with -# "Administration: read/write" on the repo. +# "Administration: read/write" on the repo. Used only to mint a +# runner registration token, then removed from the job env. # REPO_URL - e.g. https://github.com/scroll-tech/ceno # Optional env: # RUNNER_NAME - defaults to gpu- @@ -41,6 +42,9 @@ if [[ -z "${REG_TOKEN}" || "${REG_TOKEN}" == "null" ]]; then exit 1 fi +# Do not pass the long-lived PAT into the Actions runner or any job steps. +unset GITHUB_PAT + cleanup() { echo "[entrypoint] de-registering runner ..." ./config.sh remove --token "${REG_TOKEN}" || true