Skip to content

Commit d819287

Browse files
committed
ci(actions): split gpu checks into dedicated workflows
1 parent ed3e12e commit d819287

File tree

4 files changed

+118
-115
lines changed

4 files changed

+118
-115
lines changed

.github/workflows/branch-e2e.yml

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,6 @@ name: Branch E2E Checks
33
on:
44
pull_request:
55
types: [opened, synchronize, reopened, labeled]
6-
workflow_dispatch:
7-
inputs:
8-
gpu-runner:
9-
description: GPU runner label to target
10-
required: true
11-
type: choice
12-
default: linux-arm64-gpu-l4-latest-1
13-
options:
14-
- linux-arm64-gpu-l4-latest-1
15-
- linux-amd64-gpu-rtxpro6000-latest-1
166

177
permissions:
188
contents: read
@@ -36,34 +26,8 @@ jobs:
3626
runner: build-arm64
3727

3828
e2e:
39-
if: github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'test:e2e')
4029
needs: [build-gateway, build-cluster]
4130
uses: ./.github/workflows/e2e-test.yml
4231
with:
4332
image-tag: ${{ github.sha }}
4433
runner: build-arm64
45-
46-
build-gateway-gpu:
47-
if: github.event_name == 'workflow_dispatch'
48-
uses: ./.github/workflows/docker-build.yml
49-
with:
50-
component: gateway
51-
platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }}
52-
runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }}
53-
54-
build-cluster-gpu:
55-
if: github.event_name == 'workflow_dispatch'
56-
uses: ./.github/workflows/docker-build.yml
57-
with:
58-
component: cluster
59-
platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }}
60-
runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }}
61-
62-
e2e-gpu:
63-
if: github.event_name == 'workflow_dispatch'
64-
needs: [build-gateway-gpu, build-cluster-gpu]
65-
uses: ./.github/workflows/e2e-test.yml
66-
with:
67-
image-tag: ${{ github.sha }}
68-
runner: ${{ inputs.gpu-runner }}
69-
suite: python-gpu
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: GPU E2E Test
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
image-tag:
7+
description: "Image tag to test (typically the commit SHA)"
8+
required: true
9+
type: string
10+
11+
permissions:
12+
contents: read
13+
packages: read
14+
15+
jobs:
16+
e2e-gpu:
17+
name: "E2E GPU (${{ matrix.name }})"
18+
runs-on: ${{ matrix.runner }}
19+
continue-on-error: ${{ matrix.experimental }}
20+
timeout-minutes: 30
21+
strategy:
22+
fail-fast: false
23+
matrix:
24+
include:
25+
- name: linux-arm64
26+
runner: linux-arm64-gpu-l4-latest-1
27+
cluster: e2e-gpu-arm64
28+
port: "8083"
29+
experimental: false
30+
- name: linux-amd64
31+
runner: linux-amd64-gpu-rtxpro6000-latest-1
32+
cluster: e2e-gpu-amd64
33+
port: "8084"
34+
experimental: false
35+
- name: wsl-amd64
36+
runner: wsl-amd64-gpu-rtxpro6000-latest-1
37+
cluster: e2e-gpu-wsl
38+
port: "8085"
39+
experimental: true
40+
container:
41+
image: ghcr.io/nvidia/openshell/ci:latest
42+
credentials:
43+
username: ${{ github.actor }}
44+
password: ${{ secrets.GITHUB_TOKEN }}
45+
options: --privileged
46+
volumes:
47+
- /var/run/docker.sock:/var/run/docker.sock
48+
env:
49+
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50+
IMAGE_TAG: ${{ inputs.image-tag }}
51+
OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
52+
OPENSHELL_REGISTRY_HOST: ghcr.io
53+
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
54+
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
55+
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
56+
OPENSHELL_GATEWAY: ${{ matrix.cluster }}
57+
steps:
58+
- uses: actions/checkout@v4
59+
60+
- name: Log in to GHCR
61+
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
62+
63+
- name: Pull cluster image
64+
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
65+
66+
- name: Install Python dependencies and generate protobuf stubs
67+
run: uv sync --frozen && mise run --no-prepare python:proto
68+
69+
- name: Bootstrap GPU cluster
70+
env:
71+
GATEWAY_HOST: host.docker.internal
72+
GATEWAY_PORT: ${{ matrix.port }}
73+
CLUSTER_NAME: ${{ matrix.cluster }}
74+
# Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled.
75+
CLUSTER_GPU: "1"
76+
SKIP_IMAGE_PUSH: "1"
77+
SKIP_CLUSTER_IMAGE_BUILD: "1"
78+
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
79+
run: mise run --no-prepare --skip-deps cluster
80+
81+
- name: Run tests
82+
run: mise run --no-prepare --skip-deps e2e:python:gpu

.github/workflows/e2e-test.yml

Lines changed: 5 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,6 @@
11
name: E2E Test
22

33
on:
4-
workflow_dispatch:
5-
inputs:
6-
image-tag:
7-
description: "Image tag to test (typically the commit SHA)"
8-
required: false
9-
type: string
10-
default: ""
11-
runner:
12-
description: "GitHub Actions runner label"
13-
required: false
14-
type: string
15-
default: "build-amd64"
16-
suite:
17-
description: "E2E suite to run"
18-
required: false
19-
type: choice
20-
default: all
21-
options:
22-
- all
23-
- python-gpu
244
workflow_call:
255
inputs:
266
image-tag:
@@ -32,11 +12,6 @@ on:
3212
required: false
3313
type: string
3414
default: "build-amd64"
35-
suite:
36-
description: "E2E suite to run"
37-
required: false
38-
type: string
39-
default: "all"
4015

4116
permissions:
4217
contents: read
@@ -45,7 +20,6 @@ permissions:
4520
jobs:
4621
e2e:
4722
name: "E2E (${{ matrix.suite }})"
48-
if: inputs.suite != 'python-gpu'
4923
runs-on: ${{ inputs.runner }}
5024
timeout-minutes: 30
5125
strategy:
@@ -74,7 +48,7 @@ jobs:
7448
- /var/run/docker.sock:/var/run/docker.sock
7549
env:
7650
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77-
IMAGE_TAG: ${{ inputs.image-tag || github.sha }}
51+
IMAGE_TAG: ${{ inputs.image-tag }}
7852
OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
7953
OPENSHELL_REGISTRY_HOST: ghcr.io
8054
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
@@ -88,18 +62,18 @@ jobs:
8862
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
8963

9064
- name: Pull cluster image
91-
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }}
65+
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
9266

9367
- name: Install Python dependencies and generate protobuf stubs
9468
if: matrix.suite == 'python'
9569
run: uv sync --frozen && mise run --no-prepare python:proto
9670

9771
- name: Build Rust CLI
98-
if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume'
72+
if: matrix.suite != 'python'
9973
run: cargo build -p openshell-cli --features openshell-core/dev-settings
10074

10175
- name: Install SSH client
102-
if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume'
76+
if: matrix.suite != 'python'
10377
run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/*
10478

10579
- name: Bootstrap cluster
@@ -109,56 +83,8 @@ jobs:
10983
CLUSTER_NAME: ${{ matrix.cluster }}
11084
SKIP_IMAGE_PUSH: "1"
11185
SKIP_CLUSTER_IMAGE_BUILD: "1"
112-
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }}
86+
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
11387
run: mise run --no-prepare --skip-deps cluster
11488

11589
- name: Run tests
11690
run: ${{ matrix.cmd }}
117-
118-
e2e-gpu:
119-
name: "E2E (python-gpu)"
120-
if: inputs.suite == 'python-gpu'
121-
runs-on: ${{ inputs.runner }}
122-
timeout-minutes: 30
123-
container:
124-
image: ghcr.io/nvidia/openshell/ci:latest
125-
credentials:
126-
username: ${{ github.actor }}
127-
password: ${{ secrets.GITHUB_TOKEN }}
128-
options: --privileged
129-
volumes:
130-
- /var/run/docker.sock:/var/run/docker.sock
131-
env:
132-
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
133-
IMAGE_TAG: ${{ inputs.image-tag || github.sha }}
134-
OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
135-
OPENSHELL_REGISTRY_HOST: ghcr.io
136-
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
137-
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
138-
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
139-
OPENSHELL_GATEWAY: e2e-gpu
140-
steps:
141-
- uses: actions/checkout@v4
142-
143-
- name: Log in to GHCR
144-
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
145-
146-
- name: Pull cluster image
147-
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }}
148-
149-
- name: Install Python dependencies and generate protobuf stubs
150-
run: uv sync --frozen && mise run --no-prepare python:proto
151-
152-
- name: Bootstrap GPU cluster
153-
env:
154-
GATEWAY_HOST: host.docker.internal
155-
GATEWAY_PORT: "8083"
156-
CLUSTER_NAME: e2e-gpu
157-
CLUSTER_GPU: "1"
158-
SKIP_IMAGE_PUSH: "1"
159-
SKIP_CLUSTER_IMAGE_BUILD: "1"
160-
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }}
161-
run: mise run --no-prepare --skip-deps cluster
162-
163-
- name: Run tests
164-
run: mise run --no-prepare --skip-deps e2e:python:gpu

.github/workflows/test-gpu.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: GPU Test
2+
3+
on:
4+
pull_request:
5+
types: [opened, synchronize, reopened, labeled]
6+
workflow_dispatch: {}
7+
# Add `schedule:` here when we want nightly coverage from the same workflow.
8+
9+
permissions:
10+
contents: read
11+
packages: write
12+
13+
jobs:
14+
build-gateway:
15+
if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu')
16+
uses: ./.github/workflows/docker-build.yml
17+
with:
18+
component: gateway
19+
20+
build-cluster:
21+
if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu')
22+
uses: ./.github/workflows/docker-build.yml
23+
with:
24+
component: cluster
25+
26+
e2e-gpu:
27+
if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu')
28+
needs: [build-gateway, build-cluster]
29+
uses: ./.github/workflows/e2e-gpu-test.yaml
30+
with:
31+
image-tag: ${{ github.sha }}

0 commit comments

Comments
 (0)