KAI Scheduler - Pull Request #5769
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright 2025 NVIDIA CORPORATION | |
| # SPDX-License-Identifier: Apache-2.0 | |
| name: KAI Scheduler - Pull Request | |
| on: | |
| pull_request: | |
| types: [opened, reopened, synchronize] | |
| merge_group: | |
| types: [checks_requested] | |
| concurrency: | |
| group: ${{ github.event_name == 'merge_group' && github.ref || github.head_ref || github.run_id }} | |
| cancel-in-progress: true | |
| jobs: | |
| check-build-and-test-required: | |
| name: Check if build and test are required | |
| runs-on: ubuntu-latest | |
| outputs: | |
| code: ${{ steps.filter.outputs.code }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Check changed files | |
| uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| predicate-quantifier: "every" | |
| filters: | | |
| docs: | |
| - '**/*.md' | |
| - 'docs/**' | |
| code: | |
| - '**' | |
| - '!**/*.md' | |
| - '!docs/**' | |
| validate-and-test: | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| name: Validate & Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Cache Go build cache (Docker-mounted) | |
| uses: actions/cache@v5 | |
| with: | |
| path: | | |
| ~/.cache/go-build-docker-gocache | |
| ~/.cache/go-build-docker-gopath | |
| key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }} | |
| restore-keys: | | |
| go-docker-${{ runner.os }}- | |
| - name: Set up Go | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: '1.26.3' | |
| cache: true | |
| - name: Run validation | |
| run: make validate | |
| - name: Run tests | |
| run: make test | |
| - name: Archive code coverage results | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: code-coverage | |
| path: coverage/coverage.out | |
| - name: Calculate total coverage | |
| run: | | |
| if [ ! -s coverage/coverage.out ]; then | |
| exit 0 | |
| fi | |
| COVERAGE=$(go tool cover -func=coverage/coverage.out | grep total | grep -Eo '[0-9]+\.[0-9]+') | |
| echo "$COVERAGE" > coverage/total-coverage.txt | |
| - name: Archive total coverage | |
| if: hashFiles('coverage/total-coverage.txt') != '' | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: total-coverage | |
| path: coverage/total-coverage.txt | |
| code-coverage-report: | |
| name: Code Coverage Report | |
| runs-on: ubuntu-latest | |
| needs: [ validate-and-test, check-build-and-test-required ] | |
| if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true' && github.base_ref == 'main' | |
| steps: | |
| - uses: fgrosse/go-coverage-report@cbeb2ab2e32591d690337146ba02a911cc566f3f | |
| id: coverage_reporter | |
| with: | |
| coverage-artifact-name: "code-coverage" | |
| coverage-file-name: "coverage.out" | |
| root-package: "github.com/kai-scheduler/KAI-scheduler" | |
| github-baseline-workflow-ref: update-coverage-badge.yaml | |
| skip-comment: true | |
| - name: Download total coverage artifact | |
| uses: actions/download-artifact@v8 | |
| with: | |
| name: total-coverage | |
| path: coverage-summary | |
| - name: Download coverage badge branch | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: coverage-badge | |
| path: coverage-badge | |
| - name: Calculate coverage totals | |
| id: coverage_totals | |
| run: | | |
| PR_COVERAGE=$(cat coverage-summary/total-coverage.txt) | |
| echo "pr=$PR_COVERAGE" >> $GITHUB_OUTPUT | |
| BASELINE_COVERAGE=$(grep -oE '[0-9]+\.[0-9]+%' coverage-badge/badges/coverage.svg | head -1 | tr -d '%') | |
| echo "baseline=$BASELINE_COVERAGE" >> $GITHUB_OUTPUT | |
| - name: Save coverage report to file | |
| env: | |
| REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }} | |
| BASELINE_COVERAGE: ${{ steps.coverage_totals.outputs.baseline }} | |
| PR_COVERAGE: ${{ steps.coverage_totals.outputs.pr }} | |
| run: | | |
| if [ -z "$REPORT_BODY" ]; then | |
| : > coverage-report.txt | |
| exit 0 | |
| fi | |
| { | |
| echo "**Total coverage:** ${BASELINE_COVERAGE}% -> ${PR_COVERAGE}%" | |
| echo "" | |
| echo "$REPORT_BODY" | |
| } > coverage-report.txt | |
| - name: Upload coverage report | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: coverage-report-for-comment | |
| path: coverage-report.txt | |
| - name: Save PR number | |
| run: echo "${{ github.event.number }}" > pr_number.txt | |
| - name: Upload PR number | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| name: pr-number-for-comment | |
| path: pr_number.txt | |
| build: | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| name: Build | |
| runs-on: ubuntu-latest | |
| outputs: | |
| package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Extract package version | |
| id: package_version | |
| run: | | |
| GIT_REV=$(git rev-parse --short HEAD | sed 's/^0*//') | |
| PACKAGE_VERSION=0.0.0-$GIT_REV | |
| echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV | |
| echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT | |
| echo $PACKAGE_VERSION | |
| - name: Cache Go build cache (Docker-mounted) | |
| uses: actions/cache@v5 | |
| with: | |
| path: | | |
| ~/.cache/go-build-docker-gocache | |
| ~/.cache/go-build-docker-gopath | |
| key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }} | |
| restore-keys: | | |
| go-docker-${{ runner.os }}- | |
| - name: Set up Go | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: '1.26.3' | |
| cache: true | |
| - name: Move Docker Data to /mnt | |
| run: | | |
| sudo systemctl stop docker | |
| sudo mkdir -p /mnt/docker-data | |
| echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json | |
| sudo systemctl start docker | |
| docker info | grep "Docker Root Dir" | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v4 | |
| - name: Create image cache directory | |
| run: | | |
| sudo mkdir -p /mnt/images | |
| sudo chown -R $USER:$USER /mnt/images | |
| - name: Cache for docker images and helm chart | |
| uses: actions/cache@v5 | |
| with: | |
| path: /mnt/images | |
| key: images-${{ github.sha }} | |
| - name: Build docker images | |
| run: | | |
| make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION | |
| docker save $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION) | gzip > /mnt/images/docker_images.tgz | |
| - name: Build helm chart | |
| run: | | |
| helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION | |
| cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/ | |
| skip-build-and-test-message: | |
| name: Skip Build and Test Message | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Skip message | |
| run: | | |
| echo "Skipping build and test since only documentation files (.md or docs/) were changed." | |
| e2e-tests: | |
| name: Run E2E Tests | |
| needs: [ build, check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Setup e2e cluster | |
| uses: ./.github/actions/setup-e2e-cluster | |
| with: | |
| package_version: ${{ needs.build.outputs.package_version }} | |
| - name: Delete restored cache | |
| continue-on-error: true | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }} | |
| - name: Install KAI-scheduler | |
| env: | |
| PACKAGE_VERSION: ${{ needs.build.outputs.package_version }} | |
| run: | | |
| helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \ | |
| --set "global.gpuSharing=true" --set "global.registry=localhost:30100" --debug --wait | |
| kubectl create clusterrole pods-patcher --verb=patch --resource=pods | |
| kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation | |
| - name: Run e2e tests | |
| run: | | |
| ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale && !upgrade' --output-dir=. --json-report=e2e-report.json ./test/e2e/suites | |
| echo "" | |
| echo "=== Skipped Tests ===" | |
| jq -r '.[].SpecReports[] | select(.State == "skipped") | ([.ContainerHierarchyTexts[], .LeafNodeText] | join(" > "))' e2e-report.json 2>/dev/null || echo "No skipped tests found" | |
| - name: Uninstall KAI-scheduler | |
| run: | | |
| helm uninstall kai-scheduler -n kai-scheduler | |
| echo "Waiting up to 60 seconds for pods to terminate..." | |
| EXCLUDED_PODS="prometheus" | |
| for i in {1..12}; do | |
| EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" | tr ' ' '|') | |
| NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null | grep -v Terminating | grep -vE "$EXCLUDE_PATTERN" | wc -l) | |
| if [ "$NON_TERM" -eq 0 ]; then | |
| echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed." | |
| exit 0 | |
| fi | |
| echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)" | |
| sleep 5 | |
| done | |
| echo "Pods did not terminate within 60 seconds. Uninstall incomplete." | |
| exit 1 | |
| e2e-upgrade-tests: | |
| name: Run E2E Upgrade Tests | |
| needs: [ build, check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Resolve upgrade-from version | |
| id: resolve_version | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| TARGET_BRANCH="${{ github.base_ref }}" | |
| if [[ "$TARGET_BRANCH" =~ v([0-9]+)\.([0-9]+) ]]; then | |
| # Version branch: upgrade from the latest release of the previous minor | |
| MAJOR="${BASH_REMATCH[1]}" | |
| MINOR="${BASH_REMATCH[2]}" | |
| if [ "$MINOR" -eq 0 ]; then | |
| echo "No previous minor version exists. Skipping upgrade tests." | |
| echo "skip=true" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| PREV_MINOR="${MAJOR}.$((MINOR - 1))" | |
| UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E "^v${PREV_MINOR}\.[0-9]+$" | sort -V | tail -1) | |
| if [ -z "$UPGRADE_FROM" ]; then | |
| echo "No release found for v${PREV_MINOR}.x. Skipping upgrade tests." | |
| echo "skip=true" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| else | |
| # Main branch: upgrade from the latest release | |
| UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -1) | |
| if [ -z "$UPGRADE_FROM" ]; then | |
| echo "No releases found. Skipping upgrade tests." | |
| echo "skip=true" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| fi | |
| echo "Upgrading from $UPGRADE_FROM" | |
| echo "upgrade_from=$UPGRADE_FROM" >> $GITHUB_OUTPUT | |
| echo "skip=false" >> $GITHUB_OUTPUT | |
| - name: Setup e2e cluster | |
| if: steps.resolve_version.outputs.skip != 'true' | |
| uses: ./.github/actions/setup-e2e-cluster | |
| with: | |
| package_version: ${{ needs.build.outputs.package_version }} | |
| - name: Install previous version of KAI-scheduler | |
| if: steps.resolve_version.outputs.skip != 'true' | |
| env: | |
| UPGRADE_FROM_VERSION: ${{ steps.resolve_version.outputs.upgrade_from }} | |
| run: | | |
| echo "Installing kai-scheduler $UPGRADE_FROM_VERSION from OCI registry..." | |
| helm upgrade -i kai-scheduler oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler -n kai-scheduler --create-namespace \ | |
| --set "global.gpuSharing=true" --wait --version "$UPGRADE_FROM_VERSION" | |
| kubectl create clusterrole pods-patcher --verb=patch --resource=pods | |
| kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation | |
| - name: Run upgrade e2e tests | |
| if: steps.resolve_version.outputs.skip != 'true' | |
| env: | |
| UPGRADE_CHART_PATH: /mnt/images/kai-scheduler-${{ needs.build.outputs.package_version }}.tgz | |
| run: | | |
| ginkgo -r --keep-going --trace -vv --label-filter 'upgrade' --output-dir=. --json-report=e2e-upgrade-report.json ./test/e2e/suites/upgrade | |
| fossa-validate: | |
| needs: [ build, check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| name: FOSSA license check | |
| runs-on: ubuntu-latest | |
| continue-on-error: true | |
| env: | |
| # push-only token, intentional; see https://github.com/fossa-contrib/fossa-action?tab=readme-ov-file#push-only-api-token | |
| FOSSA_API_KEY: 577e3d21c48454822ae8ea496209a505 # This is a push-only token that is safe to be exposed. | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Run FOSSA analysis and validate status | |
| uses: fossa-contrib/fossa-action@v3.0.1 | |
| with: | |
| fossa-api-key: ${{ env.FOSSA_API_KEY }} | |
| skip-test: false |