KAI Scheduler - Pull Request #5769

Workflow file for this run

	# Copyright 2025 NVIDIA CORPORATION
	# SPDX-License-Identifier: Apache-2.0

	name: KAI Scheduler - Pull Request
	on:
	pull_request:
	types: [opened, reopened, synchronize]
	merge_group:
	types: [checks_requested]

	concurrency:
	group: ${{ github.event_name == 'merge_group' && github.ref \|\| github.head_ref \|\| github.run_id }}
	cancel-in-progress: true

	jobs:
	check-build-and-test-required:
	name: Check if build and test are required
	runs-on: ubuntu-latest
	outputs:
	code: ${{ steps.filter.outputs.code }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Check changed files
	uses: dorny/paths-filter@v3
	id: filter
	with:
	predicate-quantifier: "every"
	filters: \|
	docs:
	- '*/.md'
	- 'docs/**'
	code:
	- '**'
	- '!*/.md'
	- '!docs/**'

	validate-and-test:
	needs: [ check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code == 'true'
	name: Validate & Test
	runs-on: ubuntu-latest
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Cache Go build cache (Docker-mounted)
	uses: actions/cache@v5
	with:
	path: \|
	~/.cache/go-build-docker-gocache
	~/.cache/go-build-docker-gopath
	key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
	restore-keys: \|
	go-docker-${{ runner.os }}-

	- name: Set up Go
	uses: actions/setup-go@v6
	with:
	go-version: '1.26.3'
	cache: true

	- name: Run validation
	run: make validate

	- name: Run tests
	run: make test

	- name: Archive code coverage results
	uses: actions/upload-artifact@v7
	with:
	name: code-coverage
	path: coverage/coverage.out

	- name: Calculate total coverage
	run: \|
	if [ ! -s coverage/coverage.out ]; then
	exit 0
	fi

	COVERAGE=$(go tool cover -func=coverage/coverage.out \| grep total \| grep -Eo '[0-9]+\.[0-9]+')
	echo "$COVERAGE" > coverage/total-coverage.txt

	- name: Archive total coverage
	if: hashFiles('coverage/total-coverage.txt') != ''
	uses: actions/upload-artifact@v7
	with:
	name: total-coverage
	path: coverage/total-coverage.txt

	code-coverage-report:
	name: Code Coverage Report
	runs-on: ubuntu-latest
	needs: [ validate-and-test, check-build-and-test-required ]
	if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true' && github.base_ref == 'main'
	steps:
	- uses: fgrosse/go-coverage-report@cbeb2ab2e32591d690337146ba02a911cc566f3f
	id: coverage_reporter
	with:
	coverage-artifact-name: "code-coverage"
	coverage-file-name: "coverage.out"
	root-package: "github.com/kai-scheduler/KAI-scheduler"
	github-baseline-workflow-ref: update-coverage-badge.yaml
	skip-comment: true
	- name: Download total coverage artifact
	uses: actions/download-artifact@v8
	with:
	name: total-coverage
	path: coverage-summary
	- name: Download coverage badge branch
	uses: actions/checkout@v6
	with:
	ref: coverage-badge
	path: coverage-badge
	- name: Calculate coverage totals
	id: coverage_totals
	run: \|
	PR_COVERAGE=$(cat coverage-summary/total-coverage.txt)
	echo "pr=$PR_COVERAGE" >> $GITHUB_OUTPUT

	BASELINE_COVERAGE=$(grep -oE '[0-9]+\.[0-9]+%' coverage-badge/badges/coverage.svg \| head -1 \| tr -d '%')
	echo "baseline=$BASELINE_COVERAGE" >> $GITHUB_OUTPUT
	- name: Save coverage report to file
	env:
	REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }}
	BASELINE_COVERAGE: ${{ steps.coverage_totals.outputs.baseline }}
	PR_COVERAGE: ${{ steps.coverage_totals.outputs.pr }}
	run: \|
	if [ -z "$REPORT_BODY" ]; then
	: > coverage-report.txt
	exit 0
	fi

	{
	echo "Total coverage: ${BASELINE_COVERAGE}% -> ${PR_COVERAGE}%"
	echo ""
	echo "$REPORT_BODY"
	} > coverage-report.txt
	- name: Upload coverage report
	uses: actions/upload-artifact@v7
	with:
	name: coverage-report-for-comment
	path: coverage-report.txt
	- name: Save PR number
	run: echo "${{ github.event.number }}" > pr_number.txt
	- name: Upload PR number
	uses: actions/upload-artifact@v7
	with:
	name: pr-number-for-comment
	path: pr_number.txt

	build:
	needs: [ check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code == 'true'
	name: Build
	runs-on: ubuntu-latest
	outputs:
	package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Extract package version
	id: package_version
	run: \|
	GIT_REV=$(git rev-parse --short HEAD \| sed 's/^0*//')
	PACKAGE_VERSION=0.0.0-$GIT_REV
	echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV
	echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT
	echo $PACKAGE_VERSION

	- name: Cache Go build cache (Docker-mounted)
	uses: actions/cache@v5
	with:
	path: \|
	~/.cache/go-build-docker-gocache
	~/.cache/go-build-docker-gopath
	key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
	restore-keys: \|
	go-docker-${{ runner.os }}-

	- name: Set up Go
	uses: actions/setup-go@v6
	with:
	go-version: '1.26.3'
	cache: true

	- name: Move Docker Data to /mnt
	run: \|
	sudo systemctl stop docker
	sudo mkdir -p /mnt/docker-data
	echo '{"data-root": "/mnt/docker-data"}' \| sudo tee /etc/docker/daemon.json
	sudo systemctl start docker
	docker info \| grep "Docker Root Dir"

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v4

	- name: Create image cache directory
	run: \|
	sudo mkdir -p /mnt/images
	sudo chown -R $USER:$USER /mnt/images

	- name: Cache for docker images and helm chart
	uses: actions/cache@v5
	with:
	path: /mnt/images
	key: images-${{ github.sha }}

	- name: Build docker images
	run: \|
	make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION
	docker save $(docker images --format '{{.Repository}}:{{.Tag}}' \| grep $PACKAGE_VERSION) \| gzip > /mnt/images/docker_images.tgz

	- name: Build helm chart
	run: \|
	helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION
	cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/

	skip-build-and-test-message:
	name: Skip Build and Test Message
	needs: [ check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code != 'true'
	runs-on: ubuntu-latest
	steps:
	- name: Skip message
	run: \|
	echo "Skipping build and test since only documentation files (.md or docs/) were changed."

	e2e-tests:
	name: Run E2E Tests
	needs: [ build, check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code == 'true'
	runs-on: ubuntu-latest
	permissions:
	actions: write
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Setup e2e cluster
	uses: ./.github/actions/setup-e2e-cluster
	with:
	package_version: ${{ needs.build.outputs.package_version }}

	- name: Delete restored cache
	continue-on-error: true
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }}

	- name: Install KAI-scheduler
	env:
	PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
	run: \|
	helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \
	--set "global.gpuSharing=true" --set "global.registry=localhost:30100" --debug --wait
	kubectl create clusterrole pods-patcher --verb=patch --resource=pods
	kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation

	- name: Run e2e tests
	run: \|
	ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale && !upgrade' --output-dir=. --json-report=e2e-report.json ./test/e2e/suites
	echo ""
	echo "=== Skipped Tests ==="
	jq -r '.[].SpecReports[] \| select(.State == "skipped") \| ([.ContainerHierarchyTexts[], .LeafNodeText] \| join(" > "))' e2e-report.json 2>/dev/null \|\| echo "No skipped tests found"

	- name: Uninstall KAI-scheduler
	run: \|
	helm uninstall kai-scheduler -n kai-scheduler
	echo "Waiting up to 60 seconds for pods to terminate..."
	EXCLUDED_PODS="prometheus"
	for i in {1..12}; do
	EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" \| tr ' ' '\|')
	NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null \| grep -v Terminating \| grep -vE "$EXCLUDE_PATTERN" \| wc -l)
	if [ "$NON_TERM" -eq 0 ]; then
	echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed."
	exit 0
	fi
	echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)"
	sleep 5
	done
	echo "Pods did not terminate within 60 seconds. Uninstall incomplete."
	exit 1

	e2e-upgrade-tests:
	name: Run E2E Upgrade Tests
	needs: [ build, check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code == 'true'
	runs-on: ubuntu-latest
	permissions:
	actions: write
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Resolve upgrade-from version
	id: resolve_version
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	TARGET_BRANCH="${{ github.base_ref }}"
	if [[ "$TARGET_BRANCH" =~ v([0-9]+)\.([0-9]+) ]]; then
	# Version branch: upgrade from the latest release of the previous minor
	MAJOR="${BASH_REMATCH[1]}"
	MINOR="${BASH_REMATCH[2]}"
	if [ "$MINOR" -eq 0 ]; then
	echo "No previous minor version exists. Skipping upgrade tests."
	echo "skip=true" >> $GITHUB_OUTPUT
	exit 0
	fi
	PREV_MINOR="${MAJOR}.$((MINOR - 1))"
	UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' \| grep -E "^v${PREV_MINOR}\.[0-9]+$" \| sort -V \| tail -1)
	if [ -z "$UPGRADE_FROM" ]; then
	echo "No release found for v${PREV_MINOR}.x. Skipping upgrade tests."
	echo "skip=true" >> $GITHUB_OUTPUT
	exit 0
	fi
	else
	# Main branch: upgrade from the latest release
	UPGRADE_FROM=$(gh release list --limit 200 --json tagName -q '.[].tagName' \| grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' \| sort -V \| tail -1)
	if [ -z "$UPGRADE_FROM" ]; then
	echo "No releases found. Skipping upgrade tests."
	echo "skip=true" >> $GITHUB_OUTPUT
	exit 0
	fi
	fi

	echo "Upgrading from $UPGRADE_FROM"
	echo "upgrade_from=$UPGRADE_FROM" >> $GITHUB_OUTPUT
	echo "skip=false" >> $GITHUB_OUTPUT

	- name: Setup e2e cluster
	if: steps.resolve_version.outputs.skip != 'true'
	uses: ./.github/actions/setup-e2e-cluster
	with:
	package_version: ${{ needs.build.outputs.package_version }}

	- name: Install previous version of KAI-scheduler
	if: steps.resolve_version.outputs.skip != 'true'
	env:
	UPGRADE_FROM_VERSION: ${{ steps.resolve_version.outputs.upgrade_from }}
	run: \|
	echo "Installing kai-scheduler $UPGRADE_FROM_VERSION from OCI registry..."
	helm upgrade -i kai-scheduler oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler -n kai-scheduler --create-namespace \
	--set "global.gpuSharing=true" --wait --version "$UPGRADE_FROM_VERSION"
	kubectl create clusterrole pods-patcher --verb=patch --resource=pods
	kubectl create rolebinding fake-status-updater --clusterrole=pods-patcher --serviceaccount=gpu-operator:status-updater -n kai-resource-reservation

	- name: Run upgrade e2e tests
	if: steps.resolve_version.outputs.skip != 'true'
	env:
	UPGRADE_CHART_PATH: /mnt/images/kai-scheduler-${{ needs.build.outputs.package_version }}.tgz
	run: \|
	ginkgo -r --keep-going --trace -vv --label-filter 'upgrade' --output-dir=. --json-report=e2e-upgrade-report.json ./test/e2e/suites/upgrade

	fossa-validate:
	needs: [ build, check-build-and-test-required ]
	if: needs.check-build-and-test-required.outputs.code == 'true'
	name: FOSSA license check
	runs-on: ubuntu-latest
	continue-on-error: true
	env:
	# push-only token, intentional; see https://github.com/fossa-contrib/fossa-action?tab=readme-ov-file#push-only-api-token
	FOSSA_API_KEY: 577e3d21c48454822ae8ea496209a505 # This is a push-only token that is safe to be exposed.
	steps:
	- name: Checkout code
	uses: actions/checkout@v6

	- name: Run FOSSA analysis and validate status
	uses: fossa-contrib/fossa-action@v3.0.1
	with:
	fossa-api-key: ${{ env.FOSSA_API_KEY }}
	skip-test: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

KAI Scheduler - Pull Request #5769

Workflow file

KAI Scheduler - Pull Request #5769

Uh oh!

Workflow file for this run