Skip to content

Nightly CI Pipeline #158

Nightly CI Pipeline

Nightly CI Pipeline #158

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Nightly CI Pipeline
on:
schedule:
- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
workflow_dispatch: # Allow manual triggering for testing
inputs:
release:
description: 'Stage NGC images, Artifactory wheels, and trigger the GitLab release pipeline. Schedule always sets release=true.'
required: false
type: boolean
default: true
run_tests:
description: 'Run vllm/sglang/trtllm tests + dynamo-pipeline checks. Schedule always runs them.'
required: false
type: boolean
default: true
skip_gitlab_pipeline:
description: 'Skip the GitLab release automation pipeline trigger. Emergency use only.'
required: false
type: boolean
default: false
permissions:
contents: read
env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# PRE-WARM K8S BUILDER
# ============================================================================
create-fresh-builder:
name: Create fresh K8s builder
runs-on: prod-default-small-v2
permissions:
contents: read
outputs:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
- name: Create and bootstrap fresh K8s builder
uses: ./.github/actions/bootstrap-buildkit
with:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
buildkit_worker_addresses: ''
suppress_fallback_warning: 'true'
# ============================================================================
# RESOLVE SOURCE SHA
# ============================================================================
# Single SHA every downstream job (shared-build-image, dynamo-pipeline,
# shared-test, release.yml) builds, tags, and releases against. On schedule
# and workflow_dispatch from main, this is just github.sha.
resolve-source-sha:
name: Resolve source SHA
runs-on: prod-default-v2
permissions:
contents: read
outputs:
source_sha: ${{ steps.resolve.outputs.source_sha }}
steps:
- id: resolve
shell: bash
run: |
set -euo pipefail
SHA="${GITHUB_SHA}"
echo "Using caller SHA: ${SHA}"
echo "source_sha=${SHA}" >> "$GITHUB_OUTPUT"
# ============================================================================
# COMPUTE NIGHTLY DEV VERSION
# ============================================================================
# Emits a PEP 440 dev suffix (e.g. .dev20260423) forwarded to every
# pipeline below. At the leaf, the suffix is stamped into pyproject / Cargo
# versions on the runner before docker build, so wheels produced by the
# wheel_builder stage carry the dev version.
compute-dev-version:
name: Compute dev version suffix
runs-on: prod-default-v2
permissions:
contents: read
outputs:
dev_suffix: ${{ steps.compute.outputs.dev_suffix }}
steps:
- id: compute
shell: bash
run: |
DATE=$(date -u +%Y%m%d)
echo "dev_suffix=.dev${DATE}" >> $GITHUB_OUTPUT
# ============================================================================
# COMPUTE RELEASE MODE
# ============================================================================
# schedule → always release. workflow_dispatch → honor the `release` input.
# Output is consumed by the `release` job below to gate the workflow_call
# into release.yml.
compute-release-mode:
name: Compute release mode
runs-on: prod-default-v2
permissions:
contents: read
outputs:
release: ${{ steps.compute.outputs.release }}
run_tests: ${{ steps.compute.outputs.run_tests }}
steps:
- id: compute
shell: bash
env:
DISPATCH_RELEASE: ${{ inputs.release }}
DISPATCH_RUN_TESTS: ${{ inputs.run_tests }}
run: |
case "${GITHUB_EVENT_NAME}" in
schedule)
# cron must keep tests on so a failing nightly blocks the release.
echo "release=true" >> "$GITHUB_OUTPUT"
echo "run_tests=true" >> "$GITHUB_OUTPUT"
;;
workflow_dispatch)
if [ "${DISPATCH_RELEASE}" = "false" ]; then
echo "release=false" >> "$GITHUB_OUTPUT"
else
echo "release=true" >> "$GITHUB_OUTPUT"
fi
if [ "${DISPATCH_RUN_TESTS}" = "false" ]; then
echo "run_tests=false" >> "$GITHUB_OUTPUT"
else
echo "run_tests=true" >> "$GITHUB_OUTPUT"
fi
;;
*)
echo "release=false" >> "$GITHUB_OUTPUT"
echo "run_tests=true" >> "$GITHUB_OUTPUT"
;;
esac
# ============================================================================
# BUILD JOBS
# ============================================================================
vllm-build:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: vllm
target: runtime
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
sglang-build:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: sglang
target: runtime
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
trtllm-build:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [create-fresh-builder, resolve-source-sha]
uses: ./.github/workflows/shared-build-image.yml
with:
framework: trtllm
target: runtime
cuda_version: '["13.1"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 120
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
# ============================================================================
# TEST JOBS
# ============================================================================
vllm-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [vllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: vllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: vllm and gpu_0
gpu_test_markers: vllm and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the parallel stage; unprofiled fall through to sequential.
# 24 GiB admits all currently profiled vLLM tests (max is ~20.4 GiB) on a 48 GiB GPU.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
vllm-multi-gpu-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [vllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: vllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: vllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
sglang-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [sglang-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: sglang
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: sglang and gpu_0
gpu_test_markers: sglang and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
# Current single-GPU runners are 24 GiB, so this cap admits the profiled SGLang pool
# while yielding one auto slot today. Larger runners will get more slots from the same markers.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
sglang-multi-gpu-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [sglang-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: sglang
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: sglang and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
trtllm-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [trtllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: trtllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
enable_coverage: true
run_cpu_only_tests: true
cpu_only_test_markers: trtllm and gpu_0
gpu_test_markers: trtllm and gpu_1
gpu_test_timeout_minutes: 240
# Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential.
# Current single-GPU runners are 24 GiB, so this cap admits the profiled TRT-LLM pool
# while yielding one auto slot today. Larger runners will get more slots from the same markers.
run_gpu_parallel_tests: true
gpu_parallel_max_vram_gib: '24'
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
trtllm-multi-gpu-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [trtllm-build, resolve-source-sha, compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: trtllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v2
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64"]' # No ARM GPUs available
enable_coverage: true
run_sanity_check: false
gpu_test_markers: trtllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 45
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
# ============================================================================
# DYNAMO RUNTIME PIPELINE
# ============================================================================
dynamo-pipeline:
name: dynamo-runtime
needs: [create-fresh-builder, compute-dev-version, resolve-source-sha]
uses: ./.github/workflows/dynamo-pipeline.yml
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
fresh_builder: true
no_cache: true
build_timeout_minutes: 120
# TODO: widen beyond `pre_merge` — today it picks up tests
# (e.g. fault_tolerance/deploy/*) that fail in this container-only
# context. Matches the coverage of the old container-validation-dynamo
# workflow.
cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)'
cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
gpu_test_markers: 'pre_merge and none and gpu_1'
dev_version_suffix: ${{ needs.compute-dev-version.outputs.dev_suffix }}
source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }}
secrets: inherit
# ============================================================================
# RUST COVERAGE
# ============================================================================
rust-tests:
name: rust-${{ matrix.dir == '.' && 'root' || matrix.dir }}-coverage
needs: [compute-release-mode]
if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }}
runs-on:
group: Fastchecker
strategy:
fail-fast: false
matrix:
dir: ['.', 'lib/bindings/python', 'lib/bindings/kvbm']
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
lfs: true
- name: Set up system dependencies
run: |
# Install protoc for Rust build dependencies (NOTE: much faster than apt install)
PB_REL="https://github.com/protocolbuffers/protobuf/releases"
PROTOC_VER="30.2"
PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c -
unzip "${PROTOC_ZIP}" -d $HOME/.local
rm "${PROTOC_ZIP}"
export PATH="$PATH:$HOME/.local/bin"
protoc --version
- name: Cache cargo artifacts
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
with:
path: |
~/.cargo/bin/
~/.cargo/registry
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
restore-keys: ${{ runner.os }}-cargo-
- name: Set up Rust Toolchain Components
run: rustup component add llvm-tools-preview
- name: Install cargo-llvm-cov
run: cargo-llvm-cov --version 2>/dev/null || cargo install cargo-llvm-cov --locked
# Have an explicit step to build tests first to separate time spent on build vs execution.
- name: Compile Tests
working-directory: ${{ matrix.dir }}
run: cargo test --locked --no-run
- name: Run Unit Tests with Coverage
working-directory: ${{ matrix.dir }}
# NOTE: --all-targets doesn't run doc tests.
# cargo llvm-cov is a drop-in for cargo test; --no-report defers output
# so we can generate multiple formats without re-running the tests.
run: |
cargo llvm-cov --locked --all-targets --no-report
cargo llvm-cov report --output-path coverage-rust.txt
cargo llvm-cov report --lcov --output-path coverage-rust.lcov
echo "Coverage summary:"
grep "^TOTAL" coverage-rust.txt || tail -3 coverage-rust.txt
SAFE_DIR=$(echo "${{ matrix.dir }}" | sed 's|^\.$|root|' | tr '/' '-')
echo "RUST_COV_ARTIFACT_NAME=coverage-rust-${SAFE_DIR}-${{ github.run_id }}" >> $GITHUB_ENV
- name: Upload Rust Coverage Data
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: always()
with:
name: ${{ env.RUST_COV_ARTIFACT_NAME }}
path: |
${{ matrix.dir }}/coverage-rust.txt
${{ matrix.dir }}/coverage-rust.lcov
retention-days: 7
# ============================================================================
# RELEASE (workflow_call into release.yml)
# ============================================================================
# commit_sha is the SHA `resolve-source-sha` picked (github.sha). release.yml
# requires post-merge CI to have already pushed images to ECR for that SHA;
# on schedule this is true because post-merge runs on every merge to main
# and nightly fires later.
release:
name: Release Nightly
needs:
- resolve-source-sha
- compute-release-mode
- vllm-test
- vllm-multi-gpu-test
- sglang-test
- sglang-multi-gpu-test
- trtllm-test
- trtllm-multi-gpu-test
- dynamo-pipeline
- rust-tests
# Framework (vllm/sglang/trtllm) and rust tests do not block publishing.
if: ${{ !cancelled() && needs.compute-release-mode.outputs.release == 'true' }}
uses: ./.github/workflows/release.yml
with:
commit_sha: ${{ needs.resolve-source-sha.outputs.source_sha }}
nightly: true
skip_gitlab_pipeline: ${{ inputs.skip_gitlab_pipeline || false }}
secrets: inherit
# ============================================================================
# COVERAGE REPORT
# ============================================================================
coverage-report:
name: Generate Coverage Report
runs-on: ubuntu-latest
needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
if: always()
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Install Coverage Tools
run: |
python -m pip install "coverage[toml]==7.13.1"
python -m coverage --version
echo "✅ Coverage tools installed"
- name: Download All Python Coverage Artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
pattern: coverage-python-*
path: coverage-artifacts/
merge-multiple: false
- name: Download All Rust Coverage Artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
continue-on-error: true
with:
pattern: coverage-rust-*
path: coverage-rust-artifacts/
merge-multiple: false
- name: List Downloaded Artifacts
run: |
echo "📦 Downloaded coverage artifacts:"
echo "==== Directory structure (including hidden files) ===="
ls -Ra coverage-artifacts/
echo ""
echo "==== Coverage files found ===="
find coverage-artifacts/ -type f \( -name "*.xml" -o -name ".coverage*" \) | sort
echo ""
echo "==== Specifically looking for .coverage files ===="
find coverage-artifacts/ -name ".coverage" -type f
echo ""
- name: Merge All Coverage Data
run: |
set -x
echo "📊 Merging all test coverage..."
# Expand $GITHUB_WORKSPACE in the config
sed -i "s|\$GITHUB_WORKSPACE|${GITHUB_WORKSPACE}|g" .coveragerc
echo "Updated .coveragerc with path remapping:"
cat .coveragerc
# Find all .coverage files and copy them with unique names
mkdir -p coverage-combined
find coverage-artifacts/ -name ".coverage*" 2>/dev/null | while read -r file; do
unique_name=$(echo "$file" | tr '/' '_' | sed 's/coverage-artifacts_//')
cp "$file" "coverage-combined/.coverage.${unique_name}"
echo "Copied: $file -> coverage-combined/.coverage.${unique_name}"
done
# Check if we have any coverage files
if ls coverage-combined/.coverage* 1> /dev/null 2>&1; then
echo "✅ Found coverage files to merge"
echo "Files to merge:"
ls -lh coverage-combined/
# Combine all coverage data from the workspace root so that the
# relative canonical path "components/src/dynamo" in .coveragerc
# resolves to $GITHUB_WORKSPACE/components/src/dynamo (where the
# source actually lives) rather than to the non-existent
# coverage-combined/components/src/dynamo subdirectory.
echo "Running coverage combine with path remapping..."
set +e # Don't exit on error
COVERAGE_RCFILE=.coveragerc coverage combine --keep coverage-combined/.coverage* 2>&1 | tee combine.log
set -e
if [ -f .coverage ]; then
echo "✅ Combined .coverage file created"
# Generate reports (continue even if some fail)
echo "📊 Generating coverage reports..."
set +e
COVERAGE_RCFILE=.coveragerc coverage report --show-missing --data-file=.coverage 2>&1 | tee coverage-report.txt
TOTAL_COVERAGE=$(awk '/^TOTAL/ {print $NF}' coverage-report.txt | tail -1)
echo "TOTAL_COVERAGE=${TOTAL_COVERAGE:-0%}" >> $GITHUB_ENV
COVERAGE_RCFILE=.coveragerc coverage html --data-file=.coverage -d coverage-html/ 2>&1 || echo "HTML generation failed"
COVERAGE_RCFILE=.coveragerc coverage xml --data-file=.coverage -o coverage-merged.xml 2>&1 || echo "XML generation failed"
set -e
else
echo "❌ Failed to create combined .coverage file"
echo "No coverage data available" > coverage-report.txt
echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
fi
else
echo "⚠️ No coverage data found"
echo "No coverage data available" > coverage-report.txt
echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV
fi
- name: Process Rust Coverage
run: |
RUST_TOTAL_COVERAGE="N/A"
echo "No Rust coverage data available yet." > coverage-rust-report.txt
if find coverage-rust-artifacts/ -name "coverage-rust.txt" -type f 2>/dev/null | grep -q .; then
echo "📊 Processing Rust coverage reports..."
# Concatenate full per-file coverage from every workspace dir
> coverage-rust-report.txt
find coverage-rust-artifacts/ -name "coverage-rust.txt" | sort | while read -r f; do
DIR_NAME=$(basename $(dirname "$f"))
echo "=== ${DIR_NAME} ===" >> coverage-rust-report.txt
cat "$f" >> coverage-rust-report.txt
echo "" >> coverage-rust-report.txt
done
# Aggregate line coverage across all crates by summing total/missed lines
# from every TOTAL row. Columns on a `cargo llvm-cov report` TOTAL line:
# $2 regions $3 missed_regions $4 region%
# $5 funcs $6 missed_funcs $7 func%
# $8 lines $9 missed_lines $10 line%
RUST_TOTAL_COVERAGE=$(find coverage-rust-artifacts/ -name "coverage-rust.txt" -exec grep -h "^TOTAL" {} + \
| awk '{ total += $8; missed += $9 } END { if (total > 0) printf "%.2f%%", (total - missed) * 100 / total; else print "N/A" }')
RUST_TOTAL_COVERAGE="${RUST_TOTAL_COVERAGE:-N/A}"
echo "Rust line coverage (aggregated across $(find coverage-rust-artifacts/ -name "coverage-rust.txt" | wc -l | tr -d ' ') crates): ${RUST_TOTAL_COVERAGE}"
else
echo "ℹ️ No Rust coverage artifacts found"
fi
echo "RUST_TOTAL_COVERAGE=${RUST_TOTAL_COVERAGE}" >> $GITHUB_ENV
- name: Create Coverage Summary
run: |
DATE=$(date +"%Y-%m-%d %H:%M:%S UTC")
cat > coverage-summary.md << EOF
# 📊 Test Coverage Report
**Date:** ${DATE}
**Run ID:** ${{ github.run_id }}
**Workflow:** ${{ github.workflow }}
| Language | Total Coverage |
|----------|---------------|
| Python | ${TOTAL_COVERAGE} |
| Rust | ${RUST_TOTAL_COVERAGE} |
---
## Python Coverage Details
\`\`\`
$(cat coverage-report.txt 2>/dev/null || echo "No Python coverage data available")
\`\`\`
---
## Rust Coverage Details
\`\`\`
$(cat coverage-rust-report.txt 2>/dev/null || echo "No Rust coverage data available")
\`\`\`
---
## 📁 Artifacts
- Full HTML Report: Download \`coverage-reports-${{ github.run_id }}\` artifact
- Python Coverage XML: \`coverage-merged.xml\`
- Rust Coverage LCov: \`coverage-rust.lcov\` (per workspace dir)
EOF
echo "📄 Coverage summary generated"
cat coverage-summary.md
- name: Post to Workflow Summary
run: cat coverage-summary.md >> $GITHUB_STEP_SUMMARY
- name: Upload Coverage Reports
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: always()
with:
name: coverage-reports-${{ github.run_id }}
path: |
coverage-html/
coverage-merged.xml
coverage-report.txt
coverage-rust-report.txt
coverage-rust-artifacts/
coverage-summary.md
.coverage
retention-days: 30
# ============================================================================
# CLEANUP
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-build, sglang-build, trtllm-build, dynamo-pipeline, create-fresh-builder]
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Register K8s builder context (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
buildkit_worker_addresses: ''
skip_bootstrap: 'true'
- name: Remove K8s builder
shell: bash
run: |
docker buildx rm ${{ env.BUILDER_NAME }} || true
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-default-v2
if: always()
needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests]
permissions:
contents: read
steps:
- name: Get Failed jobs
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | .name | split(" / ") | if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end | . + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
- name: Notify Slack
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
with:
webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type: incoming-webhook
payload: |
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Nightly Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."