Nightly CI Pipeline #158
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| name: Nightly CI Pipeline | |
| on: | |
| schedule: | |
| - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC) | |
| workflow_dispatch: # Allow manual triggering for testing | |
| inputs: | |
| release: | |
| description: 'Stage NGC images, Artifactory wheels, and trigger the GitLab release pipeline. Schedule always sets release=true.' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_tests: | |
| description: 'Run vllm/sglang/trtllm tests + dynamo-pipeline checks. Schedule always runs them.' | |
| required: false | |
| type: boolean | |
| default: true | |
| skip_gitlab_pipeline: | |
| description: 'Skip the GitLab release automation pipeline trigger. Emergency use only.' | |
| required: false | |
| type: boolean | |
| default: false | |
| permissions: | |
| contents: read | |
| env: | |
| BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }} | |
| jobs: | |
| # ============================================================================ | |
| # PRE-WARM K8S BUILDER | |
| # ============================================================================ | |
| create-fresh-builder: | |
| name: Create fresh K8s builder | |
| runs-on: prod-default-small-v2 | |
| permissions: | |
| contents: read | |
| outputs: | |
| builder_name: ${{ steps.export-builder-name.outputs.builder_name }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Export builder name | |
| id: export-builder-name | |
| run: | | |
| echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT | |
| - name: Create and bootstrap fresh K8s builder | |
| uses: ./.github/actions/bootstrap-buildkit | |
| with: | |
| builder_name: ${{ steps.export-builder-name.outputs.builder_name }} | |
| buildkit_worker_addresses: '' | |
| suppress_fallback_warning: 'true' | |
| # ============================================================================ | |
| # RESOLVE SOURCE SHA | |
| # ============================================================================ | |
| # Single SHA every downstream job (shared-build-image, dynamo-pipeline, | |
| # shared-test, release.yml) builds, tags, and releases against. On schedule | |
| # and workflow_dispatch from main, this is just github.sha. | |
| resolve-source-sha: | |
| name: Resolve source SHA | |
| runs-on: prod-default-v2 | |
| permissions: | |
| contents: read | |
| outputs: | |
| source_sha: ${{ steps.resolve.outputs.source_sha }} | |
| steps: | |
| - id: resolve | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| SHA="${GITHUB_SHA}" | |
| echo "Using caller SHA: ${SHA}" | |
| echo "source_sha=${SHA}" >> "$GITHUB_OUTPUT" | |
| # ============================================================================ | |
| # COMPUTE NIGHTLY DEV VERSION | |
| # ============================================================================ | |
| # Emits a PEP 440 dev suffix (e.g. .dev20260423) forwarded to every | |
| # pipeline below. At the leaf, the suffix is stamped into pyproject / Cargo | |
| # versions on the runner before docker build, so wheels produced by the | |
| # wheel_builder stage carry the dev version. | |
| compute-dev-version: | |
| name: Compute dev version suffix | |
| runs-on: prod-default-v2 | |
| permissions: | |
| contents: read | |
| outputs: | |
| dev_suffix: ${{ steps.compute.outputs.dev_suffix }} | |
| steps: | |
| - id: compute | |
| shell: bash | |
| run: | | |
| DATE=$(date -u +%Y%m%d) | |
| echo "dev_suffix=.dev${DATE}" >> $GITHUB_OUTPUT | |
| # ============================================================================ | |
| # COMPUTE RELEASE MODE | |
| # ============================================================================ | |
| # schedule → always release. workflow_dispatch → honor the `release` input. | |
| # Output is consumed by the `release` job below to gate the workflow_call | |
| # into release.yml. | |
| compute-release-mode: | |
| name: Compute release mode | |
| runs-on: prod-default-v2 | |
| permissions: | |
| contents: read | |
| outputs: | |
| release: ${{ steps.compute.outputs.release }} | |
| run_tests: ${{ steps.compute.outputs.run_tests }} | |
| steps: | |
| - id: compute | |
| shell: bash | |
| env: | |
| DISPATCH_RELEASE: ${{ inputs.release }} | |
| DISPATCH_RUN_TESTS: ${{ inputs.run_tests }} | |
| run: | | |
| case "${GITHUB_EVENT_NAME}" in | |
| schedule) | |
| # cron must keep tests on so a failing nightly blocks the release. | |
| echo "release=true" >> "$GITHUB_OUTPUT" | |
| echo "run_tests=true" >> "$GITHUB_OUTPUT" | |
| ;; | |
| workflow_dispatch) | |
| if [ "${DISPATCH_RELEASE}" = "false" ]; then | |
| echo "release=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "release=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| if [ "${DISPATCH_RUN_TESTS}" = "false" ]; then | |
| echo "run_tests=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "run_tests=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| ;; | |
| *) | |
| echo "release=false" >> "$GITHUB_OUTPUT" | |
| echo "run_tests=true" >> "$GITHUB_OUTPUT" | |
| ;; | |
| esac | |
| # ============================================================================ | |
| # BUILD JOBS | |
| # ============================================================================ | |
| vllm-build: | |
| name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI | |
| needs: [create-fresh-builder, resolve-source-sha] | |
| uses: ./.github/workflows/shared-build-image.yml | |
| with: | |
| framework: vllm | |
| target: runtime | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: 'linux/amd64,linux/arm64' | |
| builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} | |
| build_timeout_minutes: 120 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| sglang-build: | |
| name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI | |
| needs: [create-fresh-builder, resolve-source-sha] | |
| uses: ./.github/workflows/shared-build-image.yml | |
| with: | |
| framework: sglang | |
| target: runtime | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: 'linux/amd64,linux/arm64' | |
| builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} | |
| build_timeout_minutes: 120 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| trtllm-build: | |
| name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI | |
| needs: [create-fresh-builder, resolve-source-sha] | |
| uses: ./.github/workflows/shared-build-image.yml | |
| with: | |
| framework: trtllm | |
| target: runtime | |
| cuda_version: '["13.1"]' | |
| platform: 'linux/amd64,linux/arm64' | |
| builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} | |
| build_timeout_minutes: 120 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| # ============================================================================ | |
| # TEST JOBS | |
| # ============================================================================ | |
| vllm-test: | |
| name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI | |
| needs: [vllm-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: vllm | |
| test_type: Test | |
| amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform | |
| target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }} | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped | |
| enable_coverage: true | |
| run_cpu_only_tests: true | |
| cpu_only_test_markers: vllm and gpu_0 | |
| gpu_test_markers: vllm and gpu_1 | |
| gpu_test_timeout_minutes: 240 | |
| # Profiled tests run in the parallel stage; unprofiled fall through to sequential. | |
| # 24 GiB admits all currently profiled vLLM tests (max is ~20.4 GiB) on a 48 GiB GPU. | |
| run_gpu_parallel_tests: true | |
| gpu_parallel_max_vram_gib: '24' | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| vllm-multi-gpu-test: | |
| name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI | |
| needs: [vllm-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: vllm | |
| test_type: Multi-GPU Test | |
| amd_runner: prod-tester-amd-gpu-4-v2 | |
| target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }} | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: '["amd64"]' # No ARM GPUs available | |
| enable_coverage: true | |
| run_sanity_check: false | |
| gpu_test_markers: vllm and (gpu_2 or gpu_4) | |
| gpu_test_timeout_minutes: 45 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| sglang-test: | |
| name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI | |
| needs: [sglang-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: sglang | |
| test_type: Test | |
| amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform | |
| target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }} | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped | |
| enable_coverage: true | |
| run_cpu_only_tests: true | |
| cpu_only_test_markers: sglang and gpu_0 | |
| gpu_test_markers: sglang and gpu_1 | |
| gpu_test_timeout_minutes: 240 | |
| # Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential. | |
| # Current single-GPU runners are 24 GiB, so this cap admits the profiled SGLang pool | |
| # while yielding one auto slot today. Larger runners will get more slots from the same markers. | |
| run_gpu_parallel_tests: true | |
| gpu_parallel_max_vram_gib: '24' | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| sglang-multi-gpu-test: | |
| name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI | |
| needs: [sglang-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: sglang | |
| test_type: Multi-GPU Test | |
| amd_runner: prod-tester-amd-gpu-4-v2 | |
| target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }} | |
| cuda_version: '["12.9", "13.0"]' | |
| platform: '["amd64"]' # No ARM GPUs available | |
| enable_coverage: true | |
| run_sanity_check: false | |
| gpu_test_markers: sglang and (gpu_2 or gpu_4) | |
| gpu_test_timeout_minutes: 45 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| trtllm-test: | |
| name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI | |
| needs: [trtllm-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: trtllm | |
| test_type: Test | |
| amd_runner: prod-tester-amd-gpu-v2 # This runner is overridden for ARM platform | |
| target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }} | |
| cuda_version: '["13.1"]' | |
| platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped | |
| enable_coverage: true | |
| run_cpu_only_tests: true | |
| cpu_only_test_markers: trtllm and gpu_0 | |
| gpu_test_markers: trtllm and gpu_1 | |
| gpu_test_timeout_minutes: 240 | |
| # Profiled tests run in the VRAM-aware GPU stage; unprofiled fall through to sequential. | |
| # Current single-GPU runners are 24 GiB, so this cap admits the profiled TRT-LLM pool | |
| # while yielding one auto slot today. Larger runners will get more slots from the same markers. | |
| run_gpu_parallel_tests: true | |
| gpu_parallel_max_vram_gib: '24' | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| trtllm-multi-gpu-test: | |
| name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI | |
| needs: [trtllm-build, resolve-source-sha, compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| uses: ./.github/workflows/shared-test.yml | |
| with: | |
| test_suite_name: trtllm | |
| test_type: Multi-GPU Test | |
| amd_runner: prod-tester-amd-gpu-4-v2 | |
| target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }} | |
| cuda_version: '["13.1"]' | |
| platform: '["amd64"]' # No ARM GPUs available | |
| enable_coverage: true | |
| run_sanity_check: false | |
| gpu_test_markers: trtllm and (gpu_2 or gpu_4) | |
| gpu_test_timeout_minutes: 45 | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| # ============================================================================ | |
| # DYNAMO RUNTIME PIPELINE | |
| # ============================================================================ | |
| dynamo-pipeline: | |
| name: dynamo-runtime | |
| needs: [create-fresh-builder, compute-dev-version, resolve-source-sha] | |
| uses: ./.github/workflows/dynamo-pipeline.yml | |
| with: | |
| builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} | |
| fresh_builder: true | |
| no_cache: true | |
| build_timeout_minutes: 120 | |
| # TODO: widen beyond `pre_merge` — today it picks up tests | |
| # (e.g. fault_tolerance/deploy/*) that fail in this container-only | |
| # context. Matches the coverage of the old container-validation-dynamo | |
| # workflow. | |
| cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)' | |
| cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)' | |
| gpu_test_markers: 'pre_merge and none and gpu_1' | |
| dev_version_suffix: ${{ needs.compute-dev-version.outputs.dev_suffix }} | |
| source_ref: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| secrets: inherit | |
| # ============================================================================ | |
| # RUST COVERAGE | |
| # ============================================================================ | |
| rust-tests: | |
| name: rust-${{ matrix.dir == '.' && 'root' || matrix.dir }}-coverage | |
| needs: [compute-release-mode] | |
| if: ${{ needs.compute-release-mode.outputs.run_tests == 'true' }} | |
| runs-on: | |
| group: Fastchecker | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| dir: ['.', 'lib/bindings/python', 'lib/bindings/kvbm'] | |
| permissions: | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| lfs: true | |
| - name: Set up system dependencies | |
| run: | | |
| # Install protoc for Rust build dependencies (NOTE: much faster than apt install) | |
| PB_REL="https://github.com/protocolbuffers/protobuf/releases" | |
| PROTOC_VER="30.2" | |
| PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" | |
| PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" | |
| curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" | |
| echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - | |
| unzip "${PROTOC_ZIP}" -d $HOME/.local | |
| rm "${PROTOC_ZIP}" | |
| export PATH="$PATH:$HOME/.local/bin" | |
| protoc --version | |
| - name: Cache cargo artifacts | |
| uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 | |
| with: | |
| path: | | |
| ~/.cargo/bin/ | |
| ~/.cargo/registry | |
| key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} | |
| restore-keys: ${{ runner.os }}-cargo- | |
| - name: Set up Rust Toolchain Components | |
| run: rustup component add llvm-tools-preview | |
| - name: Install cargo-llvm-cov | |
| run: cargo-llvm-cov --version 2>/dev/null || cargo install cargo-llvm-cov --locked | |
| # Have an explicit step to build tests first to separate time spent on build vs execution. | |
| - name: Compile Tests | |
| working-directory: ${{ matrix.dir }} | |
| run: cargo test --locked --no-run | |
| - name: Run Unit Tests with Coverage | |
| working-directory: ${{ matrix.dir }} | |
| # NOTE: --all-targets doesn't run doc tests. | |
| # cargo llvm-cov is a drop-in for cargo test; --no-report defers output | |
| # so we can generate multiple formats without re-running the tests. | |
| run: | | |
| cargo llvm-cov --locked --all-targets --no-report | |
| cargo llvm-cov report --output-path coverage-rust.txt | |
| cargo llvm-cov report --lcov --output-path coverage-rust.lcov | |
| echo "Coverage summary:" | |
| grep "^TOTAL" coverage-rust.txt || tail -3 coverage-rust.txt | |
| SAFE_DIR=$(echo "${{ matrix.dir }}" | sed 's|^\.$|root|' | tr '/' '-') | |
| echo "RUST_COV_ARTIFACT_NAME=coverage-rust-${SAFE_DIR}-${{ github.run_id }}" >> $GITHUB_ENV | |
| - name: Upload Rust Coverage Data | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| if: always() | |
| with: | |
| name: ${{ env.RUST_COV_ARTIFACT_NAME }} | |
| path: | | |
| ${{ matrix.dir }}/coverage-rust.txt | |
| ${{ matrix.dir }}/coverage-rust.lcov | |
| retention-days: 7 | |
| # ============================================================================ | |
| # RELEASE (workflow_call into release.yml) | |
| # ============================================================================ | |
| # commit_sha is the SHA `resolve-source-sha` picked (github.sha). release.yml | |
| # requires post-merge CI to have already pushed images to ECR for that SHA; | |
| # on schedule this is true because post-merge runs on every merge to main | |
| # and nightly fires later. | |
| release: | |
| name: Release Nightly | |
| needs: | |
| - resolve-source-sha | |
| - compute-release-mode | |
| - vllm-test | |
| - vllm-multi-gpu-test | |
| - sglang-test | |
| - sglang-multi-gpu-test | |
| - trtllm-test | |
| - trtllm-multi-gpu-test | |
| - dynamo-pipeline | |
| - rust-tests | |
| # Framework (vllm/sglang/trtllm) and rust tests do not block publishing. | |
| if: ${{ !cancelled() && needs.compute-release-mode.outputs.release == 'true' }} | |
| uses: ./.github/workflows/release.yml | |
| with: | |
| commit_sha: ${{ needs.resolve-source-sha.outputs.source_sha }} | |
| nightly: true | |
| skip_gitlab_pipeline: ${{ inputs.skip_gitlab_pipeline || false }} | |
| secrets: inherit | |
| # ============================================================================ | |
| # COVERAGE REPORT | |
| # ============================================================================ | |
| coverage-report: | |
| name: Generate Coverage Report | |
| runs-on: ubuntu-latest | |
| needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests] | |
| if: always() | |
| permissions: | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Set up Python | |
| uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | |
| with: | |
| python-version: '3.12' | |
| - name: Install Coverage Tools | |
| run: | | |
| python -m pip install "coverage[toml]==7.13.1" | |
| python -m coverage --version | |
| echo "✅ Coverage tools installed" | |
| - name: Download All Python Coverage Artifacts | |
| uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 | |
| with: | |
| pattern: coverage-python-* | |
| path: coverage-artifacts/ | |
| merge-multiple: false | |
| - name: Download All Rust Coverage Artifacts | |
| uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 | |
| continue-on-error: true | |
| with: | |
| pattern: coverage-rust-* | |
| path: coverage-rust-artifacts/ | |
| merge-multiple: false | |
| - name: List Downloaded Artifacts | |
| run: | | |
| echo "📦 Downloaded coverage artifacts:" | |
| echo "==== Directory structure (including hidden files) ====" | |
| ls -Ra coverage-artifacts/ | |
| echo "" | |
| echo "==== Coverage files found ====" | |
| find coverage-artifacts/ -type f \( -name "*.xml" -o -name ".coverage*" \) | sort | |
| echo "" | |
| echo "==== Specifically looking for .coverage files ====" | |
| find coverage-artifacts/ -name ".coverage" -type f | |
| echo "" | |
| - name: Merge All Coverage Data | |
| run: | | |
| set -x | |
| echo "📊 Merging all test coverage..." | |
| # Expand $GITHUB_WORKSPACE in the config | |
| sed -i "s|\$GITHUB_WORKSPACE|${GITHUB_WORKSPACE}|g" .coveragerc | |
| echo "Updated .coveragerc with path remapping:" | |
| cat .coveragerc | |
| # Find all .coverage files and copy them with unique names | |
| mkdir -p coverage-combined | |
| find coverage-artifacts/ -name ".coverage*" 2>/dev/null | while read -r file; do | |
| unique_name=$(echo "$file" | tr '/' '_' | sed 's/coverage-artifacts_//') | |
| cp "$file" "coverage-combined/.coverage.${unique_name}" | |
| echo "Copied: $file -> coverage-combined/.coverage.${unique_name}" | |
| done | |
| # Check if we have any coverage files | |
| if ls coverage-combined/.coverage* 1> /dev/null 2>&1; then | |
| echo "✅ Found coverage files to merge" | |
| echo "Files to merge:" | |
| ls -lh coverage-combined/ | |
| # Combine all coverage data from the workspace root so that the | |
| # relative canonical path "components/src/dynamo" in .coveragerc | |
| # resolves to $GITHUB_WORKSPACE/components/src/dynamo (where the | |
| # source actually lives) rather than to the non-existent | |
| # coverage-combined/components/src/dynamo subdirectory. | |
| echo "Running coverage combine with path remapping..." | |
| set +e # Don't exit on error | |
| COVERAGE_RCFILE=.coveragerc coverage combine --keep coverage-combined/.coverage* 2>&1 | tee combine.log | |
| set -e | |
| if [ -f .coverage ]; then | |
| echo "✅ Combined .coverage file created" | |
| # Generate reports (continue even if some fail) | |
| echo "📊 Generating coverage reports..." | |
| set +e | |
| COVERAGE_RCFILE=.coveragerc coverage report --show-missing --data-file=.coverage 2>&1 | tee coverage-report.txt | |
| TOTAL_COVERAGE=$(awk '/^TOTAL/ {print $NF}' coverage-report.txt | tail -1) | |
| echo "TOTAL_COVERAGE=${TOTAL_COVERAGE:-0%}" >> $GITHUB_ENV | |
| COVERAGE_RCFILE=.coveragerc coverage html --data-file=.coverage -d coverage-html/ 2>&1 || echo "HTML generation failed" | |
| COVERAGE_RCFILE=.coveragerc coverage xml --data-file=.coverage -o coverage-merged.xml 2>&1 || echo "XML generation failed" | |
| set -e | |
| else | |
| echo "❌ Failed to create combined .coverage file" | |
| echo "No coverage data available" > coverage-report.txt | |
| echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV | |
| fi | |
| else | |
| echo "⚠️ No coverage data found" | |
| echo "No coverage data available" > coverage-report.txt | |
| echo "TOTAL_COVERAGE=0%" >> $GITHUB_ENV | |
| fi | |
| - name: Process Rust Coverage | |
| run: | | |
| RUST_TOTAL_COVERAGE="N/A" | |
| echo "No Rust coverage data available yet." > coverage-rust-report.txt | |
| if find coverage-rust-artifacts/ -name "coverage-rust.txt" -type f 2>/dev/null | grep -q .; then | |
| echo "📊 Processing Rust coverage reports..." | |
| # Concatenate full per-file coverage from every workspace dir | |
| > coverage-rust-report.txt | |
| find coverage-rust-artifacts/ -name "coverage-rust.txt" | sort | while read -r f; do | |
| DIR_NAME=$(basename $(dirname "$f")) | |
| echo "=== ${DIR_NAME} ===" >> coverage-rust-report.txt | |
| cat "$f" >> coverage-rust-report.txt | |
| echo "" >> coverage-rust-report.txt | |
| done | |
| # Aggregate line coverage across all crates by summing total/missed lines | |
| # from every TOTAL row. Columns on a `cargo llvm-cov report` TOTAL line: | |
| # $2 regions $3 missed_regions $4 region% | |
| # $5 funcs $6 missed_funcs $7 func% | |
| # $8 lines $9 missed_lines $10 line% | |
| RUST_TOTAL_COVERAGE=$(find coverage-rust-artifacts/ -name "coverage-rust.txt" -exec grep -h "^TOTAL" {} + \ | |
| | awk '{ total += $8; missed += $9 } END { if (total > 0) printf "%.2f%%", (total - missed) * 100 / total; else print "N/A" }') | |
| RUST_TOTAL_COVERAGE="${RUST_TOTAL_COVERAGE:-N/A}" | |
| echo "Rust line coverage (aggregated across $(find coverage-rust-artifacts/ -name "coverage-rust.txt" | wc -l | tr -d ' ') crates): ${RUST_TOTAL_COVERAGE}" | |
| else | |
| echo "ℹ️ No Rust coverage artifacts found" | |
| fi | |
| echo "RUST_TOTAL_COVERAGE=${RUST_TOTAL_COVERAGE}" >> $GITHUB_ENV | |
| - name: Create Coverage Summary | |
| run: | | |
| DATE=$(date +"%Y-%m-%d %H:%M:%S UTC") | |
| cat > coverage-summary.md << EOF | |
| # 📊 Test Coverage Report | |
| **Date:** ${DATE} | |
| **Run ID:** ${{ github.run_id }} | |
| **Workflow:** ${{ github.workflow }} | |
| | Language | Total Coverage | | |
| |----------|---------------| | |
| | Python | ${TOTAL_COVERAGE} | | |
| | Rust | ${RUST_TOTAL_COVERAGE} | | |
| --- | |
| ## Python Coverage Details | |
| \`\`\` | |
| $(cat coverage-report.txt 2>/dev/null || echo "No Python coverage data available") | |
| \`\`\` | |
| --- | |
| ## Rust Coverage Details | |
| \`\`\` | |
| $(cat coverage-rust-report.txt 2>/dev/null || echo "No Rust coverage data available") | |
| \`\`\` | |
| --- | |
| ## 📁 Artifacts | |
| - Full HTML Report: Download \`coverage-reports-${{ github.run_id }}\` artifact | |
| - Python Coverage XML: \`coverage-merged.xml\` | |
| - Rust Coverage LCov: \`coverage-rust.lcov\` (per workspace dir) | |
| EOF | |
| echo "📄 Coverage summary generated" | |
| cat coverage-summary.md | |
| - name: Post to Workflow Summary | |
| run: cat coverage-summary.md >> $GITHUB_STEP_SUMMARY | |
| - name: Upload Coverage Reports | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| if: always() | |
| with: | |
| name: coverage-reports-${{ github.run_id }} | |
| path: | | |
| coverage-html/ | |
| coverage-merged.xml | |
| coverage-report.txt | |
| coverage-rust-report.txt | |
| coverage-rust-artifacts/ | |
| coverage-summary.md | |
| .coverage | |
| retention-days: 30 | |
| # ============================================================================ | |
| # CLEANUP | |
| # ============================================================================ | |
| clean-k8s-builder: | |
| name: Clean K8s builder if exists | |
| runs-on: prod-default-small-v2 | |
| if: always() | |
| needs: [vllm-build, sglang-build, trtllm-build, dynamo-pipeline, create-fresh-builder] | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Register K8s builder context (skip bootstrap) | |
| uses: ./.github/actions/bootstrap-buildkit | |
| continue-on-error: true | |
| with: | |
| builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} | |
| buildkit_worker_addresses: '' | |
| skip_bootstrap: 'true' | |
| - name: Remove K8s builder | |
| shell: bash | |
| run: | | |
| docker buildx rm ${{ env.BUILDER_NAME }} || true | |
| ############################## SLACK NOTIFICATION ############################## | |
| notify-slack: | |
| name: Notify Slack | |
| runs-on: prod-default-v2 | |
| if: always() | |
| needs: [vllm-test, vllm-multi-gpu-test, sglang-test, sglang-multi-gpu-test, trtllm-test, trtllm-multi-gpu-test, rust-tests] | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: Get Failed jobs | |
| shell: bash | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| JOBS_JSON=$(mktemp) | |
| curl -sSL \ | |
| -H "Authorization: Bearer ${GITHUB_TOKEN}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \ | |
| >$JOBS_JSON | |
| FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | .name | split(" / ") | if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end | . + "\\n"' "$JOBS_JSON") | |
| echo $FAILED_JOBS | |
| { | |
| echo "FAILED_JOBS<<EOF" | |
| echo "$FAILED_JOBS" | |
| echo "EOF" | |
| } >> "$GITHUB_ENV" | |
| - name: Notify Slack | |
| uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1 | |
| with: | |
| webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} | |
| webhook-type: incoming-webhook | |
| payload: | | |
| blocks: | |
| - type: "section" | |
| text: | |
| type: mrkdwn | |
| text: ":alert: *Github Nightly Pipeline Failure*" | |
| - type: "section" | |
| text: | |
| type: mrkdwn | |
| text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>" | |
| - type: "section" | |
| text: | |
| type: mrkdwn | |
| text: "${{ env.FAILED_JOBS }}" | |
| - type: "section" | |
| text: | |
| type: mrkdwn | |
| text: "@ops-support Please investigate the failures above." |