Skip to content

Merge branch 'main' into arhamm1-patch-16 #4716

Merge branch 'main' into arhamm1-patch-16

Merge branch 'main' into arhamm1-patch-16 #4716

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo Curator
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- main
- "pull-request/[0-9]+"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
actions: write # Required for gh run cancel
env:
container-registry: ${{ vars.DEFAULT_CONTAINER_REGISTRY }}
UV_HTTP_TIMEOUT: 60
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.80.1
with:
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
sso_users_filename: ${{ vars.SSO_USERS_FILENAME }}
secrets:
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
cicd-wait-in-queue:
needs: [pre-flight]
runs-on: ubuntu-latest
environment: test
if: |
needs.pre-flight.outputs.is_ci_workload == 'false'
&& needs.pre-flight.outputs.docs_only == 'false'
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-cpu-tests:
strategy:
fail-fast: false
matrix:
folder: ["backends", "config", "core", "models", "pipelines", "stages-audio", "stages-common", "stages-deduplication", "stages-image", "stages-interleaved", "stages-math_stages", "stages-synthetic", "stages-text", "stages-video", "eval", "tasks", "utils"]
needs: [pre-flight, cicd-wait-in-queue]
runs-on: ubuntu-latest
name: Unit_Test_${{ matrix.folder }}_CPU
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- uses: actions/checkout@v6
with:
submodules: recursive
- name: Free up disk space on Ubuntu
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Install system dependencies for audio tests
if: matrix.folder == 'stages-audio'
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends ffmpeg
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: false
- name: Run CPU tests (Python 3.11)
timeout-minutes: 40
run: bash tests/L0_Unit_Test_CPU.sh "${{ matrix.folder }}" 3.11
- name: Run CPU tests (Python 3.13)
timeout-minutes: 40
run: bash tests/L0_Unit_Test_CPU.sh "${{ matrix.folder }}" 3.13
- name: Generate report
id: check
shell: bash
run: |
uv run coverage xml
uv run coverage report
coverage_report=coverage-unit-test-${{ matrix.folder }}-${{ github.run_id }}-$(uuidgen)
echo "$coverage_report"
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
- name: Upload artifacts
uses: actions/upload-artifact@v6
if: ${{ steps.check.outputs.coverage_report != 'none' }}
with:
name: ${{ steps.check.outputs.coverage_report }}
path: |
./coverage.xml
./.coverage
include-hidden-files: true
cicd-container-build:
needs: [pre-flight, cicd-wait-in-queue]
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/build-container
with:
container-registry: ${{ env.container-registry }}
dockerfile-path: "./docker/Dockerfile"
PAT: ${{ secrets.PAT }}
repo-name: "Curator"
cicd-gpu-test-matrix:
needs: [pre-flight, cicd-wait-in-queue]
runs-on: ubuntu-latest
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v6
- name: Verify all GPU tests are in gpu_test_groups.json
run: python3 .github/scripts/check_gpu_test_coverage.py
- name: Generate GPU test matrix from config
id: set-matrix
run: |
matrix=$(python3 -c "
import json
cfg = json.load(open('tests/gpu_test_groups.json'))
include = [{'gpu-test-group': g, 'extras': ' '.join(v['extras']), 'test-paths': ' '.join(v['paths'])} for g, v in cfg.items()]
print(json.dumps({'include': include}))
")
echo "matrix=$matrix" >> "$GITHUB_OUTPUT"
cicd-gpu-tests:
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.cicd-gpu-test-matrix.outputs.matrix) }}
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build, cicd-gpu-test-matrix]
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
name: L0_Unit_Test_GPU-${{ matrix.gpu-test-group }}
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: main
uses: ./.github/actions/test-template
with:
script: L0_Unit_Test_GPU
is_unit_test: "false"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PAT: ${{ secrets.PAT }}
timeout: 40
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
runner: ${{ needs.pre-flight.outputs.runner_prefix }}
container-image: ${{ env.container-registry }}/curator:${{ github.sha }}
gpu-test-extras: ${{ matrix.extras }}
gpu-test-paths: ${{ matrix.test-paths }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-cpu-tests
- cicd-gpu-tests
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Get workflow result
id: result
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
run: |
# Get workflow run details and check job conclusions
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
if [[ ($NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0) || $DOCS_ONLY == 'true' ]]; then
RESULT="success"
elif [[ $NUM_CANCELLED -gt 0 ]]; then
RESULT="cancelled"
else
RESULT="failure"
fi
# Output the final status
echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
- name: Checkout for GH CLI
uses: actions/checkout@v6
- name: Remove label if not cancelled
if: |
steps.result.outputs.code != 'cancelled'
&& github.event.label.name == 'Run CICD'
&& github.event.pull_request.head.repo.full_name == github.repository
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ github.event.number }}
run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
- name: Pipeline successful, add PR comment
if: |
steps.result.outputs.code == 'success'
&& github.event_name == 'pull_request'
&& env.SLACK_WEBHOOK != ''
uses: peter-evans/create-or-update-comment@v4
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
with:
issue-number: ${{ github.event.number }}
body: |
[🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
So it might be time to merge this PR or get some approvals.
//cc @NVIDIA-NeMo/automation
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
if: |
steps.result.outputs.code == 'failure'
&& github.event.label.name == 'Run CICD'
&& env.SLACK_WEBHOOK != ''
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
PR_NUMBER: ${{ github.event.number }}
SERVER_URL: ${{ github.server_url }}
run: |
set -x
pip install PyGithub
export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
python .github/scripts/notify.py
- name: Exit
if: ${{ always() }}
env:
RESULT: ${{ steps.result.outputs.code }}
run: |
if [ $RESULT == "success" ]; then
exit 0
else
exit 1
fi
Coverage:
runs-on: ubuntu-latest
needs: [pre-flight, Nemo_CICD_Test]
if: |
needs.pre-flight.outputs.docs_only == 'false'
&& (
success()
|| needs.Nemo_CICD_Test.result == 'success'
)
&& !cancelled()
strategy:
fail-fast: false
matrix:
flag: [unit-test, e2e]
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Checkout
uses: actions/checkout@v6
- name: Download coverage reports of current branch
uses: actions/download-artifact@v7
with:
pattern: coverage-${{ matrix.flag }}-*
path: coverage-downloads
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage[toml]
ls -al .
if ! [ -d coverage-downloads ] || [ -z "$(ls -A coverage-downloads)" ]; then
echo "No coverage artifacts found for ${{ matrix.flag }}, skipping."
exit 0
fi
ls -al coverage-downloads/
coverage combine --keep $(find coverage-downloads -name '.coverage')
coverage report -i
coverage xml
rm -rf coverage-downloads
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
fail_ci_if_error: true
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').base.sha }}
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true
codecov-placeholder:
name: codecov/patch
needs: [pre-flight]
if: needs.pre-flight.outputs.docs_only == 'true'
runs-on: ubuntu-latest
steps:
- name: codecov_placeholder
run: |
echo "This is a placeholder status check for when no tests are ran but the codecov status is expected"