-
Notifications
You must be signed in to change notification settings - Fork 53
82 lines (79 loc) · 3.06 KB
/
unit_test.yml
File metadata and controls
82 lines (79 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
name: Run Unit Tests
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
build_wheels:
runs-on: ubuntu-24.04
container:
image: 'ghcr.io/nvidia/nvidia-resiliency-ext/builder_amd64:cuda-12.4.1-cudnn-devel-ubuntu22.04'
steps:
- name: Install git
run: apt-get update && apt-get install -y git
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Fix Git dubious ownership
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
# Use builder image venvs (Dockerfile.builder: /opt/venv3.10, 3.11, 3.12 with poetry + dynamic-versioning)
- name: Build wheels
run: |
set -e
cd "$GITHUB_WORKSPACE"
/opt/venv3.10/bin/poetry env use /opt/venv3.10/bin/python && /opt/venv3.10/bin/poetry build -f wheel
/opt/venv3.11/bin/poetry env use /opt/venv3.11/bin/python && /opt/venv3.11/bin/poetry build -f wheel
/opt/venv3.12/bin/poetry env use /opt/venv3.12/bin/python && /opt/venv3.12/bin/poetry build -f wheel
- name: Upload the wheel artifact
uses: actions/upload-artifact@v4
with:
name: resiliency-wheels
path: dist/*.whl
unit_tests_cpu_subset:
runs-on: ubuntu-24.04
needs: build_wheels
strategy:
matrix:
container:
- 'ghcr.io/nvidia/nvidia-resiliency-ext/pytorch:2.5.1-cuda12.4-cudnn9-runtime'
- 'ghcr.io/nvidia/nvidia-resiliency-ext/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
- 'ghcr.io/nvidia/nvidia-resiliency-ext/pytorch:2.3.1-cuda12.1-cudnn8-runtime'
test_type: ['fault_tolerance']
container:
image: ${{ matrix.container }}
env:
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: resiliency-wheels
path: ./dist/
- name: Set up environment
run: |
pip install pytest
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run unit tests
shell: bash
run: |
if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
STRAGGLER_DET_CPU_TESTS_PATTERN="test_all_gather_object_calls_num \
or test_fail_if_not_initialized \
or test_individual_gpu_scores_one_rank \
or test_relative_gpu_scores_ \
or test_name_mapper_ \
or test_relative_gpu_scores_"
pytest -s -vvv tests/straggler/unit/ -k "${STRAGGLER_DET_CPU_TESTS_PATTERN}"
elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
pytest -s -vvv ./tests/fault_tolerance/unit/
else
echo "Unknown test type: ${{ matrix.test_type }}"
exit 1
fi