Skip to content

Commit 6ee7cdf

Browse files
authored
Enable CI testing (#27)
1 parent 1ca0f1d commit 6ee7cdf

File tree

4 files changed

+60
-1
lines changed

4 files changed

+60
-1
lines changed

.github/workflows/test_cuda.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Test CUDA
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
- release/*
9+
10+
concurrency:
11+
group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
12+
cancel-in-progress: true
13+
14+
jobs:
15+
test-cuda:
16+
name: Test CUDA (cuda12.6-py3.10)
17+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
18+
strategy:
19+
fail-fast: true
20+
matrix:
21+
include:
22+
- name: 4xlargegpu
23+
runs-on: linux.g5.4xlarge.nvidia.gpu
24+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
25+
gpu-arch-type: "cuda"
26+
gpu-arch-version: "12.6"
27+
with:
28+
timeout: 60
29+
runner: ${{ matrix.runs-on }}
30+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
31+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
32+
submodules: recursive
33+
script: |
34+
pip install --quiet -r requirements-test.txt
35+
# For some reason the spec above isnt working
36+
pip uninstall -y torch
37+
pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
38+
pip install --quiet .
39+
pytest tests

autoparallel/compute_estimation.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,18 @@ class DeviceLimit:
7373
torch.int8: 330,
7474
},
7575
),
76+
DeviceLimit(
77+
"A10G",
78+
"https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf",
79+
sm=(8, 0),
80+
gmem_bandwidth=933 * (1024**3),
81+
gemm_tflops={
82+
torch.float32: 31.2,
83+
torch.float16: 125,
84+
torch.bfloat16: 125,
85+
torch.int8: 250,
86+
},
87+
),
7688
DeviceLimit(
7789
"T4",
7890
"https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf",
@@ -113,7 +125,7 @@ class DeviceLimit:
113125
def _get_device_tflops(dtype):
114126
# for some reason the function from PyTorch is giving
115127
# wildly different TFlops compared to the specs. I'm
116-
# using had-coded values for now that I pulled from xFormers
128+
# using hard-coded values for now that I pulled from xFormers
117129
# https://github.com/fairinternal/xformers/blob/main/xformers/profiler/device_limits.py
118130
# TODO: fix PyTorch's implementation
119131
# from torch._inductor.utils import get_device_tflops

requirements-test.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
torch >= 2.7.0
2+
numpy
23
pulp
4+
pytest
35

46
black == 22.3.0
57
flake8 == 6.1.0

tests/test_optimize_placement.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
from unittest.mock import patch
7+
68
import pytest
79
import torch
810
from torch import nn
@@ -122,6 +124,8 @@ def input_fn():
122124
return model_fn, input_fn
123125

124126

127+
@patch("torch.cuda.device_count", lambda: 8)
128+
@patch("torch.cuda.get_device_name", lambda device: "H100")
125129
@pytest.mark.parametrize(
126130
"model_type", ["ffn_with_multiple_input_output", "transformer_block"]
127131
)
@@ -237,6 +241,8 @@ def test_optimization_finds_fsdp_and_ddp_1d(device_mesh_1d, high_mem, model_type
237241
]
238242

239243

244+
@patch("torch.cuda.device_count", lambda: 8)
245+
@patch("torch.cuda.get_device_name", lambda device: "H100")
240246
@pytest.mark.parametrize(
241247
"model_type,expected_param_placements,expected_node_placements",
242248
[

0 commit comments

Comments
 (0)