Enable CI testing (#27)

wconstab · web-flow · commit 6ee7cdf6602c · 2025-07-03T13:36:28.000-07:00
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -0,0 +1,39 @@
+name: Test CUDA
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda:
+    name: Test CUDA (cuda12.6-py3.10)
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - name: 4xlargegpu
+            runs-on: linux.g5.4xlarge.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.6"
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        pip install --quiet -r requirements-test.txt
+        # For some reason the spec above isnt working
+        pip uninstall -y torch
+        pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        pip install --quiet .
+        pytest tests
diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -73,6 +73,18 @@ class DeviceLimit:
             torch.int8: 330,
         },
     ),
+    DeviceLimit(
+        "A10G",
+        "https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf",
+        sm=(8, 0),
+        gmem_bandwidth=933 * (1024**3),
+        gemm_tflops={
+            torch.float32: 31.2,
+            torch.float16: 125,
+            torch.bfloat16: 125,
+            torch.int8: 250,
+        },
+    ),
     DeviceLimit(
         "T4",
         "https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf",
@@ -113,7 +125,7 @@ class DeviceLimit:
 def _get_device_tflops(dtype):
     # for some reason the function from PyTorch is giving
     # wildly different TFlops compared to the specs. I'm
-    # using had-coded values for now that I pulled from xFormers
+    # using hard-coded values for now that I pulled from xFormers
     # https://github.com/fairinternal/xformers/blob/main/xformers/profiler/device_limits.py
     # TODO: fix PyTorch's implementation
     # from torch._inductor.utils import get_device_tflops
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,7 @@
 torch >= 2.7.0
+numpy
 pulp
+pytest
 
 black == 22.3.0
 flake8 == 6.1.0
diff --git a/tests/test_optimize_placement.py b/tests/test_optimize_placement.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+from unittest.mock import patch
+
 import pytest
 import torch
 from torch import nn
@@ -122,6 +124,8 @@ def input_fn():
     return model_fn, input_fn
 
 
+@patch("torch.cuda.device_count", lambda: 8)
+@patch("torch.cuda.get_device_name", lambda device: "H100")
 @pytest.mark.parametrize(
     "model_type", ["ffn_with_multiple_input_output", "transformer_block"]
 )
@@ -237,6 +241,8 @@ def test_optimization_finds_fsdp_and_ddp_1d(device_mesh_1d, high_mem, model_type
 ]
 
 
+@patch("torch.cuda.device_count", lambda: 8)
+@patch("torch.cuda.get_device_name", lambda device: "H100")
 @pytest.mark.parametrize(
     "model_type,expected_param_placements,expected_node_placements",
     [