diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index cc6ab23722..09d24d5ba2 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -201,7 +201,13 @@ jobs: test-gpu: needs: build-gpu-image if: ${{ always() && needs.build-gpu-image.result == 'success' }} - runs-on: [self-hosted, gpu-local] + # Only one GPU job at a time across all workflow runs — our AWS quota + # allows a single GPU instance. Note: GitHub queues at most one pending + # job per group; a third arrival cancels the pending one. + concurrency: + group: gpu-tests + runs-on: + - "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=true" container: image: ${{ needs.build-gpu-image.outputs.image }} credentials: @@ -232,7 +238,10 @@ jobs: test-cudatoolkit: needs: build-gpu-image if: ${{ always() && needs.build-gpu-image.result == 'success' }} - runs-on: [self-hosted, gpu-local] + concurrency: + group: gpu-tests + runs-on: + - "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=true" container: image: ${{ needs.build-gpu-image.outputs.image }} credentials: @@ -245,6 +254,7 @@ jobs: TORCH_TEST: 1 TORCH_TEST_CUDA: 1 CUDA: "12.8" + TORCH_SKIP_SLOW_TESTS: 1 steps: - name: Verify GPU access diff --git a/tests/testthat/helper-tensor.R b/tests/testthat/helper-tensor.R index d4803e17ce..2c6b788f08 100644 --- a/tests/testthat/helper-tensor.R +++ b/tests/testthat/helper-tensor.R @@ -13,6 +13,12 @@ skip_if_cuda_not_available <- function() { } } +skip_slow_tests <- function() { + if (Sys.getenv("TORCH_SKIP_SLOW_TESTS", "0") == "1") { + skip("Skipping slow test (TORCH_SKIP_SLOW_TESTS=1)") + } +} + skip_if_not_m1_mac <- function() { if (!grepl("darwin", R.version$os)) { skip("Not on MacOS") diff --git a/tests/testthat/test-autocast.R b/tests/testthat/test-autocast.R index 05b8097e5b..ab554f845a 100644 --- a/tests/testthat/test-autocast.R +++ b/tests/testthat/test-autocast.R @@ -181,8 +181,9 @@ test_that("internal cpp_amp_check works", { }) test_that("grad scalers work correctly", { - + skip_if_cuda_not_available() + skip_slow_tests() make_model <- function(in_size, out_size, num_layers) { layers <- list() diff --git a/tests/testthat/test-cuda.R b/tests/testthat/test-cuda.R index a94e89bc24..c793384c6d 100644 --- a/tests/testthat/test-cuda.R +++ b/tests/testthat/test-cuda.R @@ -51,11 +51,13 @@ test_that("cuda is really available", { test_that("cuda memory snapshot works", { skip_if_cuda_not_available() - + skip_slow_tests() + + withr::defer(cuda_record_memory_history(enabled = NULL)) cuda_record_memory_history(enabled = "all", max_entries = 1e3) x <- torch_randn(16, device="cuda") memory <- cuda_memory_snapshot() - + expect_true(class(memory) == "raw") expect_true(length(memory) > 100) })