diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index cc6ab23722..09d24d5ba2 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -201,7 +201,13 @@ jobs:
   test-gpu:
     needs: build-gpu-image
     if: ${{ always() && needs.build-gpu-image.result == 'success' }}
-    runs-on: [self-hosted, gpu-local]
+    # Only one GPU job at a time across all workflow runs — our AWS quota
+    # allows a single GPU instance. Note: GitHub queues at most one pending
+    # job per group; a third arrival cancels the pending one.
+    concurrency:
+      group: gpu-tests
+    runs-on:
+      - "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=true"
     container:
       image: ${{ needs.build-gpu-image.outputs.image }}
       credentials:
@@ -232,7 +238,10 @@ jobs:
   test-cudatoolkit:
     needs: build-gpu-image
     if: ${{ always() && needs.build-gpu-image.result == 'success' }}
-    runs-on: [self-hosted, gpu-local]
+    concurrency:
+      group: gpu-tests
+    runs-on:
+      - "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=true"
     container:
       image: ${{ needs.build-gpu-image.outputs.image }}
       credentials:
@@ -245,6 +254,7 @@ jobs:
       TORCH_TEST: 1
       TORCH_TEST_CUDA: 1
       CUDA: "12.8"
+      TORCH_SKIP_SLOW_TESTS: 1
 
     steps:
       - name: Verify GPU access
diff --git a/tests/testthat/helper-tensor.R b/tests/testthat/helper-tensor.R
index d4803e17ce..2c6b788f08 100644
--- a/tests/testthat/helper-tensor.R
+++ b/tests/testthat/helper-tensor.R
@@ -13,6 +13,12 @@ skip_if_cuda_not_available <- function() {
   }
 }
 
+skip_slow_tests <- function() {
+  if (Sys.getenv("TORCH_SKIP_SLOW_TESTS", "0") == "1") {
+    skip("Skipping slow test (TORCH_SKIP_SLOW_TESTS=1)")
+  }
+}
+
 skip_if_not_m1_mac <- function() {
   if (!grepl("darwin", R.version$os)) {
     skip("Not on MacOS")
diff --git a/tests/testthat/test-autocast.R b/tests/testthat/test-autocast.R
index 05b8097e5b..ab554f845a 100644
--- a/tests/testthat/test-autocast.R
+++ b/tests/testthat/test-autocast.R
@@ -181,8 +181,9 @@ test_that("internal cpp_amp_check works", {
 })
 
 test_that("grad scalers work correctly", {
-  
+
   skip_if_cuda_not_available()
+  skip_slow_tests()
 
   make_model <- function(in_size, out_size, num_layers) {
     layers <- list()
diff --git a/tests/testthat/test-cuda.R b/tests/testthat/test-cuda.R
index a94e89bc24..c793384c6d 100644
--- a/tests/testthat/test-cuda.R
+++ b/tests/testthat/test-cuda.R
@@ -51,11 +51,13 @@ test_that("cuda is really available", {
 
 test_that("cuda memory snapshot works", {
   skip_if_cuda_not_available()
-  
+  skip_slow_tests()
+
+  withr::defer(cuda_record_memory_history(enabled = NULL))
   cuda_record_memory_history(enabled = "all", max_entries = 1e3)
   x <- torch_randn(16, device="cuda")
   memory <- cuda_memory_snapshot()
-  
+
   expect_true(class(memory) == "raw")
   expect_true(length(memory) > 100)
 })