diff --git a/backends/aoti/slim/c10/cuda/Exception.h b/backends/aoti/slim/c10/cuda/Exception.h
new file mode 100644
index 00000000000..33d8414e661
--- /dev/null
+++ b/backends/aoti/slim/c10/cuda/Exception.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef CUDA_AVAILABLE
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <executorch/backends/aoti/slim/c10/macros/Macros.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+/// Checks a CUDA expression and aborts on error.
+/// @param EXPR The CUDA expression to check.
+#define ET_CUDA_CHECK(EXPR)                                                 \
+  do {                                                                      \
+    const cudaError_t __err = EXPR;                                         \
+    ET_CHECK_MSG(                                                           \
+        __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
+  } while (0)
+
+/// Checks a CUDA expression and logs a warning on error (non-fatal).
+/// @param EXPR The CUDA expression to check.
+#define ET_CUDA_LOG_WARN(EXPR)                                      \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    if (SLIMTENSOR_UNLIKELY(__err != cudaSuccess)) {                \
+      [[maybe_unused]] auto error_unused = cudaGetLastError();      \
+      ET_LOG(Error, "CUDA warning: %s", cudaGetErrorString(__err)); \
+    }                                                               \
+  } while (0)
+
+#endif // CUDA_AVAILABLE
diff --git a/backends/aoti/slim/c10/cuda/TARGETS b/backends/aoti/slim/c10/cuda/TARGETS
new file mode 100644
index 00000000000..08e83a5f3c4
--- /dev/null
+++ b/backends/aoti/slim/c10/cuda/TARGETS
@@ -0,0 +1,6 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/aoti/slim/c10/cuda/targets.bzl b/backends/aoti/slim/c10/cuda/targets.bzl
new file mode 100644
index 00000000000..1d44bd1f032
--- /dev/null
+++ b/backends/aoti/slim/c10/cuda/targets.bzl
@@ -0,0 +1,16 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Define targets for SlimTensor CUDA exception handling module."""
+
+    runtime.cxx_library(
+        name = "exception",
+        exported_headers = [
+            "Exception.h",
+        ],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        exported_deps = [
+            "//executorch/backends/aoti/slim/c10/macros:macros",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h
index d122e86c1d4..6718f04cb51 100644
--- a/backends/aoti/slim/core/Storage.h
+++ b/backends/aoti/slim/core/Storage.h
@@ -10,12 +10,18 @@
 
 #include <cstring>
 
+#ifdef CUDA_AVAILABLE
+#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#endif
+
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
 #include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
 #include <executorch/backends/aoti/slim/util/ArrayRefUtil.h>
 #include <executorch/backends/aoti/slim/util/SharedPtr.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
 
 namespace executorch::backends::aoti::slim {
 
@@ -30,6 +36,10 @@ inline void noop(void*) {}
 /// Default CPU device constant.
 inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
 
+/// Default CUDA device constant.
+inline const c10::Device DEFAULT_CUDA_DEVICE =
+    c10::Device(c10::DeviceType::CUDA, 0);
+
 /// DeviceTraits template for device-specific operations.
 /// Device-specific implementations provide allocate(), free(), and memcpy().
 template <c10::DeviceType D>
@@ -74,6 +84,119 @@ struct DeviceTraits<c10::DeviceType::CPU> {
   }
 };
 
+#ifdef CUDA_AVAILABLE
+/// CUDA specialization of DeviceTraits.
+/// Provides CUDA memory allocation and copy operations using
+/// cudaMallocAsync/cudaFreeAsync with proper stream handling.
+///
+/// IMPORTANT: Callers are expected to set the correct CUDA device and stream
+/// using CUDAStreamGuard before calling these methods. This is consistent
+/// with PyTorch's CUDACachingAllocator design pattern where the allocator
+/// assumes the caller has already set the correct device context.
+template <>
+struct DeviceTraits<c10::DeviceType::CUDA> {
+  /// Allocates CUDA device memory on the current stream.
+  /// Uses cudaMallocAsync for asynchronous allocation on the stream
+  /// that is currently set via CUDAStreamGuard, similar to how
+  /// PyTorch's CUDACachingAllocator works.
+  ///
+  /// NOTE: Caller must ensure the correct device is already set via
+  /// CUDAStreamGuard. This function does NOT create a device guard internally.
+  ///
+  /// @param nbytes Number of bytes to allocate.
+  /// @param device The target CUDA device (used to get the stream).
+  /// @return Pointer to allocated device memory.
+  static void* allocate(size_t nbytes, const c10::Device& device) {
+    // Get the current stream for this device (set by CUDAStreamGuard if any)
+    // This follows PyTorch's pattern where the allocator assumes the caller
+    // has already set the correct device via CUDAStreamGuard.
+    auto stream_result =
+        executorch::backends::cuda::getCurrentCUDAStream(device.index());
+    ET_CHECK_MSG(
+        stream_result.ok(),
+        "Failed to get current CUDA stream for device %d",
+        static_cast<int>(device.index()));
+
+    cudaStream_t stream = stream_result.get();
+    void* data = nullptr;
+    ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
+    return data;
+  }
+
+  /// Frees CUDA device memory on the current stream.
+  /// @param ptr Pointer to device memory to free.
+  static void free(void* ptr) {
+    // Get the current stream for the current device
+    auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
+    if (stream_result.ok()) {
+      ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    } else {
+      // Fallback to synchronous free if we can't get the stream
+      ET_CUDA_LOG_WARN(cudaFree(ptr));
+    }
+  }
+
+  /// Copies memory between CPU and CUDA or CUDA and CUDA.
+  /// @param dst Destination pointer.
+  /// @param src Source pointer.
+  /// @param nbytes Number of bytes to copy.
+  /// @param dst_device Destination device.
+  /// @param src_device Source device.
+  static void memcpy(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      const c10::Device& dst_device,
+      const c10::Device& src_device) {
+    cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
+
+    if (src_device.is_cpu()) {
+      direction = cudaMemcpyHostToDevice;
+    } else if (dst_device.is_cpu()) {
+      direction = cudaMemcpyDeviceToHost;
+    } else {
+      ET_CHECK_MSG(
+          src_device.index() == dst_device.index(),
+          "CUDA memcpy across different device indices not supported: %d != %d",
+          static_cast<int>(src_device.index()),
+          static_cast<int>(dst_device.index()));
+    }
+
+    ET_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
+  }
+};
+#else
+/// CUDA stub when CUDA_AVAILABLE is not defined.
+/// All operations abort with an error message.
+template <>
+struct DeviceTraits<c10::DeviceType::CUDA> {
+  static void* allocate(size_t nbytes, const c10::Device& device) {
+    (void)nbytes;
+    (void)device;
+    ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
+  }
+
+  static void free(void* ptr) {
+    (void)ptr;
+    ET_LOG(Error, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
+  }
+
+  static void memcpy(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      const c10::Device& dst_device,
+      const c10::Device& src_device) {
+    (void)dst;
+    (void)src;
+    (void)nbytes;
+    (void)dst_device;
+    (void)src_device;
+    ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
+  }
+};
+#endif // CUDA_AVAILABLE
+
 /**
  * MaybeOwningStorage - A storage class that manages tensor data memory.
  *
@@ -93,17 +216,19 @@ struct DeviceTraits<c10::DeviceType::CPU> {
 class MaybeOwningStorage {
  public:
   /// Constructs owning storage with allocated memory.
-  /// @param device The device for storage (must be CPU).
+  /// @param device The device for storage (CPU or CUDA).
   /// @param nbytes Number of bytes to allocate.
   MaybeOwningStorage(const c10::Device& device, size_t nbytes)
       : device_(device), capacity_(nbytes), is_owning_(true) {
-    ET_CHECK_MSG(
-        device.is_cpu(),
-        "Only CPU device is currently supported, got: %s",
-        device.str().c_str());
-
-    data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
-    deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
+    if (device.is_cpu()) {
+      data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
+      deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
+    } else if (device.is_cuda()) {
+      data_ = DeviceTraits<c10::DeviceType::CUDA>::allocate(nbytes, device);
+      deleter_ = DeviceTraits<c10::DeviceType::CUDA>::free;
+    } else {
+      ET_CHECK_MSG(false, "Unsupported device type: %s", device.str().c_str());
+    }
   }
 
   /// Default constructor is deleted - storage must have a device.
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index 2056b8c6866..0fc898c5598 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -17,10 +17,12 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/util:shared_ptr",
             "//executorch/backends/aoti/slim/util:size_util",
             "//executorch/runtime/platform:platform",
+            "//executorch/backends/aoti/slim/c10/cuda:exception",
+            "//executorch/backends/cuda/runtime:guard",
         ],
     )
 
-    # Header-only library for SlimTensor
+    # Header-only library for SlimTensor (CPU-only for now)
     runtime.cxx_library(
         name = "slimtensor",
         headers = [
diff --git a/backends/aoti/slim/core/test/targets.bzl b/backends/aoti/slim/core/test/targets.bzl
index c7debd46836..3a7e99dd37c 100644
--- a/backends/aoti/slim/core/test/targets.bzl
+++ b/backends/aoti/slim/core/test/targets.bzl
@@ -1,17 +1,36 @@
+load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
+def get_backend_mode():
+    """Get the supported backend mode of slimtensor."""
+    return ["cuda", "cpu"]
+
 def define_common_targets():
     """Define test targets for SlimTensor core module."""
 
-    runtime.cxx_test(
-        name = "test_storage",
-        srcs = [
-            "test_storage.cpp",
-        ],
-        deps = [
-            "//executorch/backends/aoti/slim/core:storage",
-        ],
-    )
+    # GPU storage test with CUDA support
+    for backend_mode in get_backend_mode():
+        backend_suffix = "_" + backend_mode if backend_mode == "cuda" else ""
+
+        backend_kwargs = {
+            "external_deps": [("cuda", None, "cuda-lazy")],
+            "preprocessor_flags": ["-DCUDA_AVAILABLE=1"],
+            "keep_gpu_sections": True,
+            "remote_execution": re_test_utils.remote_execution(
+                platform = "gpu-remote-execution",
+            ),
+        } if backend_mode == "cuda" else {}
+
+        runtime.cxx_test(
+            name = "test_storage" + backend_suffix,
+            srcs = [
+                "test_storage.cpp",
+            ],
+            deps = [
+                "//executorch/backends/aoti/slim/core:storage",
+            ],
+            **backend_kwargs
+        )
 
     runtime.cxx_test(
         name = "test_slimtensor_basic",
diff --git a/backends/aoti/slim/core/test/test_storage.cpp b/backends/aoti/slim/core/test/test_storage.cpp
index 42a8678c888..5ff3d6620be 100644
--- a/backends/aoti/slim/core/test/test_storage.cpp
+++ b/backends/aoti/slim/core/test/test_storage.cpp
@@ -10,8 +10,29 @@
 
 #include <executorch/backends/aoti/slim/core/Storage.h>
 
+#ifdef CUDA_AVAILABLE
+#include <cuda_runtime.h>
+#endif
+
 namespace executorch::backends::aoti::slim {
 
+// =============================================================================
+// Test Device Helpers
+// =============================================================================
+
+inline std::vector<c10::Device> getTestDevices() {
+  std::vector<c10::Device> devices = {CPU_DEVICE};
+#ifdef CUDA_AVAILABLE
+  devices.push_back(DEFAULT_CUDA_DEVICE);
+#endif
+  return devices;
+}
+
+inline std::string deviceToString(
+    const testing::TestParamInfo<c10::Device>& info) {
+  return info.param.is_cpu() ? "CPU" : "CUDA";
+}
+
 // =============================================================================
 // DeviceTraits<CPU> Tests
 // =============================================================================
@@ -52,48 +73,39 @@ TEST(DeviceTraitsCPUTest, MemcpyCPUToCPU) {
 }
 
 // =============================================================================
-// MaybeOwningStorage Tests - Owning Mode
+// MaybeOwningStorage Parameterized Tests (CPU and CUDA)
 // =============================================================================
 
-TEST(MaybeOwningStorageTest, ConstructOwning) {
+class MaybeOwningStorageParamTest : public testing::TestWithParam<c10::Device> {
+ protected:
+  c10::Device device() const {
+    return GetParam();
+  }
+};
+
+TEST_P(MaybeOwningStorageParamTest, ConstructOwning) {
   constexpr size_t kNbytes = 512;
-  MaybeOwningStorage storage(CPU_DEVICE, kNbytes);
+  MaybeOwningStorage storage(device(), kNbytes);
 
   EXPECT_NE(storage.data(), nullptr);
   EXPECT_EQ(storage.nbytes(), kNbytes);
-  EXPECT_TRUE(storage.device().is_cpu());
+  EXPECT_EQ(storage.device().type(), device().type());
   EXPECT_TRUE(storage.is_owning());
   EXPECT_TRUE(storage.is_resizable());
 }
 
-TEST(MaybeOwningStorageTest, ConstructOwningZeroBytes) {
-  MaybeOwningStorage storage(CPU_DEVICE, 0);
+TEST_P(MaybeOwningStorageParamTest, ConstructOwningZeroBytes) {
+  MaybeOwningStorage storage(device(), 0);
 
   EXPECT_EQ(storage.data(), nullptr);
   EXPECT_EQ(storage.nbytes(), 0);
-  EXPECT_TRUE(storage.device().is_cpu());
+  EXPECT_EQ(storage.device().type(), device().type());
   EXPECT_TRUE(storage.is_owning());
 }
 
-TEST(MaybeOwningStorageTest, DataPersistence) {
-  constexpr size_t kNumFloats = 64;
-  constexpr size_t kNbytes = kNumFloats * sizeof(float);
-  MaybeOwningStorage storage(CPU_DEVICE, kNbytes);
-
-  float* data = static_cast<float*>(storage.data());
-  for (size_t i = 0; i < kNumFloats; ++i) {
-    data[i] = static_cast<float>(i) * 2.0f;
-  }
-
-  float* read_data = static_cast<float*>(storage.data());
-  for (size_t i = 0; i < kNumFloats; ++i) {
-    EXPECT_FLOAT_EQ(read_data[i], static_cast<float>(i) * 2.0f);
-  }
-}
-
-TEST(MaybeOwningStorageTest, MoveConstruct) {
+TEST_P(MaybeOwningStorageParamTest, MoveConstruct) {
   constexpr size_t kNbytes = 256;
-  MaybeOwningStorage original(CPU_DEVICE, kNbytes);
+  MaybeOwningStorage original(device(), kNbytes);
   void* original_data = original.data();
 
   MaybeOwningStorage moved(std::move(original));
@@ -101,17 +113,18 @@ TEST(MaybeOwningStorageTest, MoveConstruct) {
   EXPECT_EQ(moved.data(), original_data);
   EXPECT_EQ(moved.nbytes(), kNbytes);
   EXPECT_TRUE(moved.is_owning());
+  EXPECT_EQ(moved.device().type(), device().type());
 
   EXPECT_EQ(original.data(), nullptr);
   EXPECT_EQ(original.nbytes(), 0);
   EXPECT_FALSE(original.is_owning());
 }
 
-TEST(MaybeOwningStorageTest, MoveAssign) {
+TEST_P(MaybeOwningStorageParamTest, MoveAssign) {
   constexpr size_t kNbytes1 = 256;
   constexpr size_t kNbytes2 = 512;
-  MaybeOwningStorage storage1(CPU_DEVICE, kNbytes1);
-  MaybeOwningStorage storage2(CPU_DEVICE, kNbytes2);
+  MaybeOwningStorage storage1(device(), kNbytes1);
+  MaybeOwningStorage storage2(device(), kNbytes2);
   void* storage2_data = storage2.data();
 
   storage1 = std::move(storage2);
@@ -125,7 +138,33 @@ TEST(MaybeOwningStorageTest, MoveAssign) {
   EXPECT_FALSE(storage2.is_owning());
 }
 
-TEST(MaybeOwningStorageTest, Clone) {
+INSTANTIATE_TEST_SUITE_P(
+    DeviceTests,
+    MaybeOwningStorageParamTest,
+    testing::ValuesIn(getTestDevices()),
+    deviceToString);
+
+// =============================================================================
+// MaybeOwningStorage CPU-Only Tests (require direct data access)
+// =============================================================================
+
+TEST(MaybeOwningStorageCPUTest, DataPersistence) {
+  constexpr size_t kNumFloats = 64;
+  constexpr size_t kNbytes = kNumFloats * sizeof(float);
+  MaybeOwningStorage storage(CPU_DEVICE, kNbytes);
+
+  float* data = static_cast<float*>(storage.data());
+  for (size_t i = 0; i < kNumFloats; ++i) {
+    data[i] = static_cast<float>(i) * 2.0f;
+  }
+
+  float* read_data = static_cast<float*>(storage.data());
+  for (size_t i = 0; i < kNumFloats; ++i) {
+    EXPECT_FLOAT_EQ(read_data[i], static_cast<float>(i) * 2.0f);
+  }
+}
+
+TEST(MaybeOwningStorageCPUTest, Clone) {
   constexpr size_t kNumFloats = 32;
   constexpr size_t kNbytes = kNumFloats * sizeof(float);
   MaybeOwningStorage original(CPU_DEVICE, kNbytes);
@@ -150,7 +189,7 @@ TEST(MaybeOwningStorageTest, Clone) {
   EXPECT_FLOAT_EQ(cloned_data[0], 0.0f);
 }
 
-TEST(MaybeOwningStorageTest, CopyFunction) {
+TEST(MaybeOwningStorageCPUTest, CopyFunction) {
   constexpr size_t kNumFloats = 16;
   constexpr size_t kNbytes = kNumFloats * sizeof(float);
   MaybeOwningStorage src_storage(CPU_DEVICE, kNbytes);
@@ -171,23 +210,30 @@ TEST(MaybeOwningStorageTest, CopyFunction) {
 }
 
 // =============================================================================
-// Storage (SharedPtr<MaybeOwningStorage>) Tests
+// Storage (SharedPtr<MaybeOwningStorage>) Parameterized Tests
 // =============================================================================
 
-TEST(StorageSharedPtrTest, BasicUsage) {
+class StorageSharedPtrParamTest : public testing::TestWithParam<c10::Device> {
+ protected:
+  c10::Device device() const {
+    return GetParam();
+  }
+};
+
+TEST_P(StorageSharedPtrParamTest, BasicUsage) {
   constexpr size_t kNbytes = 128;
-  Storage storage(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
+  Storage storage(new MaybeOwningStorage(device(), kNbytes));
 
   EXPECT_NE(storage.get(), nullptr);
   EXPECT_NE(storage->data(), nullptr);
   EXPECT_EQ(storage->nbytes(), kNbytes);
-  EXPECT_TRUE(storage->device().is_cpu());
+  EXPECT_EQ(storage->device().type(), device().type());
   EXPECT_EQ(storage.use_count(), 1);
 }
 
-TEST(StorageSharedPtrTest, SharedOwnership) {
+TEST_P(StorageSharedPtrParamTest, SharedOwnership) {
   constexpr size_t kNbytes = 128;
-  Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
+  Storage storage1(new MaybeOwningStorage(device(), kNbytes));
   void* data_ptr = storage1->data();
 
   Storage storage2 = storage1; // Copy, not reference - increments ref count
@@ -198,7 +244,52 @@ TEST(StorageSharedPtrTest, SharedOwnership) {
   EXPECT_EQ(storage2->data(), data_ptr);
 }
 
-TEST(StorageSharedPtrTest, SharedOwnershipModification) {
+TEST_P(StorageSharedPtrParamTest, ReferenceCountDecrement) {
+  constexpr size_t kNbytes = 64;
+  Storage storage1(new MaybeOwningStorage(device(), kNbytes));
+  EXPECT_EQ(storage1.use_count(), 1);
+
+  {
+    Storage storage2 = storage1;
+    EXPECT_EQ(storage1.use_count(), 2);
+  }
+
+  EXPECT_EQ(storage1.use_count(), 1);
+}
+
+TEST_P(StorageSharedPtrParamTest, MoveSemantics) {
+  constexpr size_t kNbytes = 64;
+  Storage storage1(new MaybeOwningStorage(device(), kNbytes));
+  void* data_ptr = storage1->data();
+
+  Storage storage2 = std::move(storage1);
+
+  EXPECT_EQ(storage1.get(), nullptr);
+  EXPECT_EQ(storage2->data(), data_ptr);
+  EXPECT_EQ(storage2.use_count(), 1);
+}
+
+TEST_P(StorageSharedPtrParamTest, MakeShared) {
+  constexpr size_t kNbytes = 256;
+  Storage storage = make_shared<MaybeOwningStorage>(device(), kNbytes);
+
+  EXPECT_NE(storage.get(), nullptr);
+  EXPECT_NE(storage->data(), nullptr);
+  EXPECT_EQ(storage->nbytes(), kNbytes);
+  EXPECT_EQ(storage.use_count(), 1);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    DeviceTests,
+    StorageSharedPtrParamTest,
+    testing::ValuesIn(getTestDevices()),
+    deviceToString);
+
+// =============================================================================
+// Storage CPU-Only Tests (require direct data access)
+// =============================================================================
+
+TEST(StorageSharedPtrCPUTest, SharedOwnershipModification) {
   constexpr size_t kNumFloats = 8;
   constexpr size_t kNbytes = kNumFloats * sizeof(float);
   Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
@@ -208,7 +299,7 @@ TEST(StorageSharedPtrTest, SharedOwnershipModification) {
     data[i] = 0.0f;
   }
 
-  const Storage& storage2 = storage1;
+  Storage storage2 = storage1;
 
   float* data2 = static_cast<float*>(storage2->data());
   for (size_t i = 0; i < kNumFloats; ++i) {
@@ -221,39 +312,156 @@ TEST(StorageSharedPtrTest, SharedOwnershipModification) {
   }
 }
 
-TEST(StorageSharedPtrTest, ReferenceCountDecrement) {
-  constexpr size_t kNbytes = 64;
-  Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
-  EXPECT_EQ(storage1.use_count(), 1);
+#ifdef CUDA_AVAILABLE
 
-  {
-    Storage storage2 = storage1; // Copy increments ref count
-    EXPECT_EQ(storage1.use_count(), 2);
-  } // storage2 destroyed, ref count decrements
+// =============================================================================
+// DeviceTraits<CUDA> Tests
+// =============================================================================
 
-  EXPECT_EQ(storage1.use_count(), 1);
+TEST(DeviceTraitsCUDATest, AllocateAndFree) {
+  constexpr size_t kSize = 1024;
+  void* ptr =
+      DeviceTraits<c10::DeviceType::CUDA>::allocate(kSize, DEFAULT_CUDA_DEVICE);
+  ASSERT_NE(ptr, nullptr);
+
+  DeviceTraits<c10::DeviceType::CUDA>::free(ptr);
 }
 
-TEST(StorageSharedPtrTest, MoveSemantics) {
-  constexpr size_t kNbytes = 64;
-  Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
-  void* data_ptr = storage1->data();
+TEST(DeviceTraitsCUDATest, AllocateZeroBytes) {
+  void* ptr =
+      DeviceTraits<c10::DeviceType::CUDA>::allocate(0, DEFAULT_CUDA_DEVICE);
+  DeviceTraits<c10::DeviceType::CUDA>::free(ptr);
+}
 
-  Storage storage2 = std::move(storage1);
+TEST(DeviceTraitsCUDATest, MemcpyCPUToCUDA) {
+  constexpr size_t kSize = 256;
+  float* cpu_src = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
+  float* cuda_dst =
+      static_cast<float*>(DeviceTraits<c10::DeviceType::CUDA>::allocate(
+          kSize * sizeof(float), DEFAULT_CUDA_DEVICE));
+  float* cpu_verify = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
 
-  EXPECT_EQ(storage1.get(), nullptr);
-  EXPECT_EQ(storage2->data(), data_ptr);
-  EXPECT_EQ(storage2.use_count(), 1);
+  for (size_t i = 0; i < kSize; ++i) {
+    cpu_src[i] = static_cast<float>(i) * 2.5f;
+  }
+
+  // Copy CPU -> CUDA
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cuda_dst,
+      cpu_src,
+      kSize * sizeof(float),
+      DEFAULT_CUDA_DEVICE,
+      CPU_DEVICE);
+
+  // Copy CUDA -> CPU to verify
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cpu_verify,
+      cuda_dst,
+      kSize * sizeof(float),
+      CPU_DEVICE,
+      DEFAULT_CUDA_DEVICE);
+
+  for (size_t i = 0; i < kSize; ++i) {
+    EXPECT_FLOAT_EQ(cpu_verify[i], static_cast<float>(i) * 2.5f);
+  }
+
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_src);
+  DeviceTraits<c10::DeviceType::CUDA>::free(cuda_dst);
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_verify);
 }
 
-TEST(StorageSharedPtrTest, MakeShared) {
-  constexpr size_t kNbytes = 256;
-  Storage storage = make_shared<MaybeOwningStorage>(CPU_DEVICE, kNbytes);
+TEST(DeviceTraitsCUDATest, MemcpyCUDAToCPU) {
+  constexpr size_t kSize = 128;
+  float* cpu_src = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
+  float* cuda_mem =
+      static_cast<float*>(DeviceTraits<c10::DeviceType::CUDA>::allocate(
+          kSize * sizeof(float), DEFAULT_CUDA_DEVICE));
+  float* cpu_dst = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
 
-  EXPECT_NE(storage.get(), nullptr);
-  EXPECT_NE(storage->data(), nullptr);
-  EXPECT_EQ(storage->nbytes(), kNbytes);
-  EXPECT_EQ(storage.use_count(), 1);
+  for (size_t i = 0; i < kSize; ++i) {
+    cpu_src[i] = static_cast<float>(i) + 100.0f;
+  }
+
+  // Copy CPU -> CUDA
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cuda_mem,
+      cpu_src,
+      kSize * sizeof(float),
+      DEFAULT_CUDA_DEVICE,
+      CPU_DEVICE);
+
+  // Copy CUDA -> CPU
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cpu_dst,
+      cuda_mem,
+      kSize * sizeof(float),
+      CPU_DEVICE,
+      DEFAULT_CUDA_DEVICE);
+
+  for (size_t i = 0; i < kSize; ++i) {
+    EXPECT_FLOAT_EQ(cpu_dst[i], static_cast<float>(i) + 100.0f);
+  }
+
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_src);
+  DeviceTraits<c10::DeviceType::CUDA>::free(cuda_mem);
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_dst);
 }
 
+TEST(DeviceTraitsCUDATest, MemcpyCUDAToCUDA) {
+  constexpr size_t kSize = 64;
+  float* cpu_src = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
+  float* cuda_src =
+      static_cast<float*>(DeviceTraits<c10::DeviceType::CUDA>::allocate(
+          kSize * sizeof(float), DEFAULT_CUDA_DEVICE));
+  float* cuda_dst =
+      static_cast<float*>(DeviceTraits<c10::DeviceType::CUDA>::allocate(
+          kSize * sizeof(float), DEFAULT_CUDA_DEVICE));
+  float* cpu_verify = static_cast<float*>(
+      DeviceTraits<c10::DeviceType::CPU>::allocate(kSize * sizeof(float)));
+
+  for (size_t i = 0; i < kSize; ++i) {
+    cpu_src[i] = static_cast<float>(i) * 3.0f;
+  }
+
+  // Copy CPU -> CUDA src
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cuda_src,
+      cpu_src,
+      kSize * sizeof(float),
+      DEFAULT_CUDA_DEVICE,
+      CPU_DEVICE);
+
+  // Copy CUDA src -> CUDA dst
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cuda_dst,
+      cuda_src,
+      kSize * sizeof(float),
+      DEFAULT_CUDA_DEVICE,
+      DEFAULT_CUDA_DEVICE);
+
+  // Copy CUDA dst -> CPU to verify
+  DeviceTraits<c10::DeviceType::CUDA>::memcpy(
+      cpu_verify,
+      cuda_dst,
+      kSize * sizeof(float),
+      CPU_DEVICE,
+      DEFAULT_CUDA_DEVICE);
+
+  for (size_t i = 0; i < kSize; ++i) {
+    EXPECT_FLOAT_EQ(cpu_verify[i], static_cast<float>(i) * 3.0f);
+  }
+
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_src);
+  DeviceTraits<c10::DeviceType::CUDA>::free(cuda_src);
+  DeviceTraits<c10::DeviceType::CUDA>::free(cuda_dst);
+  DeviceTraits<c10::DeviceType::CPU>::free(cpu_verify);
+}
+
+#endif // CUDA_AVAILABLE
+
 } // namespace executorch::backends::aoti::slim
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 532ab5544ab..024418d31a6 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -3,6 +3,28 @@ load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
 
+runtime.cxx_library(
+    name = "guard",
+    srcs = [
+        "guard.cpp",
+    ],
+    headers = [
+        "guard.h",
+        "utils.h",
+    ],
+    visibility = ["PUBLIC"],
+    deps = [
+        "//executorch/runtime/platform:platform",
+    ],
+    exported_deps = [
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_platform",
     srcs = [