NVIDIA · shivansh023023 · Mar 11, 2026
diff --git a/repro_uint8_bug.py b/repro_uint8_bug.py
@@ -0,0 +1,40 @@
+"""Minimal reproduction of the uint8 type closure conversion bug."""
+
+import warp as wp
+
+wp.init()
+
+
+def create_type_closure_scalar(scalar_type):
+    @wp.kernel
+    def k(input: float, expected: float):
+        x = scalar_type(input)
+        wp.expect_eq(float(x), expected)
+
+    return k
+
+
+# These work fine (int, float closures)
+type_closure_kernel_int = create_type_closure_scalar(int)
+type_closure_kernel_float = create_type_closure_scalar(float)
+
+# This is the broken one
+type_closure_kernel_uint8 = create_type_closure_scalar(wp.uint8)
+
+print("Testing int closure...")
+wp.launch(type_closure_kernel_int, dim=1, inputs=[-1.5, -1.0], device="cpu")
+wp.synchronize()
+print("  PASSED")
+
+print("Testing float closure...")
+wp.launch(type_closure_kernel_float, dim=1, inputs=[-1.5, -1.5], device="cpu")
+wp.synchronize()
+print("  PASSED")
+
+print("Testing uint8 closure...")
+try:
+    wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0], device="cpu")
+    wp.synchronize()
+    print("  PASSED")
+except Exception as e:
+    print(f"  FAILED with exception: {type(e).__name__}: {e}")
diff --git a/warp/_src/builtins.py b/warp/_src/builtins.py
@@ -1007,8 +1007,17 @@ def get_diag_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
 
 # scalar type constructors between all storage / compute types
 scalar_types_all = [*scalar_types, bool, int, float]
+
+unsigned_int_types = (uint8, uint16, uint32, uint64)
+float_src_types = {float16: "float16", float32: "float32", float64: "float64", float: "float32"}
+
 for t in scalar_types_all:
     for u in scalar_types_all:
+        # Use safe cast for float -> unsigned to avoid C++ UB
+        safe_native = None
+        if t in unsigned_int_types and u in float_src_types:
+            safe_native = f"{float_src_types[u]}_to_{t.__name__}"
+
         add_builtin(
             t.__name__,
             input_types={"a": u},
@@ -1017,7 +1026,8 @@ def get_diag_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
             hidden=True,
             group="Scalar Math",
             export=False,
-            namespace="wp::" if t is not bool else "",
+            namespace="wp::" if t is not bool and not safe_native else "",
+            native_func=safe_native if safe_native else t.__name__,
         )
 
 

diff --git a/warp/native/builtin.h b/warp/native/builtin.h
@@ -105,6 +105,46 @@ typedef uint64_t uint64;
 typedef const char* str;
 
 
+// Float-to-unsigned conversions: cast through int64 to avoid C++ UB
+// (C++ 7.3.11: float -> unsigned is UB when truncated value is negative)
+template <typename F> CUDA_CALLABLE inline int64 safe_float_to_int64(F x)
+{
+    if (!(x == x))
+        return 0;
+    constexpr F min_int64 = static_cast<F>(-9223372036854775808.0);  // -2^63
+    constexpr F max_overflow = static_cast<F>(9223372036854775808.0);  // 2^63
+    if (x < min_int64)
+        return -9223372036854775807LL - 1LL;
+    if (x >= max_overflow)
+        return 9223372036854775807LL;
+    return static_cast<int64>(x);
+}
+
+template <typename F> CUDA_CALLABLE inline uint64 safe_float_to_uint64(F x)
+{
+    if (!(x == x))
+        return 0;
+    if (x <= 0.0)
+        return static_cast<uint64>(safe_float_to_int64(x));
+    constexpr F pow2_63 = static_cast<F>(9223372036854775808.0);  // 2^63
+    constexpr F overflow_uint64 = static_cast<F>(18446744073709551616.0);  // 2^64
+    if (x >= overflow_uint64)
+        return 18446744073709551615ULL;
+    if (x >= pow2_63)
+        return static_cast<uint64>(safe_float_to_int64(x - pow2_63)) + 9223372036854775808ULL;
+    return static_cast<uint64>(safe_float_to_int64(x));
+}
+
+CUDA_CALLABLE inline uint8 float32_to_uint8(float32 x) { return static_cast<uint8>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint8 float64_to_uint8(float64 x) { return static_cast<uint8>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint16 float32_to_uint16(float32 x) { return static_cast<uint16>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint16 float64_to_uint16(float64 x) { return static_cast<uint16>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint32 float32_to_uint32(float32 x) { return static_cast<uint32>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint32 float64_to_uint32(float64 x) { return static_cast<uint32>(safe_float_to_int64(x)); }
+CUDA_CALLABLE inline uint64 float32_to_uint64(float32 x) { return safe_float_to_uint64(x); }
+CUDA_CALLABLE inline uint64 float64_to_uint64(float64 x) { return safe_float_to_uint64(x); }
+
+
 struct half;
 
 CUDA_CALLABLE half float_to_half(float x);
@@ -182,6 +222,12 @@ static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
 
 typedef half float16;
 
+// Handle float16 source
+CUDA_CALLABLE inline uint8 float16_to_uint8(float16 x) { return float32_to_uint8(float32(x)); }
+CUDA_CALLABLE inline uint16 float16_to_uint16(float16 x) { return float32_to_uint16(float32(x)); }
+CUDA_CALLABLE inline uint32 float16_to_uint32(float16 x) { return float32_to_uint32(float32(x)); }
+CUDA_CALLABLE inline uint64 float16_to_uint64(float16 x) { return float32_to_uint64(float32(x)); }
+
 // Approximate division/reciprocal intrinsics
 #if defined(__CUDA_ARCH__)
 
@@ -337,6 +383,19 @@ template <typename T> CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float
 template <typename T> CUDA_CALLABLE inline void adj_float32(T x, T& adj_x, float32 adj_ret) { adj_x += T(adj_ret); }
 template <typename T> CUDA_CALLABLE inline void adj_float64(T x, T& adj_x, float64 adj_ret) { adj_x += T(adj_ret); }
 
+// Adjoint stubs for safe float-to-unsigned casts (no-op since they are cast functions)
+template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint8(T, T&, uint8) { }
+template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint8(T, T&, uint8) { }
+template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint8(T, T&, uint8) { }
+template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint16(T, T&, uint16) { }
+template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint16(T, T&, uint16) { }
+template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint16(T, T&, uint16) { }
+template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint32(T, T&, uint32) { }
+template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint32(T, T&, uint32) { }
+template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint32(T, T&, uint32) { }
+template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint64(T, T&, uint64) { }
+template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint64(T, T&, uint64) { }
+template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint64(T, T&, uint64) { }
 
 #define kEps 0.0f
 

diff --git a/warp/tests/test_codegen_instancing.py b/warp/tests/test_codegen_instancing.py
@@ -1091,13 +1091,31 @@ def k(input: float, expected: float):
 type_closure_kernel_uint8 = create_type_closure_scalar(wp.uint8)
 
 
+def create_type_closure_scalar_f64(scalar_type):
+    @wp.kernel
+    def k(input: wp.float64, expected: wp.float64):
+        x = scalar_type(input)
+        wp.expect_eq(wp.float64(x), expected)
+
+    return k
+
+
+type_closure_kernel_uint64_f64 = create_type_closure_scalar_f64(wp.uint64)
+
+
 def test_type_closure_scalar(test, device):
     with wp.ScopedDevice(device):
         wp.launch(type_closure_kernel_int, dim=1, inputs=[-1.5, -1.0])
         wp.launch(type_closure_kernel_float, dim=1, inputs=[-1.5, -1.5])
 
-        # FIXME: a problem with type conversions breaks this case
-        # wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0])
+        wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0])
+        wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-0.1, 0.0])
+        wp.launch(type_closure_kernel_uint8, dim=1, inputs=[255.1, 255.0])
+        wp.launch(type_closure_kernel_uint8, dim=1, inputs=[128.0, 128.0])
+        wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-100.0, 156.0])
+
+        # Test boundary cases for uint64 truncation safety with float64 precision
+        wp.launch(type_closure_kernel_uint64_f64, dim=1, inputs=[9223372036854774784.0, 9223372036854774784.0])
 
 
 # =======================================================================