diff --git a/repro_uint8_bug.py b/repro_uint8_bug.py new file mode 100644 index 0000000000..13e8c40f2c --- /dev/null +++ b/repro_uint8_bug.py @@ -0,0 +1,40 @@ +"""Minimal reproduction of the uint8 type closure conversion bug.""" + +import warp as wp + +wp.init() + + +def create_type_closure_scalar(scalar_type): + @wp.kernel + def k(input: float, expected: float): + x = scalar_type(input) + wp.expect_eq(float(x), expected) + + return k + + +# These work fine (int, float closures) +type_closure_kernel_int = create_type_closure_scalar(int) +type_closure_kernel_float = create_type_closure_scalar(float) + +# This is the broken one +type_closure_kernel_uint8 = create_type_closure_scalar(wp.uint8) + +print("Testing int closure...") +wp.launch(type_closure_kernel_int, dim=1, inputs=[-1.5, -1.0], device="cpu") +wp.synchronize() +print(" PASSED") + +print("Testing float closure...") +wp.launch(type_closure_kernel_float, dim=1, inputs=[-1.5, -1.5], device="cpu") +wp.synchronize() +print(" PASSED") + +print("Testing uint8 closure...") +try: + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0], device="cpu") + wp.synchronize() + print(" PASSED") +except Exception as e: + print(f" FAILED with exception: {type(e).__name__}: {e}") diff --git a/warp/_src/builtins.py b/warp/_src/builtins.py index a04b3c26ac..6a1e103eb0 100644 --- a/warp/_src/builtins.py +++ b/warp/_src/builtins.py @@ -1007,8 +1007,17 @@ def get_diag_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, # scalar type constructors between all storage / compute types scalar_types_all = [*scalar_types, bool, int, float] + +unsigned_int_types = (uint8, uint16, uint32, uint64) +float_src_types = {float16: "float16", float32: "float32", float64: "float64", float: "float32"} + for t in scalar_types_all: for u in scalar_types_all: + # Use safe cast for float -> unsigned to avoid C++ UB + safe_native = None + if t in unsigned_int_types and u in float_src_types: + safe_native = f"{float_src_types[u]}_to_{t.__name__}" + add_builtin( t.__name__, input_types={"a": u}, @@ -1017,7 +1026,8 @@ def get_diag_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, hidden=True, group="Scalar Math", export=False, - namespace="wp::" if t is not bool else "", + namespace="wp::" if t is not bool and not safe_native else "", + native_func=safe_native if safe_native else t.__name__, ) diff --git a/warp/native/builtin.h b/warp/native/builtin.h index 00fa9cfd65..560407fee2 100644 --- a/warp/native/builtin.h +++ b/warp/native/builtin.h @@ -105,6 +105,46 @@ typedef uint64_t uint64; typedef const char* str; +// Float-to-unsigned conversions: cast through int64 to avoid C++ UB +// (C++ 7.3.11: float -> unsigned is UB when truncated value is negative) +template CUDA_CALLABLE inline int64 safe_float_to_int64(F x) +{ + if (!(x == x)) + return 0; + constexpr F min_int64 = static_cast(-9223372036854775808.0); // -2^63 + constexpr F max_overflow = static_cast(9223372036854775808.0); // 2^63 + if (x < min_int64) + return -9223372036854775807LL - 1LL; + if (x >= max_overflow) + return 9223372036854775807LL; + return static_cast(x); +} + +template CUDA_CALLABLE inline uint64 safe_float_to_uint64(F x) +{ + if (!(x == x)) + return 0; + if (x <= 0.0) + return static_cast(safe_float_to_int64(x)); + constexpr F pow2_63 = static_cast(9223372036854775808.0); // 2^63 + constexpr F overflow_uint64 = static_cast(18446744073709551616.0); // 2^64 + if (x >= overflow_uint64) + return 18446744073709551615ULL; + if (x >= pow2_63) + return static_cast(safe_float_to_int64(x - pow2_63)) + 9223372036854775808ULL; + return static_cast(safe_float_to_int64(x)); +} + +CUDA_CALLABLE inline uint8 float32_to_uint8(float32 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint8 float64_to_uint8(float64 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint16 float32_to_uint16(float32 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint16 float64_to_uint16(float64 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint32 float32_to_uint32(float32 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint32 float64_to_uint32(float64 x) { return static_cast(safe_float_to_int64(x)); } +CUDA_CALLABLE inline uint64 float32_to_uint64(float32 x) { return safe_float_to_uint64(x); } +CUDA_CALLABLE inline uint64 float64_to_uint64(float64 x) { return safe_float_to_uint64(x); } + + struct half; CUDA_CALLABLE half float_to_half(float x); @@ -182,6 +222,12 @@ static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes"); typedef half float16; +// Handle float16 source +CUDA_CALLABLE inline uint8 float16_to_uint8(float16 x) { return float32_to_uint8(float32(x)); } +CUDA_CALLABLE inline uint16 float16_to_uint16(float16 x) { return float32_to_uint16(float32(x)); } +CUDA_CALLABLE inline uint32 float16_to_uint32(float16 x) { return float32_to_uint32(float32(x)); } +CUDA_CALLABLE inline uint64 float16_to_uint64(float16 x) { return float32_to_uint64(float32(x)); } + // Approximate division/reciprocal intrinsics #if defined(__CUDA_ARCH__) @@ -337,6 +383,19 @@ template CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float template CUDA_CALLABLE inline void adj_float32(T x, T& adj_x, float32 adj_ret) { adj_x += T(adj_ret); } template CUDA_CALLABLE inline void adj_float64(T x, T& adj_x, float64 adj_ret) { adj_x += T(adj_ret); } +// Adjoint stubs for safe float-to-unsigned casts (no-op since they are cast functions) +template CUDA_CALLABLE inline void adj_float32_to_uint8(T, T&, uint8) { } +template CUDA_CALLABLE inline void adj_float64_to_uint8(T, T&, uint8) { } +template CUDA_CALLABLE inline void adj_float16_to_uint8(T, T&, uint8) { } +template CUDA_CALLABLE inline void adj_float32_to_uint16(T, T&, uint16) { } +template CUDA_CALLABLE inline void adj_float64_to_uint16(T, T&, uint16) { } +template CUDA_CALLABLE inline void adj_float16_to_uint16(T, T&, uint16) { } +template CUDA_CALLABLE inline void adj_float32_to_uint32(T, T&, uint32) { } +template CUDA_CALLABLE inline void adj_float64_to_uint32(T, T&, uint32) { } +template CUDA_CALLABLE inline void adj_float16_to_uint32(T, T&, uint32) { } +template CUDA_CALLABLE inline void adj_float32_to_uint64(T, T&, uint64) { } +template CUDA_CALLABLE inline void adj_float64_to_uint64(T, T&, uint64) { } +template CUDA_CALLABLE inline void adj_float16_to_uint64(T, T&, uint64) { } #define kEps 0.0f diff --git a/warp/tests/test_codegen_instancing.py b/warp/tests/test_codegen_instancing.py index f09a4c4b9e..f45a2cd0f8 100644 --- a/warp/tests/test_codegen_instancing.py +++ b/warp/tests/test_codegen_instancing.py @@ -1091,13 +1091,31 @@ def k(input: float, expected: float): type_closure_kernel_uint8 = create_type_closure_scalar(wp.uint8) +def create_type_closure_scalar_f64(scalar_type): + @wp.kernel + def k(input: wp.float64, expected: wp.float64): + x = scalar_type(input) + wp.expect_eq(wp.float64(x), expected) + + return k + + +type_closure_kernel_uint64_f64 = create_type_closure_scalar_f64(wp.uint64) + + def test_type_closure_scalar(test, device): with wp.ScopedDevice(device): wp.launch(type_closure_kernel_int, dim=1, inputs=[-1.5, -1.0]) wp.launch(type_closure_kernel_float, dim=1, inputs=[-1.5, -1.5]) - # FIXME: a problem with type conversions breaks this case - # wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0]) + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-1.5, 255.0]) + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-0.1, 0.0]) + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[255.1, 255.0]) + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[128.0, 128.0]) + wp.launch(type_closure_kernel_uint8, dim=1, inputs=[-100.0, 156.0]) + + # Test boundary cases for uint64 truncation safety with float64 precision + wp.launch(type_closure_kernel_uint64_f64, dim=1, inputs=[9223372036854774784.0, 9223372036854774784.0]) # =======================================================================