mitsuba-renderer
diff --git a/‎include/drjit-core/array.h
Lines changed: 1 addition & 1 deletion b/‎include/drjit-core/array.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/drjit-core/jit.h
Lines changed: 28 additions & 16 deletions b/‎include/drjit-core/jit.h
Lines changed: 28 additions & 16 deletions
diff --git a/‎resources/Makefile
Lines changed: 3 additions & 3 deletions b/‎resources/Makefile
Lines changed: 3 additions & 3 deletions
diff --git a/‎resources/common.h
Lines changed: 4 additions & 0 deletions b/‎resources/common.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎resources/compress.cuh
Lines changed: 10 additions & 0 deletions b/‎resources/compress.cuh
Lines changed: 10 additions & 0 deletions
@@ -364,7 +364,7 @@ Array empty(size_t size) {
                                                       : AllocType::HostAsync,
                    byte_size);
     return Array::steal(
-        jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));
+        jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));
 }
 
 template <typename Array>
 
@@ -1592,30 +1592,42 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
  */
 extern JIT_EXPORT void jit_reduce(JIT_ENUM JitBackend backend, JIT_ENUM VarType type,
                                   JIT_ENUM ReduceOp rtype,
-                                  const void *ptr, uint32_t size, void *out);
+                                  const void *in, uint32_t size, void *out);
 
-/**
- * \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
- * array
+/** \brief Compute n prefix sum over the given input array
+ *
+ * Both exclusive and inclusive variants are supported. If desired, the scan
+ * can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
+ * asynchronously.
  *
- * If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
- * Note that the CUDA implementation will round up \c size to the maximum of
- * the following three values for performance reasons:
+ * The operation is currenly implemented for the following numeric types:
+ * ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
+ * ``VarType::Float32``, and ``VarType::Float64``.
  *
- * - the value 4,
+ * Note that the CUDA implementation may round \c size to the maximum of the
+ * following three values for performance and implementation-related reasons
+ * (the prefix sum uses a tree-based parallelization scheme).
+ *
+ * - the value 4
  * - the next highest power of two (when size <= 4096),
  * - the next highest multiple of 2K (when size > 4096),
  *
  * For this reason, the the supplied memory regions must be sufficiently large
- * to avoid both out-of-bounds reads and writes. This is not an issue for
- * memory obtained using \ref jit_malloc(), which internally rounds
- * allocations to the next largest power of two and enforces a 64 byte minimum
- * allocation size.
+ * to avoid out-of-bounds reads and writes. This is not an issue for memory
+ * obtained using \ref jit_malloc(), which internally rounds allocations to the
+ * next largest power of two and enforces a 64 byte minimum allocation size.
  *
- * Runs asynchronously.
+ * The CUDA backend implementation for *large* numeric types (double precision
+ * floats, 64 bit integers) has the following technical limitation: when
+ * reducing 64-bit integers, their values must be smaller than 2**62. When
+ * reducing double precision arrays, the two least significant mantissa bits
+ * are clamped to zero when forwarding the prefix from one 512-wide block to
+ * the next (at a very minor loss in accuracy). See the implementation for
+ * details on this.
  */
-extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t *in,
-                                    uint32_t size, uint32_t *out);
+extern JIT_EXPORT void jit_prefix_sum(JIT_ENUM JitBackend backend,
+                                      JIT_ENUM VarType type, int exclusive,
+                                      const void *in, uint32_t size, void *out);
 
 /**
  * \brief Compress a mask into a list of nonzero indices
@@ -1625,7 +1637,7 @@ extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t
  * indices of nonzero entries to \c out (in increasing order), and it
  * furthermore returns the total number of nonzero mask entries.
  *
- * The internals resemble \ref jit_scan_u32(), and the CUDA implementation may
+ * The internals resemble \ref jit_prefix_sum_u32(), and the CUDA implementation may
  * similarly access regions beyond the end of \c in and \c out.
  *
  * This function internally performs a synchronization step.
 
@@ -1,13 +1,13 @@
 COMPUTE_CAPABILITY=compute_70
 CUDA_VER=10.2
-NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr
+NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr -std=c++14
 
 all: kernels.h
 
-kernels_50.ptx: reduce.cuh scan.cuh compress.cuh mkperm.cuh misc.cuh kernels.cu
+kernels_50.ptx: reduce.cuh prefix_sum.cuh compress.cuh mkperm.cuh misc.cuh kernels.cu
 	$(NVCC) --Wno-deprecated-gpu-targets -gencode arch=compute_50,code=compute_50 kernels.cu -o kernels_50.ptx
 
-kernels_70.ptx: reduce.cuh scan.cuh compress.cuh mkperm.cuh misc.cuh kernels.cu
+kernels_70.ptx: reduce.cuh prefix_sum.cuh compress.cuh mkperm.cuh misc.cuh kernels.cu
 	$(NVCC) -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=compute_70 kernels.cu -o kernels_70.ptx
 
 kernels.dict:
 
@@ -5,6 +5,10 @@
 #include <limits>
 
 #define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define FINLINE __forceinline__
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
 
 template <typename T> struct SharedMemory {
     __device__ inline static T *get() {
 
@@ -10,6 +10,16 @@
 
 #include "common.h"
 
+DEVICE FINLINE void store_cg(uint64_t *ptr, uint64_t val) {
+    asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+
+DEVICE FINLINE uint64_t load_cg(uint64_t *ptr) {
+    uint64_t retval;
+    asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr));
+    return retval;
+}
+
 KERNEL void compress_small(const uint8_t *in, uint32_t *out, uint32_t size, uint32_t *count_out) {
     uint32_t *shared = SharedMemory<uint32_t>::get();
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ Array empty(size_t size) {`
`364`	`364`	`: AllocType::HostAsync,`
`365`	`365`	`byte_size);`
`366`	`366`	`return Array::steal(`
`367`		`- jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));`
	`367`	`+ jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));`
`368`	`368`	`}`
`369`	`369`
`370`	`370`	`template <typename Array>`