mitsuba-renderer
diff --git a/‎include/drjit-core/array.h
Lines changed: 6 additions & 1 deletion b/‎include/drjit-core/array.h
Lines changed: 6 additions & 1 deletion
diff --git a/‎include/drjit-core/jit.h
Lines changed: 22 additions & 3 deletions b/‎include/drjit-core/jit.h
Lines changed: 22 additions & 3 deletions
diff --git a/‎src/api.cpp
Lines changed: 6 additions & 0 deletions b/‎src/api.cpp
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/cuda_eval.cpp
Lines changed: 66 additions & 5 deletions b/‎src/cuda_eval.cpp
Lines changed: 66 additions & 5 deletions
diff --git a/‎src/eval.cpp
Lines changed: 1 addition & 0 deletions b/‎src/eval.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/internal.h
Lines changed: 5 additions & 2 deletions b/‎src/internal.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/io.cpp
Lines changed: 4 additions & 4 deletions b/‎src/io.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/llvm_eval.cpp
Lines changed: 58 additions & 2 deletions b/‎src/llvm_eval.cpp
Lines changed: 58 additions & 2 deletions
@@ -368,7 +368,7 @@ Array empty(size_t size) {
 }
 
 template <typename Array>
-Array zero(size_t size = 1) {
+Array zeros(size_t size = 1) {
     typename Array::Value value = 0;
     return Array::steal(
         jit_var_literal(Array::Backend, Array::Type, &value, size));
@@ -409,6 +409,11 @@ void scatter_reduce(ReduceOp op, Array &target, const Array &value,
                                           index.index(), mask.index(), op));
 }
 
+template <typename Array>
+Array scatter_inc(Array &target, const Array index, const JitArray<Array::Backend, bool> &mask = true) {
+    return Array::steal(jit_var_scatter_inc(target.index_ptr(), index.index(), mask.index()));
+}
+
 template <typename Array, typename Index>
 void scatter_reduce_kahan(Array &target_1, Array &target_2, const Array &value,
                           const JitArray<Array::Backend, Index> &index,
 
@@ -929,9 +929,11 @@ extern JIT_EXPORT uint32_t jit_var_scatter(uint32_t target, uint32_t value,
 /**
  * \brief Schedule a Kahan-compensated floating point atomic scatter-write
  *
- * This operation is just like `jit_var(scatter, ..., ReduceOp::Add)`. The
- * difference is that it simultaneously adds to two different target buffers
- * using the Kahan summation algorithm.
+ * This operation is just like ``jit_var_scatter`` invoked with a floating
+ * point operands and reduce_op=ReduceOp::Add.
+ *
+ * The difference is that it simultaneously adds to
+ * two different target buffers using the Kahan summation algorithm.
  *
  * The implementation may overwrite the 'target_1' / 'target_2' pointers
  * if a copy needs to be made (for example, if another variable elsewhere
@@ -943,6 +945,23 @@ extern JIT_EXPORT void jit_var_scatter_reduce_kahan(uint32_t *target_1,
                                                     uint32_t index,
                                                     uint32_t mask);
 
+/**
+ * \brief Atomically increment a counter and return the old value
+ *
+ * This operation is just like ``jit_var_scatter`` invoked with 32-bit unsigned
+ * integer operands, the value ``1``, and reduce_op=ReduceOp::Add.
+ *
+ * The main difference is that this variant returns the *old* value before the
+ * atomic write (in contrast to the more general scatter reduction, where doing
+ * so would be rather complicated).
+ *
+ * This operation is a building block for stream compaction: threads can
+ * scatter-increment a global counter to request a spot in an array.
+ */
+extern JIT_EXPORT uint32_t jit_var_scatter_inc(uint32_t *target,
+                                               uint32_t index,
+                                               uint32_t mask);
+
 /**
  * \brief Create an identical copy of the given variable
  *
 
@@ -407,6 +407,12 @@ void jit_var_scatter_reduce_kahan(uint32_t *target_1, uint32_t *target_2,
     jitc_var_scatter_reduce_kahan(target_1, target_2, value, index, mask);
 }
 
+uint32_t jit_var_scatter_inc(uint32_t *target, uint32_t index, uint32_t mask) {
+    lock_guard guard(state.lock);
+    return jitc_var_scatter_inc(target, index, mask);
+}
+
+
 uint32_t jit_var_pointer(JitBackend backend, const void *value,
                              uint32_t dep, int write) {
     lock_guard guard(state.lock);
 
@@ -50,10 +50,12 @@
 
 // Forward declarations
 static void jitc_cuda_render_stmt(uint32_t index, const Variable *v);
-static void jitc_cuda_render_var(uint32_t index, const Variable *v);
+static void jitc_cuda_render_var(uint32_t index, Variable *v);
 static void jitc_cuda_render_scatter(const Variable *v, const Variable *ptr,
                                      const Variable *value, const Variable *index,
                                      const Variable *mask);
+static void jitc_cuda_render_scatter_inc(Variable *v, const Variable *ptr,
+                                         const Variable *index, const Variable *mask);
 static void jitc_cuda_render_scatter_kahan(const Variable *v, uint32_t index);
 static void jitc_cuda_render_printf(uint32_t index, const Variable *v,
                                     const Variable *mask);
@@ -157,7 +159,7 @@ void jitc_cuda_assemble(ThreadState *ts, ScheduledGroup group,
 
     for (uint32_t gi = group.start; gi != group.end; ++gi) {
         uint32_t index = schedule[gi].index;
-        const Variable *v = jitc_var(index);
+        Variable *v = jitc_var(index);
         const uint32_t vti = v->type,
                        size = v->size;
         const VarType vt = (VarType) vti;
@@ -311,7 +313,7 @@ void jitc_cuda_assemble_func(const char *name, uint32_t inst_id,
         name, n_regs, n_regs, n_regs, n_regs, n_regs, n_regs, n_regs);
 
     for (ScheduledVariable &sv : schedule) {
-        const Variable *v = jitc_var(sv.index);
+        Variable *v = jitc_var(sv.index);
         const uint32_t vti = v->type;
         const VarType vt = (VarType) vti;
 
@@ -401,7 +403,7 @@ static const char *reduce_op_name[(int) ReduceOp::Count] = {
     "", "add", "mul", "min", "max", "and", "or"
 };
 
-static void jitc_cuda_render_var(uint32_t index, const Variable *v) {
+static void jitc_cuda_render_var(uint32_t index, Variable *v) {
     const char *stmt = nullptr;
     Variable *a0 = v->dep[0] ? jitc_var(v->dep[0]) : nullptr,
              *a1 = v->dep[1] ? jitc_var(v->dep[1]) : nullptr,
@@ -720,6 +722,10 @@ static void jitc_cuda_render_var(uint32_t index, const Variable *v) {
             jitc_cuda_render_scatter(v, a0, a1, a2, a3);
             break;
 
+        case VarKind::ScatterInc:
+            jitc_cuda_render_scatter_inc(v, a0, a1, a2);
+            break;
+
         case VarKind::ScatterKahan:
             jitc_cuda_render_scatter_kahan(v, index);
             break;
@@ -808,7 +814,7 @@ static void jitc_cuda_render_scatter(const Variable *v,
         (jitc_flags() & (uint32_t) JitFlag::AtomicReduceLocal)) {
         fmt("    {\n"
             "        .func reduce_$s_$t(.param .u64 ptr, .param .$t value);\n"
-            "        call reduce_$s_$t, (%rd3, $v);\n"
+            "        call.uni reduce_$s_$t, (%rd3, $v);\n"
             "    }\n",
             op, value, value, op, value, value);
 
@@ -901,6 +907,61 @@ static void jitc_cuda_render_scatter(const Variable *v,
         fmt("\nl_$u_done:\n", v->reg_index);
 }
 
+static void jitc_cuda_render_scatter_inc(Variable *v,
+                                         const Variable *ptr,
+                                         const Variable *index,
+                                         const Variable *mask) {
+    bool index_zero = index->is_literal() && index->literal == 0;
+    bool unmasked = mask->is_literal() && mask->literal == 1;
+
+    fmt_intrinsic(
+        ".func (.param .u32 rv) reduce_inc_u32 (.param .u64 ptr) {\n"
+        "    .reg .pred %p<2>;\n"
+        "    .reg .b32 %r<11>;\n"
+        "    .reg .b64 %rd<2>;\n"
+        "\n"
+        "    ld.param.u64 %rd1, [ptr];\n"
+        "    activemask.b32 %r2;\n"
+        "    mov.u32 %r3, %lanemask_lt;\n"
+        "    and.b32 %r3, %r3, %r2;\n"
+        "    setp.ne.u32 %p1, %r3, 0;\n"
+        "    @%p1 bra L2;\n"
+        "\n"
+        "L1:\n"
+        "    popc.b32 %r4, %r2;\n"
+        "    atom.global.add.u32 %r5, [%rd1], %r4;\n"
+        "\n"
+        "L2:\n"
+        "    popc.b32 %r6, %r3;\n"
+        "    brev.b32 %r7, %r2;\n"
+        "    bfind.shiftamt.u32 %r8, %r7;\n"
+        "    shfl.sync.idx.b32 %r9, %r5, %r8, 31, %r2;\n"
+        "    add.u32 %r10, %r6, %r9;\n"
+        "    st.param.u32 [rv], %r10;\n"
+        "    ret;\n"
+        "}\n");
+
+    if (!unmasked)
+        fmt("    @!$v bra l_$u_done;\n", mask, v->reg_index);
+
+    if (index_zero) {
+        fmt("    mov.u64 %rd3, $v;\n", ptr);
+    } else {
+        fmt("    mad.wide.$t %rd3, $v, 4, $v;\n",
+            index, index, ptr);
+    }
+
+    fmt("    {\n"
+        "        .func (.param .u32 rv) reduce_inc_u32 (.param .u64 ptr);\n"
+        "        call.uni ($v), reduce_inc_u32, (%rd3);\n"
+        "    }\n", v);
+
+    if (!unmasked)
+        fmt("\nl_$u_done:\n", v->reg_index);
+
+    v->consumed = 1;
+}
+
 static void jitc_cuda_render_scatter_kahan(const Variable *v, uint32_t v_index) {
     const Extra &extra = state.extra[v_index];
 
 
@@ -709,6 +709,7 @@ void jitc_eval(ThreadState *ts) {
             v->kind = (uint32_t) VarKind::Data;
             v->data = sv.data;
             v->output_flag = false;
+            v->consumed = false;
         }
 
         if (unlikely(v->extra)) {
 
@@ -88,7 +88,7 @@ enum VarKind : uint32_t {
     Cast, Bitcast,
 
     // Memory-related operations
-    Gather, Scatter, ScatterKahan,
+    Gather, Scatter, ScatterInc, ScatterKahan,
 
     // Specialized nodes for vcalls
     VCallMask, VCallSelf,
@@ -231,8 +231,11 @@ struct Variable {
     /// Is this variable marked as an output?
     uint32_t output_flag : 1;
 
+    /// Consumed bit for operations that should only be executed once
+    uint32_t consumed : 1;
+
     /// Unused for now
-    uint32_t unused_2 : 6;
+    uint32_t unused_2 : 5;
 
     /// Offset of the argument in the list of kernel parameters
     uint32_t param_offset;
 
@@ -63,8 +63,8 @@ void jitc_lz4_init() {
     jitc_lz4_dict_ready = true;
 }
 
-/* Computes padding to align cache file content to a multiple of sizeof(void*). 
-This prevents undefiend behavior due to misaligned memory reads/writes. */
+/* Computes padding to align cache file content to a multiple of sizeof(void*).
+   This prevents undefiend behavior due to misaligned memory reads/writes. */
 static uint32_t compute_padding(const CacheFileHeader &header) {
     uint32_t padding_size = (header.source_size + header.kernel_size) % sizeof(void *);
     if (padding_size)
@@ -353,7 +353,7 @@ bool jitc_kernel_write(const char *source, uint32_t source_size,
         header.reloc_size = kernel.llvm.n_reloc * sizeof(void *);
 
     uint32_t padding_size = compute_padding(header);
-    uint32_t in_size = header.source_size + header.kernel_size 
+    uint32_t in_size = header.source_size + header.kernel_size
                      + padding_size + header.reloc_size,
              out_size = LZ4_compressBound(in_size);
 
@@ -365,7 +365,7 @@ bool jitc_kernel_write(const char *source, uint32_t source_size,
     memset(temp_in + header.source_size + header.kernel_size, 0, padding_size);
 
     if (backend == JitBackend::LLVM) {
-        uintptr_t *reloc_out = (uintptr_t *) (temp_in + header.source_size + 
+        uintptr_t *reloc_out = (uintptr_t *) (temp_in + header.source_size +
                                               header.kernel_size + padding_size);
         for (uint32_t i = 0; i < kernel.llvm.n_reloc; ++i)
             reloc_out[i] = (uintptr_t) kernel.llvm.reloc[i] - (uintptr_t) kernel.data;
 
@@ -86,6 +86,10 @@ static void jitc_llvm_render_scatter(const Variable *v, const Variable *ptr,
                                      const Variable *value, const Variable *index,
                                      const Variable *mask);
 static void jitc_llvm_render_scatter_kahan(const Variable *v, uint32_t index);
+static void jitc_llvm_render_scatter_inc(Variable *v,
+                                         const Variable *ptr,
+                                         const Variable *index,
+                                         const Variable *mask);
 static void jitc_llvm_render_printf(uint32_t index, const Variable *v,
                                     const Variable *mask, const Variable *target);
 static void jitc_llvm_render_trace(uint32_t index, const Variable *v,
@@ -777,6 +781,10 @@ static void jitc_llvm_render_var(uint32_t index, Variable *v) {
             jitc_llvm_render_scatter(v, a0, a1, a2, a3);
             break;
 
+        case VarKind::ScatterInc:
+            jitc_llvm_render_scatter_inc(v, a0, a1, a2);
+            break;
+
         case VarKind::ScatterKahan:
             jitc_llvm_render_scatter_kahan(v, index);
             break;
@@ -948,6 +956,54 @@ static void jitc_llvm_render_scatter(const Variable *v,
     }
 }
 
+static void jitc_llvm_render_scatter_inc(Variable *v,
+                                         const Variable *ptr,
+                                         const Variable *index,
+                                         const Variable *mask) {
+    fmt( "    $v_1 = extractelement $V, i32 0\n"
+        "{    $v_2 = bitcast i8* $v to i32*\n"
+         "    $v_3 = getelementptr i32, i32* $v_2, i32 $v_1\n|"
+         "    $v_3 = getelementptr i32, ptr $v, i32 $v_1\n}"
+         "    $v = call $T @reduce_inc_u32({$t*} $v_3, $V)\n",
+        v, index,
+        v, ptr,
+        v, v, v,
+        v, ptr, v,
+        v, v, v, v, mask);
+
+    fmt_intrinsic(
+        "define internal <$w x i32> @reduce_inc_u32({i32*} %ptr, <$w x i1> %active) #0 ${\n"
+        "L0:\n"
+        "   br label %L1\n\n"
+        "L1:\n"
+        "   %index = phi i32 [ 0, %L0 ], [ %index_next, %L1 ]\n"
+        "   %sum = phi i32 [ 0, %L0 ], [ %sum_next, %L1 ]\n"
+        "   %sum_vec = phi <$w x i32> [ undef, %L0 ], [ %sum_vec_next, %L1 ]\n"
+        "   %active_i = extractelement <$w x i1> %active, i32 %index\n"
+        "   %active_u = zext i1 %active_i to i32\n"
+        "   %sum_next = add nuw i32 %sum, %active_u\n"
+        "   %sum_vec_next = insertelement <$w x i32> %sum_vec, i32 %sum, i32 %index\n"
+        "   %index_next = add nuw nsw i32 %index, 1\n"
+        "   %cond_1 = icmp eq i32 %index_next, $w\n"
+        "   br i1 %cond_1, label %L2, label %L1\n\n"
+        "L2:\n"
+        "   %cond_2 = icmp eq i32 %sum_next, 0\n"
+        "   br i1 %cond_2, label %L4, label %L3\n\n"
+        "L3:\n"
+        "   %old_1 = atomicrmw add {i32*} %ptr, i32 %sum_next monotonic\n"
+        "   %old_2 = insertelement <$w x i32> undef, i32 %old_1, i32 0\n"
+        "   %old_3 = shufflevector <$w x i32> %old_2, <$w x i32> undef, <$w x i32> $z\n"
+        "   %sum_vec_final = add <$w x i32> %sum_vec_next, %old_3\n"
+        "   br label %L4;\n\n"
+        "L4:\n"
+        "   %sum_vec_combined = phi <$w x i32> [ %sum_vec_next, %L2 ], [ %sum_vec_final, %L3 ]\n"
+        "   ret <$w x i32> %sum_vec_combined\n"
+        "$}"
+    );
+
+    v->consumed = 1;
+}
+
 static void jitc_llvm_render_scatter_kahan(const Variable *v, uint32_t v_index) {
     const Extra &extra = state.extra[v_index];
     const Variable *ptr_1 = jitc_var(extra.dep[0]),
@@ -1258,7 +1314,7 @@ void jitc_llvm_ray_trace(uint32_t func, uint32_t scene, int shadow_ray,
         jitc_var_inc_ref(id);
     }
 
-    for (uint32_t i = 0; i < (shadow_ray ? 1 : 6); ++i)
+    for (int i = 0; i < (shadow_ray ? 1 : 6); ++i)
         out[i] = jitc_var_new_node_1(JitBackend::LLVM, VarKind::Extract,
                                      i < 3 ? float_type : VarType::UInt32, size,
                                      placeholder, index, jitc_var(index),
@@ -1467,7 +1523,7 @@ static void jitc_llvm_render_trace(uint32_t index, const Variable *v,
 
     offset = (8 * float_size + 4) * width;
 
-    for (uint32_t i = 0; i < (shadow_ray ? 1 : 6); ++i) {
+    for (int i = 0; i < (shadow_ray ? 1 : 6); ++i) {
         VarType vt = (i < 3) ? float_type : VarType::UInt32;
         const char *tname = type_name_llvm[(int) vt];
         uint32_t tsize = type_size[(int) vt];
Original file line number	Diff line number	Diff line change
`@@ -709,6 +709,7 @@ void jitc_eval(ThreadState *ts) {`
`709`	`709`	`v->kind = (uint32_t) VarKind::Data;`
`710`	`710`	`v->data = sv.data;`
`711`	`711`	`v->output_flag = false;`
	`712`	`+ v->consumed = false;`
`712`	`713`	`}`
`713`	`714`
`714`	`715`	`if (unlikely(v->extra)) {`