@@ -1592,30 +1592,42 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
1592
1592
*/
1593
1593
extern JIT_EXPORT void jit_reduce (JIT_ENUM JitBackend backend , JIT_ENUM VarType type ,
1594
1594
JIT_ENUM ReduceOp rtype ,
1595
- const void * ptr , uint32_t size , void * out );
1595
+ const void * in , uint32_t size , void * out );
1596
1596
1597
- /**
1598
- * \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
1599
- * array
1597
+ /** \brief Compute n prefix sum over the given input array
1598
+ *
1599
+ * Both exclusive and inclusive variants are supported. If desired, the scan
1600
+ * can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
1601
+ * asynchronously.
1600
1602
*
1601
- * If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
1602
- * Note that the CUDA implementation will round up \c size to the maximum of
1603
- * the following three values for performance reasons:
1603
+ * The operation is currenly implemented for the following numeric types:
1604
+ * ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
1605
+ * ``VarType::Float32``, and ``VarType::Float64``.
1604
1606
*
1605
- * - the value 4,
1607
+ * Note that the CUDA implementation may round \c size to the maximum of the
1608
+ * following three values for performance and implementation-related reasons
1609
+ * (the prefix sum uses a tree-based parallelization scheme).
1610
+ *
1611
+ * - the value 4
1606
1612
* - the next highest power of two (when size <= 4096),
1607
1613
* - the next highest multiple of 2K (when size > 4096),
1608
1614
*
1609
1615
* For this reason, the the supplied memory regions must be sufficiently large
1610
- * to avoid both out-of-bounds reads and writes. This is not an issue for
1611
- * memory obtained using \ref jit_malloc(), which internally rounds
1612
- * allocations to the next largest power of two and enforces a 64 byte minimum
1613
- * allocation size.
1616
+ * to avoid out-of-bounds reads and writes. This is not an issue for memory
1617
+ * obtained using \ref jit_malloc(), which internally rounds allocations to the
1618
+ * next largest power of two and enforces a 64 byte minimum allocation size.
1614
1619
*
1615
- * Runs asynchronously.
1620
+ * The CUDA backend implementation for *large* numeric types (double precision
1621
+ * floats, 64 bit integers) has the following technical limitation: when
1622
+ * reducing 64-bit integers, their values must be smaller than 2**62. When
1623
+ * reducing double precision arrays, the two least significant mantissa bits
1624
+ * are clamped to zero when forwarding the prefix from one 512-wide block to
1625
+ * the next (at a very minor loss in accuracy). See the implementation for
1626
+ * details on this.
1616
1627
*/
1617
- extern JIT_EXPORT void jit_scan_u32 (JIT_ENUM JitBackend backend , const uint32_t * in ,
1618
- uint32_t size , uint32_t * out );
1628
+ extern JIT_EXPORT void jit_prefix_sum (JIT_ENUM JitBackend backend ,
1629
+ JIT_ENUM VarType type , int exclusive ,
1630
+ const void * in , uint32_t size , void * out );
1619
1631
1620
1632
/**
1621
1633
* \brief Compress a mask into a list of nonzero indices
@@ -1625,7 +1637,7 @@ extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t
1625
1637
* indices of nonzero entries to \c out (in increasing order), and it
1626
1638
* furthermore returns the total number of nonzero mask entries.
1627
1639
*
1628
- * The internals resemble \ref jit_scan_u32 (), and the CUDA implementation may
1640
+ * The internals resemble \ref jit_prefix_sum_u32 (), and the CUDA implementation may
1629
1641
* similarly access regions beyond the end of \c in and \c out.
1630
1642
*
1631
1643
* This function internally performs a synchronization step.
0 commit comments