NVIDIA · bernhardmgruber · Feb 6, 2025 · Nov 4, 2024 · Feb 2, 2025 · Feb 2, 2025
@@ -206,7 +206,9 @@ def __init__(self, value_dtype, size, is_output=False):
         super().__init__(value_dtype, is_output)
 
     def __repr__(self) -> str:
-        return f"Array(dtype={self.value_dtype}, out={self.is_output})"
+        return (
+            f"Array(dtype={self.value_dtype}, size={self.size}, out={self.is_output})"
+        )
 
     def cpp_decl(self, name):
         return f"{numba_type_to_cpp(self.value_dtype)} (&{name})[{self.size}]"
@@ -780,7 +782,7 @@ def codegen(context, builder, sig, args):
                             )
                             types.append(ir.PointerType(ir.IntType(8)))
                             arguments.append(void_ptr)
-                        if isinstance(param, Reference):
+                        elif isinstance(param, Reference):
                             if param.is_output:
                                 ptr = cgutils.alloca_once(
                                     builder, context.get_value_type(dtype)

@@ -8,6 +8,7 @@
 from cuda.cooperative.experimental._types import (
     Algorithm,
     Dependency,
+    DependentArray,
     DependentOperator,
     DependentReference,
     Invocable,
@@ -18,42 +19,48 @@
 )
 
 
-def reduce(dtype, threads_in_block, binary_op, methods=None):
-    """Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor.
-    Each thread contributes one input element.
+def reduce(dtype, threads_in_block, binary_op, items_per_thread=1, methods=None):
+    """Creates an operation that computes a block-wide reduction for thread\ :sub:`0` using the
+    specified binary reduction functor.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
+    - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
+        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
+        first `items_per_thread` items will be included in the reduction.
+    - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
+        single item to the reduction. The items contributed by all other threads are ignored.
+
+    Args:
+        dtype: Data type being reduced
+        threads_in_block: The number of threads in a block
+        binary_op: Binary reduction function
+        items_per_thread: The number of items each thread contributes to the reduction
+        methods: A dict of methods for user-defined types
 
     Warning:
         The return value is undefined in threads other than thread\ :sub:`0`.
 
     Example:
-        The code snippet below illustrates a max reduction of 128 integer items that
-        are partitioned across 128 threads.
+        The code snippet below illustrates a max reduction of 128 integer items that are
+        partitioned across 128 threads.
 
         .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
 
-        Below is the code snippet that demonstrates the usage of the ``reduce`` API:
-
         .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
             :end-before: example-end reduce
 
-        Suppose the set of inputs across the block of threads is
-        ``{ 0, 1, 2, 3, ..., 127 }``.
+        Suppose the set of inputs across the block of threads is ``{ 0, 1, 2, 3, ..., 127 }``.
         The corresponding output in the threads thread\ :sub:`0` will be ``{ 127 }``.
-
-    Args:
-        dtype: Data type being reduced
-        threads_in_block: The number of threads in a block
-        binary_op: Binary reduction function
-
-    Returns:
-        A callable object that can be linked to and invoked from a CUDA kernel
     """
     template = Algorithm(
         "BlockReduce",
@@ -62,6 +69,30 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
         ["cub/block/block_reduce.cuh"],
         [TemplateParameter("T"), TemplateParameter("BLOCK_DIM_X")],
         [
+            # Signatures:
+            # T Reduce(T(&)[ITEMS_PER_THREAD], Op);
+            [
+                Pointer(numba.uint8),
+                DependentArray(Dependency("T"), Dependency("ITEMS_PER_THREAD")),
+                DependentOperator(
+                    Dependency("T"),
+                    [Dependency("T"), Dependency("T")],
+                    Dependency("Op"),
+                ),
+                DependentReference(Dependency("T"), True),
+            ],
+            # T Reduce(T&, Op);
+            [
+                Pointer(numba.uint8),
+                DependentReference(Dependency("T")),
+                DependentOperator(
+                    Dependency("T"),
+                    [Dependency("T"), Dependency("T")],
+                    Dependency("Op"),
+                ),
+                DependentReference(Dependency("T"), True),
+            ],
+            # T Reduce(T&, Op, int num_valid);
             [
                 Pointer(numba.uint8),
                 DependentReference(Dependency("T")),
@@ -70,13 +101,19 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
                     [Dependency("T"), Dependency("T")],
                     Dependency("Op"),
                 ),
+                Value(numba.int32),
                 DependentReference(Dependency("T"), True),
-            ]
+            ],
         ],
         type_definitions=[numba_type_to_wrapper(dtype, methods=methods)],
     )
     specialization = template.specialize(
-        {"T": dtype, "BLOCK_DIM_X": threads_in_block, "Op": binary_op}
+        {
+            "T": dtype,
+            "BLOCK_DIM_X": threads_in_block,
+            "ITEMS_PER_THREAD": items_per_thread,
+            "Op": binary_op,
+        }
     )
 
     return Invocable(
@@ -89,41 +126,47 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
     )
 
 
-def sum(dtype, threads_in_block):
-    """Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator.
-    Each thread contributes one input element.
+def sum(dtype, threads_in_block, items_per_thread=1, methods=None):
+    """Creates an operation that computes a block-wide reduction for thread\ :sub:`0` using
+    addition (+) as the reduction operator.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
+    - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
+        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
+        first `items_per_thread` items will be included in the reduction.
+    - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
+        single item to the reduction. The items contributed by all other threads are ignored.
+
+    Args:
+        dtype: Data type being reduced
+        threads_in_block: The number of threads in a block
+        items_per_thread: The number of items each thread owns
+        methods: A dict of methods for user-defined types
 
     Warning:
         The return value is undefined in threads other than thread\ :sub:`0`.
 
     Example:
-        The code snippet below illustrates a reduction of 128 integer items that
-        are partitioned across 128 threads.
+        The code snippet below illustrates a sum of 128 integer items that are partitioned
+        across 128 threads.
 
         .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
 
-        Below is the code snippet that demonstrates the usage of the ``reduce`` API:
-
         .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum
             :end-before: example-end sum
 
-        Suppose the set of inputs across the block of threads is
-        ``{ 1, 1, 1, 1, ..., 1 }``.
+        Suppose the set of inputs across the block of threads is ``{ 1, 1, 1, 1, ..., 1 }``.
         The corresponding output in the threads thread\ :sub:`0` will be ``{ 128 }``.
-
-    Args:
-        dtype: Data type being reduced
-        threads_in_block: The number of threads in a block
-
-    Returns:
-        A callable object that can be linked to and invoked from a CUDA kernel
     """
     template = Algorithm(
         "BlockReduce",
@@ -132,20 +175,36 @@ def sum(dtype, threads_in_block):
         ["cub/block/block_reduce.cuh"],
         [TemplateParameter("T"), TemplateParameter("BLOCK_DIM_X")],
         [
+            # Signatures:
+            # T Sum(T(&)[ITEMS_PER_THREAD]);
+            [
+                Pointer(numba.uint8),
+                DependentArray(Dependency("T"), Dependency("ITEMS_PER_THREAD")),
+                DependentReference(Dependency("T"), True),
+            ],
+            # T Sum(T&);
             [
                 Pointer(numba.uint8),
                 DependentReference(Dependency("T")),
                 DependentReference(Dependency("T"), True),
             ],
+            # T Sum(T&, int num_valid);
             [
                 Pointer(numba.uint8),
                 DependentReference(Dependency("T")),
                 Value(numba.int32),
                 DependentReference(Dependency("T"), True),
             ],
         ],
+        type_definitions=[numba_type_to_wrapper(dtype, methods=methods)],
+    )
+    specialization = template.specialize(
+        {
+            "T": dtype,
+            "BLOCK_DIM_X": threads_in_block,
+            "ITEMS_PER_THREAD": items_per_thread,
+        }
     )
-    specialization = template.specialize({"T": dtype, "BLOCK_DIM_X": threads_in_block})
     return Invocable(
         temp_files=[
             make_binary_tempfile(ltoir, ".ltoir")