Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cuda.cooperative] Add missing overloads to block.reduce and block.sum #2691

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,9 @@ def __init__(self, value_dtype, size, is_output=False):
super().__init__(value_dtype, is_output)

def __repr__(self) -> str:
return f"Array(dtype={self.value_dtype}, out={self.is_output})"
return (
f"Array(dtype={self.value_dtype}, size={self.size}, out={self.is_output})"
)

def cpp_decl(self, name):
return f"{numba_type_to_cpp(self.value_dtype)} (&{name})[{self.size}]"
Expand Down Expand Up @@ -780,7 +782,7 @@ def codegen(context, builder, sig, args):
)
types.append(ir.PointerType(ir.IntType(8)))
arguments.append(void_ptr)
if isinstance(param, Reference):
elif isinstance(param, Reference):
if param.is_output:
ptr = cgutils.alloca_once(
builder, context.get_value_type(dtype)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from cuda.cooperative.experimental._types import (
Algorithm,
Dependency,
DependentArray,
DependentOperator,
DependentReference,
Invocable,
Expand All @@ -18,42 +19,48 @@
)


def reduce(dtype, threads_in_block, binary_op, methods=None):
"""Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor.
Each thread contributes one input element.
def reduce(dtype, threads_in_block, binary_op, items_per_thread=1, methods=None):
"""Creates an operation that computes a block-wide reduction for thread\ :sub:`0` using the
specified binary reduction functor.

Returns a callable object that can be linked to and invoked from device code. It can be
invoked with the following signatures:

- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
first `items_per_thread` items will be included in the reduction.
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
single item to the reduction. The items contributed by all other threads are ignored.

Args:
dtype: Data type being reduced
threads_in_block: The number of threads in a block
binary_op: Binary reduction function
items_per_thread: The number of items each thread contributes to the reduction
methods: A dict of methods for user-defined types

Warning:
The return value is undefined in threads other than thread\ :sub:`0`.

Example:
The code snippet below illustrates a max reduction of 128 integer items that
are partitioned across 128 threads.
The code snippet below illustrates a max reduction of 128 integer items that are
partitioned across 128 threads.

.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
:language: python
:dedent:
:start-after: example-begin imports
:end-before: example-end imports

Below is the code snippet that demonstrates the usage of the ``reduce`` API:

.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
:language: python
:dedent:
:start-after: example-begin reduce
:end-before: example-end reduce

Suppose the set of inputs across the block of threads is
``{ 0, 1, 2, 3, ..., 127 }``.
Suppose the set of inputs across the block of threads is ``{ 0, 1, 2, 3, ..., 127 }``.
The corresponding output in the threads thread\ :sub:`0` will be ``{ 127 }``.

Args:
dtype: Data type being reduced
threads_in_block: The number of threads in a block
binary_op: Binary reduction function

Returns:
A callable object that can be linked to and invoked from a CUDA kernel
"""
template = Algorithm(
"BlockReduce",
Expand All @@ -62,6 +69,30 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
["cub/block/block_reduce.cuh"],
[TemplateParameter("T"), TemplateParameter("BLOCK_DIM_X")],
[
# Signatures:
# T Reduce(T(&)[ITEMS_PER_THREAD], Op);
[
Pointer(numba.uint8),
DependentArray(Dependency("T"), Dependency("ITEMS_PER_THREAD")),
DependentOperator(
Dependency("T"),
[Dependency("T"), Dependency("T")],
Dependency("Op"),
),
DependentReference(Dependency("T"), True),
],
# T Reduce(T&, Op);
[
Pointer(numba.uint8),
DependentReference(Dependency("T")),
DependentOperator(
Dependency("T"),
[Dependency("T"), Dependency("T")],
Dependency("Op"),
),
DependentReference(Dependency("T"), True),
],
# T Reduce(T&, Op, int num_valid);
[
Pointer(numba.uint8),
DependentReference(Dependency("T")),
Expand All @@ -70,13 +101,19 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
[Dependency("T"), Dependency("T")],
Dependency("Op"),
),
Value(numba.int32),
DependentReference(Dependency("T"), True),
]
],
],
type_definitions=[numba_type_to_wrapper(dtype, methods=methods)],
)
specialization = template.specialize(
{"T": dtype, "BLOCK_DIM_X": threads_in_block, "Op": binary_op}
{
"T": dtype,
"BLOCK_DIM_X": threads_in_block,
"ITEMS_PER_THREAD": items_per_thread,
"Op": binary_op,
}
)

return Invocable(
Expand All @@ -89,41 +126,47 @@ def reduce(dtype, threads_in_block, binary_op, methods=None):
)


def sum(dtype, threads_in_block):
"""Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator.
Each thread contributes one input element.
def sum(dtype, threads_in_block, items_per_thread=1, methods=None):
"""Creates an operation that computes a block-wide reduction for thread\ :sub:`0` using
addition (+) as the reduction operator.

Returns a callable object that can be linked to and invoked from device code. It can be
invoked with the following signatures:

- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
first `items_per_thread` items will be included in the reduction.
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
single item to the reduction. The items contributed by all other threads are ignored.

Args:
dtype: Data type being reduced
threads_in_block: The number of threads in a block
items_per_thread: The number of items each thread owns
methods: A dict of methods for user-defined types

Warning:
The return value is undefined in threads other than thread\ :sub:`0`.

Example:
The code snippet below illustrates a reduction of 128 integer items that
are partitioned across 128 threads.
The code snippet below illustrates a sum of 128 integer items that are partitioned
across 128 threads.

.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
:language: python
:dedent:
:start-after: example-begin imports
:end-before: example-end imports

Below is the code snippet that demonstrates the usage of the ``reduce`` API:

.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_reduce_api.py
:language: python
:dedent:
:start-after: example-begin sum
:end-before: example-end sum

Suppose the set of inputs across the block of threads is
``{ 1, 1, 1, 1, ..., 1 }``.
Suppose the set of inputs across the block of threads is ``{ 1, 1, 1, 1, ..., 1 }``.
The corresponding output in the threads thread\ :sub:`0` will be ``{ 128 }``.

Args:
dtype: Data type being reduced
threads_in_block: The number of threads in a block

Returns:
A callable object that can be linked to and invoked from a CUDA kernel
"""
template = Algorithm(
"BlockReduce",
Expand All @@ -132,20 +175,36 @@ def sum(dtype, threads_in_block):
["cub/block/block_reduce.cuh"],
[TemplateParameter("T"), TemplateParameter("BLOCK_DIM_X")],
[
# Signatures:
# T Sum(T(&)[ITEMS_PER_THREAD]);
[
Pointer(numba.uint8),
DependentArray(Dependency("T"), Dependency("ITEMS_PER_THREAD")),
DependentReference(Dependency("T"), True),
],
# T Sum(T&);
[
Pointer(numba.uint8),
DependentReference(Dependency("T")),
DependentReference(Dependency("T"), True),
],
# T Sum(T&, int num_valid);
[
Pointer(numba.uint8),
DependentReference(Dependency("T")),
Value(numba.int32),
DependentReference(Dependency("T"), True),
],
],
type_definitions=[numba_type_to_wrapper(dtype, methods=methods)],
)
specialization = template.specialize(
{
"T": dtype,
"BLOCK_DIM_X": threads_in_block,
"ITEMS_PER_THREAD": items_per_thread,
}
)
specialization = template.specialize({"T": dtype, "BLOCK_DIM_X": threads_in_block})
return Invocable(
temp_files=[
make_binary_tempfile(ltoir, ".ltoir")
Expand Down
Loading
Loading