Skip to content

feat: reimplement ak.sum() kernel using cccl#3840

Draft
maxymnaumchyk wants to merge 4 commits intoscikit-hep:mainfrom
maxymnaumchyk:maxymnaumchyk/cccl-sum
Draft

feat: reimplement ak.sum() kernel using cccl#3840
maxymnaumchyk wants to merge 4 commits intoscikit-hep:mainfrom
maxymnaumchyk:maxymnaumchyk/cccl-sum

Conversation

@maxymnaumchyk
Copy link
Collaborator

I'll postpone working on this until #3833 is resolved.

@codecov
Copy link

codecov bot commented Feb 4, 2026

Codecov Report

❌ Patch coverage is 0% with 21 lines in your changes missing coverage. Please review.
✅ Project coverage is 82.59%. Comparing base (5a1e8ae) to head (87a51f2).

Files with missing lines Patch % Lines
src/awkward/_connect/cuda/_compute.py 0.00% 19 Missing ⚠️
src/awkward/_backends/cupy.py 0.00% 2 Missing ⚠️
Additional details and impacted files
Files with missing lines Coverage Δ
src/awkward/_connect/cuda/__init__.py 0.00% <ø> (ø)
src/awkward/_backends/cupy.py 41.66% <0.00%> (-1.82%) ⬇️
src/awkward/_connect/cuda/_compute.py 0.00% <0.00%> (ø)

... and 2 files with indirect coverage changes

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@maxymnaumchyk
Copy link
Collaborator Author

For now here is the comparison of different possible implementations:

Script code
import awkward as ak
import cupy as cp
import numpy as np
import timeit

from cuda.compute import segmented_reduce
from cuda.compute import CountingIterator, unary_transform

def cccl_sum_segmented_reduce(awkward_array):    
    def sum_op(a, b):
        return a+b

    input_data = awkward_array.layout.content.data 
    offsets = awkward_array.layout.offsets.data
    
    # Prepare the start and end offsets
    start_o = offsets[:-1]
    end_o = offsets[1:]
    
    # Prepare the output array
    n_segments = start_o.size
    output = cp.empty(n_segments, dtype=cp.float64)
    
    # Initial value for the reduction
    h_init = np.array([0], dtype=cp.float64)
    
    # Perform the segmented reduce
    segmented_reduce(
        input_data, output, start_o, end_o, sum_op, h_init, n_segments
    )
    return output

def cccl_sum_unary_transform(awkward_array):    
    def segment_reduce_op(segment_id: cp.int64):
        start_idx = start_o[segment_id]
        end_idx = end_o[segment_id]
        segment = input_data[start_idx:end_idx]
        if len(segment) == 0:
            return 0
        return np.sum(segment)
    
    input_data = awkward_array.layout.content.data 
    offsets = awkward_array.layout.offsets.data
    
    data_dtype = input_data.dtype
    
    # Prepare the start and end offsets
    start_o = offsets[:-1]
    end_o = offsets[1:]
    
    # Prepare the output array
    n_segments = start_o.size
    output = cp.empty(n_segments, dtype=data_dtype)
    
    # Perform the segmented reduce
    segment_ids = CountingIterator(cp.int64(0))
    unary_transform(segment_ids, output, segment_reduce_op, n_segments)
    return output

def cccl_sum_unary_transform_alt(awkward_array):        
    def segment_reduce_op(segment_id: cp.int64):
        start_idx = offsets[segment_id]
        end_idx = offsets[segment_id + 1]
        segment = input_data[start_idx:end_idx]
        if not len(segment):
            return 0
        return sum(segment)
    
    input_data = awkward_array.layout.content.data 
    offsets = awkward_array.layout.offsets.data
    
    data_dtype = input_data.dtype
    
    # Prepare the start and end offsets
    start_o = offsets[:-1]
    end_o = offsets[1:]
    
    # Prepare the output array
    n_segments = start_o.size
    output = cp.empty(n_segments, dtype=data_dtype)
    
    # Perform the segmented reduce
    segment_ids = CountingIterator(cp.int64(0))
    unary_transform(segment_ids, output, segment_reduce_op, n_segments)

    return output

# awkward_array = ak.Array([[1], [2, 3], [4, 5], [6, 7, 1, 8], [], [9]], backend = 'cuda')
awkward_array = ak.to_backend(ak.from_parquet("jnotebooks/random_listoffset_small.parquet"), 'cuda')

# first implementation, cccl_sum_segmented_reduce:
_ = cccl_sum_segmented_reduce(awkward_array)  # warmup
start_time = timeit.default_timer()
for i in range(10):
    expect = cccl_sum_segmented_reduce(awkward_array)
    cp.cuda.Device().synchronize()
end_time = timeit.default_timer()
print("Comparison of different implementations:")
print(f"Time taken for cccl_sum_segmented_reduce: \t {(end_time - start_time) / 10} seconds")


# second implementation, cccl_sum_unary_transform:
_ = cccl_sum_unary_transform(awkward_array)  # warmup
start_time = timeit.default_timer()
for i in range(10):
    expect = cccl_sum_unary_transform(awkward_array)
    cp.cuda.Device().synchronize()
end_time = timeit.default_timer()
print(f"Time taken for cccl_sum_unary_transform: \t {(end_time - start_time) / 10} seconds")

# third implementation, cccl_sum_unary_transform_alt:
_ = cccl_sum_unary_transform_alt(awkward_array)  # warmup
start_time = timeit.default_timer()
for i in range(10):
    expect = cccl_sum_unary_transform_alt(awkward_array)
    cp.cuda.Device().synchronize()
end_time = timeit.default_timer()
print(f"Time taken for cccl_sum_unary_transform_alt: \t {(end_time - start_time) / 10} seconds")

# Choosing the second implementation as the fastest:
_ = ak.sum(awkward_array, axis = 1)  # warmup
start_time = timeit.default_timer()
for i in range(10):
    expect = ak.sum(awkward_array)
end_time = timeit.default_timer()
print("----------------------------------------------------")
print("Choosing the second implementation as the fastest:")
print(f"Time taken for ak.sum on gpu: \t\t\t {(end_time - start_time) / 10} seconds")

# compare with ak.sum on CPU:
awkward_array = ak.to_backend(ak.from_parquet("jnotebooks/random_listoffset_small.parquet"), 'cpu')
_ = ak.sum(awkward_array, axis = 1)  # warmup
start_time = timeit.default_timer()
for i in range(10):
    expect = ak.sum(awkward_array)
end_time = timeit.default_timer()
print(f"Time taken for ak.sum on cpu: \t\t\t {(end_time - start_time) / 10} seconds")

Results:

Comparison of different implementations:
Time taken for cccl_sum_segmented_reduce:        0.08073729889001698 seconds
Time taken for cccl_sum_unary_transform:         0.0012178715085610747 seconds
Time taken for cccl_sum_unary_transform_alt:     0.010394129692576826 seconds
----------------------------------------------------
Choosing the second implementation as the fastest:
Time taken for ak.sum on gpu:                    0.0182920197956264 seconds
Time taken for ak.sum on cpu:                    0.016345835593529045 seconds

@github-actions
Copy link

github-actions bot commented Feb 4, 2026

The documentation preview is ready to be viewed at http://preview.awkward-array.org.s3-website.us-east-1.amazonaws.com/PR3840

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant