Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
faf6ba0
initial esimd impl
danhoeflinger Jan 6, 2026
9be2bf1
refactor with shared infrastructure
danhoeflinger Jan 7, 2026
d3a21a0
minor fix
danhoeflinger Jan 8, 2026
ba8c913
filling in testing infrastructure for sycl impl
danhoeflinger Jan 8, 2026
8893215
revert erroneous change
danhoeflinger Jan 8, 2026
1f0e492
onesweep: implement SYCL histogram kernel (#2559)
mmichel11 Jan 9, 2026
af8f49f
onesweep feature branch: implement SYCL onesweep kernel (#2567)
mmichel11 Jan 28, 2026
785e40f
initial cleanup, break-up onesweep into smaller understandable functi…
mmichel11 Jan 30, 2026
1d70133
single work group with subgroup sort
mmichel11 Feb 2, 2026
5052372
extend single work group dpl sort to support out of place
mmichel11 Feb 2, 2026
56c63ce
remove in place template
mmichel11 Feb 3, 2026
eee3d7b
update checks on params
mmichel11 Feb 3, 2026
d1e3e4f
make global_histogram in sycl a struct
mmichel11 Feb 3, 2026
117a01e
Specializations of operator() for submitters instead of function disp…
mmichel11 Feb 3, 2026
fdc8e60
remove obsolete todos
mmichel11 Feb 3, 2026
9958840
unified test structure
mmichel11 Feb 3, 2026
0cc87e8
test util class
mmichel11 Feb 3, 2026
34407e8
remove duplicate tests
mmichel11 Feb 3, 2026
e7000bb
update includes
mmichel11 Feb 3, 2026
f5135a0
fix xe2 hangs with atomic fence after batches of relaxed loads
mmichel11 Feb 3, 2026
1472b10
fix scan kt include
mmichel11 Feb 3, 2026
57952e9
workaround issue with work group size 1024 on xe
mmichel11 Feb 3, 2026
4f28191
extend work group support
mmichel11 Feb 4, 2026
1d871b0
fix unskipped test cases when slm requirement is too big
mmichel11 Feb 4, 2026
a03814e
cleanup
mmichel11 Feb 4, 2026
0bf4d5e
remove unrelated changes
mmichel11 Feb 4, 2026
05ad7bf
more cleanup
mmichel11 Feb 4, 2026
83a76fe
::std -> std
mmichel11 Feb 4, 2026
0c910f1
Revert "::std -> std"
mmichel11 Feb 4, 2026
270ee24
limit wg size to 1024 due to observed crashes with 256
mmichel11 Feb 4, 2026
da457b5
::std -> std changes in diff only
mmichel11 Feb 5, 2026
9bcb8dd
restore change from main
mmichel11 Feb 5, 2026
4d605dc
remove redundant SYCL queries
mmichel11 Feb 5, 2026
868ad5d
remove auto and restructuring
mmichel11 Feb 5, 2026
3df3f7b
coop prototype
mmichel11 Feb 5, 2026
6ade124
Full development of cooperative impl with driver bug workaround
mmichel11 Feb 6, 2026
4c4ee7f
Add work group sizes 256, 512 as tested
mmichel11 Feb 6, 2026
3e4ad16
Better workaround for driver bug and fix suspected codegen issue in s…
mmichel11 Feb 6, 2026
213fecd
Formatting and work group restrictions
mmichel11 Feb 6, 2026
7ff2c75
adjust slm approximation in testing
mmichel11 Feb 6, 2026
f1b7e35
Remove duplication in tests
mmichel11 Feb 6, 2026
f1e6b18
Remove atomic id ptr allocation
mmichel11 Feb 6, 2026
29a58a6
Remove work group size of 256 for now
mmichel11 Feb 6, 2026
952db23
copyright updates
mmichel11 Feb 6, 2026
294d795
clang-format and minor cleanup
mmichel11 Feb 6, 2026
89b0b19
Remove root_sync property as we do not need it
mmichel11 Feb 7, 2026
4350725
Add version checking for required oneAPI extensions
mmichel11 Feb 10, 2026
2ae1ac6
remove unneeded slm offset query
mmichel11 Feb 10, 2026
902daa1
remove unused variables
mmichel11 Feb 10, 2026
60628ca
Add workaround for static_assert(false) pre-C++23
mmichel11 Feb 18, 2026
e658de9
Update comment on global fix and compute global fix in place
mmichel11 Feb 18, 2026
acac65d
Remove single work group sort
mmichel11 Feb 18, 2026
3b78a1d
fix typo in __always_false_v usage
mmichel11 Feb 18, 2026
6c0ab45
Add helper struct for histogram kernel params
mmichel11 Feb 18, 2026
2f1af13
Refactor __scan_input_t
mmichel11 Feb 18, 2026
9a1dc4b
::std -> std and missing sub-group barriers
mmichel11 Feb 18, 2026
a2a452b
Safer CMake string comparison
mmichel11 Feb 18, 2026
088bb09
reduce kernel bundle overhead for multi-device systems
mmichel11 Feb 23, 2026
f93c19d
__get_sycl_range changes
mmichel11 Feb 25, 2026
948d72f
Account for fixed SLM size granularities in __get_num_work_groups
mmichel11 Feb 25, 2026
1fa3931
Merge branch 'main' into dev/mmichel11/onesweep_kt_coop
mmichel11 Feb 27, 2026
761d233
Address review feedback
mmichel11 Mar 3, 2026
b9eb2fa
Address review feedback
mmichel11 Mar 4, 2026
096003e
Address minor suggestions and fix macro checks
mmichel11 Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/oneapi/dpl/experimental/kernel_templates
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# include "kt/esimd_radix_sort.h"
#endif

#include "kt/sycl_radix_sort.h"
#include "kt/single_pass_scan.h"

#endif // _ONEDPL_KERNEL_TEMPLATES
35 changes: 22 additions & 13 deletions include/oneapi/dpl/experimental/kt/esimd_radix_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
#include "../../pstl/hetero/dpcpp/utils_ranges_sycl.h"

#include "internal/esimd_radix_sort_utils.h"
#include "internal/esimd_radix_sort_dispatchers.h"
#include "internal/radix_sort_utils.h"
#include "internal/radix_sort_dispatchers.h"
#include "../../pstl/utils.h"

namespace oneapi::dpl::experimental::kt::gpu::esimd
Expand All @@ -33,7 +34,8 @@ radix_sort(sycl::queue __q, _KeysRng&& __keys_rng, _KernelParam __param = {})
return {};

auto __pack = __impl::__rng_pack{oneapi::dpl::__ranges::views::all(::std::forward<_KeysRng>(__keys_rng))};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(__q, __pack, __pack, __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, __pack, __pack, __param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysIterator>
Expand All @@ -48,7 +50,8 @@ radix_sort(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __keys_las
auto __keys_keep = oneapi::dpl::__ranges::__get_sycl_range<sycl::access_mode::read_write>();
auto __keys_rng = __keys_keep(__keys_first, __keys_last).all_view();
auto __pack = __impl::__rng_pack{::std::move(__keys_rng)};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(__q, __pack, __pack, __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, __pack, __pack, __param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysRng,
Expand All @@ -63,7 +66,8 @@ radix_sort_by_key(sycl::queue __q, _KeysRng&& __keys_rng, _ValsRng&& __vals_rng,

auto __pack = __impl::__rng_pack{oneapi::dpl::__ranges::views::all(::std::forward<_KeysRng>(__keys_rng)),
oneapi::dpl::__ranges::views::all(::std::forward<_ValsRng>(__vals_rng))};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(__q, __pack, __pack, __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, __pack, __pack, __param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysIterator,
Expand All @@ -83,7 +87,8 @@ radix_sort_by_key(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __k
auto __vals_keep = oneapi::dpl::__ranges::__get_sycl_range<sycl::access_mode::read_write>();
auto __vals_rng = __vals_keep(__vals_first, __vals_first + (__keys_last - __keys_first)).all_view();
auto __pack = __impl::__rng_pack{::std::move(__keys_rng), ::std::move(__vals_rng)};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(__q, __pack, __pack, __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/true>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, __pack, __pack, __param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysRng1,
Expand All @@ -97,8 +102,9 @@ radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out,

auto __pack = __impl::__rng_pack{oneapi::dpl::__ranges::views::all(::std::forward<_KeysRng1>(__keys_rng))};
auto __pack_out = __impl::__rng_pack{oneapi::dpl::__ranges::views::all(::std::forward<_KeysRng2>(__keys_rng_out))};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(__q, ::std::move(__pack),
::std::move(__pack_out), __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, std::move(__pack), std::move(__pack_out),
__param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysIterator1,
Expand All @@ -119,8 +125,9 @@ radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_l
auto __keys_out_keep = oneapi::dpl::__ranges::__get_sycl_range<sycl::access_mode::read_write>();
auto __keys_out_rng = __keys_out_keep(__keys_out_first, __keys_out_first + __n).all_view();
auto __pack_out = __impl::__rng_pack{::std::move(__keys_out_rng)};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(__q, ::std::move(__pack),
::std::move(__pack_out), __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, std::move(__pack), std::move(__pack_out),
__param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysRng1,
Expand All @@ -137,8 +144,9 @@ radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rn
oneapi::dpl::__ranges::views::all(::std::forward<_ValsRng1>(__vals_rng))};
auto __pack_out = __impl::__rng_pack{oneapi::dpl::__ranges::views::all(::std::forward<_KeysRng2>(__keys_out_rng)),
oneapi::dpl::__ranges::views::all(::std::forward<_ValsRng2>(__vals_out_rng))};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(__q, ::std::move(__pack),
::std::move(__pack_out), __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, std::move(__pack), std::move(__pack_out),
__param);
}

template <bool __is_ascending = true, ::std::uint8_t __radix_bits = 8, typename _KernelParam, typename _KeysIterator1,
Expand Down Expand Up @@ -166,8 +174,9 @@ radix_sort_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 _
auto __vals_out_keep = oneapi::dpl::__ranges::__get_sycl_range<sycl::access_mode::read_write>();
auto __vals_out_rng = __vals_out_keep(__vals_out_first, __vals_out_first + __n).all_view();
auto __pack_out = __impl::__rng_pack{::std::move(__keys_out_rng), ::std::move(__vals_out_rng)};
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(__q, ::std::move(__pack),
::std::move(__pack_out), __param);
return __impl::__radix_sort<__is_ascending, __radix_bits, /*__in_place=*/false>(
oneapi::dpl::experimental::kt::gpu::__impl::__esimd_tag{}, __q, std::move(__pack), std::move(__pack_out),
__param);
}

} // namespace oneapi::dpl::experimental::kt::gpu::esimd
Expand Down
4 changes: 2 additions & 2 deletions include/oneapi/dpl/experimental/kt/internal/esimd_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@

#define _ONEDPL_ESIMD_LSC_FENCE_PRESENT (_ONEDPL_LIBSYCL_VERSION >= 70200)

namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
namespace oneapi::dpl::experimental::kt::gpu::__impl
{

// TODO: rename to show the meaning clearly: default vectorization factor
constexpr int __data_per_step = 16;

} // namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
} // namespace oneapi::dpl::experimental::kt::gpu::__impl

// This namespace mostly consists of abstractions on the top of regular ESIMD functions.
// The purpose is:
Expand Down
Loading