-
Notifications
You must be signed in to change notification settings - Fork 67
bitonic_sort #940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
bitonic_sort #940
Changes from 1 commit
5d6322a
264650c
268949e
73aa820
dcb5e6e
b84a4bd
779815e
78a307a
ad7a4c5
b80283a
7c91744
4d253f3
f03b8b2
555dcbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -1,74 +1,155 @@ | ||||
| #ifndef NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED | ||||
| #define NBL_BUILTIN_HLSL_SUBGROUP_BITONIC_SORT_INCLUDED | ||||
| #ifndef NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED | ||||
| #define NBL_BUILTIN_HLSL_WORKGROUP_BITONIC_SORT_INCLUDED | ||||
| #include "nbl/builtin/hlsl/bitonic_sort/common.hlsl" | ||||
| #include "nbl/builtin/hlsl/memory_accessor.hlsl" | ||||
| #include "nbl/builtin/hlsl/functional.hlsl" | ||||
| #include "nbl/builtin/hlsl/subgroup/bitonic_sort.hlsl" | ||||
| #include "nbl/builtin/hlsl/bit.hlsl" | ||||
| #include "nbl/builtin/hlsl/workgroup/shuffle.hlsl" | ||||
| #include "nbl/builtin/hlsl/workgroup/basic.hlsl" | ||||
|
|
||||
| namespace nbl | ||||
| { | ||||
| namespace hlsl | ||||
| namespace hlsl | ||||
| { | ||||
| namespace workgroup | ||||
| { | ||||
| namespace bitonic_sort | ||||
| { | ||||
| // Reorder: non-type parameters FIRST, then typename parameters with defaults | ||||
| // This matches FFT's pattern and avoids DXC bugs | ||||
| template<uint16_t _ElementsPerInvocationLog2, uint16_t _WorkgroupSizeLog2, typename KeyType, typename ValueType, typename Comparator = less<KeyType> > | ||||
| struct bitonic_sort_config | ||||
| { | ||||
| using key_t = KeyType; | ||||
| using value_t = ValueType; | ||||
| using comparator_t = Comparator; | ||||
|
|
||||
| NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2; | ||||
| NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; | ||||
|
|
||||
| NBL_CONSTEXPR_STATIC_INLINE uint32_t ElementsPerInvocation = 1u << ElementsPerInvocationLog2; | ||||
| NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = 1u << WorkgroupSizeLog2; | ||||
| }; | ||||
| } | ||||
|
|
||||
| template<typename Config, class device_capabilities = void> | ||||
| struct BitonicSort; | ||||
|
|
||||
|
|
||||
| template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2, typename KeyType, typename ValueType, typename Comparator, class device_capabilities> | ||||
| struct BitonicSort<bitonic_sort::bitonic_sort_config<ElementsPerInvocationLog2, WorkgroupSizeLog2, KeyType, ValueType, Comparator>, device_capabilities> | ||||
| { | ||||
| using config_t = bitonic_sort::bitonic_sort_config<ElementsPerInvocationLog2, WorkgroupSizeLog2, KeyType, ValueType, Comparator>; | ||||
| using key_t = KeyType; | ||||
| using value_t = ValueType; | ||||
| using comparator_t = Comparator; | ||||
|
|
||||
| using SortConfig = subgroup::bitonic_sort_config<key_t, value_t, comparator_t>; | ||||
|
|
||||
| template<typename SharedMemoryAccessor> | ||||
| static void mergeStage(NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor, uint32_t stage, bool bitonicAscending, uint32_t invocationID, NBL_REF_ARG(key_t) loKey, NBL_REF_ARG(key_t) hiKey, | ||||
| NBL_REF_ARG(value_t) loVal, NBL_REF_ARG(value_t) hiVal) | ||||
| { | ||||
| namespace workgroup | ||||
| NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = config_t::WorkgroupSize; | ||||
| using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, key_t, value_t, 1, WorkgroupSize>; | ||||
|
||||
| #define NBL_CONCEPT_NAME GenericSharedMemoryAccessor |
That concept basically states that the accessor can read and write uint32_ts. This is because shared memory (in most architectures) works with certain restrictions due to memory banking and size per transaction for each bank. It is the adaptor that is later in charge of reading/writing from/to shared memory with your actual type.
What you want here is to have a SharedMemoryAccessor accessing shared memory that is at least max(sizeof(key_t), sizeof(value_t)) * ArraySize bytes (this is unenforceable via concepts but you can make a utility for the config that returns this value so the user can allocate such an array).
Then you want TWO different adaptors: one is going to be
key_adaptor = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, key_t, uint32_t, 1, WorkgroupSize>
and the other is going to be
value_adaptor = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, value_t, uint32_t, 1, WorkgroupSize>
You would then shuffle the keys using the key adaptor, barrier, then shuffle the values using the value adaptor
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this would be a shuffle using the key_adaptor
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and this one would be a shuffle using the value_adaptor
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inbetween shuffles, your array is aliased. The first shuffle has the following behaviour: all threads write values, then there's a barrier (so all threads are done writing before they start reading) then they start reading. On the next shuffle, they immediately start writing again. If you don't barrier inbetween these shuffles, you risk writing before some other thread was done reading, overwriting what needed to be read. Between shuffles, therefore, you need to barrier to unalias the memory
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be wrong here but it feels like the formula on the right yields true even for the last stage (at that point the single 1 in the bitmask is too far to the left, so the result of the & is a 0)
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pull this one out of the loop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NBL_CONSTEXPR_STATIC_INLINEresolves toconst staticwhen preprocessed. DXC is stupid and when it sees astaticit WILL initialize a variable. But if it seesconston its own it does compile it down to a constant, which is the behaviour you would expect (and what would happen in C++). So for now just replace these usages with justconst.@devshgraphicsprogramming do we just change this macro to resolve to
constin HLSL in master?