Skip to content

Commit ffa6fb4

Browse files
lwesierspszymich
authored andcommitted
Fix for reduction
For some work-group sizes the current implementation of reduction is not working well. The implementation in repo assumes that all of work-items will be executed. Even if the workgroup size isn't the multiple of the width of SIMD size. This change returns the SLM+barrier for the final calculation of the reduction (performance degradation). TODO: Remove the SLM+barrier and force execution of the whole reduction built-in function with NoMask on asm level. (cherry picked from commit c147d7f)
1 parent 85661c2 commit ffa6fb4

File tree

1 file changed

+33
-25
lines changed

1 file changed

+33
-25
lines changed

IGC/BiFModule/Implementation/group.cl

+33-25
Original file line numberDiff line numberDiff line change
@@ -2484,33 +2484,41 @@ type __builtin_IB_WorkGroupReduce_##func##_##type_abbr(type X)
24842484
} \
24852485
SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(Workgroup, 0, AcquireRelease | WorkgroupMemory); \
24862486
\
2487-
type low_data; \
2488-
type high_data; \
2489-
type reduce; \
2490-
if (sg_size == 32) /* SIMD32 */ \
2487+
if(sg_id == 0) \
24912488
{ \
2492-
low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \
2493-
high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \
2494-
/* 64 (from 64) elements reduces to 32 */ \
2495-
reduce = op(low_data, high_data); \
2496-
} \
2497-
else if(sg_size == 16) /* SIMD16 */ \
2498-
{ \
2499-
low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \
2500-
type mid_low_data = sg_lid + 16 < values_num ? scratch[sg_lid + 16] : identity; \
2501-
type mid_high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \
2502-
high_data = sg_lid + 32 + 16 < values_num ? scratch[sg_lid + 32 + 16] : identity; \
2503-
/* 32 first part (from 64) elements reduces to 16 */ \
2504-
low_data = op(low_data, mid_low_data); \
2505-
/* 32 second part (from 64) elements reduces to 16 */ \
2506-
high_data = op(mid_high_data, high_data); \
2507-
/* 64 (from 64) elements reduces to 16 */ \
2508-
reduce = op(low_data, high_data); \
2509-
} \
2510-
/* SIMD8 is not available on PVC */ \
2489+
type low_data; \
2490+
type high_data; \
2491+
type reduce; \
25112492
\
2512-
sg_x = SPIRV_BUILTIN(Group##func, _i32_i32_##type_abbr, )(Subgroup, GroupOperationReduce, reduce); \
2513-
return sg_x; \
2493+
if (sg_size == 32) /* SIMD32 */ \
2494+
{ \
2495+
low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \
2496+
high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \
2497+
/* 64 (from 64) elements reduces to 32 */ \
2498+
reduce = op(low_data, high_data); \
2499+
} \
2500+
else if(sg_size == 16) /* SIMD16 */ \
2501+
{ \
2502+
low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \
2503+
type mid_low_data = sg_lid + 16 < values_num ? scratch[sg_lid + 16] : identity; \
2504+
type mid_high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \
2505+
high_data = sg_lid + 32 + 16 < values_num ? scratch[sg_lid + 32 + 16] : identity; \
2506+
/* 32 first part (from 64) elements reduces to 16 */ \
2507+
low_data = op(low_data, mid_low_data); \
2508+
/* 32 second part (from 64) elements reduces to 16 */ \
2509+
high_data = op(mid_high_data, high_data); \
2510+
/* 64 (from 64) elements reduces to 16 */ \
2511+
reduce = op(low_data, high_data); \
2512+
} \
2513+
/* SIMD8 is not available on PVC */ \
2514+
\
2515+
sg_x = SPIRV_BUILTIN(Group##func, _i32_i32_##type_abbr, )(Subgroup, GroupOperationReduce, reduce); \
2516+
if (sg_lid == 0) { \
2517+
scratch[0] = sg_x; \
2518+
} \
2519+
} \
2520+
SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(Workgroup, 0, AcquireRelease | WorkgroupMemory); \
2521+
return scratch[0]; \
25142522
} \
25152523
} \
25162524
else \

0 commit comments

Comments
 (0)