diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 74ea95bce8a47..29d1878aa8df6 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2981,110 +2981,54 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedOperation( if (RedVarType->isIntegerTy()) { if (RedVarType->getPrimitiveSizeInBits() == 16) { - if (WarpSize == 32) { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_32x32_fast_sum - : OMPRTL___kmpc_xteamr_s_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_16x64_fast_sum - : OMPRTL___kmpc_xteamr_s_16x64), - Args); - } - } - if (RedVarType->getPrimitiveSizeInBits() == 32) { - if (WarpSize == 32) { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_32x32_fast_sum - : OMPRTL___kmpc_xteamr_i_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_16x64_fast_sum - : OMPRTL___kmpc_xteamr_i_16x64), - Args); - } - } - if (RedVarType->getPrimitiveSizeInBits() == 64) { - if (WarpSize == 32) { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_32x32_fast_sum - : OMPRTL___kmpc_xteamr_l_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_16x64_fast_sum - : OMPRTL___kmpc_xteamr_l_16x64), - Args); - } - } - } - if (RedVarType->isFloatTy()) { - if (WarpSize == 32) { return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_32x32_fast_sum - : OMPRTL___kmpc_xteamr_f_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_16x64_fast_sum - : OMPRTL___kmpc_xteamr_f_16x64), + CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_fast_sum + : OMPRTL___kmpc_xteamr_s), Args); } - } - if (RedVarType->isDoubleTy()) { - if (WarpSize == 32) { + if (RedVarType->getPrimitiveSizeInBits() == 32) { return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_32x32_fast_sum - : OMPRTL___kmpc_xteamr_d_32x32), + CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_fast_sum + : OMPRTL___kmpc_xteamr_i), Args); - } else { + } + if (RedVarType->getPrimitiveSizeInBits() == 64) { return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_16x64_fast_sum - : OMPRTL___kmpc_xteamr_d_16x64), + CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_fast_sum + : OMPRTL___kmpc_xteamr_l), Args); } } + if (RedVarType->isFloatTy()) { + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), + IsFast ? OMPRTL___kmpc_xteamr_f_fast_sum : OMPRTL___kmpc_xteamr_f), + Args); + } + if (RedVarType->isDoubleTy()) { + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), + IsFast ? OMPRTL___kmpc_xteamr_d_fast_sum : OMPRTL___kmpc_xteamr_d), + Args); + } if (RedVarType->isHalfTy()) { - if (WarpSize == 32) { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_h_32x32_fast_sum - : OMPRTL___kmpc_xteamr_h_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_h_16x64_fast_sum - : OMPRTL___kmpc_xteamr_h_16x64), - Args); - } + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), + IsFast ? OMPRTL___kmpc_xteamr_h_fast_sum : OMPRTL___kmpc_xteamr_h), + Args); } if (RedVarType->isBFloatTy()) { - if (WarpSize == 32) { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_32x32_fast_sum - : OMPRTL___kmpc_xteamr_bf_32x32), - Args); - } else { - return CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_16x64_fast_sum - : OMPRTL___kmpc_xteamr_bf_16x64), - Args); - } + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_fast_sum + : OMPRTL___kmpc_xteamr_bf), + Args); } llvm_unreachable("No support for other types currently."); } diff --git a/clang/test/OpenMP/fast_red_codegen.cpp b/clang/test/OpenMP/fast_red_codegen.cpp index f62249249031e..d73af823ebeda 100644 --- a/clang/test/OpenMP/fast_red_codegen.cpp +++ b/clang/test/OpenMP/fast_red_codegen.cpp @@ -193,7 +193,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -296,7 +296,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -415,7 +415,7 @@ int main() // CHECK-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -533,7 +533,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -734,7 +734,7 @@ int main() // CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP55:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP56:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1) // CHECK-NEXT: ret void // // @@ -836,7 +836,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -938,7 +938,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1040,7 +1040,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1142,7 +1142,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1257,7 +1257,7 @@ int main() // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1372,7 +1372,7 @@ int main() // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1488,7 +1488,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1604,7 +1604,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1776,6 +1776,6 @@ int main() // CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP44:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/multi_device_codegen.cpp b/clang/test/OpenMP/multi_device_codegen.cpp index 4206f84f0b894..1be257a46c313 100644 --- a/clang/test/OpenMP/multi_device_codegen.cpp +++ b/clang/test/OpenMP/multi_device_codegen.cpp @@ -207,7 +207,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -324,7 +324,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -457,7 +457,7 @@ int main() // CHECK-NEXT: [[TMP39:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP41:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP41]], ptr [[TMP4]], ptr [[TMP39]], ptr [[TMP40]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP41]], ptr [[TMP4]], ptr [[TMP39]], ptr [[TMP40]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -589,7 +589,7 @@ int main() // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load double, ptr addrspace(5) [[TMP9]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // // @@ -801,7 +801,7 @@ int main() // CHECK-NEXT: [[TMP58:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP59:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP60:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP60]], ptr [[TMP4]], ptr [[TMP58]], ptr [[TMP59]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP33]], i32 [[TMP34]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP60]], ptr [[TMP4]], ptr [[TMP58]], ptr [[TMP59]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP33]], i32 [[TMP34]], i32 0) // CHECK-NEXT: ret void // // @@ -917,7 +917,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -1033,7 +1033,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -1149,7 +1149,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -1265,7 +1265,7 @@ int main() // CHECK-NEXT: [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0) // CHECK-NEXT: ret void // // @@ -1394,7 +1394,7 @@ int main() // CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // // @@ -1523,7 +1523,7 @@ int main() // CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // // @@ -1653,7 +1653,7 @@ int main() // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load i64, ptr addrspace(5) [[TMP9]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // // @@ -1783,7 +1783,7 @@ int main() // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load i64, ptr addrspace(5) [[TMP9]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // // @@ -1969,6 +1969,6 @@ int main() // CHECK-NEXT: [[TMP49:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 // CHECK-NEXT: [[TMP50:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 // CHECK-NEXT: [[TMP51:%.*]] = load double, ptr addrspace(5) [[TMP9]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP51]], ptr [[TMP4]], ptr [[TMP49]], ptr [[TMP50]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_d_fast_sum(double [[TMP51]], ptr [[TMP4]], ptr [[TMP49]], ptr [[TMP50]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_callee.cpp b/clang/test/OpenMP/xteam_red_callee.cpp index 2c3317df7026d..386cd0876c2a2 100644 --- a/clang/test/OpenMP/xteam_red_callee.cpp +++ b/clang/test/OpenMP/xteam_red_callee.cpp @@ -903,7 +903,7 @@ int main() // CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP29:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -2497,6 +2497,6 @@ int main() // CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP29:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_codegen.cpp b/clang/test/OpenMP/xteam_red_codegen.cpp index 39e25b1b98db8..7ad033508a219 100644 --- a/clang/test/OpenMP/xteam_red_codegen.cpp +++ b/clang/test/OpenMP/xteam_red_codegen.cpp @@ -193,7 +193,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -296,7 +296,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -415,7 +415,7 @@ int main() // CHECK-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -533,7 +533,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -734,7 +734,7 @@ int main() // CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP55:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP56:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1) // CHECK-NEXT: ret void // // @@ -836,7 +836,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -938,7 +938,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1040,7 +1040,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1142,7 +1142,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1257,7 +1257,7 @@ int main() // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1372,7 +1372,7 @@ int main() // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1488,7 +1488,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1604,7 +1604,7 @@ int main() // CHECK-NEXT: [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // // @@ -1776,6 +1776,6 @@ int main() // CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP44:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_min_max.cpp b/clang/test/OpenMP/xteam_red_min_max.cpp index e86b16bb7e3ee..f037d08f680b2 100644 --- a/clang/test/OpenMP/xteam_red_min_max.cpp +++ b/clang/test/OpenMP/xteam_red_min_max.cpp @@ -118,7 +118,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP3]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -203,7 +203,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP3]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -1546,7 +1546,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_i, ptr @__kmpc_rfun_min_lds_i, i32 2147483647, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_i, ptr @__kmpc_rfun_min_lds_i, i32 2147483647, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -1631,7 +1631,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_i, ptr @__kmpc_rfun_max_lds_i, i32 -2147483648, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_i, ptr @__kmpc_rfun_max_lds_i, i32 -2147483648, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -2958,7 +2958,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_l, ptr @__kmpc_rfun_min_lds_l, i64 9223372036854775807, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_l, ptr @__kmpc_rfun_min_lds_l, i64 9223372036854775807, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -3043,7 +3043,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_l_16x64(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_l, ptr @__kmpc_rfun_max_lds_l, i64 -9223372036854775808, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_l(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_l, ptr @__kmpc_rfun_max_lds_l, i64 -9223372036854775808, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -4430,7 +4430,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_f_16x64(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_f, ptr @__kmpc_rfun_min_lds_f, float 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_f(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_f, ptr @__kmpc_rfun_min_lds_f, float 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -4515,7 +4515,7 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_f_16x64(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_f, ptr @__kmpc_rfun_max_lds_f, float 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_f(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_f, ptr @__kmpc_rfun_max_lds_f, float 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -4598,7 +4598,7 @@ int main() // CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_min_d, ptr @__kmpc_rfun_min_lds_d, double 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_min_d, ptr @__kmpc_rfun_min_lds_d, double 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // // @@ -4681,6 +4681,6 @@ int main() // CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_max_d, ptr @__kmpc_rfun_max_lds_d, double 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_max_d, ptr @__kmpc_rfun_max_lds_d, double 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c b/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c index fa05383971510..149447bba99fc 100644 --- a/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c +++ b/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c @@ -1066,6 +1066,6 @@ int main() // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP4]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_f_16x64_fast_sum(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_f_fast_sum(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_min_max_multi_device.c b/clang/test/OpenMP/xteam_red_min_max_multi_device.c index e8bf3d270de6f..be76a495a27ee 100644 --- a/clang/test/OpenMP/xteam_red_min_max_multi_device.c +++ b/clang/test/OpenMP/xteam_red_min_max_multi_device.c @@ -937,6 +937,6 @@ int main() // CHECK-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8 // CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 // CHECK-NEXT: [[TMP34:%.*]] = load float, ptr addrspace(5) [[TMP6]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_f_16x64_fast_sum(float [[TMP34]], ptr [[TMP4]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP20]], i32 [[TMP19]], i32 0) +// CHECK-NEXT: call void @__kmpc_xteamr_f_fast_sum(float [[TMP34]], ptr [[TMP4]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP20]], i32 [[TMP19]], i32 0) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_min_max_small_precision.c b/clang/test/OpenMP/xteam_red_min_max_small_precision.c index 3963eb6fb4cf3..8457cff160292 100644 --- a/clang/test/OpenMP/xteam_red_min_max_small_precision.c +++ b/clang/test/OpenMP/xteam_red_min_max_small_precision.c @@ -130,7 +130,7 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_h_16x64(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_h, ptr @__kmpc_rfun_min_lds_h, half 0xH7C00, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_h(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_h, ptr @__kmpc_rfun_min_lds_h, half 0xH7C00, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // // @@ -215,7 +215,7 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load bfloat, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_bf, ptr @__kmpc_rfun_min_lds_bf, bfloat 0xR7F80, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_bf(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_bf, ptr @__kmpc_rfun_min_lds_bf, bfloat 0xR7F80, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // // @@ -300,7 +300,7 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // // @@ -385,7 +385,7 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_h_16x64(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_h, ptr @__kmpc_rfun_max_lds_h, half 0xHFC00, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_h(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_h, ptr @__kmpc_rfun_max_lds_h, half 0xHFC00, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // // @@ -470,7 +470,7 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load bfloat, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_bf, ptr @__kmpc_rfun_max_lds_bf, bfloat 0xRFF80, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_bf(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_bf, ptr @__kmpc_rfun_max_lds_bf, bfloat 0xRFF80, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // // @@ -555,6 +555,6 @@ int main() { // CHECK-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP4]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP13]], i32 [[TMP12]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP13]], i32 [[TMP12]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_reference.cpp b/clang/test/OpenMP/xteam_red_reference.cpp index 46249fa1408fe..1e9437bace828 100644 --- a/clang/test/OpenMP/xteam_red_reference.cpp +++ b/clang/test/OpenMP/xteam_red_reference.cpp @@ -107,6 +107,6 @@ void compute_reduced_sum(int n, int &x) { // CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP_ASCAST]], align 8 // CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4 -// CHECK-NEXT: call void @__kmpc_xteamr_i_16x64(i32 [[TMP29]], ptr [[TMP28]], ptr [[TMP26]], ptr [[TMP27]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP15]], i32 [[TMP14]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_i(i32 [[TMP29]], ptr [[TMP28]], ptr [[TMP26]], ptr [[TMP27]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP15]], i32 [[TMP14]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_small_precision.c b/clang/test/OpenMP/xteam_red_small_precision.c index 6324b2a2a603b..ba36c0d8043b3 100644 --- a/clang/test/OpenMP/xteam_red_small_precision.c +++ b/clang/test/OpenMP/xteam_red_small_precision.c @@ -133,7 +133,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr addrspace(5) [[TMP5]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_h_16x64(half [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_h, ptr @__kmpc_rfun_sum_lds_h, half 0xH0000, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_h(half [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_h, ptr @__kmpc_rfun_sum_lds_h, half 0xH0000, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -236,7 +236,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load bfloat, ptr addrspace(5) [[TMP5]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_bf, ptr @__kmpc_rfun_sum_lds_bf, bfloat 0xR0000, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_bf(bfloat [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_bf, ptr @__kmpc_rfun_sum_lds_bf, bfloat 0xR0000, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -339,6 +339,6 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load i16, ptr addrspace(5) [[TMP5]], align 2 -// CHECK-NEXT: call void @__kmpc_xteamr_s_16x64(i16 [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_s, ptr @__kmpc_rfun_sum_lds_s, i16 0, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_s(i16 [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_s, ptr @__kmpc_rfun_sum_lds_s, i16 0, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/xteam_red_split_codegen.cpp b/clang/test/OpenMP/xteam_red_split_codegen.cpp index 3ee59b2b8d8a3..46f5b0089e215 100644 --- a/clang/test/OpenMP/xteam_red_split_codegen.cpp +++ b/clang/test/OpenMP/xteam_red_split_codegen.cpp @@ -198,7 +198,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -300,7 +300,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -402,7 +402,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -504,7 +504,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -606,7 +606,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -708,7 +708,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -810,7 +810,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -912,7 +912,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1014,7 +1014,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1116,7 +1116,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1219,7 +1219,7 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // // @@ -1322,6 +1322,6 @@ int main() // CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8 // CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8 -// CHECK-NEXT: call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) +// CHECK-NEXT: call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1) // CHECK-NEXT: ret void // diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index a5cc3b097fffd..91eec68f2a7c9 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -663,61 +663,33 @@ __OMP_RTL(__kmpc_rfun_max_l, false, Void, Int64Ptr, Int64) __OMP_RTL(__kmpc_rfun_max_lds_l, false, Void, Int64Ptr, Int64) -__OMP_RTL(__kmpc_xteamr_d_16x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_d, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_d_16x64_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_d_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_f_16x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_f, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_f_16x64_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_f_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_h_16x64, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_h, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_h_16x64_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_h_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_bf_16x64, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_bf, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_bf_16x64_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_bf_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_s_16x64, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_s, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_s_16x64_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_s_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_i_16x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_i, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_i_16x64_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_i_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_l_16x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_l, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) -__OMP_RTL(__kmpc_xteamr_l_16x64_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_d_32x32, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_d_32x32_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_f_32x32, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_f_32x32_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_h_32x32, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_h_32x32_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_bf_32x32, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_bf_32x32_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_s_32x32, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_s_32x32_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_i_32x32, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_i_32x32_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_l_32x32, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) - -__OMP_RTL(__kmpc_xteamr_l_32x32_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) +__OMP_RTL(__kmpc_xteamr_l_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32) __OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr) __OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64) diff --git a/openmp/device/include/Xteamr.h b/openmp/device/include/Xteamr.h index b30a714193219..9d20b5f76a6a4 100644 --- a/openmp/device/include/Xteamr.h +++ b/openmp/device/include/Xteamr.h @@ -26,15 +26,17 @@ #define _UL unsigned long #define _INLINE_ATTR_ __attribute__((flatten, always_inline)) #define _RF_LDS volatile __gpu_local +// Maximum number of waves in a thread block +#define _MaxNumWaves 32 +// Wave size +#define _WSZ 32 extern "C" { /// External cross team reduction (xteamr) helper functions /// /// The template for name of xteamr helper function is: -/// __kmpc_xteamr__x where +/// __kmpc_xteamr_ where /// is letter(s) representing data type, e.g. d=double. -/// maximum number of waves in thread block. -/// warp size, 32 or 64. /// IS_FAST There is an optional template boolean type (defaulting to false) /// that indicates if an atomic add should be used instead of the last /// reduction round. This applies to only sum reduction currently. @@ -45,509 +47,264 @@ extern "C" { /// Clang/flang code generation for C, C++, and FORTRAN instantiate a call to /// a helper function for each reduction used in an OpenMP target region. /// -/// \param Input thread local reduction value -/// \param Pointer to result value -/// \param Global array of team values for this reduction instance -/// \param Pointer to atomic counter of completed teams -/// \param Function pointer to reduction function (sum,min,max) -/// \param Function pointer to reduction function on LDS memory -/// \param Reduction null value -/// \param Outer loop iteration value, 0 to numteams*numthreads -/// \param Number of teams +/// \param v Input thread local reduction value +/// \param r_ptr Pointer to result value +/// \param tvs Global array of team values for this reduction instance +/// \param td Pointer to atomic counter of completed teams +/// \param _rf Function pointer to reduction function (sum,min,max) +/// \param _rf_lds Function pointer to reduction function on LDS memory +/// \param rnv Reduction null value +/// \param k Outer loop iteration value, 0 to numteams*numthreads +/// \param numteams Number of teams +/// \param Scope Memory scope /// External intra-team reduction (iteamr) helper functions /// /// The name template for intra-team helper functions is -/// __kmpc_iteamr__x where +/// __kmpc_iteamr_ where /// is letter(s) representing data type, e.g. d=double. -/// maximum number of waves in thread block. -/// warp size, 32 or 64. /// All iteamr helper functions are defined in Xteamr.cpp. They each call the /// internal templated function _iteam_reduction also defined in Xteamr.cpp. /// -/// \param Input thread local reduction value -/// \param Pointer to result value -/// \param Function pointer to reduction function (sum,min,max) -/// \param Function pointer to reduction function on LDS memory -/// \param Reduction null value -/// \param Outer loop iteration value, 0 to numthreads -/// -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_d_16x64( - double v, double *r_ptr, double *tvs, uint32_t *td, - void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_d_16x64_fast_sum( - double v, double *r_ptr, double *tvs, uint32_t *td, - void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_d_16x64(double v, double *r_ptr, - void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, - _RF_LDS double *), - const double rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_f_16x64( - float v, float *r_ptr, float *tvs, uint32_t *td, - void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_f_16x64_fast_sum( - float v, float *r_ptr, float *tvs, uint32_t *td, - void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_f_16x64(float v, float *r_ptr, - void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, - _RF_LDS float *), - const float rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_h_16x64( - _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td, - void (*_rf)(_Float16 *, _Float16), - void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_h_16x64_fast_sum( - _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td, - void (*_rf)(_Float16 *, _Float16), - void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_h_16x64(_Float16 v, _Float16 *r_ptr, - void (*_rf)(_Float16 *, _Float16), - void (*_rf_lds)(_RF_LDS _Float16 *, - _RF_LDS _Float16 *), - const _Float16 rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_bf_16x64( - __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td, - void (*_rf)(__bf16 *, __bf16), - void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_bf_16x64_fast_sum( - __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td, - void (*_rf)(__bf16 *, __bf16), - void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_bf_16x64(__bf16 v, __bf16 *r_ptr, - void (*_rf)(__bf16 *, __bf16), - void (*_rf_lds)(_RF_LDS __bf16 *, - _RF_LDS __bf16 *), - const __bf16 rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64( - _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64_fast_sum( - _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_cd_16x64(_CD v, _CD *r_ptr, - void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, - _RF_LDS _CD *), - const _CD rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64( - _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64_fast_sum( - _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_cf_16x64(_CF v, _CF *r_ptr, - void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, - _RF_LDS _CF *), - const _CF rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_s_16x64( - short v, short *r_ptr, short *tvs, uint32_t *td, - void (*_rf)(short *, short), - void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_s_16x64_fast_sum( - short v, short *r_ptr, short *tvs, uint32_t *td, - void (*_rf)(short *, short), - void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_s_16x64(short v, short *r_ptr, - void (*_rf)(short *, short), - void (*_rf_lds)(_RF_LDS short *, - _RF_LDS short *), - const short rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_us_16x64( - _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US), - void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_us_16x64_fast_sum( - _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US), - void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_us_16x64(_US v, _US *r_ptr, - void (*_rf)(_US *, _US), - void (*_rf_lds)(_RF_LDS _US *, - _RF_LDS _US *), - const _US rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_i_16x64( - int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_i_16x64_fast_sum( - int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_i_16x64(int v, int *r_ptr, - void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, - _RF_LDS int *), - const int rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64( - _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64_fast_sum( - _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_ui_16x64(_UI v, _UI *r_ptr, - void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, - _RF_LDS _UI *), - const _UI rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_l_16x64( - long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_l_16x64_fast_sum( - long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_l_16x64(long v, long *r_ptr, - void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, - _RF_LDS long *), - const long rnv, const uint64_t k); -/// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64( - _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Fast Cross team sum reduction (xteamr) helper function, see documentation -/// above. -void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64_fast_sum( - _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); -/// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_ul_16x64(_UL v, _UL *r_ptr, - void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, - _RF_LDS _UL *), - const _UL rnv, const uint64_t k); +/// \param v Input thread local reduction value +/// \param r_ptr Pointer to result value +/// \param _rf Function pointer to reduction function (sum,min,max) +/// \param _rf_lds Function pointer to reduction function on LDS memory +/// \param rnv Reduction null value +/// \param k Outer loop iteration value, 0 to numthreads + /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_d_32x32( - double v, double *r_ptr, double *tvs, uint32_t *td, - void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); +void _INLINE_ATTR_ +__kmpc_xteamr_d(double v, double *r_ptr, double *tvs, uint32_t *td, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), + const double rnv, const uint64_t k, const uint32_t numteams, + ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_d_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_d_fast_sum( double v, double *r_ptr, double *tvs, uint32_t *td, void (*_rf)(double *, double), void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_d_32x32(double v, double *r_ptr, - void (*_rf)(double *, double), - void (*_rf_lds)(_RF_LDS double *, - _RF_LDS double *), - const double rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_d(double v, double *r_ptr, + void (*_rf)(double *, double), + void (*_rf_lds)(_RF_LDS double *, + _RF_LDS double *), + const double rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_f_32x32( - float v, float *r_ptr, float *tvs, uint32_t *td, - void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); +void _INLINE_ATTR_ +__kmpc_xteamr_f(float v, float *r_ptr, float *tvs, uint32_t *td, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), + const float rnv, const uint64_t k, const uint32_t numteams, + ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_f_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_f_fast_sum( float v, float *r_ptr, float *tvs, uint32_t *td, void (*_rf)(float *, float), void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_f_32x32(float v, float *r_ptr, - void (*_rf)(float *, float), - void (*_rf_lds)(_RF_LDS float *, - _RF_LDS float *), - const float rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_f(float v, float *r_ptr, + void (*_rf)(float *, float), + void (*_rf_lds)(_RF_LDS float *, + _RF_LDS float *), + const float rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_h_32x32( - _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td, - void (*_rf)(_Float16 *, _Float16), - void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); +void _INLINE_ATTR_ +__kmpc_xteamr_h(_Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td, + void (*_rf)(_Float16 *, _Float16), + void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), + const _Float16 rnv, const uint64_t k, const uint32_t numteams, + ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_h_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_h_fast_sum( _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td, void (*_rf)(_Float16 *, _Float16), void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_h_32x32(_Float16 v, _Float16 *r_ptr, - void (*_rf)(_Float16 *, _Float16), - void (*_rf_lds)(_RF_LDS _Float16 *, - _RF_LDS _Float16 *), - const _Float16 rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_h(_Float16 v, _Float16 *r_ptr, + void (*_rf)(_Float16 *, _Float16), + void (*_rf_lds)(_RF_LDS _Float16 *, + _RF_LDS _Float16 *), + const _Float16 rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_bf_32x32( - __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td, - void (*_rf)(__bf16 *, __bf16), - void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); +void _INLINE_ATTR_ +__kmpc_xteamr_bf(__bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td, + void (*_rf)(__bf16 *, __bf16), + void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), + const __bf16 rnv, const uint64_t k, const uint32_t numteams, + ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_bf_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_bf_fast_sum( __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td, void (*_rf)(__bf16 *, __bf16), void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_bf_32x32(__bf16 v, __bf16 *r_ptr, - void (*_rf)(__bf16 *, __bf16), - void (*_rf_lds)(_RF_LDS __bf16 *, - _RF_LDS __bf16 *), - const __bf16 rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_bf(__bf16 v, __bf16 *r_ptr, + void (*_rf)(__bf16 *, __bf16), + void (*_rf_lds)(_RF_LDS __bf16 *, + _RF_LDS __bf16 *), + const __bf16 rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_cd( _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_cd_fast_sum( _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD), void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_cd_32x32(_CD v, _CD *r_ptr, - void (*_rf)(_CD *, _CD), - void (*_rf_lds)(_RF_LDS _CD *, - _RF_LDS _CD *), - const _CD rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_cd(_CD v, _CD *r_ptr, void (*_rf)(_CD *, _CD), + void (*_rf_lds)(_RF_LDS _CD *, + _RF_LDS _CD *), + const _CD rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_cf( _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_cf_fast_sum( _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF), void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_cf_32x32(_CF v, _CF *r_ptr, - void (*_rf)(_CF *, _CF), - void (*_rf_lds)(_RF_LDS _CF *, - _RF_LDS _CF *), - const _CF rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_cf(_CF v, _CF *r_ptr, void (*_rf)(_CF *, _CF), + void (*_rf_lds)(_RF_LDS _CF *, + _RF_LDS _CF *), + const _CF rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_s_32x32( - short v, short *r_ptr, short *tvs, uint32_t *td, - void (*_rf)(short *, short), - void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv, - const uint64_t k, const uint32_t numteams, - ompx::atomic::MemScopeTy Scope = ompx::atomic::system); +void _INLINE_ATTR_ +__kmpc_xteamr_s(short v, short *r_ptr, short *tvs, uint32_t *td, + void (*_rf)(short *, short), + void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), + const short rnv, const uint64_t k, const uint32_t numteams, + ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_s_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_s_fast_sum( short v, short *r_ptr, short *tvs, uint32_t *td, void (*_rf)(short *, short), void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_s_32x32(short v, short *r_ptr, - void (*_rf)(short *, short), - void (*_rf_lds)(_RF_LDS short *, - _RF_LDS short *), - const short rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_s(short v, short *r_ptr, + void (*_rf)(short *, short), + void (*_rf_lds)(_RF_LDS short *, + _RF_LDS short *), + const short rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_us_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_us( _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US), void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_us_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_us_fast_sum( _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US), void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_us_32x32(_US v, _US *r_ptr, - void (*_rf)(_US *, _US), - void (*_rf_lds)(_RF_LDS _US *, - _RF_LDS _US *), - const _US rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_us(_US v, _US *r_ptr, void (*_rf)(_US *, _US), + void (*_rf_lds)(_RF_LDS _US *, + _RF_LDS _US *), + const _US rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_i_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_i( int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_i_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_i_fast_sum( int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int), void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_i_32x32(int v, int *r_ptr, - void (*_rf)(int *, int), - void (*_rf_lds)(_RF_LDS int *, - _RF_LDS int *), - const int rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_i(int v, int *r_ptr, void (*_rf)(int *, int), + void (*_rf_lds)(_RF_LDS int *, + _RF_LDS int *), + const int rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_ui( _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_ui_fast_sum( _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI), void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_ui_32x32(_UI v, _UI *r_ptr, - void (*_rf)(_UI *, _UI), - void (*_rf_lds)(_RF_LDS _UI *, - _RF_LDS _UI *), - const _UI rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_ui(_UI v, _UI *r_ptr, void (*_rf)(_UI *, _UI), + void (*_rf_lds)(_RF_LDS _UI *, + _RF_LDS _UI *), + const _UI rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_l_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_l( long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_l_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_l_fast_sum( long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long), void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_l_32x32(long v, long *r_ptr, - void (*_rf)(long *, long), - void (*_rf_lds)(_RF_LDS long *, - _RF_LDS long *), - const long rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_l(long v, long *r_ptr, + void (*_rf)(long *, long), + void (*_rf_lds)(_RF_LDS long *, + _RF_LDS long *), + const long rnv, const uint64_t k); /// Cross team reduction (xteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32( +void _INLINE_ATTR_ __kmpc_xteamr_ul( _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Fast Cross team sum reduction (xteamr) helper function, see documentation /// above. -void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32_fast_sum( +void _INLINE_ATTR_ __kmpc_xteamr_ul_fast_sum( _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL), void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, const uint64_t k, const uint32_t numteams, ompx::atomic::MemScopeTy Scope = ompx::atomic::system); /// Intra-team reduction (iteamr) helper function, see documentation above. -void _INLINE_ATTR_ __kmpc_iteamr_ul_32x32(_UL v, _UL *r_ptr, - void (*_rf)(_UL *, _UL), - void (*_rf_lds)(_RF_LDS _UL *, - _RF_LDS _UL *), - const _UL rnv, const uint64_t k); +void _INLINE_ATTR_ __kmpc_iteamr_ul(_UL v, _UL *r_ptr, void (*_rf)(_UL *, _UL), + void (*_rf_lds)(_RF_LDS _UL *, + _RF_LDS _UL *), + const _UL rnv, const uint64_t k); /// Built-in pair reduction function, see documentation above. void __kmpc_rfun_sum_d(double *val, double otherval); @@ -597,6 +354,7 @@ void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); void __kmpc_rfun_sum_ul(_UL *val, _UL otherval); /// LDS Built-in pair reduction function, see documentation above. void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); + /// Built-in pair reduction function, see documentation above. void __kmpc_rfun_max_d(double *val, double otherval); /// LDS Built-in pair reduction function, see documentation above. @@ -637,6 +395,7 @@ void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval); void __kmpc_rfun_max_ul(_UL *val, _UL otherval); /// LDS Built-in pair reduction function, see documentation above. void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); + /// Built-in pair reduction function, see documentation above. void __kmpc_rfun_min_d(double *val, double otherval); /// LDS Built-in pair reduction function, see documentation above. @@ -686,5 +445,7 @@ void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval); #undef _UL #undef _INLINE_ATTR_ #undef _RF_LDS +#undef _MaxNumWaves +#undef _WSZ #endif // of ifndef OMPTARGET_DEVICERTL_XTEAMR_H diff --git a/openmp/device/src/Xteamr.cpp b/openmp/device/src/Xteamr.cpp index 8cc448dc70d96..599d323bc9290 100644 --- a/openmp/device/src/Xteamr.cpp +++ b/openmp/device/src/Xteamr.cpp @@ -12,16 +12,24 @@ #include "Xteamr.h" #include "Debug.h" +#include "DeviceUtils.h" #include "Interface.h" #include "Mapping.h" #include "State.h" -#include "Synchronization.h" -#include "DeviceTypes.h" -#include "DeviceUtils.h" -#define __XTEAM_SHARED_LDS volatile __gpu_local - -using namespace ompx::mapping; +#define _CD double _Complex +#define _CF float _Complex +#define _US unsigned short +#define _UI unsigned int +#define _UL unsigned long +#define _INLINE_ATTR_ __attribute__((flatten, always_inline)) +#define _RF_LDS volatile __gpu_local +// Wave size (will be constant-folded since it's known at compile time) +// Should probably be made into constexpr in the future. +#define _WSZ __gpu_num_lanes() +// Maximum number of waves in a thread block +// (1024 / _WSZ = 32 or 16 waves, depending on whether _WSZ is 32 or 64) +#define _MaxNumWaves 32 // Headers for specialized shfl_xor double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width); @@ -33,7 +41,6 @@ float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, const uint32_t width); // Define the arch (amdgcn vs nvptx) variants of shfl - #ifdef __AMDGPU__ int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { int self = ompx::mapping::getThreadIdInWarp(); // __lane_id(); @@ -57,10 +64,7 @@ double xteamr_shfl_xor_d(double var, const int lane_mask, __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } -#endif - -#ifdef __NVPTX__ - +#elif defined(__NVPTX__) int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) { return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, 0x1f); } @@ -97,183 +101,68 @@ float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask, return var; } -// tag dispatching of type specific shfl_xor, get_low, and get_high -struct _d_tag {}; -struct _f_tag {}; -struct _h_tag {}; -struct _bf_tag {}; -struct _cd_tag {}; -struct _cf_tag {}; -struct _s_tag {}; -struct _us_tag {}; -struct _i_tag {}; -struct _ui_tag {}; -struct _l_tag {}; -struct _ul_tag {}; -template struct __dispatch_tag; -template <> struct __dispatch_tag { - typedef _d_tag type; -}; -template <> struct __dispatch_tag { - typedef _f_tag type; -}; -template <> struct __dispatch_tag<_Float16> { typedef _h_tag type; }; -template <> struct __dispatch_tag<__bf16> { typedef _bf_tag type; }; -template <> struct __dispatch_tag { - typedef _cd_tag type; -}; -template <> struct __dispatch_tag { - typedef _cf_tag type; -}; -template <> struct __dispatch_tag { typedef _s_tag type; }; -template <> struct __dispatch_tag { typedef _us_tag type; }; -template <> struct __dispatch_tag { - typedef _i_tag type; -}; -template <> struct __dispatch_tag { - typedef _ui_tag type; -}; -template <> struct __dispatch_tag { - typedef _l_tag type; -}; -template <> struct __dispatch_tag { - typedef _ul_tag type; -}; -template -double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) { +// type specific shfl_xor functions +double xteamr_shfl_xor(double var, const int lane_mask) { return xteamr_shfl_xor_d(var, lane_mask, _WSZ); } -template -float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) { +float xteamr_shfl_xor(float var, const int lane_mask) { return xteamr_shfl_xor_f(var, lane_mask, _WSZ); } -template -float xteamr_shfl_xor(_h_tag tag, _Float16 var, const int lane_mask) { +float xteamr_shfl_xor(_Float16 var, const int lane_mask) { return xteamr_shfl_xor_f(var, lane_mask, _WSZ); } -template -float xteamr_shfl_xor(_bf_tag tag, __bf16 var, const int lane_mask) { +float xteamr_shfl_xor(__bf16 var, const int lane_mask) { return xteamr_shfl_xor_f(var, lane_mask, _WSZ); } -template -double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var, - const int lane_mask) { +double _Complex xteamr_shfl_xor(double _Complex var, const int lane_mask) { return xteamr_shfl_xor_cd(var, lane_mask, _WSZ); } -template -float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var, - const int lane_mask) { +float _Complex xteamr_shfl_xor(float _Complex var, const int lane_mask) { return xteamr_shfl_xor_cf(var, lane_mask, _WSZ); } -template -int xteamr_shfl_xor(_s_tag tag, short var, const int lane_mask) { +int xteamr_shfl_xor(short var, const int lane_mask) { return xteamr_shfl_xor_int(var, lane_mask, _WSZ); } -template -unsigned int xteamr_shfl_xor(_us_tag tag, unsigned short var, - const int lane_mask) { +unsigned int xteamr_shfl_xor(unsigned short var, const int lane_mask) { return xteamr_shfl_xor_int(var, lane_mask, _WSZ); } -template -int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) { +int xteamr_shfl_xor(int var, const int lane_mask) { return xteamr_shfl_xor_int(var, lane_mask, _WSZ); } -template -unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var, - const int lane_mask) { +unsigned int xteamr_shfl_xor(unsigned int var, const int lane_mask) { return xteamr_shfl_xor_int(var, lane_mask, _WSZ); } -template -long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) { +long xteamr_shfl_xor(long var, const int lane_mask) { return xteamr_shfl_xor_d(var, lane_mask, _WSZ); } -template -unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var, - const int lane_mask) { +unsigned long xteamr_shfl_xor(unsigned long var, const int lane_mask) { return xteamr_shfl_xor_d(var, lane_mask, _WSZ); } -template -T xteamr_shfl_xor(T var, const int lane_mask) { - typedef typename __dispatch_tag::type tag; - return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask); -} - -/// Templated internal function used by extern intra-team reductions -/// -/// \param Template typename parameter T -/// \param Template parameter for maximum number of waves in this kernel. -/// \param Template parameter for warp size, 32 or 64 -/// -/// \param Input thread local (TLS) value for warp shfl reduce -/// \param Pointer to result value, also used in final reduction -/// \param Function pointer to TLS pair reduction function -/// \param Function pointer to LDS pair reduction function -/// \param Reduction null value, used for partial waves -/// \param The iteration value from 0 to (NumTeams*_NUM_THREADS)-1 -/// -template -__attribute__((flatten, always_inline)) void _iteam_reduction( - T val, T *r_ptr, void (*_rf)(T *, T), - void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *), - const T rnv, const uint64_t k) { - // Must be a power of 2. - const uint32_t block_size = ompx::mapping::getNumberOfThreadsInBlock(); - - const uint32_t number_of_waves = (block_size - 1) / _WSZ + 1; - const uint32_t omp_thread_num = k % block_size; - const uint32_t wave_num = omp_thread_num / _WSZ; - const uint32_t lane_num = omp_thread_num % _WSZ; - static __XTEAM_SHARED_LDS T xwave_lds[_MaxNumWaves]; - - // Binary reduce each wave, then copy to xwave_lds[wave_num] - const uint32_t start_offset = block_size < _WSZ ? block_size / 2 : _WSZ / 2; - for (unsigned int offset = start_offset; offset > 0; offset >>= 1) - (*_rf)(&val, xteamr_shfl_xor(val, offset)); - if (lane_num == 0) - xwave_lds[wave_num] = val; - - // Binary reduce all wave values into wave_lds[0] - ompx::synchronize::threadsAligned(ompx::atomic::seq_cst); - for (unsigned int offset = number_of_waves / 2; offset > 0; offset >>= 1) { - if (omp_thread_num < offset) - (*_rf_lds)(&(xwave_lds[omp_thread_num]), - &(xwave_lds[omp_thread_num + offset])); - } - - // We only need xwave_lds[0] correct on thread 0. - if (omp_thread_num == 0) - *r_ptr = xwave_lds[0]; - - ompx::synchronize::threadsAligned(ompx::atomic::seq_cst); -} - /// Templated internal function used by all extern typed reductions /// -/// \param Template typename parameter T -/// \param Template parameter for maximum number of waves in this kernel. -/// \param Template parameter for warp size, 32 or 64 -/// \param Template parameter if an atomic add should be used instead of +/// \param T Template typename parameter T +/// \param _IS_FAST Template parameter if an atomic add should be used instead +/// of /// the 1-team-reduction round. Applies to sum reduction currently. /// -/// \param Input thread local (TLS) value for warp shfl reduce -/// \param Pointer to result value, also used in final reduction -/// \param Global array of team values for this reduction only -/// \param Pointer to atomically accessed teams done counter -/// \param Function pointer to TLS pair reduction function -/// \param Function pointer to LDS pair reduction function -/// \param Reduction null value, used for partial waves -/// \param The iteration value from 0 to (NumTeams*_NUM_THREADS)-1 -/// \param The number of teams participating in reduction - -template -__attribute__((flatten, always_inline)) void _xteam_reduction( - T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr, - void (*_rf)(T *, T), - void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *), - const T rnv, const uint64_t k, const uint32_t NumTeams, - ompx::atomic::MemScopeTy Scope) { +/// \param val Input thread local (TLS) value for warp shfl reduce +/// \param r_ptr Pointer to result value, also used in final reduction +/// \param team_vals Global array of team values for this reduction only +/// \param teams_done_ptr Pointer to atomically accessed teams done counter +/// \param _rf Function pointer to TLS pair reduction function +/// \param _rf_lds Function pointer to LDS pair reduction function +/// \param rnv Reduction null value, used for partial waves +/// \param k The iteration value from 0 to (NumTeams*_NUM_THREADS)-1 +/// \param NumTeams The number of teams participating in reduction +/// \param Scope The scope of the atomic operation + +template +_INLINE_ATTR_ void +_xteam_reduction(T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr, + void (*_rf)(T *, T), void (*_rf_lds)(_RF_LDS T *, _RF_LDS T *), + const T rnv, const uint64_t k, const uint32_t NumTeams, + ompx::atomic::MemScopeTy Scope) { // More efficient to derive these constants than get from mapped API @@ -286,7 +175,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction( const uint32_t wave_num = omp_thread_num / _WSZ; const uint32_t lane_num = omp_thread_num % _WSZ; - static __XTEAM_SHARED_LDS T xwave_lds[_MaxNumWaves]; + static _RF_LDS T xwave_lds[_MaxNumWaves]; // Cuda may restrict max threads, so clear unused wave values #ifdef __NVPTX__ @@ -301,7 +190,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction( // Binary reduce each wave, then copy to xwave_lds[wave_num] const uint32_t start_offset = block_size < _WSZ ? block_size / 2 : _WSZ / 2; for (unsigned int offset = start_offset; offset > 0; offset >>= 1) - (*_rf)(&val, xteamr_shfl_xor(val, offset)); + (*_rf)(&val, xteamr_shfl_xor(val, offset)); if (lane_num == 0) xwave_lds[wave_num] = val; @@ -313,16 +202,21 @@ __attribute__((flatten, always_inline)) void _xteam_reduction( &(xwave_lds[omp_thread_num + offset])); } - if (_IS_FAST) { + if constexpr (_IS_FAST) { if (omp_thread_num == 0) ompx::atomic::add(r_ptr, xwave_lds[0], ompx::atomic::seq_cst, Scope); + } else if (NumTeams == 1) { + // We're only doing intra-team reduction, team_vals might be nullptr. + if (omp_thread_num == 0) + *r_ptr = xwave_lds[0]; + ompx::synchronize::threadsAligned(ompx::atomic::seq_cst); } else { // No sync needed here from last reduction in LDS loop // because we only need xwave_lds[0] correct on thread 0. // Save the teams reduced value in team_vals global array // and atomically increment teams_done counter. - static __XTEAM_SHARED_LDS uint32_t td; + static _RF_LDS uint32_t td; if (omp_thread_num == 0) { team_vals[omp_team_num] = xwave_lds[0]; td = ompx::atomic::inc(teams_done_ptr, NumTeams - 1u, @@ -349,7 +243,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction( // Reduce each wave into xwave_lds[wave_num] for (unsigned int offset = start_offset; offset > 0; offset >>= 1) - (*_rf)(&val, xteamr_shfl_xor(val, offset)); + (*_rf)(&val, xteamr_shfl_xor(val, offset)); if (lane_num == 0) xwave_lds[wave_num] = val; @@ -383,518 +277,252 @@ __attribute__((flatten, always_inline)) void _xteam_reduction( } } +/// Internal macro used by extern intra-team reductions +/// +/// \param T Template typename parameter T +/// +/// \param val Input thread local (TLS) value for warp shfl reduce +/// \param r_ptr Pointer to result value, also used in final reduction +/// \param _rf Function pointer to TLS pair reduction function +/// \param _rf_lds Function pointer to LDS pair reduction function +/// \param rnv Reduction null value, used for partial waves +/// \param k The iteration value from 0 to (NumTeams*_NUM_THREADS)-1 +/// +#define _iteam_reduction(T, val, r_ptr, _rf, _rf_lds, rnv, k) \ + _xteam_reduction((val), (r_ptr), nullptr, nullptr, (_rf), (_rf_lds), \ + (rnv), (k), 1, ompx::atomic::MemScopeTy::single) + // Calls to these __kmpc extern C functions are created in clang codegen -// for FORTRAN, c, and C++. They may also be used for sumulation and testing. +// for FORTRAN, c, and C++. They may also be used for simulation and testing. // The headers for these extern C functions are in ../include/Interface.h -// The compiler builds the name based on data type, -// number of waves in the team,and warpsize. +// The compiler builds the name based on the data type. // -#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void -#define _CD double _Complex -#define _CF float _Complex -#define _US unsigned short -#define _UI unsigned int -#define _UL unsigned long -#define _LDS volatile __gpu_local +#define _EXT_ATTR extern "C" _INLINE_ATTR_ void _EXT_ATTR -__kmpc_xteamr_d_16x64(double v, double *r_p, double *tvs, uint32_t *td, - void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_xteamr_d_16x64_fast_sum(double v, double *r_p, double *tvs, uint32_t *td, - void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_xteamr_d(double v, double *r_p, double *tvs, uint32_t *td, + void (*rf)(double *, double), + void (*rflds)(_RF_LDS double *, _RF_LDS double *), + const double rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_iteamr_d_16x64(double v, double *r_p, void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_d_fast_sum(double v, double *r_p, double *tvs, uint32_t *td, + void (*rf)(double *, double), + void (*rflds)(_RF_LDS double *, _RF_LDS double *), + const double rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_f_16x64(float v, float *r_p, float *tvs, uint32_t *td, - void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_d(double v, double *r_p, void (*rf)(double *, double), + void (*rflds)(_RF_LDS double *, _RF_LDS double *), + const double rnv, const uint64_t k) { + _iteam_reduction(double, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_xteamr_f_16x64_fast_sum(float v, float *r_p, float *tvs, uint32_t *td, - void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_xteamr_f(float v, float *r_p, float *tvs, uint32_t *td, + void (*rf)(float *, float), + void (*rflds)(_RF_LDS float *, _RF_LDS float *), + const float rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_iteamr_f_16x64(float v, float *r_p, void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_f_fast_sum(float v, float *r_p, float *tvs, uint32_t *td, + void (*rf)(float *, float), + void (*rflds)(_RF_LDS float *, _RF_LDS float *), + const float rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_h_16x64(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td, - void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_Float16, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_f(float v, float *r_p, void (*rf)(float *, float), + void (*rflds)(_RF_LDS float *, _RF_LDS float *), + const float rnv, const uint64_t k) { + _iteam_reduction(float, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_xteamr_h_16x64_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs, - uint32_t *td, void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_Float16, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, - nt, Scope); +__kmpc_xteamr_h(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td, + void (*rf)(_Float16 *, _Float16), + void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), + const _Float16 rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_Float16>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_iteamr_h_16x64(_Float16 v, _Float16 *r_p, - void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k) { - _iteam_reduction<_Float16, 16, 64>(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_bf_16x64(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td, - void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<__bf16, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, +__kmpc_xteamr_h_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td, + void (*rf)(_Float16 *, _Float16), + void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), + const _Float16 rnv, const uint64_t k, + const uint32_t nt, ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_Float16, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_bf_16x64_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs, - uint32_t *td, void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<__bf16, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_h(_Float16 v, _Float16 *r_p, void (*rf)(_Float16 *, _Float16), + void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), + const _Float16 rnv, const uint64_t k) { + _iteam_reduction(_Float16, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_bf_16x64(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k) { - _iteam_reduction<__bf16, 16, 64>(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_bf(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td, + void (*rf)(__bf16 *, __bf16), + void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), + const __bf16 rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<__bf16>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_s_16x64(short v, short *r_p, short *tvs, uint32_t *td, - void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_xteamr_bf_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td, + void (*rf)(__bf16 *, __bf16), + void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), + const __bf16 rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<__bf16, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_s_16x64_fast_sum(short v, short *r_p, short *tvs, uint32_t *td, - void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_bf(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16), + void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), + const __bf16 rnv, const uint64_t k) { + _iteam_reduction(__bf16, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_s_16x64(short v, short *r_p, void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_s(short v, short *r_p, short *tvs, uint32_t *td, + void (*rf)(short *, short), + void (*rflds)(_RF_LDS short *, _RF_LDS short *), + const short rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_us_16x64(_US v, _US *r_p, _US *tvs, uint32_t *td, - void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_US, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); +__kmpc_xteamr_s_fast_sum(short v, short *r_p, short *tvs, uint32_t *td, + void (*rf)(short *, short), + void (*rflds)(_RF_LDS short *, _RF_LDS short *), + const short rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_us_16x64_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td, - void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), - const _US rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_US, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_s(short v, short *r_p, void (*rf)(short *, short), + void (*rflds)(_RF_LDS short *, _RF_LDS short *), + const short rnv, const uint64_t k) { + _iteam_reduction(short, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_us_16x64(_US v, _US *r_p, void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv, - const uint64_t k) { - _iteam_reduction<_US, 16, 64>(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_us(_US v, _US *r_p, _US *tvs, uint32_t *td, + void (*rf)(_US *, _US), + void (*rflds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, + const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_US>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_i_16x64(int v, int *r_p, int *tvs, uint32_t *td, - void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), const int rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); +__kmpc_xteamr_us_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td, + void (*rf)(_US *, _US), + void (*rflds)(_RF_LDS _US *, _RF_LDS _US *), + const _US rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_US, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_i_16x64_fast_sum(int v, int *r_p, int *tvs, uint32_t *td, - void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), - const int rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_us(_US v, _US *r_p, void (*rf)(_US *, _US), + void (*rflds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv, + const uint64_t k) { + _iteam_reduction(_US, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_i_16x64(int v, int *r_p, void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), const int rnv, - const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_i(int v, int *r_p, int *tvs, uint32_t *td, void (*rf)(int *, int), + void (*rflds)(_RF_LDS int *, _RF_LDS int *), const int rnv, + const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_ui_16x64(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, - void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UI, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); +__kmpc_xteamr_i_fast_sum(int v, int *r_p, int *tvs, uint32_t *td, + void (*rf)(int *, int), + void (*rflds)(_RF_LDS int *, _RF_LDS int *), + const int rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_ui_16x64_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, - void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), - const _UI rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UI, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_i(int v, int *r_p, void (*rf)(int *, int), + void (*rflds)(_RF_LDS int *, _RF_LDS int *), const int rnv, + const uint64_t k) { + _iteam_reduction(int, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_ui_16x64(_UI v, _UI *r_p, void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv, - const uint64_t k) { - _iteam_reduction<_UI, 16, 64>(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_ui(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, + void (*rf)(_UI *, _UI), + void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, + const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_UI>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_l_16x64(long v, long *r_p, long *tvs, uint32_t *td, - void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), const long rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); +__kmpc_xteamr_ui_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, + void (*rf)(_UI *, _UI), + void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *), + const _UI rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_UI, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_l_16x64_fast_sum(long v, long *r_p, long *tvs, uint32_t *td, - void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), - const long rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_ui(_UI v, _UI *r_p, void (*rf)(_UI *, _UI), + void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv, + const uint64_t k) { + _iteam_reduction(_UI, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_l_16x64(long v, long *r_p, void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), const long rnv, - const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_l(long v, long *r_p, long *tvs, uint32_t *td, + void (*rf)(long *, long), + void (*rflds)(_RF_LDS long *, _RF_LDS long *), const long rnv, + const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_ul_16x64(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, - void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UL, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); +__kmpc_xteamr_l_fast_sum(long v, long *r_p, long *tvs, uint32_t *td, + void (*rf)(long *, long), + void (*rflds)(_RF_LDS long *, _RF_LDS long *), + const long rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_ul_16x64_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, - void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), - const _UL rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UL, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_iteamr_l(long v, long *r_p, void (*rf)(long *, long), + void (*rflds)(_RF_LDS long *, _RF_LDS long *), const long rnv, + const uint64_t k) { + _iteam_reduction(long, v, r_p, rf, rflds, rnv, k); } _EXT_ATTR -__kmpc_iteamr_ul_16x64(_UL v, _UL *r_p, void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv, - const uint64_t k) { - _iteam_reduction<_UL, 16, 64>(v, r_p, rf, rflds, rnv, k); +__kmpc_xteamr_ul(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, + void (*rf)(_UL *, _UL), + void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, + const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_UL>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_d_32x32(double v, double *r_p, double *tvs, uint32_t *td, - void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); +__kmpc_xteamr_ul_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, + void (*rf)(_UL *, _UL), + void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *), + const _UL rnv, const uint64_t k, const uint32_t nt, + ompx::atomic::MemScopeTy Scope) { + _xteam_reduction<_UL, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); } _EXT_ATTR -__kmpc_xteamr_d_32x32_fast_sum(double v, double *r_p, double *tvs, uint32_t *td, - void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_d_32x32(double v, double *r_p, void (*rf)(double *, double), - void (*rflds)(_LDS double *, _LDS double *), - const double rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_f_32x32(float v, float *r_p, float *tvs, uint32_t *td, - void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_xteamr_f_32x32_fast_sum(float v, float *r_p, float *tvs, uint32_t *td, - void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_f_32x32(float v, float *r_p, void (*rf)(float *, float), - void (*rflds)(_LDS float *, _LDS float *), - const float rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_h_32x32(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td, - void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_Float16, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_xteamr_h_32x32_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs, - uint32_t *td, void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_Float16, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, - nt, Scope); -} -_EXT_ATTR -__kmpc_iteamr_h_32x32(_Float16 v, _Float16 *r_p, - void (*rf)(_Float16 *, _Float16), - void (*rflds)(_LDS _Float16 *, _LDS _Float16 *), - const _Float16 rnv, const uint64_t k) { - _iteam_reduction<_Float16, 32, 32>(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_bf_32x32(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td, - void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<__bf16, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_xteamr_bf_32x32_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs, - uint32_t *td, void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<__bf16, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_bf_32x32(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16), - void (*rflds)(_LDS __bf16 *, _LDS __bf16 *), - const __bf16 rnv, const uint64_t k) { - _iteam_reduction<__bf16, 32, 32>(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_s_32x32(short v, short *r_p, short *tvs, uint32_t *td, - void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_xteamr_s_32x32_fast_sum(short v, short *r_p, short *tvs, uint32_t *td, - void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_s_32x32(short v, short *r_p, void (*rf)(short *, short), - void (*rflds)(_LDS short *, _LDS short *), - const short rnv, const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_us_32x32(_US v, _US *r_p, _US *tvs, uint32_t *td, - void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_US, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); -} -_EXT_ATTR -__kmpc_xteamr_us_32x32_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td, - void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), - const _US rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_US, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_us_32x32(_US v, _US *r_p, void (*rf)(_US *, _US), - void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv, - const uint64_t k) { - _iteam_reduction<_US, 32, 32>(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_i_32x32(int v, int *r_p, int *tvs, uint32_t *td, - void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), const int rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); -} -_EXT_ATTR -__kmpc_xteamr_i_32x32_fast_sum(int v, int *r_p, int *tvs, uint32_t *td, - void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), - const int rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_i_32x32(int v, int *r_p, void (*rf)(int *, int), - void (*rflds)(_LDS int *, _LDS int *), const int rnv, - const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_ui_32x32(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, - void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UI, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); -} -_EXT_ATTR -__kmpc_xteamr_ui_32x32_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td, - void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), - const _UI rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UI, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_ui_32x32(_UI v, _UI *r_p, void (*rf)(_UI *, _UI), - void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv, - const uint64_t k) { - _iteam_reduction<_UI, 32, 32>(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_l_32x32(long v, long *r_p, long *tvs, uint32_t *td, - void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), const long rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); -} -_EXT_ATTR -__kmpc_xteamr_l_32x32_fast_sum(long v, long *r_p, long *tvs, uint32_t *td, - void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), - const long rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_l_32x32(long v, long *r_p, void (*rf)(long *, long), - void (*rflds)(_LDS long *, _LDS long *), const long rnv, - const uint64_t k) { - _iteam_reduction(v, r_p, rf, rflds, rnv, k); -} -_EXT_ATTR -__kmpc_xteamr_ul_32x32(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, - void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv, - const uint64_t k, const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UL, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope); -} -_EXT_ATTR -__kmpc_xteamr_ul_32x32_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td, - void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), - const _UL rnv, const uint64_t k, - const uint32_t nt, - ompx::atomic::MemScopeTy Scope) { - _xteam_reduction<_UL, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, - Scope); -} -_EXT_ATTR -__kmpc_iteamr_ul_32x32(_UL v, _UL *r_p, void (*rf)(_UL *, _UL), - void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv, - const uint64_t k) { - _iteam_reduction<_UL, 32, 32>(v, r_p, rf, rflds, rnv, k); +__kmpc_iteamr_ul(_UL v, _UL *r_p, void (*rf)(_UL *, _UL), + void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv, + const uint64_t k) { + _iteam_reduction(_UL, v, r_p, rf, rflds, rnv, k); } // Built-in pair reduction functions used as function pointers for // cross team reduction functions. -#define _RF_LDS volatile __gpu_local - _EXT_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; } _EXT_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) { *val += *otherval; @@ -1071,11 +699,13 @@ _EXT_ATTR __kmpc_rfun_min_ul(_UL *val, _UL otherval) { _EXT_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval) { *val = (*otherval < *val) ? *otherval : *val; } -#undef _EXT_ATTR + #undef _CD #undef _CF #undef _US #undef _UI #undef _UL -#undef _LDS +#undef _INLINE_ATTR_ #undef _RF_LDS +#undef _MaxNumWaves +#undef _WSZ