diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 74ea95bce8a47..29d1878aa8df6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2981,110 +2981,54 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedOperation(
 
   if (RedVarType->isIntegerTy()) {
     if (RedVarType->getPrimitiveSizeInBits() == 16) {
-      if (WarpSize == 32) {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_32x32_fast_sum
-                                        : OMPRTL___kmpc_xteamr_s_32x32),
-            Args);
-      } else {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_16x64_fast_sum
-                                        : OMPRTL___kmpc_xteamr_s_16x64),
-            Args);
-      }
-    }
-    if (RedVarType->getPrimitiveSizeInBits() == 32) {
-      if (WarpSize == 32) {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_32x32_fast_sum
-                                        : OMPRTL___kmpc_xteamr_i_32x32),
-            Args);
-      } else {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_16x64_fast_sum
-                                        : OMPRTL___kmpc_xteamr_i_16x64),
-            Args);
-      }
-    }
-    if (RedVarType->getPrimitiveSizeInBits() == 64) {
-      if (WarpSize == 32) {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_32x32_fast_sum
-                                        : OMPRTL___kmpc_xteamr_l_32x32),
-            Args);
-      } else {
-        return CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_16x64_fast_sum
-                                        : OMPRTL___kmpc_xteamr_l_16x64),
-            Args);
-      }
-    }
-  }
-  if (RedVarType->isFloatTy()) {
-    if (WarpSize == 32) {
       return CGF.EmitRuntimeCall(
           OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_32x32_fast_sum
-                                      : OMPRTL___kmpc_xteamr_f_32x32),
-          Args);
-    } else {
-      return CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_f_16x64_fast_sum
-                                      : OMPRTL___kmpc_xteamr_f_16x64),
+              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_s_fast_sum
+                                      : OMPRTL___kmpc_xteamr_s),
           Args);
     }
-  }
-  if (RedVarType->isDoubleTy()) {
-    if (WarpSize == 32) {
+    if (RedVarType->getPrimitiveSizeInBits() == 32) {
       return CGF.EmitRuntimeCall(
           OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_32x32_fast_sum
-                                      : OMPRTL___kmpc_xteamr_d_32x32),
+              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_i_fast_sum
+                                      : OMPRTL___kmpc_xteamr_i),
           Args);
-    } else {
+    }
+    if (RedVarType->getPrimitiveSizeInBits() == 64) {
       return CGF.EmitRuntimeCall(
           OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_d_16x64_fast_sum
-                                      : OMPRTL___kmpc_xteamr_d_16x64),
+              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_l_fast_sum
+                                      : OMPRTL___kmpc_xteamr_l),
           Args);
     }
   }
+  if (RedVarType->isFloatTy()) {
+    return CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(),
+            IsFast ? OMPRTL___kmpc_xteamr_f_fast_sum : OMPRTL___kmpc_xteamr_f),
+        Args);
+  }
+  if (RedVarType->isDoubleTy()) {
+    return CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(),
+            IsFast ? OMPRTL___kmpc_xteamr_d_fast_sum : OMPRTL___kmpc_xteamr_d),
+        Args);
+  }
   if (RedVarType->isHalfTy()) {
-    if (WarpSize == 32) {
-      return CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_h_32x32_fast_sum
-                                      : OMPRTL___kmpc_xteamr_h_32x32),
-          Args);
-    } else {
-      return CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_h_16x64_fast_sum
-                                      : OMPRTL___kmpc_xteamr_h_16x64),
-          Args);
-    }
+    return CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(),
+            IsFast ? OMPRTL___kmpc_xteamr_h_fast_sum : OMPRTL___kmpc_xteamr_h),
+        Args);
   }
   if (RedVarType->isBFloatTy()) {
-    if (WarpSize == 32) {
-      return CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_32x32_fast_sum
-                                      : OMPRTL___kmpc_xteamr_bf_32x32),
-          Args);
-    } else {
-      return CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_16x64_fast_sum
-                                      : OMPRTL___kmpc_xteamr_bf_16x64),
-          Args);
-    }
+    return CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), IsFast ? OMPRTL___kmpc_xteamr_bf_fast_sum
+                                    : OMPRTL___kmpc_xteamr_bf),
+        Args);
   }
   llvm_unreachable("No support for other types currently.");
 }
diff --git a/clang/test/OpenMP/fast_red_codegen.cpp b/clang/test/OpenMP/fast_red_codegen.cpp
index f62249249031e..d73af823ebeda 100644
--- a/clang/test/OpenMP/fast_red_codegen.cpp
+++ b/clang/test/OpenMP/fast_red_codegen.cpp
@@ -193,7 +193,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -296,7 +296,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -415,7 +415,7 @@ int main()
 // CHECK-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -533,7 +533,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -734,7 +734,7 @@ int main()
 // CHECK-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP56:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -836,7 +836,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -938,7 +938,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1040,7 +1040,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1142,7 +1142,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1257,7 +1257,7 @@ int main()
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1372,7 +1372,7 @@ int main()
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1488,7 +1488,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1604,7 +1604,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1776,6 +1776,6 @@ int main()
 // CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP44:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/multi_device_codegen.cpp b/clang/test/OpenMP/multi_device_codegen.cpp
index 4206f84f0b894..1be257a46c313 100644
--- a/clang/test/OpenMP/multi_device_codegen.cpp
+++ b/clang/test/OpenMP/multi_device_codegen.cpp
@@ -207,7 +207,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -324,7 +324,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -457,7 +457,7 @@ int main()
 // CHECK-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP41:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP41]], ptr [[TMP4]], ptr [[TMP39]], ptr [[TMP40]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP41]], ptr [[TMP4]], ptr [[TMP39]], ptr [[TMP40]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -589,7 +589,7 @@ int main()
 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP43:%.*]] = load double, ptr addrspace(5) [[TMP9]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -801,7 +801,7 @@ int main()
 // CHECK-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP60:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP60]], ptr [[TMP4]], ptr [[TMP58]], ptr [[TMP59]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP33]], i32 [[TMP34]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP60]], ptr [[TMP4]], ptr [[TMP58]], ptr [[TMP59]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP33]], i32 [[TMP34]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -917,7 +917,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1033,7 +1033,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1149,7 +1149,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1265,7 +1265,7 @@ int main()
 // CHECK-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP38:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP38]], ptr [[TMP4]], ptr [[TMP36]], ptr [[TMP37]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP24]], i32 [[TMP23]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1394,7 +1394,7 @@ int main()
 // CHECK-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1523,7 +1523,7 @@ int main()
 // CHECK-NEXT:    [[TMP40:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i_fast_sum(i32 [[TMP42]], ptr [[TMP4]], ptr [[TMP40]], ptr [[TMP41]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1653,7 +1653,7 @@ int main()
 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr addrspace(5) [[TMP9]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1783,7 +1783,7 @@ int main()
 // CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr addrspace(5) [[TMP9]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l_fast_sum(i64 [[TMP43]], ptr [[TMP4]], ptr [[TMP41]], ptr [[TMP42]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1969,6 +1969,6 @@ int main()
 // CHECK-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP51:%.*]] = load double, ptr addrspace(5) [[TMP9]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64_fast_sum(double [[TMP51]], ptr [[TMP4]], ptr [[TMP49]], ptr [[TMP50]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d_fast_sum(double [[TMP51]], ptr [[TMP4]], ptr [[TMP49]], ptr [[TMP50]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP26]], i32 [[TMP25]], i32 0)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_callee.cpp b/clang/test/OpenMP/xteam_red_callee.cpp
index 2c3317df7026d..386cd0876c2a2 100644
--- a/clang/test/OpenMP/xteam_red_callee.cpp
+++ b/clang/test/OpenMP/xteam_red_callee.cpp
@@ -903,7 +903,7 @@ int main()
 // CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP29:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -2497,6 +2497,6 @@ int main()
 // CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP29:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP29]], ptr [[TMP2]], ptr [[TMP27]], ptr [[TMP28]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_codegen.cpp b/clang/test/OpenMP/xteam_red_codegen.cpp
index 39e25b1b98db8..7ad033508a219 100644
--- a/clang/test/OpenMP/xteam_red_codegen.cpp
+++ b/clang/test/OpenMP/xteam_red_codegen.cpp
@@ -193,7 +193,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -296,7 +296,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -415,7 +415,7 @@ int main()
 // CHECK-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP34]], ptr [[TMP2]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -533,7 +533,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -734,7 +734,7 @@ int main()
 // CHECK-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP56:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP56]], ptr [[TMP2]], ptr [[TMP54]], ptr [[TMP55]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP29]], i32 [[TMP30]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -836,7 +836,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -938,7 +938,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1040,7 +1040,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1142,7 +1142,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1257,7 +1257,7 @@ int main()
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1372,7 +1372,7 @@ int main()
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i(i32 [[TMP35]], ptr [[TMP2]], ptr [[TMP33]], ptr [[TMP34]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1488,7 +1488,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1604,7 +1604,7 @@ int main()
 // CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l(i64 [[TMP36]], ptr [[TMP2]], ptr [[TMP34]], ptr [[TMP35]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1776,6 +1776,6 @@ int main()
 // CHECK-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP44:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP44]], ptr [[TMP2]], ptr [[TMP42]], ptr [[TMP43]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP19]], i32 [[TMP18]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_min_max.cpp b/clang/test/OpenMP/xteam_red_min_max.cpp
index e86b16bb7e3ee..f037d08f680b2 100644
--- a/clang/test/OpenMP/xteam_red_min_max.cpp
+++ b/clang/test/OpenMP/xteam_red_min_max.cpp
@@ -118,7 +118,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP3]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -203,7 +203,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP3]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1546,7 +1546,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_i, ptr @__kmpc_rfun_min_lds_i, i32 2147483647, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_i, ptr @__kmpc_rfun_min_lds_i, i32 2147483647, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1631,7 +1631,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_i, ptr @__kmpc_rfun_max_lds_i, i32 -2147483648, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i(i32 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_i, ptr @__kmpc_rfun_max_lds_i, i32 -2147483648, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -2958,7 +2958,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_l, ptr @__kmpc_rfun_min_lds_l, i64 9223372036854775807, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_l, ptr @__kmpc_rfun_min_lds_l, i64 9223372036854775807, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -3043,7 +3043,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_l_16x64(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_l, ptr @__kmpc_rfun_max_lds_l, i64 -9223372036854775808, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_l(i64 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_l, ptr @__kmpc_rfun_max_lds_l, i64 -9223372036854775808, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -4430,7 +4430,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_f_16x64(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_f, ptr @__kmpc_rfun_min_lds_f, float 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_f(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_f, ptr @__kmpc_rfun_min_lds_f, float 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -4515,7 +4515,7 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_f_16x64(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_f, ptr @__kmpc_rfun_max_lds_f, float 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_f(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_f, ptr @__kmpc_rfun_max_lds_f, float 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -4598,7 +4598,7 @@ int main()
 // CHECK-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_min_d, ptr @__kmpc_rfun_min_lds_d, double 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_min_d, ptr @__kmpc_rfun_min_lds_d, double 0x7FF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -4681,6 +4681,6 @@ int main()
 // CHECK-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_max_d, ptr @__kmpc_rfun_max_lds_d, double 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP26]], ptr [[TMP2]], ptr [[TMP24]], ptr [[TMP25]], ptr @__kmpc_rfun_max_d, ptr @__kmpc_rfun_max_lds_d, double 0xFFF0000000000000, i64 [[TMP12]], i32 [[TMP11]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c b/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c
index fa05383971510..149447bba99fc 100644
--- a/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c
+++ b/clang/test/OpenMP/xteam_red_min_max_fast_reduction.c
@@ -1066,6 +1066,6 @@ int main()
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[TMP4]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_f_16x64_fast_sum(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_f_fast_sum(float [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_min_max_multi_device.c b/clang/test/OpenMP/xteam_red_min_max_multi_device.c
index e8bf3d270de6f..be76a495a27ee 100644
--- a/clang/test/OpenMP/xteam_red_min_max_multi_device.c
+++ b/clang/test/OpenMP/xteam_red_min_max_multi_device.c
@@ -937,6 +937,6 @@ int main()
 // CHECK-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr addrspace(5) [[TMP6]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_f_16x64_fast_sum(float [[TMP34]], ptr [[TMP4]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP20]], i32 [[TMP19]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_xteamr_f_fast_sum(float [[TMP34]], ptr [[TMP4]], ptr [[TMP32]], ptr [[TMP33]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP20]], i32 [[TMP19]], i32 0)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_min_max_small_precision.c b/clang/test/OpenMP/xteam_red_min_max_small_precision.c
index 3963eb6fb4cf3..8457cff160292 100644
--- a/clang/test/OpenMP/xteam_red_min_max_small_precision.c
+++ b/clang/test/OpenMP/xteam_red_min_max_small_precision.c
@@ -130,7 +130,7 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_h_16x64(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_h, ptr @__kmpc_rfun_min_lds_h, half 0xH7C00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_h(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_h, ptr @__kmpc_rfun_min_lds_h, half 0xH7C00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -215,7 +215,7 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load bfloat, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_bf, ptr @__kmpc_rfun_min_lds_bf, bfloat 0xR7F80, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_bf(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_bf, ptr @__kmpc_rfun_min_lds_bf, bfloat 0xR7F80, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -300,7 +300,7 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_min_s, ptr @__kmpc_rfun_min_lds_s, i16 32767, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -385,7 +385,7 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_h_16x64(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_h, ptr @__kmpc_rfun_max_lds_h, half 0xHFC00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_h(half [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_h, ptr @__kmpc_rfun_max_lds_h, half 0xHFC00, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -470,7 +470,7 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load bfloat, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_bf, ptr @__kmpc_rfun_max_lds_bf, bfloat 0xRFF80, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_bf(bfloat [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_bf, ptr @__kmpc_rfun_max_lds_bf, bfloat 0xRFF80, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -555,6 +555,6 @@ int main() {
 // CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr addrspace(5) [[TMP4]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_s_16x64(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP13]], i32 [[TMP12]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_s(i16 [[TMP27]], ptr [[TMP2]], ptr [[TMP25]], ptr [[TMP26]], ptr @__kmpc_rfun_max_s, ptr @__kmpc_rfun_max_lds_s, i16 -32768, i64 [[TMP13]], i32 [[TMP12]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_reference.cpp b/clang/test/OpenMP/xteam_red_reference.cpp
index 46249fa1408fe..1e9437bace828 100644
--- a/clang/test/OpenMP/xteam_red_reference.cpp
+++ b/clang/test/OpenMP/xteam_red_reference.cpp
@@ -107,6 +107,6 @@ void compute_reduced_sum(int n, int &x) {
 // CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[TMP_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-// CHECK-NEXT:    call void @__kmpc_xteamr_i_16x64(i32 [[TMP29]], ptr [[TMP28]], ptr [[TMP26]], ptr [[TMP27]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP15]], i32 [[TMP14]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_i(i32 [[TMP29]], ptr [[TMP28]], ptr [[TMP26]], ptr [[TMP27]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP15]], i32 [[TMP14]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_small_precision.c b/clang/test/OpenMP/xteam_red_small_precision.c
index 6324b2a2a603b..ba36c0d8043b3 100644
--- a/clang/test/OpenMP/xteam_red_small_precision.c
+++ b/clang/test/OpenMP/xteam_red_small_precision.c
@@ -133,7 +133,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load half, ptr addrspace(5) [[TMP5]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_h_16x64(half [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_h, ptr @__kmpc_rfun_sum_lds_h, half 0xH0000, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_h(half [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_h, ptr @__kmpc_rfun_sum_lds_h, half 0xH0000, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -236,7 +236,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load bfloat, ptr addrspace(5) [[TMP5]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_bf_16x64(bfloat [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_bf, ptr @__kmpc_rfun_sum_lds_bf, bfloat 0xR0000, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_bf(bfloat [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_bf, ptr @__kmpc_rfun_sum_lds_bf, bfloat 0xR0000, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -339,6 +339,6 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr addrspace(5) [[TMP5]], align 2
-// CHECK-NEXT:    call void @__kmpc_xteamr_s_16x64(i16 [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_s, ptr @__kmpc_rfun_sum_lds_s, i16 0, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_s(i16 [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_s, ptr @__kmpc_rfun_sum_lds_s, i16 0, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/xteam_red_split_codegen.cpp b/clang/test/OpenMP/xteam_red_split_codegen.cpp
index 3ee59b2b8d8a3..46f5b0089e215 100644
--- a/clang/test/OpenMP/xteam_red_split_codegen.cpp
+++ b/clang/test/OpenMP/xteam_red_split_codegen.cpp
@@ -198,7 +198,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -300,7 +300,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -402,7 +402,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -504,7 +504,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -606,7 +606,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -708,7 +708,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -810,7 +810,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -912,7 +912,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1014,7 +1014,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1116,7 +1116,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1219,7 +1219,7 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
 //
@@ -1322,6 +1322,6 @@ int main()
 // CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[DOTADDR1_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP5]], align 8
-// CHECK-NEXT:    call void @__kmpc_xteamr_d_16x64(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_xteamr_d(double [[TMP31]], ptr [[TMP2]], ptr [[TMP29]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP17]], i32 [[TMP16]], i32 1)
 // CHECK-NEXT:    ret void
 //
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index a5cc3b097fffd..91eec68f2a7c9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -663,61 +663,33 @@ __OMP_RTL(__kmpc_rfun_max_l, false, Void, Int64Ptr, Int64)
 
 __OMP_RTL(__kmpc_rfun_max_lds_l, false, Void, Int64Ptr, Int64)
 
-__OMP_RTL(__kmpc_xteamr_d_16x64, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_d, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_d_16x64_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_d_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_f_16x64, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_f, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_f_16x64_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_f_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_h_16x64, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_h, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_h_16x64_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_h_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_bf_16x64, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_bf, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_bf_16x64_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_bf_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_s_16x64, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_s, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_s_16x64_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_s_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_i_16x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_i, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_i_16x64_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_i_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_l_16x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_l, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
 
-__OMP_RTL(__kmpc_xteamr_l_16x64_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_d_32x32, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_d_32x32_fast_sum, false, Void, Double, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_f_32x32, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_f_32x32_fast_sum, false, Void, Float, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_h_32x32, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_h_32x32_fast_sum, false, Void, Half, HalfPtr, HalfPtr, Int32Ptr, VoidPtr, VoidPtr, Half, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_bf_32x32, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_bf_32x32_fast_sum, false, Void, BFloat, BFloatPtr, BFloatPtr, Int32Ptr, VoidPtr, VoidPtr, BFloat, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_s_32x32, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_s_32x32_fast_sum, false, Void, Int16, Int16Ptr, Int16Ptr, Int32Ptr, VoidPtr, VoidPtr, Int16, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_i_32x32, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_i_32x32_fast_sum, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_l_32x32, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
-
-__OMP_RTL(__kmpc_xteamr_l_32x32_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
+__OMP_RTL(__kmpc_xteamr_l_fast_sum, false, Void, Int64, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32, Int32)
 
 __OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
 __OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
diff --git a/openmp/device/include/Xteamr.h b/openmp/device/include/Xteamr.h
index b30a714193219..9d20b5f76a6a4 100644
--- a/openmp/device/include/Xteamr.h
+++ b/openmp/device/include/Xteamr.h
@@ -26,15 +26,17 @@
 #define _UL unsigned long
 #define _INLINE_ATTR_ __attribute__((flatten, always_inline))
 #define _RF_LDS volatile __gpu_local
+// Maximum number of waves in a thread block
+#define _MaxNumWaves 32
+// Wave size
+#define _WSZ 32
 
 extern "C" {
 /// External cross team reduction (xteamr) helper functions
 ///
 /// The template for name of xteamr helper function is:
-/// __kmpc_xteamr_<dtype>_<max_waves>x<WSZ> where
+/// __kmpc_xteamr_<dtype> where
 ///    <dtype> is letter(s) representing data type, e.g. d=double.
-///    <max_waves> maximum number of waves in thread block.
-///    <WSZ>   warp size, 32 or 64.
 ///    IS_FAST There is an optional template boolean type (defaulting to false)
 ///    that indicates if an atomic add should be used instead of the last
 ///    reduction round. This applies to only sum reduction currently.
@@ -45,509 +47,264 @@ extern "C" {
 /// Clang/flang code generation for C, C++, and FORTRAN instantiate a call to
 /// a helper function for each reduction used in an OpenMP target region.
 ///
-/// \param  Input thread local reduction value
-/// \param  Pointer to result value
-/// \param  Global array of team values for this reduction instance
-/// \param  Pointer to atomic counter of completed teams
-/// \param  Function pointer to reduction function (sum,min,max)
-/// \param  Function pointer to reduction function on LDS memory
-/// \param  Reduction null value
-/// \param  Outer loop iteration value, 0 to numteams*numthreads
-/// \param  Number of teams
+/// \param v Input thread local reduction value
+/// \param r_ptr Pointer to result value
+/// \param tvs Global array of team values for this reduction instance
+/// \param td Pointer to atomic counter of completed teams
+/// \param _rf Function pointer to reduction function (sum,min,max)
+/// \param _rf_lds Function pointer to reduction function on LDS memory
+/// \param rnv Reduction null value
+/// \param k Outer loop iteration value, 0 to numteams*numthreads
+/// \param numteams Number of teams
+/// \param Scope Memory scope
 
 /// External intra-team reduction (iteamr) helper functions
 ///
 /// The name template for intra-team helper functions is
-/// __kmpc_iteamr_<dtype>_<max_waves>x<WSZ> where
+/// __kmpc_iteamr_<dtype> where
 ///    <dtype> is letter(s) representing data type, e.g. d=double.
-///    <max_waves> maximum number of waves in thread block.
-///    <WSZ>   warp size, 32 or 64.
 /// All iteamr helper functions are defined in Xteamr.cpp. They each call the
 /// internal templated function _iteam_reduction also defined in Xteamr.cpp.
 ///
-/// \param  Input thread local reduction value
-/// \param  Pointer to result value
-/// \param  Function pointer to reduction function (sum,min,max)
-/// \param  Function pointer to reduction function on LDS memory
-/// \param  Reduction null value
-/// \param  Outer loop iteration value, 0 to numthreads
-///
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_d_16x64(
-    double v, double *r_ptr, double *tvs, uint32_t *td,
-    void (*_rf)(double *, double),
-    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_d_16x64_fast_sum(
-    double v, double *r_ptr, double *tvs, uint32_t *td,
-    void (*_rf)(double *, double),
-    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_d_16x64(double v, double *r_ptr,
-                                         void (*_rf)(double *, double),
-                                         void (*_rf_lds)(_RF_LDS double *,
-                                                         _RF_LDS double *),
-                                         const double rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_f_16x64(
-    float v, float *r_ptr, float *tvs, uint32_t *td,
-    void (*_rf)(float *, float),
-    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_f_16x64_fast_sum(
-    float v, float *r_ptr, float *tvs, uint32_t *td,
-    void (*_rf)(float *, float),
-    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_f_16x64(float v, float *r_ptr,
-                                         void (*_rf)(float *, float),
-                                         void (*_rf_lds)(_RF_LDS float *,
-                                                         _RF_LDS float *),
-                                         const float rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_h_16x64(
-    _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td,
-    void (*_rf)(_Float16 *, _Float16),
-    void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_h_16x64_fast_sum(
-    _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td,
-    void (*_rf)(_Float16 *, _Float16),
-    void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_h_16x64(_Float16 v, _Float16 *r_ptr,
-                                         void (*_rf)(_Float16 *, _Float16),
-                                         void (*_rf_lds)(_RF_LDS _Float16 *,
-                                                         _RF_LDS _Float16 *),
-                                         const _Float16 rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_bf_16x64(
-    __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td,
-    void (*_rf)(__bf16 *, __bf16),
-    void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_bf_16x64_fast_sum(
-    __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td,
-    void (*_rf)(__bf16 *, __bf16),
-    void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_bf_16x64(__bf16 v, __bf16 *r_ptr,
-                                          void (*_rf)(__bf16 *, __bf16),
-                                          void (*_rf_lds)(_RF_LDS __bf16 *,
-                                                          _RF_LDS __bf16 *),
-                                          const __bf16 rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64(
-    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
-    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_cd_16x64_fast_sum(
-    _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
-    void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_cd_16x64(_CD v, _CD *r_ptr,
-                                          void (*_rf)(_CD *, _CD),
-                                          void (*_rf_lds)(_RF_LDS _CD *,
-                                                          _RF_LDS _CD *),
-                                          const _CD rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64(
-    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
-    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_cf_16x64_fast_sum(
-    _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
-    void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_cf_16x64(_CF v, _CF *r_ptr,
-                                          void (*_rf)(_CF *, _CF),
-                                          void (*_rf_lds)(_RF_LDS _CF *,
-                                                          _RF_LDS _CF *),
-                                          const _CF rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_s_16x64(
-    short v, short *r_ptr, short *tvs, uint32_t *td,
-    void (*_rf)(short *, short),
-    void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_s_16x64_fast_sum(
-    short v, short *r_ptr, short *tvs, uint32_t *td,
-    void (*_rf)(short *, short),
-    void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_s_16x64(short v, short *r_ptr,
-                                         void (*_rf)(short *, short),
-                                         void (*_rf_lds)(_RF_LDS short *,
-                                                         _RF_LDS short *),
-                                         const short rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_us_16x64(
-    _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US),
-    void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_us_16x64_fast_sum(
-    _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US),
-    void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_us_16x64(_US v, _US *r_ptr,
-                                          void (*_rf)(_US *, _US),
-                                          void (*_rf_lds)(_RF_LDS _US *,
-                                                          _RF_LDS _US *),
-                                          const _US rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_i_16x64(
-    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
-    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_i_16x64_fast_sum(
-    int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
-    void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_i_16x64(int v, int *r_ptr,
-                                         void (*_rf)(int *, int),
-                                         void (*_rf_lds)(_RF_LDS int *,
-                                                         _RF_LDS int *),
-                                         const int rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64(
-    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
-    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_ui_16x64_fast_sum(
-    _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
-    void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_ui_16x64(_UI v, _UI *r_ptr,
-                                          void (*_rf)(_UI *, _UI),
-                                          void (*_rf_lds)(_RF_LDS _UI *,
-                                                          _RF_LDS _UI *),
-                                          const _UI rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_l_16x64(
-    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
-    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_l_16x64_fast_sum(
-    long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
-    void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_l_16x64(long v, long *r_ptr,
-                                         void (*_rf)(long *, long),
-                                         void (*_rf_lds)(_RF_LDS long *,
-                                                         _RF_LDS long *),
-                                         const long rnv, const uint64_t k);
-/// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64(
-    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
-    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Fast Cross team sum reduction (xteamr) helper function, see documentation
-/// above.
-void _INLINE_ATTR_ __kmpc_xteamr_ul_16x64_fast_sum(
-    _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
-    void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
-/// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_ul_16x64(_UL v, _UL *r_ptr,
-                                          void (*_rf)(_UL *, _UL),
-                                          void (*_rf_lds)(_RF_LDS _UL *,
-                                                          _RF_LDS _UL *),
-                                          const _UL rnv, const uint64_t k);
+/// \param v Input thread local reduction value
+/// \param r_ptr Pointer to result value
+/// \param _rf Function pointer to reduction function (sum,min,max)
+/// \param _rf_lds Function pointer to reduction function on LDS memory
+/// \param rnv Reduction null value
+/// \param k Outer loop iteration value, 0 to numthreads
+
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_d_32x32(
-    double v, double *r_ptr, double *tvs, uint32_t *td,
-    void (*_rf)(double *, double),
-    void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
+void _INLINE_ATTR_
+__kmpc_xteamr_d(double v, double *r_ptr, double *tvs, uint32_t *td,
+                void (*_rf)(double *, double),
+                void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *),
+                const double rnv, const uint64_t k, const uint32_t numteams,
+                ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_d_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_d_fast_sum(
     double v, double *r_ptr, double *tvs, uint32_t *td,
     void (*_rf)(double *, double),
     void (*_rf_lds)(_RF_LDS double *, _RF_LDS double *), const double rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_d_32x32(double v, double *r_ptr,
-                                         void (*_rf)(double *, double),
-                                         void (*_rf_lds)(_RF_LDS double *,
-                                                         _RF_LDS double *),
-                                         const double rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_d(double v, double *r_ptr,
+                                   void (*_rf)(double *, double),
+                                   void (*_rf_lds)(_RF_LDS double *,
+                                                   _RF_LDS double *),
+                                   const double rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_f_32x32(
-    float v, float *r_ptr, float *tvs, uint32_t *td,
-    void (*_rf)(float *, float),
-    void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
+void _INLINE_ATTR_
+__kmpc_xteamr_f(float v, float *r_ptr, float *tvs, uint32_t *td,
+                void (*_rf)(float *, float),
+                void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *),
+                const float rnv, const uint64_t k, const uint32_t numteams,
+                ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_f_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_f_fast_sum(
     float v, float *r_ptr, float *tvs, uint32_t *td,
     void (*_rf)(float *, float),
     void (*_rf_lds)(_RF_LDS float *, _RF_LDS float *), const float rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_f_32x32(float v, float *r_ptr,
-                                         void (*_rf)(float *, float),
-                                         void (*_rf_lds)(_RF_LDS float *,
-                                                         _RF_LDS float *),
-                                         const float rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_f(float v, float *r_ptr,
+                                   void (*_rf)(float *, float),
+                                   void (*_rf_lds)(_RF_LDS float *,
+                                                   _RF_LDS float *),
+                                   const float rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_h_32x32(
-    _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td,
-    void (*_rf)(_Float16 *, _Float16),
-    void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
+void _INLINE_ATTR_
+__kmpc_xteamr_h(_Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td,
+                void (*_rf)(_Float16 *, _Float16),
+                void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *),
+                const _Float16 rnv, const uint64_t k, const uint32_t numteams,
+                ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_h_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_h_fast_sum(
     _Float16 v, _Float16 *r_ptr, _Float16 *tvs, uint32_t *td,
     void (*_rf)(_Float16 *, _Float16),
     void (*_rf_lds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *), const _Float16 rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_h_32x32(_Float16 v, _Float16 *r_ptr,
-                                         void (*_rf)(_Float16 *, _Float16),
-                                         void (*_rf_lds)(_RF_LDS _Float16 *,
-                                                         _RF_LDS _Float16 *),
-                                         const _Float16 rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_h(_Float16 v, _Float16 *r_ptr,
+                                   void (*_rf)(_Float16 *, _Float16),
+                                   void (*_rf_lds)(_RF_LDS _Float16 *,
+                                                   _RF_LDS _Float16 *),
+                                   const _Float16 rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_bf_32x32(
-    __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td,
-    void (*_rf)(__bf16 *, __bf16),
-    void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
+void _INLINE_ATTR_
+__kmpc_xteamr_bf(__bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td,
+                 void (*_rf)(__bf16 *, __bf16),
+                 void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *),
+                 const __bf16 rnv, const uint64_t k, const uint32_t numteams,
+                 ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_bf_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_bf_fast_sum(
     __bf16 v, __bf16 *r_ptr, __bf16 *tvs, uint32_t *td,
     void (*_rf)(__bf16 *, __bf16),
     void (*_rf_lds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *), const __bf16 rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_bf_32x32(__bf16 v, __bf16 *r_ptr,
-                                          void (*_rf)(__bf16 *, __bf16),
-                                          void (*_rf_lds)(_RF_LDS __bf16 *,
-                                                          _RF_LDS __bf16 *),
-                                          const __bf16 rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_bf(__bf16 v, __bf16 *r_ptr,
+                                    void (*_rf)(__bf16 *, __bf16),
+                                    void (*_rf_lds)(_RF_LDS __bf16 *,
+                                                    _RF_LDS __bf16 *),
+                                    const __bf16 rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_cd(
     _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
     void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_cd_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_cd_fast_sum(
     _CD v, _CD *r_ptr, _CD *tvs, uint32_t *td, void (*_rf)(_CD *, _CD),
     void (*_rf_lds)(_RF_LDS _CD *, _RF_LDS _CD *), const _CD rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_cd_32x32(_CD v, _CD *r_ptr,
-                                          void (*_rf)(_CD *, _CD),
-                                          void (*_rf_lds)(_RF_LDS _CD *,
-                                                          _RF_LDS _CD *),
-                                          const _CD rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_cd(_CD v, _CD *r_ptr, void (*_rf)(_CD *, _CD),
+                                    void (*_rf_lds)(_RF_LDS _CD *,
+                                                    _RF_LDS _CD *),
+                                    const _CD rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_cf(
     _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
     void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_cf_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_cf_fast_sum(
     _CF v, _CF *r_ptr, _CF *tvs, uint32_t *td, void (*_rf)(_CF *, _CF),
     void (*_rf_lds)(_RF_LDS _CF *, _RF_LDS _CF *), const _CF rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_cf_32x32(_CF v, _CF *r_ptr,
-                                          void (*_rf)(_CF *, _CF),
-                                          void (*_rf_lds)(_RF_LDS _CF *,
-                                                          _RF_LDS _CF *),
-                                          const _CF rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_cf(_CF v, _CF *r_ptr, void (*_rf)(_CF *, _CF),
+                                    void (*_rf_lds)(_RF_LDS _CF *,
+                                                    _RF_LDS _CF *),
+                                    const _CF rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_s_32x32(
-    short v, short *r_ptr, short *tvs, uint32_t *td,
-    void (*_rf)(short *, short),
-    void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv,
-    const uint64_t k, const uint32_t numteams,
-    ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
+void _INLINE_ATTR_
+__kmpc_xteamr_s(short v, short *r_ptr, short *tvs, uint32_t *td,
+                void (*_rf)(short *, short),
+                void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *),
+                const short rnv, const uint64_t k, const uint32_t numteams,
+                ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_s_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_s_fast_sum(
     short v, short *r_ptr, short *tvs, uint32_t *td,
     void (*_rf)(short *, short),
     void (*_rf_lds)(_RF_LDS short *, _RF_LDS short *), const short rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_s_32x32(short v, short *r_ptr,
-                                         void (*_rf)(short *, short),
-                                         void (*_rf_lds)(_RF_LDS short *,
-                                                         _RF_LDS short *),
-                                         const short rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_s(short v, short *r_ptr,
+                                   void (*_rf)(short *, short),
+                                   void (*_rf_lds)(_RF_LDS short *,
+                                                   _RF_LDS short *),
+                                   const short rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_us_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_us(
     _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US),
     void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_us_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_us_fast_sum(
     _US v, _US *r_ptr, _US *tvs, uint32_t *td, void (*_rf)(_US *, _US),
     void (*_rf_lds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_us_32x32(_US v, _US *r_ptr,
-                                          void (*_rf)(_US *, _US),
-                                          void (*_rf_lds)(_RF_LDS _US *,
-                                                          _RF_LDS _US *),
-                                          const _US rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_us(_US v, _US *r_ptr, void (*_rf)(_US *, _US),
+                                    void (*_rf_lds)(_RF_LDS _US *,
+                                                    _RF_LDS _US *),
+                                    const _US rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_i_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_i(
     int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
     void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_i_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_i_fast_sum(
     int v, int *r_ptr, int *tvs, uint32_t *td, void (*_rf)(int *, int),
     void (*_rf_lds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_i_32x32(int v, int *r_ptr,
-                                         void (*_rf)(int *, int),
-                                         void (*_rf_lds)(_RF_LDS int *,
-                                                         _RF_LDS int *),
-                                         const int rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_i(int v, int *r_ptr, void (*_rf)(int *, int),
+                                   void (*_rf_lds)(_RF_LDS int *,
+                                                   _RF_LDS int *),
+                                   const int rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_ui(
     _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
     void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_ui_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_ui_fast_sum(
     _UI v, _UI *r_ptr, _UI *tvs, uint32_t *td, void (*_rf)(_UI *, _UI),
     void (*_rf_lds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_ui_32x32(_UI v, _UI *r_ptr,
-                                          void (*_rf)(_UI *, _UI),
-                                          void (*_rf_lds)(_RF_LDS _UI *,
-                                                          _RF_LDS _UI *),
-                                          const _UI rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_ui(_UI v, _UI *r_ptr, void (*_rf)(_UI *, _UI),
+                                    void (*_rf_lds)(_RF_LDS _UI *,
+                                                    _RF_LDS _UI *),
+                                    const _UI rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_l_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_l(
     long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
     void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_l_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_l_fast_sum(
     long v, long *r_ptr, long *tvs, uint32_t *td, void (*_rf)(long *, long),
     void (*_rf_lds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_l_32x32(long v, long *r_ptr,
-                                         void (*_rf)(long *, long),
-                                         void (*_rf_lds)(_RF_LDS long *,
-                                                         _RF_LDS long *),
-                                         const long rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_l(long v, long *r_ptr,
+                                   void (*_rf)(long *, long),
+                                   void (*_rf_lds)(_RF_LDS long *,
+                                                   _RF_LDS long *),
+                                   const long rnv, const uint64_t k);
 /// Cross team reduction (xteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32(
+void _INLINE_ATTR_ __kmpc_xteamr_ul(
     _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
     void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Fast Cross team sum reduction (xteamr) helper function, see documentation
 /// above.
-void _INLINE_ATTR_ __kmpc_xteamr_ul_32x32_fast_sum(
+void _INLINE_ATTR_ __kmpc_xteamr_ul_fast_sum(
     _UL v, _UL *r_ptr, _UL *tvs, uint32_t *td, void (*_rf)(_UL *, _UL),
     void (*_rf_lds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
     const uint64_t k, const uint32_t numteams,
     ompx::atomic::MemScopeTy Scope = ompx::atomic::system);
 /// Intra-team reduction (iteamr) helper function, see documentation above.
-void _INLINE_ATTR_ __kmpc_iteamr_ul_32x32(_UL v, _UL *r_ptr,
-                                          void (*_rf)(_UL *, _UL),
-                                          void (*_rf_lds)(_RF_LDS _UL *,
-                                                          _RF_LDS _UL *),
-                                          const _UL rnv, const uint64_t k);
+void _INLINE_ATTR_ __kmpc_iteamr_ul(_UL v, _UL *r_ptr, void (*_rf)(_UL *, _UL),
+                                    void (*_rf_lds)(_RF_LDS _UL *,
+                                                    _RF_LDS _UL *),
+                                    const _UL rnv, const uint64_t k);
 
 /// Built-in pair reduction function, see documentation above.
 void __kmpc_rfun_sum_d(double *val, double otherval);
@@ -597,6 +354,7 @@ void __kmpc_rfun_sum_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
 void __kmpc_rfun_sum_ul(_UL *val, _UL otherval);
 /// LDS Built-in pair reduction function, see documentation above.
 void __kmpc_rfun_sum_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+
 /// Built-in pair reduction function, see documentation above.
 void __kmpc_rfun_max_d(double *val, double otherval);
 /// LDS Built-in pair reduction function, see documentation above.
@@ -637,6 +395,7 @@ void __kmpc_rfun_max_lds_l(_RF_LDS long *val, _RF_LDS long *otherval);
 void __kmpc_rfun_max_ul(_UL *val, _UL otherval);
 /// LDS Built-in pair reduction function, see documentation above.
 void __kmpc_rfun_max_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
+
 /// Built-in pair reduction function, see documentation above.
 void __kmpc_rfun_min_d(double *val, double otherval);
 /// LDS Built-in pair reduction function, see documentation above.
@@ -686,5 +445,7 @@ void __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval);
 #undef _UL
 #undef _INLINE_ATTR_
 #undef _RF_LDS
+#undef _MaxNumWaves
+#undef _WSZ
 
 #endif // of ifndef OMPTARGET_DEVICERTL_XTEAMR_H
diff --git a/openmp/device/src/Xteamr.cpp b/openmp/device/src/Xteamr.cpp
index 8cc448dc70d96..599d323bc9290 100644
--- a/openmp/device/src/Xteamr.cpp
+++ b/openmp/device/src/Xteamr.cpp
@@ -12,16 +12,24 @@
 
 #include "Xteamr.h"
 #include "Debug.h"
+#include "DeviceUtils.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
-#include "Synchronization.h"
-#include "DeviceTypes.h"
-#include "DeviceUtils.h"
 
-#define __XTEAM_SHARED_LDS volatile __gpu_local
-
-using namespace  ompx::mapping;
+#define _CD double _Complex
+#define _CF float _Complex
+#define _US unsigned short
+#define _UI unsigned int
+#define _UL unsigned long
+#define _INLINE_ATTR_ __attribute__((flatten, always_inline))
+#define _RF_LDS volatile __gpu_local
+// Wave size (will be constant-folded since it's known at compile time)
+// Should probably be made into constexpr in the future.
+#define _WSZ __gpu_num_lanes()
+// Maximum number of waves in a thread block
+// (1024 / _WSZ = 32 or 16 waves, depending on whether _WSZ is 32 or 64)
+#define _MaxNumWaves 32
 
 // Headers for specialized shfl_xor
 double xteamr_shfl_xor_d(double var, const int lane_mask, const uint32_t width);
@@ -33,7 +41,6 @@ float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
                                   const uint32_t width);
 
 // Define the arch (amdgcn vs nvptx) variants of shfl
-
 #ifdef __AMDGPU__
 int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
   int self = ompx::mapping::getThreadIdInWarp(); // __lane_id();
@@ -57,10 +64,7 @@ double xteamr_shfl_xor_d(double var, const int lane_mask,
   __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
   return tmp1;
 }
-#endif
-
-#ifdef __NVPTX__
-
+#elif defined(__NVPTX__)
 int xteamr_shfl_xor_int(int var, const int lane_mask, const uint32_t width) {
   return __nvvm_shfl_sync_bfly_i32(0xFFFFFFFF, var, lane_mask, 0x1f);
 }
@@ -97,183 +101,68 @@ float _Complex xteamr_shfl_xor_cf(float _Complex var, const int lane_mask,
   return var;
 }
 
-// tag dispatching of type specific shfl_xor, get_low, and get_high
-struct _d_tag {};
-struct _f_tag {};
-struct _h_tag {};
-struct _bf_tag {};
-struct _cd_tag {};
-struct _cf_tag {};
-struct _s_tag {};
-struct _us_tag {};
-struct _i_tag {};
-struct _ui_tag {};
-struct _l_tag {};
-struct _ul_tag {};
-template <typename T> struct __dispatch_tag;
-template <> struct __dispatch_tag<double> {
-  typedef _d_tag type;
-};
-template <> struct __dispatch_tag<float> {
-  typedef _f_tag type;
-};
-template <> struct __dispatch_tag<_Float16> { typedef _h_tag type; };
-template <> struct __dispatch_tag<__bf16> { typedef _bf_tag type; };
-template <> struct __dispatch_tag<double _Complex> {
-  typedef _cd_tag type;
-};
-template <> struct __dispatch_tag<float _Complex> {
-  typedef _cf_tag type;
-};
-template <> struct __dispatch_tag<short> { typedef _s_tag type; };
-template <> struct __dispatch_tag<unsigned short> { typedef _us_tag type; };
-template <> struct __dispatch_tag<int> {
-  typedef _i_tag type;
-};
-template <> struct __dispatch_tag<unsigned int> {
-  typedef _ui_tag type;
-};
-template <> struct __dispatch_tag<long> {
-  typedef _l_tag type;
-};
-template <> struct __dispatch_tag<unsigned long> {
-  typedef _ul_tag type;
-};
-template <const uint32_t _WSZ>
-double xteamr_shfl_xor(_d_tag tag, double var, const int lane_mask) {
+// type specific shfl_xor functions
+double xteamr_shfl_xor(double var, const int lane_mask) {
   return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-float xteamr_shfl_xor(_f_tag tag, float var, const int lane_mask) {
+float xteamr_shfl_xor(float var, const int lane_mask) {
   return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-float xteamr_shfl_xor(_h_tag tag, _Float16 var, const int lane_mask) {
+float xteamr_shfl_xor(_Float16 var, const int lane_mask) {
   return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-float xteamr_shfl_xor(_bf_tag tag, __bf16 var, const int lane_mask) {
+float xteamr_shfl_xor(__bf16 var, const int lane_mask) {
   return xteamr_shfl_xor_f(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-double _Complex xteamr_shfl_xor(_cd_tag tag, double _Complex var,
-                                const int lane_mask) {
+double _Complex xteamr_shfl_xor(double _Complex var, const int lane_mask) {
   return xteamr_shfl_xor_cd(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-float _Complex xteamr_shfl_xor(_cf_tag tag, float _Complex var,
-                               const int lane_mask) {
+float _Complex xteamr_shfl_xor(float _Complex var, const int lane_mask) {
   return xteamr_shfl_xor_cf(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-int xteamr_shfl_xor(_s_tag tag, short var, const int lane_mask) {
+int xteamr_shfl_xor(short var, const int lane_mask) {
   return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-unsigned int xteamr_shfl_xor(_us_tag tag, unsigned short var,
-                             const int lane_mask) {
+unsigned int xteamr_shfl_xor(unsigned short var, const int lane_mask) {
   return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-int xteamr_shfl_xor(_i_tag tag, int var, const int lane_mask) {
+int xteamr_shfl_xor(int var, const int lane_mask) {
   return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-unsigned int xteamr_shfl_xor(_ui_tag tag, unsigned int var,
-                             const int lane_mask) {
+unsigned int xteamr_shfl_xor(unsigned int var, const int lane_mask) {
   return xteamr_shfl_xor_int(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-long xteamr_shfl_xor(_l_tag tag, long var, const int lane_mask) {
+long xteamr_shfl_xor(long var, const int lane_mask) {
   return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
 }
-template <const uint32_t _WSZ>
-unsigned long xteamr_shfl_xor(_ul_tag tag, unsigned long var,
-                              const int lane_mask) {
+unsigned long xteamr_shfl_xor(unsigned long var, const int lane_mask) {
   return xteamr_shfl_xor_d(var, lane_mask, _WSZ);
 }
 
-template <typename T, const uint32_t _WSZ>
-T xteamr_shfl_xor(T var, const int lane_mask) {
-  typedef typename __dispatch_tag<T>::type tag;
-  return xteamr_shfl_xor<_WSZ>(tag(), var, lane_mask);
-}
-
-/// Templated internal function used by extern intra-team reductions
-///
-/// \param  Template typename parameter T
-/// \param  Template parameter for maximum number of waves in this kernel.
-/// \param  Template parameter for warp size, 32 or 64
-///
-/// \param  Input thread local (TLS) value for warp shfl reduce
-/// \param  Pointer to result value, also used in final reduction
-/// \param  Function pointer to TLS pair reduction function
-/// \param  Function pointer to LDS pair reduction function
-/// \param  Reduction null value, used for partial waves
-/// \param  The iteration value from 0 to (NumTeams*_NUM_THREADS)-1
-///
-template <typename T, const int32_t _MaxNumWaves, const int32_t _WSZ>
-__attribute__((flatten, always_inline)) void _iteam_reduction(
-    T val, T *r_ptr, void (*_rf)(T *, T),
-    void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *),
-    const T rnv, const uint64_t k) {
-  // Must be a power of 2.
-  const uint32_t block_size = ompx::mapping::getNumberOfThreadsInBlock();
-
-  const uint32_t number_of_waves = (block_size - 1) / _WSZ + 1;
-  const uint32_t omp_thread_num = k % block_size;
-  const uint32_t wave_num = omp_thread_num / _WSZ;
-  const uint32_t lane_num = omp_thread_num % _WSZ;
-  static __XTEAM_SHARED_LDS T xwave_lds[_MaxNumWaves];
-
-  // Binary reduce each wave, then copy to xwave_lds[wave_num]
-  const uint32_t start_offset = block_size < _WSZ ? block_size / 2 : _WSZ / 2;
-  for (unsigned int offset = start_offset; offset > 0; offset >>= 1)
-    (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
-  if (lane_num == 0)
-    xwave_lds[wave_num] = val;
-
-  // Binary reduce all wave values into wave_lds[0]
-  ompx::synchronize::threadsAligned(ompx::atomic::seq_cst);
-  for (unsigned int offset = number_of_waves / 2; offset > 0; offset >>= 1) {
-    if (omp_thread_num < offset)
-      (*_rf_lds)(&(xwave_lds[omp_thread_num]),
-                 &(xwave_lds[omp_thread_num + offset]));
-  }
-
-  // We only need xwave_lds[0] correct on thread 0.
-  if (omp_thread_num == 0)
-    *r_ptr = xwave_lds[0];
-
-  ompx::synchronize::threadsAligned(ompx::atomic::seq_cst);
-}
-
 /// Templated internal function used by all extern typed reductions
 ///
-/// \param  Template typename parameter T
-/// \param  Template parameter for maximum number of waves in this kernel.
-/// \param  Template parameter for warp size, 32 or 64
-/// \param  Template parameter if an atomic add should be used instead of
+/// \param T Template typename parameter T
+/// \param _IS_FAST Template parameter if an atomic add should be used instead
+/// of
 ///         the 1-team-reduction round. Applies to sum reduction currently.
 ///
-/// \param  Input thread local (TLS) value for warp shfl reduce
-/// \param  Pointer to result value, also used in final reduction
-/// \param  Global array of team values for this reduction only
-/// \param  Pointer to atomically accessed teams done counter
-/// \param  Function pointer to TLS pair reduction function
-/// \param  Function pointer to LDS pair reduction function
-/// \param  Reduction null value, used for partial waves
-/// \param  The iteration value from 0 to (NumTeams*_NUM_THREADS)-1
-/// \param  The number of teams participating in reduction
-
-template <typename T, const int32_t _MaxNumWaves, const int32_t _WSZ,
-          const bool _IS_FAST = false>
-__attribute__((flatten, always_inline)) void _xteam_reduction(
-    T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr,
-    void (*_rf)(T *, T),
-    void (*_rf_lds)(__XTEAM_SHARED_LDS T *, __XTEAM_SHARED_LDS T *),
-    const T rnv, const uint64_t k, const uint32_t NumTeams,
-    ompx::atomic::MemScopeTy Scope) {
+/// \param val Input thread local (TLS) value for warp shfl reduce
+/// \param r_ptr Pointer to result value, also used in final reduction
+/// \param team_vals Global array of team values for this reduction only
+/// \param teams_done_ptr Pointer to atomically accessed teams done counter
+/// \param _rf Function pointer to TLS pair reduction function
+/// \param _rf_lds Function pointer to LDS pair reduction function
+/// \param rnv Reduction null value, used for partial waves
+/// \param k The iteration value from 0 to (NumTeams*_NUM_THREADS)-1
+/// \param NumTeams The number of teams participating in reduction
+/// \param Scope The scope of the atomic operation
+
+template <typename T, const bool _IS_FAST = false>
+_INLINE_ATTR_ void
+_xteam_reduction(T val, T *r_ptr, T *team_vals, uint32_t *teams_done_ptr,
+                 void (*_rf)(T *, T), void (*_rf_lds)(_RF_LDS T *, _RF_LDS T *),
+                 const T rnv, const uint64_t k, const uint32_t NumTeams,
+                 ompx::atomic::MemScopeTy Scope) {
 
   // More efficient to derive these constants than get from mapped API
 
@@ -286,7 +175,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction(
   const uint32_t wave_num = omp_thread_num / _WSZ;
   const uint32_t lane_num = omp_thread_num % _WSZ;
 
-  static __XTEAM_SHARED_LDS T xwave_lds[_MaxNumWaves];
+  static _RF_LDS T xwave_lds[_MaxNumWaves];
 
 // Cuda may restrict max threads, so clear unused wave values
 #ifdef __NVPTX__
@@ -301,7 +190,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction(
   // Binary reduce each wave, then copy to xwave_lds[wave_num]
   const uint32_t start_offset = block_size < _WSZ ? block_size / 2 : _WSZ / 2;
   for (unsigned int offset = start_offset; offset > 0; offset >>= 1)
-    (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+    (*_rf)(&val, xteamr_shfl_xor(val, offset));
   if (lane_num == 0)
     xwave_lds[wave_num] = val;
 
@@ -313,16 +202,21 @@ __attribute__((flatten, always_inline)) void _xteam_reduction(
                  &(xwave_lds[omp_thread_num + offset]));
   }
 
-  if (_IS_FAST) {
+  if constexpr (_IS_FAST) {
     if (omp_thread_num == 0)
       ompx::atomic::add(r_ptr, xwave_lds[0], ompx::atomic::seq_cst, Scope);
+  } else if (NumTeams == 1) {
+    // We're only doing intra-team reduction, team_vals might be nullptr.
+    if (omp_thread_num == 0)
+      *r_ptr = xwave_lds[0];
+    ompx::synchronize::threadsAligned(ompx::atomic::seq_cst);
   } else {
     // No sync needed here from last reduction in LDS loop
     // because we only need xwave_lds[0] correct on thread 0.
 
     // Save the teams reduced value in team_vals global array
     // and atomically increment teams_done counter.
-    static __XTEAM_SHARED_LDS uint32_t td;
+    static _RF_LDS uint32_t td;
     if (omp_thread_num == 0) {
       team_vals[omp_team_num] = xwave_lds[0];
       td = ompx::atomic::inc(teams_done_ptr, NumTeams - 1u,
@@ -349,7 +243,7 @@ __attribute__((flatten, always_inline)) void _xteam_reduction(
 
       // Reduce each wave into xwave_lds[wave_num]
       for (unsigned int offset = start_offset; offset > 0; offset >>= 1)
-        (*_rf)(&val, xteamr_shfl_xor<T, _WSZ>(val, offset));
+        (*_rf)(&val, xteamr_shfl_xor(val, offset));
       if (lane_num == 0)
         xwave_lds[wave_num] = val;
 
@@ -383,518 +277,252 @@ __attribute__((flatten, always_inline)) void _xteam_reduction(
   }
 }
 
+/// Internal macro used by extern intra-team reductions
+///
+/// \param T Template typename parameter T
+///
+/// \param val Input thread local (TLS) value for warp shfl reduce
+/// \param r_ptr Pointer to result value, also used in final reduction
+/// \param _rf Function pointer to TLS pair reduction function
+/// \param _rf_lds Function pointer to LDS pair reduction function
+/// \param rnv Reduction null value, used for partial waves
+/// \param k The iteration value from 0 to (NumTeams*_NUM_THREADS)-1
+///
+#define _iteam_reduction(T, val, r_ptr, _rf, _rf_lds, rnv, k)                  \
+  _xteam_reduction<T>((val), (r_ptr), nullptr, nullptr, (_rf), (_rf_lds),      \
+                      (rnv), (k), 1, ompx::atomic::MemScopeTy::single)
+
 //  Calls to these __kmpc extern C functions are created in clang codegen
-//  for FORTRAN, c, and C++. They may also be used for sumulation and testing.
+//  for FORTRAN, c, and C++. They may also be used for simulation and testing.
 //  The headers for these extern C functions are in ../include/Interface.h
-//  The compiler builds the name based on data type,
-//  number of waves in the team,and warpsize.
+//  The compiler builds the name based on the data type.
 //
-#define _EXT_ATTR extern "C" __attribute__((flatten, always_inline)) void
-#define _CD double _Complex
-#define _CF float _Complex
-#define _US unsigned short
-#define _UI unsigned int
-#define _UL unsigned long
-#define _LDS volatile __gpu_local
+#define _EXT_ATTR extern "C" _INLINE_ATTR_ void
 
 _EXT_ATTR
-__kmpc_xteamr_d_16x64(double v, double *r_p, double *tvs, uint32_t *td,
-                      void (*rf)(double *, double),
-                      void (*rflds)(_LDS double *, _LDS double *),
-                      const double rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<double, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                   Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_d_16x64_fast_sum(double v, double *r_p, double *tvs, uint32_t *td,
-                               void (*rf)(double *, double),
-                               void (*rflds)(_LDS double *, _LDS double *),
-                               const double rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<double, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                         Scope);
+__kmpc_xteamr_d(double v, double *r_p, double *tvs, uint32_t *td,
+                void (*rf)(double *, double),
+                void (*rflds)(_RF_LDS double *, _RF_LDS double *),
+                const double rnv, const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<double>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_iteamr_d_16x64(double v, double *r_p, void (*rf)(double *, double),
-                      void (*rflds)(_LDS double *, _LDS double *),
-                      const double rnv, const uint64_t k) {
-  _iteam_reduction<double, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_d_fast_sum(double v, double *r_p, double *tvs, uint32_t *td,
+                         void (*rf)(double *, double),
+                         void (*rflds)(_RF_LDS double *, _RF_LDS double *),
+                         const double rnv, const uint64_t k, const uint32_t nt,
+                         ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<double, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_f_16x64(float v, float *r_p, float *tvs, uint32_t *td,
-                      void (*rf)(float *, float),
-                      void (*rflds)(_LDS float *, _LDS float *),
-                      const float rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<float, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                  Scope);
+__kmpc_iteamr_d(double v, double *r_p, void (*rf)(double *, double),
+                void (*rflds)(_RF_LDS double *, _RF_LDS double *),
+                const double rnv, const uint64_t k) {
+  _iteam_reduction(double, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_xteamr_f_16x64_fast_sum(float v, float *r_p, float *tvs, uint32_t *td,
-                               void (*rf)(float *, float),
-                               void (*rflds)(_LDS float *, _LDS float *),
-                               const float rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<float, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                        Scope);
+__kmpc_xteamr_f(float v, float *r_p, float *tvs, uint32_t *td,
+                void (*rf)(float *, float),
+                void (*rflds)(_RF_LDS float *, _RF_LDS float *),
+                const float rnv, const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<float>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_iteamr_f_16x64(float v, float *r_p, void (*rf)(float *, float),
-                      void (*rflds)(_LDS float *, _LDS float *),
-                      const float rnv, const uint64_t k) {
-  _iteam_reduction<float, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_f_fast_sum(float v, float *r_p, float *tvs, uint32_t *td,
+                         void (*rf)(float *, float),
+                         void (*rflds)(_RF_LDS float *, _RF_LDS float *),
+                         const float rnv, const uint64_t k, const uint32_t nt,
+                         ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<float, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_h_16x64(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td,
-                      void (*rf)(_Float16 *, _Float16),
-                      void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                      const _Float16 rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_Float16, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                     Scope);
+__kmpc_iteamr_f(float v, float *r_p, void (*rf)(float *, float),
+                void (*rflds)(_RF_LDS float *, _RF_LDS float *),
+                const float rnv, const uint64_t k) {
+  _iteam_reduction(float, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_xteamr_h_16x64_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs,
-                               uint32_t *td, void (*rf)(_Float16 *, _Float16),
-                               void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                               const _Float16 rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_Float16, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k,
-                                           nt, Scope);
+__kmpc_xteamr_h(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td,
+                void (*rf)(_Float16 *, _Float16),
+                void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *),
+                const _Float16 rnv, const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_Float16>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_iteamr_h_16x64(_Float16 v, _Float16 *r_p,
-                      void (*rf)(_Float16 *, _Float16),
-                      void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                      const _Float16 rnv, const uint64_t k) {
-  _iteam_reduction<_Float16, 16, 64>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_bf_16x64(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td,
-                       void (*rf)(__bf16 *, __bf16),
-                       void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                       const __bf16 rnv, const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<__bf16, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
+__kmpc_xteamr_h_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td,
+                         void (*rf)(_Float16 *, _Float16),
+                         void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *),
+                         const _Float16 rnv, const uint64_t k,
+                         const uint32_t nt, ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_Float16, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
                                    Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_bf_16x64_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs,
-                                uint32_t *td, void (*rf)(__bf16 *, __bf16),
-                                void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                                const __bf16 rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<__bf16, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                         Scope);
+__kmpc_iteamr_h(_Float16 v, _Float16 *r_p, void (*rf)(_Float16 *, _Float16),
+                void (*rflds)(_RF_LDS _Float16 *, _RF_LDS _Float16 *),
+                const _Float16 rnv, const uint64_t k) {
+  _iteam_reduction(_Float16, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_bf_16x64(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16),
-                       void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                       const __bf16 rnv, const uint64_t k) {
-  _iteam_reduction<__bf16, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_bf(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td,
+                 void (*rf)(__bf16 *, __bf16),
+                 void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *),
+                 const __bf16 rnv, const uint64_t k, const uint32_t nt,
+                 ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<__bf16>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_s_16x64(short v, short *r_p, short *tvs, uint32_t *td,
-                      void (*rf)(short *, short),
-                      void (*rflds)(_LDS short *, _LDS short *),
-                      const short rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<short, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                  Scope);
+__kmpc_xteamr_bf_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td,
+                          void (*rf)(__bf16 *, __bf16),
+                          void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *),
+                          const __bf16 rnv, const uint64_t k, const uint32_t nt,
+                          ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<__bf16, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_s_16x64_fast_sum(short v, short *r_p, short *tvs, uint32_t *td,
-                               void (*rf)(short *, short),
-                               void (*rflds)(_LDS short *, _LDS short *),
-                               const short rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<short, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                        Scope);
+__kmpc_iteamr_bf(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16),
+                 void (*rflds)(_RF_LDS __bf16 *, _RF_LDS __bf16 *),
+                 const __bf16 rnv, const uint64_t k) {
+  _iteam_reduction(__bf16, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_s_16x64(short v, short *r_p, void (*rf)(short *, short),
-                      void (*rflds)(_LDS short *, _LDS short *),
-                      const short rnv, const uint64_t k) {
-  _iteam_reduction<short, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_s(short v, short *r_p, short *tvs, uint32_t *td,
+                void (*rf)(short *, short),
+                void (*rflds)(_RF_LDS short *, _RF_LDS short *),
+                const short rnv, const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<short>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_us_16x64(_US v, _US *r_p, _US *tvs, uint32_t *td,
-                       void (*rf)(_US *, _US),
-                       void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_US, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
+__kmpc_xteamr_s_fast_sum(short v, short *r_p, short *tvs, uint32_t *td,
+                         void (*rf)(short *, short),
+                         void (*rflds)(_RF_LDS short *, _RF_LDS short *),
+                         const short rnv, const uint64_t k, const uint32_t nt,
+                         ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<short, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_us_16x64_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td,
-                                void (*rf)(_US *, _US),
-                                void (*rflds)(_LDS _US *, _LDS _US *),
-                                const _US rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_US, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
+__kmpc_iteamr_s(short v, short *r_p, void (*rf)(short *, short),
+                void (*rflds)(_RF_LDS short *, _RF_LDS short *),
+                const short rnv, const uint64_t k) {
+  _iteam_reduction(short, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_us_16x64(_US v, _US *r_p, void (*rf)(_US *, _US),
-                       void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_US, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_us(_US v, _US *r_p, _US *tvs, uint32_t *td,
+                 void (*rf)(_US *, _US),
+                 void (*rflds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
+                 const uint64_t k, const uint32_t nt,
+                 ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_US>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_i_16x64(int v, int *r_p, int *tvs, uint32_t *td,
-                      void (*rf)(int *, int),
-                      void (*rflds)(_LDS int *, _LDS int *), const int rnv,
-                      const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<int, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
+__kmpc_xteamr_us_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td,
+                          void (*rf)(_US *, _US),
+                          void (*rflds)(_RF_LDS _US *, _RF_LDS _US *),
+                          const _US rnv, const uint64_t k, const uint32_t nt,
+                          ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_US, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_i_16x64_fast_sum(int v, int *r_p, int *tvs, uint32_t *td,
-                               void (*rf)(int *, int),
-                               void (*rflds)(_LDS int *, _LDS int *),
-                               const int rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<int, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
+__kmpc_iteamr_us(_US v, _US *r_p, void (*rf)(_US *, _US),
+                 void (*rflds)(_RF_LDS _US *, _RF_LDS _US *), const _US rnv,
+                 const uint64_t k) {
+  _iteam_reduction(_US, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_i_16x64(int v, int *r_p, void (*rf)(int *, int),
-                      void (*rflds)(_LDS int *, _LDS int *), const int rnv,
-                      const uint64_t k) {
-  _iteam_reduction<int, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_i(int v, int *r_p, int *tvs, uint32_t *td, void (*rf)(int *, int),
+                void (*rflds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
+                const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<int>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_ui_16x64(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
-                       void (*rf)(_UI *, _UI),
-                       void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UI, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
+__kmpc_xteamr_i_fast_sum(int v, int *r_p, int *tvs, uint32_t *td,
+                         void (*rf)(int *, int),
+                         void (*rflds)(_RF_LDS int *, _RF_LDS int *),
+                         const int rnv, const uint64_t k, const uint32_t nt,
+                         ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<int, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_ui_16x64_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
-                                void (*rf)(_UI *, _UI),
-                                void (*rflds)(_LDS _UI *, _LDS _UI *),
-                                const _UI rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UI, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
+__kmpc_iteamr_i(int v, int *r_p, void (*rf)(int *, int),
+                void (*rflds)(_RF_LDS int *, _RF_LDS int *), const int rnv,
+                const uint64_t k) {
+  _iteam_reduction(int, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_ui_16x64(_UI v, _UI *r_p, void (*rf)(_UI *, _UI),
-                       void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_UI, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_ui(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
+                 void (*rf)(_UI *, _UI),
+                 void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
+                 const uint64_t k, const uint32_t nt,
+                 ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_UI>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_l_16x64(long v, long *r_p, long *tvs, uint32_t *td,
-                      void (*rf)(long *, long),
-                      void (*rflds)(_LDS long *, _LDS long *), const long rnv,
-                      const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<long, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
+__kmpc_xteamr_ui_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
+                          void (*rf)(_UI *, _UI),
+                          void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *),
+                          const _UI rnv, const uint64_t k, const uint32_t nt,
+                          ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_UI, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_l_16x64_fast_sum(long v, long *r_p, long *tvs, uint32_t *td,
-                               void (*rf)(long *, long),
-                               void (*rflds)(_LDS long *, _LDS long *),
-                               const long rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<long, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                       Scope);
+__kmpc_iteamr_ui(_UI v, _UI *r_p, void (*rf)(_UI *, _UI),
+                 void (*rflds)(_RF_LDS _UI *, _RF_LDS _UI *), const _UI rnv,
+                 const uint64_t k) {
+  _iteam_reduction(_UI, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_l_16x64(long v, long *r_p, void (*rf)(long *, long),
-                      void (*rflds)(_LDS long *, _LDS long *), const long rnv,
-                      const uint64_t k) {
-  _iteam_reduction<long, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_l(long v, long *r_p, long *tvs, uint32_t *td,
+                void (*rf)(long *, long),
+                void (*rflds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
+                const uint64_t k, const uint32_t nt,
+                ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<long>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_ul_16x64(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
-                       void (*rf)(_UL *, _UL),
-                       void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UL, 16, 64>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
+__kmpc_xteamr_l_fast_sum(long v, long *r_p, long *tvs, uint32_t *td,
+                         void (*rf)(long *, long),
+                         void (*rflds)(_RF_LDS long *, _RF_LDS long *),
+                         const long rnv, const uint64_t k, const uint32_t nt,
+                         ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<long, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_ul_16x64_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
-                                void (*rf)(_UL *, _UL),
-                                void (*rflds)(_LDS _UL *, _LDS _UL *),
-                                const _UL rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UL, 16, 64, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
+__kmpc_iteamr_l(long v, long *r_p, void (*rf)(long *, long),
+                void (*rflds)(_RF_LDS long *, _RF_LDS long *), const long rnv,
+                const uint64_t k) {
+  _iteam_reduction(long, v, r_p, rf, rflds, rnv, k);
 }
 _EXT_ATTR
-__kmpc_iteamr_ul_16x64(_UL v, _UL *r_p, void (*rf)(_UL *, _UL),
-                       void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_UL, 16, 64>(v, r_p, rf, rflds, rnv, k);
+__kmpc_xteamr_ul(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
+                 void (*rf)(_UL *, _UL),
+                 void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
+                 const uint64_t k, const uint32_t nt,
+                 ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_UL>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_d_32x32(double v, double *r_p, double *tvs, uint32_t *td,
-                      void (*rf)(double *, double),
-                      void (*rflds)(_LDS double *, _LDS double *),
-                      const double rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<double, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                   Scope);
+__kmpc_xteamr_ul_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
+                          void (*rf)(_UL *, _UL),
+                          void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *),
+                          const _UL rnv, const uint64_t k, const uint32_t nt,
+                          ompx::atomic::MemScopeTy Scope) {
+  _xteam_reduction<_UL, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
 }
 _EXT_ATTR
-__kmpc_xteamr_d_32x32_fast_sum(double v, double *r_p, double *tvs, uint32_t *td,
-                               void (*rf)(double *, double),
-                               void (*rflds)(_LDS double *, _LDS double *),
-                               const double rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<double, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                         Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_d_32x32(double v, double *r_p, void (*rf)(double *, double),
-                      void (*rflds)(_LDS double *, _LDS double *),
-                      const double rnv, const uint64_t k) {
-  _iteam_reduction<double, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_f_32x32(float v, float *r_p, float *tvs, uint32_t *td,
-                      void (*rf)(float *, float),
-                      void (*rflds)(_LDS float *, _LDS float *),
-                      const float rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<float, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                  Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_f_32x32_fast_sum(float v, float *r_p, float *tvs, uint32_t *td,
-                               void (*rf)(float *, float),
-                               void (*rflds)(_LDS float *, _LDS float *),
-                               const float rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<float, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                        Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_f_32x32(float v, float *r_p, void (*rf)(float *, float),
-                      void (*rflds)(_LDS float *, _LDS float *),
-                      const float rnv, const uint64_t k) {
-  _iteam_reduction<float, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_h_32x32(_Float16 v, _Float16 *r_p, _Float16 *tvs, uint32_t *td,
-                      void (*rf)(_Float16 *, _Float16),
-                      void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                      const _Float16 rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_Float16, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                     Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_h_32x32_fast_sum(_Float16 v, _Float16 *r_p, _Float16 *tvs,
-                               uint32_t *td, void (*rf)(_Float16 *, _Float16),
-                               void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                               const _Float16 rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_Float16, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k,
-                                           nt, Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_h_32x32(_Float16 v, _Float16 *r_p,
-                      void (*rf)(_Float16 *, _Float16),
-                      void (*rflds)(_LDS _Float16 *, _LDS _Float16 *),
-                      const _Float16 rnv, const uint64_t k) {
-  _iteam_reduction<_Float16, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_bf_32x32(__bf16 v, __bf16 *r_p, __bf16 *tvs, uint32_t *td,
-                       void (*rf)(__bf16 *, __bf16),
-                       void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                       const __bf16 rnv, const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<__bf16, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                   Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_bf_32x32_fast_sum(__bf16 v, __bf16 *r_p, __bf16 *tvs,
-                                uint32_t *td, void (*rf)(__bf16 *, __bf16),
-                                void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                                const __bf16 rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<__bf16, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                         Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_bf_32x32(__bf16 v, __bf16 *r_p, void (*rf)(__bf16 *, __bf16),
-                       void (*rflds)(_LDS __bf16 *, _LDS __bf16 *),
-                       const __bf16 rnv, const uint64_t k) {
-  _iteam_reduction<__bf16, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_s_32x32(short v, short *r_p, short *tvs, uint32_t *td,
-                      void (*rf)(short *, short),
-                      void (*rflds)(_LDS short *, _LDS short *),
-                      const short rnv, const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<short, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                  Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_s_32x32_fast_sum(short v, short *r_p, short *tvs, uint32_t *td,
-                               void (*rf)(short *, short),
-                               void (*rflds)(_LDS short *, _LDS short *),
-                               const short rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<short, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                        Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_s_32x32(short v, short *r_p, void (*rf)(short *, short),
-                      void (*rflds)(_LDS short *, _LDS short *),
-                      const short rnv, const uint64_t k) {
-  _iteam_reduction<short, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_us_32x32(_US v, _US *r_p, _US *tvs, uint32_t *td,
-                       void (*rf)(_US *, _US),
-                       void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_US, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_us_32x32_fast_sum(_US v, _US *r_p, _US *tvs, uint32_t *td,
-                                void (*rf)(_US *, _US),
-                                void (*rflds)(_LDS _US *, _LDS _US *),
-                                const _US rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_US, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_us_32x32(_US v, _US *r_p, void (*rf)(_US *, _US),
-                       void (*rflds)(_LDS _US *, _LDS _US *), const _US rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_US, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_i_32x32(int v, int *r_p, int *tvs, uint32_t *td,
-                      void (*rf)(int *, int),
-                      void (*rflds)(_LDS int *, _LDS int *), const int rnv,
-                      const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<int, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_i_32x32_fast_sum(int v, int *r_p, int *tvs, uint32_t *td,
-                               void (*rf)(int *, int),
-                               void (*rflds)(_LDS int *, _LDS int *),
-                               const int rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<int, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_i_32x32(int v, int *r_p, void (*rf)(int *, int),
-                      void (*rflds)(_LDS int *, _LDS int *), const int rnv,
-                      const uint64_t k) {
-  _iteam_reduction<int, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_ui_32x32(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
-                       void (*rf)(_UI *, _UI),
-                       void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UI, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_ui_32x32_fast_sum(_UI v, _UI *r_p, _UI *tvs, uint32_t *td,
-                                void (*rf)(_UI *, _UI),
-                                void (*rflds)(_LDS _UI *, _LDS _UI *),
-                                const _UI rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UI, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_ui_32x32(_UI v, _UI *r_p, void (*rf)(_UI *, _UI),
-                       void (*rflds)(_LDS _UI *, _LDS _UI *), const _UI rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_UI, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_l_32x32(long v, long *r_p, long *tvs, uint32_t *td,
-                      void (*rf)(long *, long),
-                      void (*rflds)(_LDS long *, _LDS long *), const long rnv,
-                      const uint64_t k, const uint32_t nt,
-                      ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<long, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_l_32x32_fast_sum(long v, long *r_p, long *tvs, uint32_t *td,
-                               void (*rf)(long *, long),
-                               void (*rflds)(_LDS long *, _LDS long *),
-                               const long rnv, const uint64_t k,
-                               const uint32_t nt,
-                               ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<long, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                       Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_l_32x32(long v, long *r_p, void (*rf)(long *, long),
-                      void (*rflds)(_LDS long *, _LDS long *), const long rnv,
-                      const uint64_t k) {
-  _iteam_reduction<long, 32, 32>(v, r_p, rf, rflds, rnv, k);
-}
-_EXT_ATTR
-__kmpc_xteamr_ul_32x32(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
-                       void (*rf)(_UL *, _UL),
-                       void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv,
-                       const uint64_t k, const uint32_t nt,
-                       ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UL, 32, 32>(v, r_p, tvs, td, rf, rflds, rnv, k, nt, Scope);
-}
-_EXT_ATTR
-__kmpc_xteamr_ul_32x32_fast_sum(_UL v, _UL *r_p, _UL *tvs, uint32_t *td,
-                                void (*rf)(_UL *, _UL),
-                                void (*rflds)(_LDS _UL *, _LDS _UL *),
-                                const _UL rnv, const uint64_t k,
-                                const uint32_t nt,
-                                ompx::atomic::MemScopeTy Scope) {
-  _xteam_reduction<_UL, 32, 32, true>(v, r_p, tvs, td, rf, rflds, rnv, k, nt,
-                                      Scope);
-}
-_EXT_ATTR
-__kmpc_iteamr_ul_32x32(_UL v, _UL *r_p, void (*rf)(_UL *, _UL),
-                       void (*rflds)(_LDS _UL *, _LDS _UL *), const _UL rnv,
-                       const uint64_t k) {
-  _iteam_reduction<_UL, 32, 32>(v, r_p, rf, rflds, rnv, k);
+__kmpc_iteamr_ul(_UL v, _UL *r_p, void (*rf)(_UL *, _UL),
+                 void (*rflds)(_RF_LDS _UL *, _RF_LDS _UL *), const _UL rnv,
+                 const uint64_t k) {
+  _iteam_reduction(_UL, v, r_p, rf, rflds, rnv, k);
 }
 
 // Built-in pair reduction functions used as function pointers for
 // cross team reduction functions.
 
-#define _RF_LDS volatile __gpu_local
-
 _EXT_ATTR __kmpc_rfun_sum_d(double *val, double otherval) { *val += otherval; }
 _EXT_ATTR __kmpc_rfun_sum_lds_d(_RF_LDS double *val, _RF_LDS double *otherval) {
   *val += *otherval;
@@ -1071,11 +699,13 @@ _EXT_ATTR __kmpc_rfun_min_ul(_UL *val, _UL otherval) {
 _EXT_ATTR __kmpc_rfun_min_lds_ul(_RF_LDS _UL *val, _RF_LDS _UL *otherval) {
   *val = (*otherval < *val) ? *otherval : *val;
 }
-#undef _EXT_ATTR
+
 #undef _CD
 #undef _CF
 #undef _US
 #undef _UI
 #undef _UL
-#undef _LDS
+#undef _INLINE_ATTR_
 #undef _RF_LDS
+#undef _MaxNumWaves
+#undef _WSZ