From baecd96be8d89ed11693630c86e0cd0e48e3eb6e Mon Sep 17 00:00:00 2001 From: guangyey Date: Wed, 9 Jul 2025 20:07:11 -0700 Subject: [PATCH] Replace deprecated `[[intel::reqd_sub_group_size]]` with `[[sycl::reqd_sub_group_size]]` --- src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 8 ++++---- src/ATen/native/xpu/sycl/Dequant_int4.cpp | 2 +- .../native/xpu/sycl/ForeachReduceKernels.cpp | 10 ++++------ src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 16 ++++++++-------- src/ATen/native/xpu/sycl/IndexKernelUtils.h | 2 +- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 4 ++-- src/ATen/native/xpu/sycl/LinearInt4.cpp | 2 +- src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp | 2 +- .../xpu/sycl/MultiLabelMarginLossKernels.cpp | 4 ++-- src/ATen/native/xpu/sycl/Norm.h | 4 ++-- src/ATen/native/xpu/sycl/SoftMaxKernels.cpp | 4 ++-- src/ATen/native/xpu/sycl/SortingKernels.h | 10 +++++----- src/ATen/native/xpu/sycl/TensorModeKernel.cpp | 2 +- 13 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 2cccaba92b..f8e1b6906e 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -423,7 +423,7 @@ template < typename index_t> struct BatchNormCollectStatisticsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { int plane = item.get_group(1); int tid = item.get_local_linear_id(); @@ -1874,7 +1874,7 @@ template < typename index_t> struct BatchNormBackwardReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); @@ -4162,7 +4162,7 @@ template < typename stat_accscalar_t, typename index_t> struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); index_t N = grad_output_.size(0) * grad_output_.size(2); @@ -4370,7 +4370,7 @@ template < typename index_t> struct BatchNormBackwardVectorizedKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); index_t N = grad_output_.size(0) * grad_output_.size(2); diff --git a/src/ATen/native/xpu/sycl/Dequant_int4.cpp b/src/ATen/native/xpu/sycl/Dequant_int4.cpp index b217e18ad1..8a52dbfb32 100644 --- a/src/ATen/native/xpu/sycl/Dequant_int4.cpp +++ b/src/ATen/native/xpu/sycl/Dequant_int4.cpp @@ -22,7 +22,7 @@ struct DequantInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { weight_dequant(weight_dequant) {} void sycl_ker_config_convention(sycl::handler& cgh) {} - [[intel::reqd_sub_group_size(SgSize)]] void operator()( + [[sycl::reqd_sub_group_size(SgSize)]] void operator()( sycl::nd_item<1> it) const { int constexpr GroupN = TileN; int constexpr GroupK = SgSize * TileK; diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp index 899ffab56c..0a2b5cdbc7 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp @@ -25,8 +25,7 @@ template < int r_args_depth = 1, int res_arg_index = 0> struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - template - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + template void operator()( const int64_t chunk_size, TLA tlAddress, TLW tlWGMeta, @@ -117,7 +116,7 @@ struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { template struct lpnormChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { auto lid = item_id.get_local_linear_id(); auto group_id = item_id.get_group(0); @@ -481,8 +480,7 @@ std::vector foreach_norm_kernel( template struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - template - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + template void operator()( int64_t chunk_size, TLA tlAddressMeta, TLW tlWGMeta, @@ -555,7 +553,7 @@ struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { template struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { auto local_range = item_id.get_local_range(0); auto lid = item_id.get_local_linear_id(); diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 935ab99f74..261c9a0627 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -66,7 +66,7 @@ struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using WelfordOp = WelfordOpsXPU>; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { const int64_t i = item.get_group(0); WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item}; @@ -114,7 +114,7 @@ struct GNRowwiseMomentsVectorizedFunctor WelfordOpsXPU>; using vec_t = memory::aligned_vector; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { WelfordType val[VEC_SIZE]; WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item}; @@ -476,7 +476,7 @@ void group_norm_kernel( template struct Compute1dBackwardFusedParamsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t G = group_; const int64_t D = C_ / G; @@ -630,7 +630,7 @@ template struct GammaBeta1dBackwardLargeKernel : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t c = item.get_group(1) * item.get_local_range(1) + item.get_local_id(1); @@ -890,7 +890,7 @@ template struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { const int64_t nc = item.get_group(0); T_ACC sum1 = 0; @@ -941,7 +941,7 @@ struct ComputeInternalGradientsVectorizedFunctor using vec_t = memory::aligned_vector; using acc_vec_t = memory::aligned_vector; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { acc_vec_t sum1_vec; acc_vec_t sum2_vec; @@ -1038,7 +1038,7 @@ struct ComputeBackwardFusedParamsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t G = group_; const int64_t D = C_ / G; @@ -1176,7 +1176,7 @@ template struct GammaBetaBackwardFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { auto group_x = item.get_group(1); auto group_size_x = item.get_local_range(1); diff --git a/src/ATen/native/xpu/sycl/IndexKernelUtils.h b/src/ATen/native/xpu/sycl/IndexKernelUtils.h index 1b1cdc1fdb..4be8b335be 100644 --- a/src/ATen/native/xpu/sycl/IndexKernelUtils.h +++ b/src/ATen/native/xpu/sycl/IndexKernelUtils.h @@ -42,7 +42,7 @@ inline bool fast_gather_kernel_eligible( template struct VectorizedGatherKernel { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { int64_t ind = idx_[item.get_group(1)]; if (allow_neg_indices_) { diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 2d62ad0584..9917b256ce 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -186,7 +186,7 @@ struct RowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using WelfordType = WelfordData; using WelfordOp = WelfordOps>; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { const int64_t i = item_id.get_group(0); WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false}; @@ -435,7 +435,7 @@ WelfordDataLN compute_stats( template struct VectorizedLayerNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item_id) const { auto i1 = item_id.get_group(1); const T* block_row = X_ + i1 * N_; diff --git a/src/ATen/native/xpu/sycl/LinearInt4.cpp b/src/ATen/native/xpu/sycl/LinearInt4.cpp index 25665b639b..31ed632baf 100644 --- a/src/ATen/native/xpu/sycl/LinearInt4.cpp +++ b/src/ATen/native/xpu/sycl/LinearInt4.cpp @@ -41,7 +41,7 @@ struct LinearInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { ldc(ldc) {} void sycl_ker_config_convention(sycl::handler& cgh) {} - [[intel::reqd_sub_group_size(16)]] void operator()( + [[sycl::reqd_sub_group_size(16)]] void operator()( sycl::nd_item<1> it) const { int constexpr Unroll = 2; int constexpr SgSize = 16; diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp index 8b018de6b8..6b67e081d1 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp @@ -79,7 +79,7 @@ struct NllLoss2dForwardNoReduceKernelFunctor { template struct NllLoss2dForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { scalar_t cur_weight; accscalar_t input_sum = 0; diff --git a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp index 28047972fd..3c5d3271bc 100644 --- a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp +++ b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp @@ -51,7 +51,7 @@ void multilabel_margin_loss_shape_check( template struct MultilabelMarginLossForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void + [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void operator()(sycl::nd_item<1> item) const { int k = item.get_group(0); const scalar_t* input_k = input_ + k * dim_; @@ -148,7 +148,7 @@ struct MultilabelMarginLossForwardKernelFunctor template struct MultilabelMarginLossBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void + [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void operator()(sycl::nd_item<1> item) const { int k = item.get_group(0); const scalar_t* input_k = input_ + k * dim_; diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 6dd893100e..6117b6d261 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -600,7 +600,7 @@ template < class Norm, bool one_moment = false> struct FusedNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<3> item_id) const { accscalar_t sum1 = 0; accscalar_t sum2 = 0; @@ -747,7 +747,7 @@ template < class Norm, bool one_moment = false> struct RowwiseMomentsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<3> item_id) const { index_t local_id = item_id.get_local_id(2); diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 2a4f749e15..fdedf5fb09 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -216,7 +216,7 @@ template < bool is_same_dtype> struct DispatchSoftmaxForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { if (local_size_ == 1 && item.get_global_id(0) >= outer_size_) return; @@ -933,7 +933,7 @@ template < bool is_same_dtype = false> struct DispatchSoftmaxBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { if (local_size_ == 1 && item.get_global_id(0) >= outer_size_) return; diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h index cce01ba4b5..aad93d9eb6 100644 --- a/src/ATen/native/xpu/sycl/SortingKernels.h +++ b/src/ATen/native/xpu/sycl/SortingKernels.h @@ -15,7 +15,7 @@ namespace xpu { template struct SegmentedGroupRadixSortPairsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int seg_idx = item.get_group(0); int seg_offset = seg_idx * num_elements_; @@ -96,7 +96,7 @@ void segmented_group_radix_sort_pairs_kernel( template struct SegmentedRadixSortPairsUpsweepFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) / method_t::PROCESSING_LENGTH; @@ -179,7 +179,7 @@ void segmented_radix_sort_pairs_upsweep_kernel( template struct SegmentedRadixSortPairsScanFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { constexpr int RADIX_BUCKETS = 16; int seg_idx = item.get_group(0); @@ -218,7 +218,7 @@ void segmented_radix_sort_pairs_scan_kernel( template struct SegmentedRadixSortPairsDownsweepFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) / method_t::PROCESSING_LENGTH; @@ -448,7 +448,7 @@ struct SegmentedGroupRadixSelectPairsFunctor MAX_KV_BYTES = std::max(sizeof(key_t), sizeof(value_t)), }; - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int seg_idx = item.get_group(0); int seg_offset = seg_idx * nelements_; diff --git a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp index 7ae95e36b8..dcecf27ab0 100644 --- a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp +++ b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp @@ -231,7 +231,7 @@ inline T reduceGroupWithNThreadLocalReductions( template struct ComputeModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(32)]] void operator()( + [[sycl::reqd_sub_group_size(32)]] void operator()( sycl::nd_item<3> item) const { int tidx = item.get_local_id(2); int stidx = item.get_local_range(2) +