Skip to content

Replace deprecated [[intel::reqd_sub_group_size(SgSize)]] with [[sycl::reqd_sub_group_size(SIMD)]] and remove unnecessary attributes #1828

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/ATen/native/xpu/sycl/BatchNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ template <
typename index_t>
struct BatchNormCollectStatisticsKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
int plane = item.get_group(1);
int tid = item.get_local_linear_id();
Expand Down Expand Up @@ -1874,7 +1874,7 @@ template <
typename index_t>
struct BatchNormBackwardReduceKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);

Expand Down Expand Up @@ -4162,7 +4162,7 @@ template <
typename stat_accscalar_t,
typename index_t>
struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);
index_t N = grad_output_.size(0) * grad_output_.size(2);
Expand Down Expand Up @@ -4370,7 +4370,7 @@ template <
typename index_t>
struct BatchNormBackwardVectorizedKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);
index_t N = grad_output_.size(0) * grad_output_.size(2);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/Dequant_int4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ struct DequantInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
weight_dequant(weight_dequant) {}

void sycl_ker_config_convention(sycl::handler& cgh) {}
[[intel::reqd_sub_group_size(SgSize)]] void operator()(
[[sycl::reqd_sub_group_size(SgSize)]] void operator()(
sycl::nd_item<1> it) const {
int constexpr GroupN = TileN;
int constexpr GroupK = SgSize * TileK;
Expand Down
10 changes: 4 additions & 6 deletions src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ template <
int r_args_depth = 1,
int res_arg_index = 0>
struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
template <typename TLA, typename TLW>
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
template <typename TLA, typename TLW> void operator()(
const int64_t chunk_size,
TLA tlAddress,
TLW tlWGMeta,
Expand Down Expand Up @@ -117,7 +116,7 @@ struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {

template <typename out_t, NormType norm_type, typename opmath_t, int SIMD>
struct lpnormChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
auto lid = item_id.get_local_linear_id();
auto group_id = item_id.get_group(0);
Expand Down Expand Up @@ -481,8 +480,7 @@ std::vector<Tensor> foreach_norm_kernel(

template <typename T, int SIMD>
struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
template <typename TLA, typename TLW>
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
template <typename TLA, typename TLW> void operator()(
int64_t chunk_size,
TLA tlAddressMeta,
TLW tlWGMeta,
Expand Down Expand Up @@ -555,7 +553,7 @@ struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {

template <typename T, int SIMD>
struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
auto local_range = item_id.get_local_range(0);
auto lid = item_id.get_local_linear_id();
Expand Down
16 changes: 8 additions & 8 deletions src/ATen/native/xpu/sycl/GroupNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using WelfordOp =
WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
const int64_t i = item.get_group(0);
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
Expand Down Expand Up @@ -114,7 +114,7 @@ struct GNRowwiseMomentsVectorizedFunctor
WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
using vec_t = memory::aligned_vector<T, VEC_SIZE>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
WelfordType val[VEC_SIZE];
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
Expand Down Expand Up @@ -476,7 +476,7 @@ void group_norm_kernel(
template <typename T, typename T_ACC, int SIMD>
struct Compute1dBackwardFusedParamsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t G = group_;
const int64_t D = C_ / G;
Expand Down Expand Up @@ -630,7 +630,7 @@ template <typename T, int SIMD, int kReduceTileSize>
struct GammaBeta1dBackwardLargeKernel : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t c =
item.get_group(1) * item.get_local_range(1) + item.get_local_id(1);
Expand Down Expand Up @@ -890,7 +890,7 @@ template <typename T, int SIMD>
struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
const int64_t nc = item.get_group(0);
T_ACC sum1 = 0;
Expand Down Expand Up @@ -941,7 +941,7 @@ struct ComputeInternalGradientsVectorizedFunctor
using vec_t = memory::aligned_vector<T, VEC_SIZE>;
using acc_vec_t = memory::aligned_vector<T_ACC, VEC_SIZE>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
acc_vec_t sum1_vec;
acc_vec_t sum2_vec;
Expand Down Expand Up @@ -1038,7 +1038,7 @@ struct ComputeBackwardFusedParamsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t G = group_;
const int64_t D = C_ / G;
Expand Down Expand Up @@ -1176,7 +1176,7 @@ template <typename T, int SIMD, int kReduceTileSize>
struct GammaBetaBackwardFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
auto group_x = item.get_group(1);
auto group_size_x = item.get_local_range(1);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/IndexKernelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ inline bool fast_gather_kernel_eligible(

template <int Alignment, typename index_t>
struct VectorizedGatherKernel {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
int64_t ind = idx_[item.get_group(1)];
if (allow_neg_indices_) {
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/LayerNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ struct RowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using WelfordType = WelfordData<T_ACC, int64_t>;
using WelfordOp = WelfordOps<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
const int64_t i = item_id.get_group(0);
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
Expand Down Expand Up @@ -435,7 +435,7 @@ WelfordDataLN compute_stats(
template <typename T, typename T_ACC>
struct VectorizedLayerNormKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item_id) const {
auto i1 = item_id.get_group(1);
const T* block_row = X_ + i1 * N_;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/LinearInt4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct LinearInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
ldc(ldc) {}
void sycl_ker_config_convention(sycl::handler& cgh) {}

[[intel::reqd_sub_group_size(16)]] void operator()(
[[sycl::reqd_sub_group_size(16)]] void operator()(
sycl::nd_item<1> it) const {
int constexpr Unroll = 2;
int constexpr SgSize = 16;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ struct NllLoss2dForwardNoReduceKernelFunctor {

template <typename scalar_t, typename accscalar_t, typename index_t, int SIMD>
struct NllLoss2dForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
scalar_t cur_weight;
accscalar_t input_sum = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void multilabel_margin_loss_shape_check(
template <typename scalar_t, typename accscalar_t>
struct MultilabelMarginLossForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
[[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
operator()(sycl::nd_item<1> item) const {
int k = item.get_group(0);
const scalar_t* input_k = input_ + k * dim_;
Expand Down Expand Up @@ -148,7 +148,7 @@ struct MultilabelMarginLossForwardKernelFunctor
template <typename scalar_t, typename accscalar_t>
struct MultilabelMarginLossBackwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
[[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
operator()(sycl::nd_item<1> item) const {
int k = item.get_group(0);
const scalar_t* input_k = input_ + k * dim_;
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/Norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ template <
class Norm,
bool one_moment = false>
struct FusedNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<3> item_id) const {
accscalar_t sum1 = 0;
accscalar_t sum2 = 0;
Expand Down Expand Up @@ -747,7 +747,7 @@ template <
class Norm,
bool one_moment = false>
struct RowwiseMomentsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<3> item_id) const {
index_t local_id = item_id.get_local_id(2);

Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ template <
bool is_same_dtype>
struct DispatchSoftmaxForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
return;
Expand Down Expand Up @@ -933,7 +933,7 @@ template <
bool is_same_dtype = false>
struct DispatchSoftmaxBackwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
return;
Expand Down
10 changes: 5 additions & 5 deletions src/ATen/native/xpu/sycl/SortingKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace xpu {
template <typename method_t, typename key_t, typename value_t>
struct SegmentedGroupRadixSortPairsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int seg_idx = item.get_group(0);
int seg_offset = seg_idx * num_elements_;
Expand Down Expand Up @@ -96,7 +96,7 @@ void segmented_group_radix_sort_pairs_kernel(
template <typename method_t, typename key_t, typename value_t>
struct SegmentedRadixSortPairsUpsweepFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
method_t::PROCESSING_LENGTH;
Expand Down Expand Up @@ -179,7 +179,7 @@ void segmented_radix_sort_pairs_upsweep_kernel(
template <typename method_t>
struct SegmentedRadixSortPairsScanFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
constexpr int RADIX_BUCKETS = 16;
int seg_idx = item.get_group(0);
Expand Down Expand Up @@ -218,7 +218,7 @@ void segmented_radix_sort_pairs_scan_kernel(
template <typename method_t, typename key_t, typename value_t>
struct SegmentedRadixSortPairsDownsweepFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
method_t::PROCESSING_LENGTH;
Expand Down Expand Up @@ -448,7 +448,7 @@ struct SegmentedGroupRadixSelectPairsFunctor
MAX_KV_BYTES = std::max(sizeof(key_t), sizeof(value_t)),
};

[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int seg_idx = item.get_group(0);
int seg_offset = seg_idx * nelements_;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/TensorModeKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ inline T reduceGroupWithNThreadLocalReductions(

template <typename T, unsigned int Power2Size>
struct ComputeModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(32)]] void operator()(
[[sycl::reqd_sub_group_size(32)]] void operator()(
sycl::nd_item<3> item) const {
int tidx = item.get_local_id(2);
int stidx = item.get_local_range(2) +
Expand Down