intel · jianyizh · May 24, 2025 · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
@@ -69,22 +69,33 @@ T bilinear_interpolate(
 template <typename T>
 struct RoiAlignForwardKernel {
   void operator()(sycl::nd_item<1> item) const {
-    XPU_KERNEL_LOOP(item, index, nthreads_) {
-      // (n, c, ph, pw) is an element in the pooled output
+    auto wg = item.get_group(0);
-    auto wg = item.get_group(0);
+    auto wg = item.get_group(0);
+    // Compute the ROI index (n) by dividing the workgroup ID (wg) by the number of workgroups per ROI (wgs_per_roi_).
+    // This ensures that each ROI is processed by the correct set of workgroups.
-    auto wg = item.get_group(0);
+    auto wg = item.get_group(0);
+    // Compute the ROI index (n) by dividing the workgroup ID (wg) by the number of workgroups per ROI (wgs_per_roi_).
+    // This ensures that each ROI is processed by the correct set of workgroups.
+    auto idx = item.get_local_id(0);
+    int n = wg / wg_per_roi_;
+    int index = (wg - n * wg_per_roi_) * item.get_local_range(0);
+    if (index < item_per_roi_) {
       int pw = index % pooled_width_;
       int ph = (index / pooled_width_) % pooled_height_;
       int c = (index / pooled_width_ / pooled_height_) % channels_;
-      int n = index / pooled_width_ / pooled_height_ / channels_;
 
       const T* offset_rois = rois_ + n * 5;
-      int roi_batch_ind = offset_rois[0];
+      if (idx == 0) {
+        cache_roi_[0] = offset_rois[0];
+
+        // Do not using rounding; this implementation detail is critical
+        T offset = aligned_ ? (T)0.5 : (T)0.0;
+        cache_roi_[1] = offset_rois[1] * spatial_scale_ - offset;
+        cache_roi_[2] = offset_rois[2] * spatial_scale_ - offset;
+        cache_roi_[3] = offset_rois[3] * spatial_scale_ - offset;
+        cache_roi_[4] = offset_rois[4] * spatial_scale_ - offset;
+      }
+      item.barrier(sycl_local_fence);
 
-      // Do not using rounding; this implementation detail is critical
-      T offset = aligned_ ? (T)0.5 : (T)0.0;
-      T roi_start_w = offset_rois[1] * spatial_scale_ - offset;
-      T roi_start_h = offset_rois[2] * spatial_scale_ - offset;
-      T roi_end_w = offset_rois[3] * spatial_scale_ - offset;
-      T roi_end_h = offset_rois[4] * spatial_scale_ - offset;
+      int roi_batch_ind = cache_roi_[0];
+      T roi_start_w = cache_roi_[1];
+      T roi_start_h = cache_roi_[2];
+      T roi_end_w = cache_roi_[3];
+      T roi_end_h = cache_roi_[4];
 
       T roi_width = roi_end_w - roi_start_w;
       T roi_height = roi_end_h - roi_start_h;
@@ -136,9 +147,10 @@ struct RoiAlignForwardKernel {
     }
   }
   RoiAlignForwardKernel(
-      int nthreads,
       const T* input,
       const T spatial_scale,
+      int item_per_rois,
+      int wg_per_roi,
       int channels,
       int height,
       int width,
@@ -148,9 +160,10 @@ struct RoiAlignForwardKernel {
       bool aligned,
       const T* rois,
       T* output)
-      : nthreads_(nthreads),
-        input_(input),
+      : input_(input),
         spatial_scale_(spatial_scale),
+        item_per_roi_(item_per_rois),
+        wg_per_roi_(wg_per_roi),
         channels_(channels),
         height_(height),
         width_(width),
@@ -160,20 +173,25 @@ struct RoiAlignForwardKernel {
         aligned_(aligned),
         rois_(rois),
         output_(output) {}
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    cache_roi_ = sycl_local_acc_t<T>(5, cgh);
+  }
 
  private:
-  int nthreads_;
   const T* input_;
   const T spatial_scale_;
-  int channels_;
-  int height_;
-  int width_;
-  int pooled_height_;
-  int pooled_width_;
-  int sampling_ratio_;
-  bool aligned_;
+  const int item_per_roi_;
+  const int wg_per_roi_;
+  const int channels_;
+  const int height_;
+  const int width_;
+  const int pooled_height_;
+  const int pooled_width_;
+  const int sampling_ratio_;
+  const bool aligned_;
   const T* rois_;
   T* output_;
+  sycl_local_acc_t<T> cache_roi_;
 };
 
 template <typename T>
@@ -415,11 +433,7 @@ Tensor roi_align_kernel(
 
   at::Tensor output = at::zeros(
       {num_rois, channels, pooled_height, pooled_width}, input.options());
-
   auto output_size = num_rois * pooled_height * pooled_width * channels;
-  int64_t global_range =
-      ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512));
-  int64_t local_range = 512;
 
   if (output.numel() == 0) {
     return output;
@@ -433,10 +447,20 @@ Tensor roi_align_kernel(
       input.scalar_type(),
       "roi_align_forward_kernel_xpu",
       [&] {
+        int64_t local_range =
+            syclMaxWorkGroupSize<RoiAlignForwardKernel<scalar_t>>();
+        int item_per_roi = pooled_height * pooled_width * channels;
+        if (item_per_roi < local_range) {
+          local_range = (item_per_roi + 32 - 1) / 32 *
+              32; // wg can be smaller but it better to be a mutiple of 32
+        }
+        int wg_per_roi = (item_per_roi + local_range - 1) / local_range;
+        int64_t global_range = wg_per_roi * num_rois;
         auto kfn = RoiAlignForwardKernel<scalar_t>(
-            output_size,
             input_.data_ptr<scalar_t>(),
             spatial_scale,
+            item_per_roi,
+            wg_per_roi,
             channels,
             height,
             width,

diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -425,9 +425,9 @@ struct UpsampleBilinear2dBackwardNotAlignKernelFunctor {
           // scale is 1 if on boundary
           distance_w =
               distance_w + is_boundary_w * (output_width_ * 2 - distance_w);
-          bool is_boundary_h =
-              !((point_h >= output_height_) &&
-                (point_h <= output_height_ * input_height_ * 2 - output_height_));
+          bool is_boundary_h = !(
+              (point_h >= output_height_) &&
+              (point_h <= output_height_ * input_height_ * 2 - output_height_));
           distance_h =
               distance_h + is_boundary_h * (output_height_ * 2 - distance_h);
           accscalar_t scale =
@@ -606,8 +606,10 @@ void launch_upsample_bilinear2d_backward_kernel(
   // TODO: when input 3x3, scale is 1.5, output is 4x4,
   // pytorch prefer use 1/1.5, but my implementation treat it as 3/4...
   // I also have to skip double because of rounding issues, it will not pass ut
-  can_optimize = can_optimize && (align_corners || (input_width == (rwidth * output_width) &&
-      input_height == (rheight * output_height))) &&
+  can_optimize = can_optimize &&
+      (align_corners ||
+       (input_width == (rwidth * output_width) &&
+        input_height == (rheight * output_height))) &&
       !std::is_same<scalar_t, double>::value;
   if (can_optimize) {
     if (align_corners) {
@@ -790,8 +792,10 @@ void launch_upsample_bilinear2d_backward_nhwc_kernel(
   // TODO: when input 3x3, scale is 1.5, output is 4x4,
   // pytorch prefer use 1/1.5, but my implementation treat it as 3/4...
   // I also have to skip double because of rounding issues, it will not pass ut
-  can_optimize = can_optimize && (align_corners || (input_width == (rwidth * output_width) &&
-      input_height == (rheight * output_height))) &&
+  can_optimize = can_optimize &&
+      (align_corners ||
+       (input_width == (rwidth * output_width) &&
+        input_height == (rheight * output_height))) &&
       !std::is_same<scalar_t, double>::value;
   if (can_optimize) {
     if (align_corners) {