diff --git a/Paddle b/Paddle
index d7597815b16..2701651b591 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit d7597815b16e2a72f927b2f15f8477b196098bf0
+Subproject commit 2701651b5912cf97d4ecd3a20444b2dbce3a0b7f
diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index cf3bee126b2..e6fd433c723 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -112,7 +112,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
   # Core
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/mixed_vector.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cusparse.cc
   # kernels/funcs
@@ -128,6 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/batched_gemm.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu
@@ -876,7 +877,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
diff --git a/backends/iluvatar_gpu/common/cuda_flags.cc b/backends/iluvatar_gpu/common/cuda_flags.cc
index 31209fb0e98..a30896f0ff2 100644
--- a/backends/iluvatar_gpu/common/cuda_flags.cc
+++ b/backends/iluvatar_gpu/common/cuda_flags.cc
@@ -277,3 +277,19 @@ PHI_DEFINE_EXPORTED_bool(
     flash_attn_available,
     true,
     "Weather flash attention is available on the current device.");
+
+/**
+ * CUDNN related FLAG
+ * Name: FLAGS_conv_workspace_size_limit
+ * Since Version: 0.13.0
+ * Value Range: uint64, default=512 (MB)
+ * Example:
+ * Note: The internal function of cuDNN obtains the fastest matching algorithm
+ *       within this memory limit. Usually, faster algorithms can be chosen in
+ *       larger workspaces, but memory space can also be significantly
+ * increased.
+ *       Users need to balance memory and speed.
+ */
+PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
+                          1024,
+                          "cuDNN convolution workspace limit in MB unit.");
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu
new file mode 100644
index 00000000000..46fb616efc5
--- /dev/null
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/batched_gemm.h"
+
+PD_CUSTOM_KERNEL_REGISTER(batched_gemm,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::BatchedGEMM,
+                          float,
+                          phi::bfloat16) {}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu
deleted file mode 100644
index 823d7008d42..00000000000
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
-#include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
-
-namespace phi {
-namespace fusion {
-
-template <typename T, typename Context>
-void FusedRopeGradKernel(const Context& dev_ctx,
-                         const paddle::optional<DenseTensor>& sin,
-                         const paddle::optional<DenseTensor>& cos,
-                         const paddle::optional<DenseTensor>& position_ids,
-                         const DenseTensor& dout_q,
-                         const paddle::optional<DenseTensor>& dout_k,
-                         const paddle::optional<DenseTensor>& dout_v,
-                         bool use_neox_rotary_style,
-                         bool time_major,
-                         float rotary_emb_base,
-                         DenseTensor* dq,
-                         DenseTensor* dk,
-                         DenseTensor* dv) {
-  int64_t numel = dout_q.numel();
-  if (numel <= 0) return;
-  dev_ctx.template Alloc<T>(dq);
-
-  phi::Array<int64_t, 3> inputs_num_heads;
-  // small size for broadcast
-  auto batch_size = time_major ? dout_q.dims()[1] : dout_q.dims()[0];
-  auto seq_len = time_major ? dout_q.dims()[0] : dout_q.dims()[1];
-  inputs_num_heads[0] = dout_q.dims()[2];
-  auto head_dim = dout_q.dims()[3];
-  PADDLE_ENFORCE_NE(head_dim % 2,
-                    1,
-                    common::errors::InvalidArgument(
-                        "The head_dim of input must be a multiple of 2."));
-
-  constexpr const int vec_size = 2;
-
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
-
-  int64_t grid = config.block_per_grid.x;
-  int64_t block = config.thread_per_block.x;
-  auto stream = dev_ctx.stream();
-
-  phi::Array<T*, 3> outs_data;
-  phi::Array<const T*, 3> ins_data;
-  phi::Array<const T*, 2> sin_cos_data;
-  const int64_t* position_ids_data = NULL;
-
-  ins_data[0] = dout_q.data<T>();
-  outs_data[0] = dq->data<T>();
-  int num_inputs = 1;
-
-  if (dout_k) {
-    dev_ctx.template Alloc<T>(dk);
-    outs_data[num_inputs] = dk->data<T>();
-    ins_data[num_inputs] = dout_k->data<T>();
-    inputs_num_heads[num_inputs] = dk->dims()[2];
-    num_inputs++;
-  }
-
-  if (dout_v) {
-    dev_ctx.template Alloc<T>(dv);
-    outs_data[num_inputs] = dv->data<T>();
-    ins_data[num_inputs] = dout_v->data<T>();
-    inputs_num_heads[num_inputs] = dv->dims()[2];
-    num_inputs++;
-  }
-
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType div_c = static_cast<MPType>(1.0f / head_dim);
-
-  bool flag_sin_cos = false;
-  if (sin.get_ptr() && cos.get_ptr()) {
-    sin_cos_data[0] = sin->data<T>();
-    sin_cos_data[1] = cos->data<T>();
-
-    flag_sin_cos = true;
-
-    if (position_ids) {
-      position_ids_data = position_ids->data<int64_t>();
-    }
-  }
-
-  bool is_same_num_heads = true;
-  auto prev_num_heads = inputs_num_heads[0];
-  for (int i = 1; i < num_inputs; ++i) {
-    if (prev_num_heads != inputs_num_heads[i]) {
-      is_same_num_heads = false;
-      break;
-    }
-    prev_num_heads = inputs_num_heads[i];
-  }
-
-  int sign = -1;
-
-  VectorizedFusedRopeCudaKernelFunc<T, MPType, vec_size> kernel_func =
-      use_neox_rotary_style
-          ? VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
-          : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
-
-  if (is_same_num_heads) {
-    int64_t batch_stride =
-        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
-    int64_t seq_stride = time_major ? dout_q.strides()[0] : dout_q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride,
-                                            seq_stride,
-                                            num_inputs,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-  } else {
-    // rotary position embedding Q
-    int64_t batch_stride_q =
-        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
-    int64_t seq_stride_q =
-        time_major ? dout_q.strides()[0] : dout_q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride_q,
-                                            seq_stride_q,
-                                            1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-    // rotary position embedding K,V
-    int64_t batch_stride_kv = time_major
-                                  ? inputs_num_heads[1] * head_dim
-                                  : seq_len * inputs_num_heads[1] * head_dim;
-    int64_t seq_stride_kv = time_major
-                                ? batch_size * inputs_num_heads[1] * head_dim
-                                : inputs_num_heads[1] * head_dim;
-
-    phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
-    phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
-    kernel_func<<<grid, block, 0, stream>>>(input_kv,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[1],
-                                            head_dim,
-                                            batch_stride_kv,
-                                            seq_stride_kv,
-                                            num_inputs - 1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            out_kv);
-  }
-}
-
-}  // namespace fusion
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding_grad,
-                          iluvatar_gpu,
-                          ALL_LAYOUT,
-                          phi::fusion::FusedRopeGradKernel,
-                          float,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16){};
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
new file mode 100644
index 00000000000..64d4083f0be
--- /dev/null
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedRopeGradKernel,
+                          float,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16){};
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu
deleted file mode 100644
index c4f24acdbcb..00000000000
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
-#include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
-
-namespace phi {
-namespace fusion {
-
-template <typename T, typename Context>
-void FusedRopeKernel(const Context& dev_ctx,
-                     const DenseTensor& q,
-                     const paddle::optional<DenseTensor>& k,
-                     const paddle::optional<DenseTensor>& v,
-                     const paddle::optional<DenseTensor>& sin,
-                     const paddle::optional<DenseTensor>& cos,
-                     const paddle::optional<DenseTensor>& position_ids,
-                     bool use_neox_rotary_style,
-                     bool time_major,
-                     float rotary_emb_base,
-                     DenseTensor* out_q,
-                     DenseTensor* out_k,
-                     DenseTensor* out_v) {
-  int64_t numel = q.numel();
-  if (numel <= 0) return;
-  dev_ctx.template Alloc<T>(out_q);
-
-  phi::Array<int64_t, 3> inputs_num_heads;
-
-  // q.shape: [seq_len, batch_size, num_heads, head_dim] if time_major else
-  // [batch_size, seq_len, num_heads, head_dim]
-  auto batch_size = time_major ? q.dims()[1] : q.dims()[0];
-  auto seq_len = time_major ? q.dims()[0] : q.dims()[1];
-  inputs_num_heads[0] = q.dims()[2];
-  auto head_dim = q.dims()[3];
-
-  PADDLE_ENFORCE_EQ(head_dim % 2,
-                    0,
-                    common::errors::InvalidArgument(
-                        "The head_dim of input must be a multiple of 2."));
-
-  constexpr const int vec_size = 2;
-
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
-
-  int64_t grid = config.block_per_grid.x;
-  int64_t block = config.thread_per_block.x;
-  auto stream = dev_ctx.stream();
-
-  phi::Array<T*, 3> outs_data;
-  phi::Array<const T*, 3> ins_data;
-  phi::Array<const T*, 2> sin_cos_data;
-  const int64_t* position_ids_data = NULL;
-
-  ins_data[0] = q.data<T>();
-  outs_data[0] = out_q->data<T>();
-  int num_inputs = 1;
-
-  if (k) {
-    dev_ctx.template Alloc<T>(out_k);
-    ins_data[num_inputs] = k->data<T>();
-    outs_data[num_inputs] = out_k->data<T>();
-    inputs_num_heads[num_inputs] = k->dims()[2];
-    num_inputs++;
-  }
-
-  if (v) {
-    dev_ctx.template Alloc<T>(out_v);
-    ins_data[num_inputs] = v->data<T>();
-    outs_data[num_inputs] = out_v->data<T>();
-    inputs_num_heads[num_inputs] = v->dims()[2];
-    num_inputs++;
-  }
-
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType div_c = static_cast<MPType>(1.0f / head_dim);
-
-  bool flag_sin_cos = false;
-
-  if (sin.get_ptr() && cos.get_ptr()) {
-    PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(),
-                      cos.get_ptr()->dims(),
-                      common::errors::InvalidArgument(
-                          "The dims of sin and cos must be the same. But "
-                          "received sin's dims is {%s}, cos's dims is {%s}.",
-                          sin.get_ptr()->dims(),
-                          cos.get_ptr()->dims()));
-
-    auto sin_dims = sin.get_ptr()->dims();
-    int dims_size = sin_dims.size();
-    PADDLE_ENFORCE_EQ((dims_size == 2 || dims_size == 4),
-                      true,
-                      common::errors::InvalidArgument(
-                          "The dims of sin and cos is expected to "
-                          "be 2 or 4, but received %d.",
-                          dims_size));
-    if (dims_size == 4) {
-      // sin.shape: [1, seq_len, 1, head_dim]
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[0] == 1 && sin_dims[2] == 1),
-          true,
-          common::errors::InvalidArgument(
-              "The batch_size and num_heads of sin and cos must be 1."));
-    }
-    int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0;
-
-    if (position_ids) {
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[dims_size - 1] == head_dim &&
-           sin_dims[sin_seq_len_dim] >= seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The seq_len of sin and cos must be greater than or equal to "
-              "this of q. The head_dim of sin and cos must be the same as this "
-              "of q. But received sin's "
-              "shape is {%s}, q's shape is {%s}.",
-              sin_dims,
-              q.dims()));
-
-      auto position_ids_dims = position_ids.get_ptr()->dims();
-      PADDLE_ENFORCE_EQ(position_ids_dims.size(),
-                        2,
-                        common::errors::InvalidArgument(
-                            "The dims of position_ids is expected to "
-                            "be 2, but received %d.",
-                            position_ids_dims.size()));
-
-      PADDLE_ENFORCE_EQ(
-          (position_ids_dims[0] == batch_size &&
-           position_ids_dims[1] == seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The batch_size and seq_len of position_ids must be the same as "
-              "those of q. But received position_ids's "
-              "shape is {%s}, q's shape is {%s}.",
-              position_ids_dims,
-              q.dims()));
-
-      position_ids_data = position_ids->data<int64_t>();
-    } else {
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[dims_size - 1] == head_dim &&
-           sin_dims[sin_seq_len_dim] == seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The seq_len and head_dim of sin and cos "
-              "must be the same as those of q. But received sin's "
-              "shape is {%s}, q's shape is {%s}.",
-              sin_dims,
-              q.dims()));
-    }
-
-    sin_cos_data[0] = sin->data<T>();
-    sin_cos_data[1] = cos->data<T>();
-
-    flag_sin_cos = true;
-  }
-
-  bool is_same_num_heads = true;
-  auto prev_num_heads = inputs_num_heads[0];
-  for (int i = 1; i < num_inputs; ++i) {
-    if (prev_num_heads != inputs_num_heads[i]) {
-      is_same_num_heads = false;
-      break;
-    }
-    prev_num_heads = inputs_num_heads[i];
-  }
-
-  int sign = 1;
-  VectorizedFusedRopeCudaKernelFunc<T, MPType, vec_size> kernel_func =
-      use_neox_rotary_style
-          ? VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
-          : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
-
-  if (is_same_num_heads) {
-    int64_t batch_stride = time_major ? q.strides()[1] : q.strides()[0];
-    int64_t seq_stride = time_major ? q.strides()[0] : q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride,
-                                            seq_stride,
-                                            num_inputs,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-  } else {
-    // Multi Query Attention (MQA) or Group Query Attention (GQA)
-    PADDLE_ENFORCE_EQ(
-        (inputs_num_heads[0] != inputs_num_heads[num_inputs - 1]) &&
-            (inputs_num_heads[0] % inputs_num_heads[num_inputs - 1] == 0),
-        true,
-        common::errors::InvalidArgument(
-            "The MQA or GQA mode is entered, when the number of heads of qkv "
-            "is not exactly the same two by two. This mode requires "
-            "num_heads of q to be divisible by k,v."
-            "But received num_heads of q is %d, num_heads of k,v is %d",
-            inputs_num_heads[0],
-            inputs_num_heads[num_inputs - 1]));
-
-    if (k.get_ptr() && v.get_ptr()) {
-      PADDLE_ENFORCE_EQ(
-          inputs_num_heads[1] == inputs_num_heads[2],
-          true,
-          common::errors::InvalidArgument(
-              "The num_heads of k must be equal to the num_heads of v when v "
-              "is not none."
-              "But received num_heads of k is %d, num_heads of v is %d",
-              inputs_num_heads[1],
-              inputs_num_heads[2]));
-    }
-    // rotary position embedding Q
-    int64_t batch_stride_q = time_major ? q.strides()[1] : q.strides()[0];
-    int64_t seq_stride_q = time_major ? q.strides()[0] : q.strides()[1];
-
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride_q,
-                                            seq_stride_q,
-                                            1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-    // rotary position embedding K,V
-    phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
-    phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
-    int64_t batch_stride_kv = time_major
-                                  ? inputs_num_heads[1] * head_dim
-                                  : seq_len * inputs_num_heads[1] * head_dim;
-    int64_t seq_stride_kv = time_major
-                                ? batch_size * inputs_num_heads[1] * head_dim
-                                : inputs_num_heads[1] * head_dim;
-
-    kernel_func<<<grid, block, 0, stream>>>(input_kv,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[1],
-                                            head_dim,
-                                            batch_stride_kv,
-                                            seq_stride_kv,
-                                            num_inputs - 1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            out_kv);
-  }
-}
-}  // namespace fusion
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding,
-                          iluvatar_gpu,
-                          ALL_LAYOUT,
-                          phi::fusion::FusedRopeKernel,
-                          float,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16){};
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
new file mode 100644
index 00000000000..23ba04fff9c
--- /dev/null
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedRopeKernel,
+                          float,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16){};
diff --git a/backends/iluvatar_gpu/tests/disabled_test.txt b/backends/iluvatar_gpu/tests/disabled_test.txt
index 5f0c0962e05..7bae3c88524 100644
--- a/backends/iluvatar_gpu/tests/disabled_test.txt
+++ b/backends/iluvatar_gpu/tests/disabled_test.txt
@@ -548,6 +548,5 @@ test_linear_interp_v2_op.py
 test_nearest_interp_v2_op.py
 test_poisson_op.py
 test_rrelu_op.py
-test_match_matrix_tensor_op.py
-test_set_grad.py
 test_batched_gemm.py
+test_tensor.py
diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index af64a663c2a..598fb608e0c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -635,6 +635,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index f29986a3780..7ac600a0da7 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -22,7 +22,7 @@ mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
 rm -r patch/eigen3
-cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
+# cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
diff --git a/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
index c99e18f9a48..8f92d3df86f 100644
--- a/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
@@ -15,187 +15,14 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
-
-namespace phi {
-namespace fusion {
-
-template <typename T, typename Context>
-void FusedRopeGradKernel(const Context& dev_ctx,
-                         const paddle::optional<DenseTensor>& sin,
-                         const paddle::optional<DenseTensor>& cos,
-                         const paddle::optional<DenseTensor>& position_ids,
-                         const DenseTensor& dout_q,
-                         const paddle::optional<DenseTensor>& dout_k,
-                         const paddle::optional<DenseTensor>& dout_v,
-                         bool use_neox_rotary_style,
-                         bool time_major,
-                         float rotary_emb_base,
-                         DenseTensor* dq,
-                         DenseTensor* dk,
-                         DenseTensor* dv) {
-  int64_t numel = dout_q.numel();
-  if (numel <= 0) return;
-  dev_ctx.template Alloc<T>(dq);
-
-  phi::Array<int64_t, 3> inputs_num_heads;
-  // small size for broadcast
-  auto batch_size = time_major ? dout_q.dims()[1] : dout_q.dims()[0];
-  auto seq_len = time_major ? dout_q.dims()[0] : dout_q.dims()[1];
-  inputs_num_heads[0] = dout_q.dims()[2];
-  auto head_dim = dout_q.dims()[3];
-  PADDLE_ENFORCE_NE(head_dim % 2,
-                    1,
-                    common::errors::InvalidArgument(
-                        "The head_dim of input must be a multiple of 2."));
-
-  constexpr const int vec_size = 2;
-
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
-
-  int64_t grid = config.block_per_grid.x;
-  int64_t block = config.thread_per_block.x;
-  auto stream = dev_ctx.stream();
-
-  phi::Array<T*, 3> outs_data;
-  phi::Array<const T*, 3> ins_data;
-  phi::Array<const T*, 2> sin_cos_data;
-  const int64_t* position_ids_data = NULL;
-
-  ins_data[0] = dout_q.data<T>();
-  outs_data[0] = dq->data<T>();
-  int num_inputs = 1;
-
-  if (dout_k) {
-    dev_ctx.template Alloc<T>(dk);
-    outs_data[num_inputs] = dk->data<T>();
-    ins_data[num_inputs] = dout_k->data<T>();
-    inputs_num_heads[num_inputs] = dk->dims()[2];
-    num_inputs++;
-  }
-
-  if (dout_v) {
-    dev_ctx.template Alloc<T>(dv);
-    outs_data[num_inputs] = dv->data<T>();
-    ins_data[num_inputs] = dout_v->data<T>();
-    inputs_num_heads[num_inputs] = dv->dims()[2];
-    num_inputs++;
-  }
-
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType div_c = static_cast<MPType>(1.0f / head_dim);
-
-  bool flag_sin_cos = false;
-  if (sin.get_ptr() && cos.get_ptr()) {
-    sin_cos_data[0] = sin->data<T>();
-    sin_cos_data[1] = cos->data<T>();
-
-    flag_sin_cos = true;
-
-    if (position_ids) {
-      position_ids_data = position_ids->data<int64_t>();
-    }
-  }
-
-  bool is_same_num_heads = true;
-  auto prev_num_heads = inputs_num_heads[0];
-  for (int i = 1; i < num_inputs; ++i) {
-    if (prev_num_heads != inputs_num_heads[i]) {
-      is_same_num_heads = false;
-      break;
-    }
-    prev_num_heads = inputs_num_heads[i];
-  }
-
-  int sign = -1;
-
-  VectorizedFusedRopeCudaKernelFunc<T, MPType, vec_size> kernel_func =
-      use_neox_rotary_style
-          ? VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
-          : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
-
-  if (is_same_num_heads) {
-    int64_t batch_stride =
-        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
-    int64_t seq_stride = time_major ? dout_q.strides()[0] : dout_q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride,
-                                            seq_stride,
-                                            num_inputs,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-  } else {
-    // rotary position embedding Q
-    int64_t batch_stride_q =
-        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
-    int64_t seq_stride_q =
-        time_major ? dout_q.strides()[0] : dout_q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride_q,
-                                            seq_stride_q,
-                                            1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-    // rotary position embedding K,V
-    int64_t batch_stride_kv = time_major
-                                  ? inputs_num_heads[1] * head_dim
-                                  : seq_len * inputs_num_heads[1] * head_dim;
-    int64_t seq_stride_kv = time_major
-                                ? batch_size * inputs_num_heads[1] * head_dim
-                                : inputs_num_heads[1] * head_dim;
-
-    phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
-    phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
-    kernel_func<<<grid, block, 0, stream>>>(input_kv,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[1],
-                                            head_dim,
-                                            batch_stride_kv,
-                                            seq_stride_kv,
-                                            num_inputs - 1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            out_kv);
-  }
-}
-
-}  // namespace fusion
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding_grad,
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::fusion::FusedRopeGradKernel,
                           float,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16){};
+                          double,
+                          phi::float16,
+                          phi::bfloat16){};
diff --git a/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
index 05c5942bed1..6be386d316d 100644
--- a/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
+++ b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
@@ -13,276 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
-
-namespace phi {
-namespace fusion {
-
-template <typename T, typename Context>
-void FusedRopeKernel(const Context& dev_ctx,
-                     const DenseTensor& q,
-                     const paddle::optional<DenseTensor>& k,
-                     const paddle::optional<DenseTensor>& v,
-                     const paddle::optional<DenseTensor>& sin,
-                     const paddle::optional<DenseTensor>& cos,
-                     const paddle::optional<DenseTensor>& position_ids,
-                     bool use_neox_rotary_style,
-                     bool time_major,
-                     float rotary_emb_base,
-                     DenseTensor* out_q,
-                     DenseTensor* out_k,
-                     DenseTensor* out_v) {
-  int64_t numel = q.numel();
-  if (numel <= 0) return;
-  dev_ctx.template Alloc<T>(out_q);
-
-  phi::Array<int64_t, 3> inputs_num_heads;
-
-  // q.shape: [seq_len, batch_size, num_heads, head_dim] if time_major else
-  // [batch_size, seq_len, num_heads, head_dim]
-  auto batch_size = time_major ? q.dims()[1] : q.dims()[0];
-  auto seq_len = time_major ? q.dims()[0] : q.dims()[1];
-  inputs_num_heads[0] = q.dims()[2];
-  auto head_dim = q.dims()[3];
-
-  PADDLE_ENFORCE_EQ(head_dim % 2,
-                    0,
-                    common::errors::InvalidArgument(
-                        "The head_dim of input must be a multiple of 2."));
-
-  constexpr const int vec_size = 2;
-
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
-
-  int64_t grid = config.block_per_grid.x;
-  int64_t block = config.thread_per_block.x;
-  auto stream = dev_ctx.stream();
-
-  phi::Array<T*, 3> outs_data;
-  phi::Array<const T*, 3> ins_data;
-  phi::Array<const T*, 2> sin_cos_data;
-  const int64_t* position_ids_data = NULL;
-
-  ins_data[0] = q.data<T>();
-  outs_data[0] = out_q->data<T>();
-  int num_inputs = 1;
-
-  if (k) {
-    dev_ctx.template Alloc<T>(out_k);
-    ins_data[num_inputs] = k->data<T>();
-    outs_data[num_inputs] = out_k->data<T>();
-    inputs_num_heads[num_inputs] = k->dims()[2];
-    num_inputs++;
-  }
-
-  if (v) {
-    dev_ctx.template Alloc<T>(out_v);
-    ins_data[num_inputs] = v->data<T>();
-    outs_data[num_inputs] = out_v->data<T>();
-    inputs_num_heads[num_inputs] = v->dims()[2];
-    num_inputs++;
-  }
-
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType div_c = static_cast<MPType>(1.0f / head_dim);
-
-  bool flag_sin_cos = false;
-
-  if (sin.get_ptr() && cos.get_ptr()) {
-    PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(),
-                      cos.get_ptr()->dims(),
-                      common::errors::InvalidArgument(
-                          "The dims of sin and cos must be the same. But "
-                          "received sin's dims is {%s}, cos's dims is {%s}.",
-                          sin.get_ptr()->dims(),
-                          cos.get_ptr()->dims()));
-
-    auto sin_dims = sin.get_ptr()->dims();
-    int dims_size = sin_dims.size();
-    PADDLE_ENFORCE_EQ((dims_size == 2 || dims_size == 4),
-                      true,
-                      common::errors::InvalidArgument(
-                          "The dims of sin and cos is expected to "
-                          "be 2 or 4, but received %d.",
-                          dims_size));
-    if (dims_size == 4) {
-      // sin.shape: [1, seq_len, 1, head_dim]
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[0] == 1 && sin_dims[2] == 1),
-          true,
-          common::errors::InvalidArgument(
-              "The batch_size and num_heads of sin and cos must be 1."));
-    }
-    int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0;
-
-    if (position_ids) {
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[dims_size - 1] == head_dim &&
-           sin_dims[sin_seq_len_dim] >= seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The seq_len of sin and cos must be greater than or equal to "
-              "this of q. The head_dim of sin and cos must be the same as this "
-              "of q. But received sin's "
-              "shape is {%s}, q's shape is {%s}.",
-              sin_dims,
-              q.dims()));
-
-      auto position_ids_dims = position_ids.get_ptr()->dims();
-      PADDLE_ENFORCE_EQ(position_ids_dims.size(),
-                        2,
-                        common::errors::InvalidArgument(
-                            "The dims of position_ids is expected to "
-                            "be 2, but received %d.",
-                            position_ids_dims.size()));
-
-      PADDLE_ENFORCE_EQ(
-          (position_ids_dims[0] == batch_size &&
-           position_ids_dims[1] == seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The batch_size and seq_len of position_ids must be the same as "
-              "those of q. But received position_ids's "
-              "shape is {%s}, q's shape is {%s}.",
-              position_ids_dims,
-              q.dims()));
-
-      position_ids_data = position_ids->data<int64_t>();
-    } else {
-      PADDLE_ENFORCE_EQ(
-          (sin_dims[dims_size - 1] == head_dim &&
-           sin_dims[sin_seq_len_dim] == seq_len),
-          true,
-          common::errors::InvalidArgument(
-              "The seq_len and head_dim of sin and cos "
-              "must be the same as those of q. But received sin's "
-              "shape is {%s}, q's shape is {%s}.",
-              sin_dims,
-              q.dims()));
-    }
-
-    sin_cos_data[0] = sin->data<T>();
-    sin_cos_data[1] = cos->data<T>();
-
-    flag_sin_cos = true;
-  }
-
-  bool is_same_num_heads = true;
-  auto prev_num_heads = inputs_num_heads[0];
-  for (int i = 1; i < num_inputs; ++i) {
-    if (prev_num_heads != inputs_num_heads[i]) {
-      is_same_num_heads = false;
-      break;
-    }
-    prev_num_heads = inputs_num_heads[i];
-  }
-
-  int sign = 1;
-  VectorizedFusedRopeCudaKernelFunc<T, MPType, vec_size> kernel_func =
-      use_neox_rotary_style
-          ? VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
-          : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
-
-  if (is_same_num_heads) {
-    int64_t batch_stride = time_major ? q.strides()[1] : q.strides()[0];
-    int64_t seq_stride = time_major ? q.strides()[0] : q.strides()[1];
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride,
-                                            seq_stride,
-                                            num_inputs,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-  } else {
-    // Multi Query Attention (MQA) or Group Query Attention (GQA)
-    PADDLE_ENFORCE_EQ(
-        (inputs_num_heads[0] != inputs_num_heads[num_inputs - 1]) &&
-            (inputs_num_heads[0] % inputs_num_heads[num_inputs - 1] == 0),
-        true,
-        common::errors::InvalidArgument(
-            "The MQA or GQA mode is entered, when the number of heads of qkv "
-            "is not exactly the same two by two. This mode requires "
-            "num_heads of q to be divisible by k,v."
-            "But received num_heads of q is %d, num_heads of k,v is %d",
-            inputs_num_heads[0],
-            inputs_num_heads[num_inputs - 1]));
-
-    if (k.get_ptr() && v.get_ptr()) {
-      PADDLE_ENFORCE_EQ(
-          inputs_num_heads[1] == inputs_num_heads[2],
-          true,
-          common::errors::InvalidArgument(
-              "The num_heads of k must be equal to the num_heads of v when v "
-              "is not none."
-              "But received num_heads of k is %d, num_heads of v is %d",
-              inputs_num_heads[1],
-              inputs_num_heads[2]));
-    }
-    // rotary position embedding Q
-    int64_t batch_stride_q = time_major ? q.strides()[1] : q.strides()[0];
-    int64_t seq_stride_q = time_major ? q.strides()[0] : q.strides()[1];
-
-    kernel_func<<<grid, block, 0, stream>>>(ins_data,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[0],
-                                            head_dim,
-                                            batch_stride_q,
-                                            seq_stride_q,
-                                            1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            outs_data);
-
-    // rotary position embedding K,V
-    phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
-    phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
-    int64_t batch_stride_kv = time_major
-                                  ? inputs_num_heads[1] * head_dim
-                                  : seq_len * inputs_num_heads[1] * head_dim;
-    int64_t seq_stride_kv = time_major
-                                ? batch_size * inputs_num_heads[1] * head_dim
-                                : inputs_num_heads[1] * head_dim;
-
-    kernel_func<<<grid, block, 0, stream>>>(input_kv,
-                                            sin_cos_data,
-                                            position_ids_data,
-                                            flag_sin_cos,
-                                            sign,
-                                            batch_size,
-                                            seq_len,
-                                            inputs_num_heads[1],
-                                            head_dim,
-                                            batch_stride_kv,
-                                            seq_stride_kv,
-                                            num_inputs - 1,
-                                            div_c,
-                                            rotary_emb_base,
-                                            out_kv);
-  }
-}
-}  // namespace fusion
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding,
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::fusion::FusedRopeKernel,