diff --git a/Paddle b/Paddle index d7597815b16..2701651b591 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit d7597815b16e2a72f927b2f15f8477b196098bf0 +Subproject commit 2701651b5912cf97d4ecd3a20444b2dbce3a0b7f diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index cf3bee126b2..e6fd433c723 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -112,7 +112,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc # Core ${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.cc - ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc + # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc ${PADDLE_SOURCE_DIR}/paddle/phi/core/mixed_vector.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cusparse.cc # kernels/funcs @@ -128,6 +128,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/batched_gemm.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu @@ -876,7 +877,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc + # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu diff --git a/backends/iluvatar_gpu/common/cuda_flags.cc b/backends/iluvatar_gpu/common/cuda_flags.cc index 31209fb0e98..a30896f0ff2 100644 --- a/backends/iluvatar_gpu/common/cuda_flags.cc +++ b/backends/iluvatar_gpu/common/cuda_flags.cc @@ -277,3 +277,19 @@ PHI_DEFINE_EXPORTED_bool( flash_attn_available, true, "Weather flash attention is available on the current device."); + +/** + * CUDNN related FLAG + * Name: FLAGS_conv_workspace_size_limit + * Since Version: 0.13.0 + * Value Range: uint64, default=512 (MB) + * Example: + * Note: The internal function of cuDNN obtains the fastest matching algorithm + * within this memory limit. Usually, faster algorithms can be chosen in + * larger workspaces, but memory space can also be significantly + * increased. + * Users need to balance memory and speed. + */ +PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit, + 1024, + "cuDNN convolution workspace limit in MB unit."); diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu new file mode 100644 index 00000000000..46fb616efc5 --- /dev/null +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu @@ -0,0 +1,23 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/legacy/gpu/batched_gemm.h" + +PD_CUSTOM_KERNEL_REGISTER(batched_gemm, + iluvatar_gpu, + ALL_LAYOUT, + phi::BatchedGEMM, + float, + phi::bfloat16) {} diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu deleted file mode 100644 index 823d7008d42..00000000000 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" - -namespace phi { -namespace fusion { - -template -void FusedRopeGradKernel(const Context& dev_ctx, - const paddle::optional& sin, - const paddle::optional& cos, - const paddle::optional& position_ids, - const DenseTensor& dout_q, - const paddle::optional& dout_k, - const paddle::optional& dout_v, - bool use_neox_rotary_style, - bool time_major, - float rotary_emb_base, - DenseTensor* dq, - DenseTensor* dk, - DenseTensor* dv) { - int64_t numel = dout_q.numel(); - if (numel <= 0) return; - dev_ctx.template Alloc(dq); - - phi::Array inputs_num_heads; - // small size for broadcast - auto batch_size = time_major ? dout_q.dims()[1] : dout_q.dims()[0]; - auto seq_len = time_major ? dout_q.dims()[0] : dout_q.dims()[1]; - inputs_num_heads[0] = dout_q.dims()[2]; - auto head_dim = dout_q.dims()[3]; - PADDLE_ENFORCE_NE(head_dim % 2, - 1, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - - constexpr const int vec_size = 2; - - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); - - int64_t grid = config.block_per_grid.x; - int64_t block = config.thread_per_block.x; - auto stream = dev_ctx.stream(); - - phi::Array outs_data; - phi::Array ins_data; - phi::Array sin_cos_data; - const int64_t* position_ids_data = NULL; - - ins_data[0] = dout_q.data(); - outs_data[0] = dq->data(); - int num_inputs = 1; - - if (dout_k) { - dev_ctx.template Alloc(dk); - outs_data[num_inputs] = dk->data(); - ins_data[num_inputs] = dout_k->data(); - inputs_num_heads[num_inputs] = dk->dims()[2]; - num_inputs++; - } - - if (dout_v) { - dev_ctx.template Alloc(dv); - outs_data[num_inputs] = dv->data(); - ins_data[num_inputs] = dout_v->data(); - inputs_num_heads[num_inputs] = dv->dims()[2]; - num_inputs++; - } - - using MPType = typename phi::dtype::MPTypeTrait::Type; - MPType div_c = static_cast(1.0f / head_dim); - - bool flag_sin_cos = false; - if (sin.get_ptr() && cos.get_ptr()) { - sin_cos_data[0] = sin->data(); - sin_cos_data[1] = cos->data(); - - flag_sin_cos = true; - - if (position_ids) { - position_ids_data = position_ids->data(); - } - } - - bool is_same_num_heads = true; - auto prev_num_heads = inputs_num_heads[0]; - for (int i = 1; i < num_inputs; ++i) { - if (prev_num_heads != inputs_num_heads[i]) { - is_same_num_heads = false; - break; - } - prev_num_heads = inputs_num_heads[i]; - } - - int sign = -1; - - VectorizedFusedRopeCudaKernelFunc kernel_func = - use_neox_rotary_style - ? VectorizedFusedRopeWithRotateEveryTwoKernel - : VectorizedFusedRopeWithRotateHalfKernel; - - if (is_same_num_heads) { - int64_t batch_stride = - time_major ? dout_q.strides()[1] : dout_q.strides()[0]; - int64_t seq_stride = time_major ? dout_q.strides()[0] : dout_q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride, - seq_stride, - num_inputs, - div_c, - rotary_emb_base, - outs_data); - - } else { - // rotary position embedding Q - int64_t batch_stride_q = - time_major ? dout_q.strides()[1] : dout_q.strides()[0]; - int64_t seq_stride_q = - time_major ? dout_q.strides()[0] : dout_q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride_q, - seq_stride_q, - 1, - div_c, - rotary_emb_base, - outs_data); - - // rotary position embedding K,V - int64_t batch_stride_kv = time_major - ? inputs_num_heads[1] * head_dim - : seq_len * inputs_num_heads[1] * head_dim; - int64_t seq_stride_kv = time_major - ? batch_size * inputs_num_heads[1] * head_dim - : inputs_num_heads[1] * head_dim; - - phi::Array input_kv{ins_data[1], ins_data[2], nullptr}; - phi::Array out_kv{outs_data[1], outs_data[2], nullptr}; - kernel_func<<>>(input_kv, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[1], - head_dim, - batch_stride_kv, - seq_stride_kv, - num_inputs - 1, - div_c, - rotary_emb_base, - out_kv); - } -} - -} // namespace fusion -} // namespace phi - -PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding_grad, - iluvatar_gpu, - ALL_LAYOUT, - phi::fusion::FusedRopeGradKernel, - float, - phi::dtype::float16, - phi::dtype::bfloat16){}; diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu new file mode 100644 index 00000000000..64d4083f0be --- /dev/null +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu" //NOLINT + +PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad, + iluvatar_gpu, + ALL_LAYOUT, + phi::fusion::FusedRopeGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16){}; diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu deleted file mode 100644 index c4f24acdbcb..00000000000 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel.cu +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" - -namespace phi { -namespace fusion { - -template -void FusedRopeKernel(const Context& dev_ctx, - const DenseTensor& q, - const paddle::optional& k, - const paddle::optional& v, - const paddle::optional& sin, - const paddle::optional& cos, - const paddle::optional& position_ids, - bool use_neox_rotary_style, - bool time_major, - float rotary_emb_base, - DenseTensor* out_q, - DenseTensor* out_k, - DenseTensor* out_v) { - int64_t numel = q.numel(); - if (numel <= 0) return; - dev_ctx.template Alloc(out_q); - - phi::Array inputs_num_heads; - - // q.shape: [seq_len, batch_size, num_heads, head_dim] if time_major else - // [batch_size, seq_len, num_heads, head_dim] - auto batch_size = time_major ? q.dims()[1] : q.dims()[0]; - auto seq_len = time_major ? q.dims()[0] : q.dims()[1]; - inputs_num_heads[0] = q.dims()[2]; - auto head_dim = q.dims()[3]; - - PADDLE_ENFORCE_EQ(head_dim % 2, - 0, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - - constexpr const int vec_size = 2; - - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); - - int64_t grid = config.block_per_grid.x; - int64_t block = config.thread_per_block.x; - auto stream = dev_ctx.stream(); - - phi::Array outs_data; - phi::Array ins_data; - phi::Array sin_cos_data; - const int64_t* position_ids_data = NULL; - - ins_data[0] = q.data(); - outs_data[0] = out_q->data(); - int num_inputs = 1; - - if (k) { - dev_ctx.template Alloc(out_k); - ins_data[num_inputs] = k->data(); - outs_data[num_inputs] = out_k->data(); - inputs_num_heads[num_inputs] = k->dims()[2]; - num_inputs++; - } - - if (v) { - dev_ctx.template Alloc(out_v); - ins_data[num_inputs] = v->data(); - outs_data[num_inputs] = out_v->data(); - inputs_num_heads[num_inputs] = v->dims()[2]; - num_inputs++; - } - - using MPType = typename phi::dtype::MPTypeTrait::Type; - MPType div_c = static_cast(1.0f / head_dim); - - bool flag_sin_cos = false; - - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - - auto sin_dims = sin.get_ptr()->dims(); - int dims_size = sin_dims.size(); - PADDLE_ENFORCE_EQ((dims_size == 2 || dims_size == 4), - true, - common::errors::InvalidArgument( - "The dims of sin and cos is expected to " - "be 2 or 4, but received %d.", - dims_size)); - if (dims_size == 4) { - // sin.shape: [1, seq_len, 1, head_dim] - PADDLE_ENFORCE_EQ( - (sin_dims[0] == 1 && sin_dims[2] == 1), - true, - common::errors::InvalidArgument( - "The batch_size and num_heads of sin and cos must be 1.")); - } - int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0; - - if (position_ids) { - PADDLE_ENFORCE_EQ( - (sin_dims[dims_size - 1] == head_dim && - sin_dims[sin_seq_len_dim] >= seq_len), - true, - common::errors::InvalidArgument( - "The seq_len of sin and cos must be greater than or equal to " - "this of q. The head_dim of sin and cos must be the same as this " - "of q. But received sin's " - "shape is {%s}, q's shape is {%s}.", - sin_dims, - q.dims())); - - auto position_ids_dims = position_ids.get_ptr()->dims(); - PADDLE_ENFORCE_EQ(position_ids_dims.size(), - 2, - common::errors::InvalidArgument( - "The dims of position_ids is expected to " - "be 2, but received %d.", - position_ids_dims.size())); - - PADDLE_ENFORCE_EQ( - (position_ids_dims[0] == batch_size && - position_ids_dims[1] == seq_len), - true, - common::errors::InvalidArgument( - "The batch_size and seq_len of position_ids must be the same as " - "those of q. But received position_ids's " - "shape is {%s}, q's shape is {%s}.", - position_ids_dims, - q.dims())); - - position_ids_data = position_ids->data(); - } else { - PADDLE_ENFORCE_EQ( - (sin_dims[dims_size - 1] == head_dim && - sin_dims[sin_seq_len_dim] == seq_len), - true, - common::errors::InvalidArgument( - "The seq_len and head_dim of sin and cos " - "must be the same as those of q. But received sin's " - "shape is {%s}, q's shape is {%s}.", - sin_dims, - q.dims())); - } - - sin_cos_data[0] = sin->data(); - sin_cos_data[1] = cos->data(); - - flag_sin_cos = true; - } - - bool is_same_num_heads = true; - auto prev_num_heads = inputs_num_heads[0]; - for (int i = 1; i < num_inputs; ++i) { - if (prev_num_heads != inputs_num_heads[i]) { - is_same_num_heads = false; - break; - } - prev_num_heads = inputs_num_heads[i]; - } - - int sign = 1; - VectorizedFusedRopeCudaKernelFunc kernel_func = - use_neox_rotary_style - ? VectorizedFusedRopeWithRotateEveryTwoKernel - : VectorizedFusedRopeWithRotateHalfKernel; - - if (is_same_num_heads) { - int64_t batch_stride = time_major ? q.strides()[1] : q.strides()[0]; - int64_t seq_stride = time_major ? q.strides()[0] : q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride, - seq_stride, - num_inputs, - div_c, - rotary_emb_base, - outs_data); - } else { - // Multi Query Attention (MQA) or Group Query Attention (GQA) - PADDLE_ENFORCE_EQ( - (inputs_num_heads[0] != inputs_num_heads[num_inputs - 1]) && - (inputs_num_heads[0] % inputs_num_heads[num_inputs - 1] == 0), - true, - common::errors::InvalidArgument( - "The MQA or GQA mode is entered, when the number of heads of qkv " - "is not exactly the same two by two. This mode requires " - "num_heads of q to be divisible by k,v." - "But received num_heads of q is %d, num_heads of k,v is %d", - inputs_num_heads[0], - inputs_num_heads[num_inputs - 1])); - - if (k.get_ptr() && v.get_ptr()) { - PADDLE_ENFORCE_EQ( - inputs_num_heads[1] == inputs_num_heads[2], - true, - common::errors::InvalidArgument( - "The num_heads of k must be equal to the num_heads of v when v " - "is not none." - "But received num_heads of k is %d, num_heads of v is %d", - inputs_num_heads[1], - inputs_num_heads[2])); - } - // rotary position embedding Q - int64_t batch_stride_q = time_major ? q.strides()[1] : q.strides()[0]; - int64_t seq_stride_q = time_major ? q.strides()[0] : q.strides()[1]; - - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride_q, - seq_stride_q, - 1, - div_c, - rotary_emb_base, - outs_data); - - // rotary position embedding K,V - phi::Array input_kv{ins_data[1], ins_data[2], nullptr}; - phi::Array out_kv{outs_data[1], outs_data[2], nullptr}; - int64_t batch_stride_kv = time_major - ? inputs_num_heads[1] * head_dim - : seq_len * inputs_num_heads[1] * head_dim; - int64_t seq_stride_kv = time_major - ? batch_size * inputs_num_heads[1] * head_dim - : inputs_num_heads[1] * head_dim; - - kernel_func<<>>(input_kv, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[1], - head_dim, - batch_stride_kv, - seq_stride_kv, - num_inputs - 1, - div_c, - rotary_emb_base, - out_kv); - } -} -} // namespace fusion -} // namespace phi - -PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding, - iluvatar_gpu, - ALL_LAYOUT, - phi::fusion::FusedRopeKernel, - float, - phi::dtype::float16, - phi::dtype::bfloat16){}; diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu new file mode 100644 index 00000000000..23ba04fff9c --- /dev/null +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu" //NOLINT + +PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding, + iluvatar_gpu, + ALL_LAYOUT, + phi::fusion::FusedRopeKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16){}; diff --git a/backends/iluvatar_gpu/tests/disabled_test.txt b/backends/iluvatar_gpu/tests/disabled_test.txt index 5f0c0962e05..7bae3c88524 100644 --- a/backends/iluvatar_gpu/tests/disabled_test.txt +++ b/backends/iluvatar_gpu/tests/disabled_test.txt @@ -548,6 +548,5 @@ test_linear_interp_v2_op.py test_nearest_interp_v2_op.py test_poisson_op.py test_rrelu_op.py -test_match_matrix_tensor_op.py -test_set_grad.py test_batched_gemm.py +test_tensor.py diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index af64a663c2a..598fb608e0c 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -635,6 +635,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index f29986a3780..7ac600a0da7 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -22,7 +22,7 @@ mv mcEigen_3.4.0_paddle_final eigen3 cd .. cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 rm -r patch/eigen3 -cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core +# cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core cd ../../Paddle/ git apply --verbose ../backends/metax_gpu/patch/paddle.patch cd - diff --git a/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu index c99e18f9a48..8f92d3df86f 100644 --- a/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu @@ -15,187 +15,14 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu" //NOLINT #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" - -namespace phi { -namespace fusion { - -template -void FusedRopeGradKernel(const Context& dev_ctx, - const paddle::optional& sin, - const paddle::optional& cos, - const paddle::optional& position_ids, - const DenseTensor& dout_q, - const paddle::optional& dout_k, - const paddle::optional& dout_v, - bool use_neox_rotary_style, - bool time_major, - float rotary_emb_base, - DenseTensor* dq, - DenseTensor* dk, - DenseTensor* dv) { - int64_t numel = dout_q.numel(); - if (numel <= 0) return; - dev_ctx.template Alloc(dq); - - phi::Array inputs_num_heads; - // small size for broadcast - auto batch_size = time_major ? dout_q.dims()[1] : dout_q.dims()[0]; - auto seq_len = time_major ? dout_q.dims()[0] : dout_q.dims()[1]; - inputs_num_heads[0] = dout_q.dims()[2]; - auto head_dim = dout_q.dims()[3]; - PADDLE_ENFORCE_NE(head_dim % 2, - 1, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - - constexpr const int vec_size = 2; - - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); - - int64_t grid = config.block_per_grid.x; - int64_t block = config.thread_per_block.x; - auto stream = dev_ctx.stream(); - - phi::Array outs_data; - phi::Array ins_data; - phi::Array sin_cos_data; - const int64_t* position_ids_data = NULL; - - ins_data[0] = dout_q.data(); - outs_data[0] = dq->data(); - int num_inputs = 1; - - if (dout_k) { - dev_ctx.template Alloc(dk); - outs_data[num_inputs] = dk->data(); - ins_data[num_inputs] = dout_k->data(); - inputs_num_heads[num_inputs] = dk->dims()[2]; - num_inputs++; - } - - if (dout_v) { - dev_ctx.template Alloc(dv); - outs_data[num_inputs] = dv->data(); - ins_data[num_inputs] = dout_v->data(); - inputs_num_heads[num_inputs] = dv->dims()[2]; - num_inputs++; - } - - using MPType = typename phi::dtype::MPTypeTrait::Type; - MPType div_c = static_cast(1.0f / head_dim); - - bool flag_sin_cos = false; - if (sin.get_ptr() && cos.get_ptr()) { - sin_cos_data[0] = sin->data(); - sin_cos_data[1] = cos->data(); - - flag_sin_cos = true; - - if (position_ids) { - position_ids_data = position_ids->data(); - } - } - - bool is_same_num_heads = true; - auto prev_num_heads = inputs_num_heads[0]; - for (int i = 1; i < num_inputs; ++i) { - if (prev_num_heads != inputs_num_heads[i]) { - is_same_num_heads = false; - break; - } - prev_num_heads = inputs_num_heads[i]; - } - - int sign = -1; - - VectorizedFusedRopeCudaKernelFunc kernel_func = - use_neox_rotary_style - ? VectorizedFusedRopeWithRotateEveryTwoKernel - : VectorizedFusedRopeWithRotateHalfKernel; - - if (is_same_num_heads) { - int64_t batch_stride = - time_major ? dout_q.strides()[1] : dout_q.strides()[0]; - int64_t seq_stride = time_major ? dout_q.strides()[0] : dout_q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride, - seq_stride, - num_inputs, - div_c, - rotary_emb_base, - outs_data); - - } else { - // rotary position embedding Q - int64_t batch_stride_q = - time_major ? dout_q.strides()[1] : dout_q.strides()[0]; - int64_t seq_stride_q = - time_major ? dout_q.strides()[0] : dout_q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride_q, - seq_stride_q, - 1, - div_c, - rotary_emb_base, - outs_data); - - // rotary position embedding K,V - int64_t batch_stride_kv = time_major - ? inputs_num_heads[1] * head_dim - : seq_len * inputs_num_heads[1] * head_dim; - int64_t seq_stride_kv = time_major - ? batch_size * inputs_num_heads[1] * head_dim - : inputs_num_heads[1] * head_dim; - - phi::Array input_kv{ins_data[1], ins_data[2], nullptr}; - phi::Array out_kv{outs_data[1], outs_data[2], nullptr}; - kernel_func<<>>(input_kv, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[1], - head_dim, - batch_stride_kv, - seq_stride_kv, - num_inputs - 1, - div_c, - rotary_emb_base, - out_kv); - } -} - -} // namespace fusion -} // namespace phi - -PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding_grad, +PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad, metax_gpu, ALL_LAYOUT, phi::fusion::FusedRopeGradKernel, float, - phi::dtype::float16, - phi::dtype::bfloat16){}; + double, + phi::float16, + phi::bfloat16){}; diff --git a/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu index 05c5942bed1..6be386d316d 100644 --- a/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu +++ b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu @@ -13,276 +13,11 @@ // limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu" //NOLINT #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" - -namespace phi { -namespace fusion { - -template -void FusedRopeKernel(const Context& dev_ctx, - const DenseTensor& q, - const paddle::optional& k, - const paddle::optional& v, - const paddle::optional& sin, - const paddle::optional& cos, - const paddle::optional& position_ids, - bool use_neox_rotary_style, - bool time_major, - float rotary_emb_base, - DenseTensor* out_q, - DenseTensor* out_k, - DenseTensor* out_v) { - int64_t numel = q.numel(); - if (numel <= 0) return; - dev_ctx.template Alloc(out_q); - - phi::Array inputs_num_heads; - - // q.shape: [seq_len, batch_size, num_heads, head_dim] if time_major else - // [batch_size, seq_len, num_heads, head_dim] - auto batch_size = time_major ? q.dims()[1] : q.dims()[0]; - auto seq_len = time_major ? q.dims()[0] : q.dims()[1]; - inputs_num_heads[0] = q.dims()[2]; - auto head_dim = q.dims()[3]; - - PADDLE_ENFORCE_EQ(head_dim % 2, - 0, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - - constexpr const int vec_size = 2; - - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); - - int64_t grid = config.block_per_grid.x; - int64_t block = config.thread_per_block.x; - auto stream = dev_ctx.stream(); - - phi::Array outs_data; - phi::Array ins_data; - phi::Array sin_cos_data; - const int64_t* position_ids_data = NULL; - - ins_data[0] = q.data(); - outs_data[0] = out_q->data(); - int num_inputs = 1; - - if (k) { - dev_ctx.template Alloc(out_k); - ins_data[num_inputs] = k->data(); - outs_data[num_inputs] = out_k->data(); - inputs_num_heads[num_inputs] = k->dims()[2]; - num_inputs++; - } - - if (v) { - dev_ctx.template Alloc(out_v); - ins_data[num_inputs] = v->data(); - outs_data[num_inputs] = out_v->data(); - inputs_num_heads[num_inputs] = v->dims()[2]; - num_inputs++; - } - - using MPType = typename phi::dtype::MPTypeTrait::Type; - MPType div_c = static_cast(1.0f / head_dim); - - bool flag_sin_cos = false; - - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - - auto sin_dims = sin.get_ptr()->dims(); - int dims_size = sin_dims.size(); - PADDLE_ENFORCE_EQ((dims_size == 2 || dims_size == 4), - true, - common::errors::InvalidArgument( - "The dims of sin and cos is expected to " - "be 2 or 4, but received %d.", - dims_size)); - if (dims_size == 4) { - // sin.shape: [1, seq_len, 1, head_dim] - PADDLE_ENFORCE_EQ( - (sin_dims[0] == 1 && sin_dims[2] == 1), - true, - common::errors::InvalidArgument( - "The batch_size and num_heads of sin and cos must be 1.")); - } - int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0; - - if (position_ids) { - PADDLE_ENFORCE_EQ( - (sin_dims[dims_size - 1] == head_dim && - sin_dims[sin_seq_len_dim] >= seq_len), - true, - common::errors::InvalidArgument( - "The seq_len of sin and cos must be greater than or equal to " - "this of q. The head_dim of sin and cos must be the same as this " - "of q. But received sin's " - "shape is {%s}, q's shape is {%s}.", - sin_dims, - q.dims())); - - auto position_ids_dims = position_ids.get_ptr()->dims(); - PADDLE_ENFORCE_EQ(position_ids_dims.size(), - 2, - common::errors::InvalidArgument( - "The dims of position_ids is expected to " - "be 2, but received %d.", - position_ids_dims.size())); - - PADDLE_ENFORCE_EQ( - (position_ids_dims[0] == batch_size && - position_ids_dims[1] == seq_len), - true, - common::errors::InvalidArgument( - "The batch_size and seq_len of position_ids must be the same as " - "those of q. But received position_ids's " - "shape is {%s}, q's shape is {%s}.", - position_ids_dims, - q.dims())); - - position_ids_data = position_ids->data(); - } else { - PADDLE_ENFORCE_EQ( - (sin_dims[dims_size - 1] == head_dim && - sin_dims[sin_seq_len_dim] == seq_len), - true, - common::errors::InvalidArgument( - "The seq_len and head_dim of sin and cos " - "must be the same as those of q. But received sin's " - "shape is {%s}, q's shape is {%s}.", - sin_dims, - q.dims())); - } - - sin_cos_data[0] = sin->data(); - sin_cos_data[1] = cos->data(); - - flag_sin_cos = true; - } - - bool is_same_num_heads = true; - auto prev_num_heads = inputs_num_heads[0]; - for (int i = 1; i < num_inputs; ++i) { - if (prev_num_heads != inputs_num_heads[i]) { - is_same_num_heads = false; - break; - } - prev_num_heads = inputs_num_heads[i]; - } - - int sign = 1; - VectorizedFusedRopeCudaKernelFunc kernel_func = - use_neox_rotary_style - ? VectorizedFusedRopeWithRotateEveryTwoKernel - : VectorizedFusedRopeWithRotateHalfKernel; - - if (is_same_num_heads) { - int64_t batch_stride = time_major ? q.strides()[1] : q.strides()[0]; - int64_t seq_stride = time_major ? q.strides()[0] : q.strides()[1]; - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride, - seq_stride, - num_inputs, - div_c, - rotary_emb_base, - outs_data); - } else { - // Multi Query Attention (MQA) or Group Query Attention (GQA) - PADDLE_ENFORCE_EQ( - (inputs_num_heads[0] != inputs_num_heads[num_inputs - 1]) && - (inputs_num_heads[0] % inputs_num_heads[num_inputs - 1] == 0), - true, - common::errors::InvalidArgument( - "The MQA or GQA mode is entered, when the number of heads of qkv " - "is not exactly the same two by two. This mode requires " - "num_heads of q to be divisible by k,v." - "But received num_heads of q is %d, num_heads of k,v is %d", - inputs_num_heads[0], - inputs_num_heads[num_inputs - 1])); - - if (k.get_ptr() && v.get_ptr()) { - PADDLE_ENFORCE_EQ( - inputs_num_heads[1] == inputs_num_heads[2], - true, - common::errors::InvalidArgument( - "The num_heads of k must be equal to the num_heads of v when v " - "is not none." - "But received num_heads of k is %d, num_heads of v is %d", - inputs_num_heads[1], - inputs_num_heads[2])); - } - // rotary position embedding Q - int64_t batch_stride_q = time_major ? q.strides()[1] : q.strides()[0]; - int64_t seq_stride_q = time_major ? q.strides()[0] : q.strides()[1]; - - kernel_func<<>>(ins_data, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[0], - head_dim, - batch_stride_q, - seq_stride_q, - 1, - div_c, - rotary_emb_base, - outs_data); - - // rotary position embedding K,V - phi::Array input_kv{ins_data[1], ins_data[2], nullptr}; - phi::Array out_kv{outs_data[1], outs_data[2], nullptr}; - int64_t batch_stride_kv = time_major - ? inputs_num_heads[1] * head_dim - : seq_len * inputs_num_heads[1] * head_dim; - int64_t seq_stride_kv = time_major - ? batch_size * inputs_num_heads[1] * head_dim - : inputs_num_heads[1] * head_dim; - - kernel_func<<>>(input_kv, - sin_cos_data, - position_ids_data, - flag_sin_cos, - sign, - batch_size, - seq_len, - inputs_num_heads[1], - head_dim, - batch_stride_kv, - seq_stride_kv, - num_inputs - 1, - div_c, - rotary_emb_base, - out_kv); - } -} -} // namespace fusion -} // namespace phi - -PD_REGISTER_PLUGIN_KERNEL(fused_rotary_position_embedding, +PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding, metax_gpu, ALL_LAYOUT, phi::fusion::FusedRopeKernel,