PaddlePaddle · YqGe585 · Dec 11, 2025 · Dec 10, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/Paddle b/Paddle
diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
@@ -112,7 +112,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
   # Core
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/mixed_vector.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cusparse.cc
   # kernels/funcs
@@ -128,6 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/batched_gemm.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu
@@ -876,7 +877,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu

diff --git a/backends/iluvatar_gpu/common/cuda_flags.cc b/backends/iluvatar_gpu/common/cuda_flags.cc
@@ -277,3 +277,19 @@ PHI_DEFINE_EXPORTED_bool(
     flash_attn_available,
     true,
     "Weather flash attention is available on the current device.");
+
+/**
+ * CUDNN related FLAG
+ * Name: FLAGS_conv_workspace_size_limit
+ * Since Version: 0.13.0
+ * Value Range: uint64, default=512 (MB)
+ * Example:
+ * Note: The internal function of cuDNN obtains the fastest matching algorithm
+ *       within this memory limit. Usually, faster algorithms can be chosen in
+ *       larger workspaces, but memory space can also be significantly
+ * increased.
+ *       Users need to balance memory and speed.
+ */
+PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
+                          1024,
+                          "cuDNN convolution workspace limit in MB unit.");
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/batched_gemm.h"
+
+PD_CUSTOM_KERNEL_REGISTER(batched_gemm,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::BatchedGEMM,
+                          float,
+                          phi::bfloat16) {}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedRopeGradKernel,
+                          float,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16){};
+3 −5		paddle/cinn/backends/codegen_cuda_dev.cc
+404 −23		paddle/cinn/backends/compiler.cc
+58 −2		paddle/cinn/backends/compiler.h
+133 −4		paddle/cinn/backends/llvm/execution_engine.cc
+15 −3		paddle/cinn/backends/llvm/execution_engine.h
+10 −0		paddle/cinn/cinn.h
+1 −0		paddle/cinn/hlir/framework/graph_compiler_util.h
+10 −0		paddle/cinn/hlir/framework/pir/compilation_cache.h
+9 −3		paddle/cinn/hlir/framework/pir/compilation_task.cc
+45 −8		paddle/cinn/hlir/framework/pir/fusion_info.cc
+3 −0		paddle/cinn/hlir/framework/pir/fusion_info.h
+1 −0		paddle/cinn/hlir/framework/pir/op_lowering_group.h
+297 −11		paddle/cinn/hlir/framework/pir_compiler.cc
+3 −3		paddle/cinn/runtime/arch_device.h
+22 −3		paddle/cinn/runtime/cuda/cuda_util.cc
+6 −1		paddle/cinn/runtime/cuda/cuda_util.h
+28 −0		paddle/common/flags.cc
+68 −11		paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+9 −2		paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
+2 −2		paddle/fluid/pir/serialize_deserialize/CMakeLists.txt
+0 −0		paddle/fluid/pir/serialize_deserialize/patch/4.yaml
+3 −1		paddle/fluid/pybind/eager_method.cc
+4 −16		paddle/fluid/pybind/eager_properties.cc
+48 −18		paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+133 −127		paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
+160 −183		paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
+273 −448		paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
+81 −24		paddle/phi/kernels/impl/matmul_kernel_impl.h
+20 −0		paddle/phi/kernels/stride/reduce_stride_base.cu.h
+5 −0		paddle/phi/ops/yaml/ops.yaml
+9 −1		paddle/pir/include/core/type.h
+25 −7		python/paddle/distributed/flex_checkpoint/dcp/load_state_dict.py
+27 −12		python/paddle/distributed/flex_checkpoint/dcp/reshard_comm.py
+2 −2		python/paddle/distributed/flex_checkpoint/dcp/save_state_dict.py
+6 −0		python/paddle/tensor/manipulation.py
+5 −1		python/paddle/utils/cpp_extension/extension_utils.py
+7 −7		test/auto_parallel/hybrid_strategy/semi_flexcheckpoint_merge.py
+3 −0		test/ir/pir/cinn/CMakeLists.txt
+114 −0		test/ir/pir/cinn/test_enable_cinn_kernel_cache.py
+114 −0		test/ir/pir/cinn/test_enable_cinn_kernel_cache_2.py
+47 −4		test/legacy_test/test_fused_rotary_position_embedding.py
+122 −1		test/legacy_test/test_tensor.py
+24 −0		test/legacy_test/test_tile_op.py