eplb

EnigmaTHU · EnigmaTHU · commit 43a042a78fdb · 2025-09-29T15:51:08.000+08:00
eplb_clean

eplb_repair
diff --git a/csrc/deepep/deep_ep.cpp b/csrc/deepep/deep_ep.cpp
@@ -592,10 +592,11 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     return {combined_x, event, std::function<void()>([] {})};
 }
 
-std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>> Buffer::fused_deep_moe(
-    const at::Tensor &x, const at::Tensor &expertIds, const at::Tensor &gmm1PermutedWeight,
-    const at::Tensor &gmm1PermutedWeightScale, const at::Tensor &gmm2Weight, const at::Tensor &gmm2WeightScale,
-    const at::Tensor &expertScalesOptional, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8)
+std::tuple<std::vector<at::Tensor>, std::optional<EventHandle>, std::optional<std::function<void()>>>
+Buffer::fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds, const at::TensorList &gmm1PermutedWeight,
+                       const at::TensorList &gmm1PermutedWeightScale, const at::TensorList &gmm2Weight,
+                       const at::TensorList &gmm2WeightScale, const at::Tensor &expertScalesOptional,
+                       int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8)
 {
     EP_HOST_ASSERT(expertIds.dim() == 2);
     EP_HOST_ASSERT(expertScalesOptional.dim() == 2);
@@ -659,6 +660,15 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     int bs = this->new_topk_idx.size(0);
     at::Tensor output = at::empty({bs, h}, x.options());
 
+    bool isShareExpert = (rank < shared_expert_num);
+    int64_t localExpertNum = 0;
+    if (isShareExpert) {
+        localExpertNum = num_ranks;
+    } else {
+        localExpertNum = num_ranks * (num_experts / (num_ranks - shared_expert_num));
+    }
+    at::Tensor recvCountOutput = at::empty({localExpertNum}, expertIds.options());
+
     EXEC_NPU_CMD(aclnnFusedDeepMoe,
                  // input
                  x, expertIds, gmm1PermutedWeight, gmm1PermutedWeightScale, gmm2Weight, gmm2WeightScale,
@@ -667,7 +677,7 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
                  hcom_ep_name, num_ranks, rank, num_experts, shared_expert_num, shared_expert_rank_num, quantMode,
                  globalBs,
                  // output
-                 output);
+                 output, recvCountOutput);
 
     // ---------- Unpadding ----------
     if (this->is_padding) {
@@ -680,6 +690,6 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     }
 
     std::optional<EventHandle> event;
-    return {output, event, std::function<void()>([] {})};
+    return {{output, recvCountOutput}, event, std::function<void()>([] {})};
 }
 }  // namespace deep_ep
diff --git a/csrc/deepep/deep_ep.hpp b/csrc/deepep/deep_ep.hpp
@@ -85,10 +85,10 @@ struct Buffer {
         const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
         const std::optional<at::Tensor> &out);
 
-    std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
-    fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds, const at::Tensor &gmm1PermutedWeight,
-                   const at::Tensor &gmm1PermutedWeightScale, const at::Tensor &gmm2Weight,
-                   const at::Tensor &gmm2WeightScale, const at::Tensor &expertScalesOptional,
+    std::tuple<std::vector<at::Tensor>, std::optional<EventHandle>, std::optional<std::function<void()>>>
+    fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds, const at::TensorList &gmm1PermutedWeight,
+                   const at::TensorList &gmm1PermutedWeightScale, const at::TensorList &gmm2Weight,
+                   const at::TensorList &gmm2WeightScale, const at::Tensor &expertScalesOptional,
                    int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8);
 };
 }  // namespace deep_ep
diff --git a/csrc/deepep/ops/op_host/fused_deep_moe.cpp b/csrc/deepep/ops/op_host/fused_deep_moe.cpp
@@ -8,6 +8,8 @@
  */
 #include "register/op_def_registry.h"
 
+#define ENABLE_TENSOR_LIST
+
 namespace ops {
 class FusedDeepMoe : public OpDef
 {
@@ -24,6 +26,28 @@ class FusedDeepMoe : public OpDef
             .DataType({ge::DT_INT32, ge::DT_INT32})
             .Format({ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+#ifdef ENABLE_TENSOR_LIST
+        this->Input("gmm1_permuted_weight")
+            .ParamType(DYNAMIC)
+            .DataType({ge::DT_INT8, ge::DT_INT8})
+            .Format({ge::FORMAT_FRACTAL_NZ, ge::FORMAT_FRACTAL_NZ})
+            .UnknownShapeFormat({ge::FORMAT_FRACTAL_NZ, ge::FORMAT_FRACTAL_NZ});
+        this->Input("gmm1_permuted_weight_scale")
+            .ParamType(DYNAMIC)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("gmm2_weight")
+            .ParamType(DYNAMIC)
+            .DataType({ge::DT_INT8, ge::DT_INT8})
+            .Format({ge::FORMAT_FRACTAL_NZ, ge::FORMAT_FRACTAL_NZ})
+            .UnknownShapeFormat({ge::FORMAT_FRACTAL_NZ, ge::FORMAT_FRACTAL_NZ});
+        this->Input("gmm2_weight_scale")
+            .ParamType(DYNAMIC)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+#else
         this->Input("gmm1_permuted_weight")
             .ParamType(REQUIRED)
             .DataType({ge::DT_INT8, ge::DT_INT8})
@@ -44,6 +68,7 @@ class FusedDeepMoe : public OpDef
             .DataType({ge::DT_FLOAT, ge::DT_FLOAT})
             .Format({ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+#endif
         this->Input("expert_smooth_scales")
             .ParamType(OPTIONAL)
             .DataType({ge::DT_FLOAT, ge::DT_FLOAT})
@@ -59,6 +84,11 @@ class FusedDeepMoe : public OpDef
             .DataType({ge::DT_BF16, ge::DT_FLOAT16})
             .Format({ge::FORMAT_ND, ge::FORMAT_ND})
             .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("ep_recv_count")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Attr("group_ep").String();
         this->Attr("ep_rank_size").Int();
         this->Attr("ep_rank_id").Int();
diff --git a/csrc/deepep/ops/op_host/fused_deep_moe_infer.cpp b/csrc/deepep/ops/op_host/fused_deep_moe_infer.cpp
@@ -16,14 +16,27 @@ namespace ge {
 constexpr uint32_t EXPAND_X_INDEX = 0;
 constexpr uint32_t EXPERT_IDS_INDEX = 1;
 constexpr uint32_t OUTPUT_X_INDEX = 0;
+constexpr uint32_t OUTPUT_REC_COUNT_INDEX = 1;
+
+constexpr uint32_t ATTR_GROUP_EP_INDEX = 0;
+constexpr uint32_t ATTR_EP_RANK_SIZE_INDEX = 1;
+constexpr uint32_t ATTR_EP_RANK_ID_INDEX = 2;
+constexpr uint32_t ATTR_MOE_EXPERT_NUM_INDEX = 3;
+constexpr uint32_t ATTR_SHARE_EXPERT_NUM_INDEX = 4;
+constexpr uint32_t ATTR_SHARE_EXPERT_RANK_NUM_INDEX = 5;
+constexpr uint32_t ATTR_QUANT_MODE_INDEX = 6;
+constexpr uint32_t ATTR_GLOBAL_BS_INDEX = 7;
 
 static ge::graphStatus InferShape(gert::InferShapeContext *context)
 {
+    const char *nodeName = context->GetNodeName();
+    // infer output shape
     const gert::Shape *expandXShape = context->GetInputShape(EXPAND_X_INDEX);
     const gert::Shape *expertIdsShape = context->GetInputShape(EXPERT_IDS_INDEX);
     gert::Shape *expandXOutShape = context->GetOutputShape(OUTPUT_X_INDEX);
-
-    if (expandXShape == nullptr || expertIdsShape == nullptr || expandXOutShape == nullptr) {
+    gert::Shape *recvCountOutShape = context->GetOutputShape(OUTPUT_REC_COUNT_INDEX);
+    if (expandXShape == nullptr || expertIdsShape == nullptr || expandXOutShape == nullptr ||
+        recvCountOutShape == nullptr) {
         return GRAPH_FAILED;
     }
     if (expandXShape->GetDimNum() < 2 || expertIdsShape->GetDimNum() < 1) {
@@ -37,13 +50,42 @@ static ge::graphStatus InferShape(gert::InferShapeContext *context)
     expandXOutShape->SetDim(0, bs);
     expandXOutShape->SetDim(1, h);
 
+    // infer recvCount shape
+    auto attrs = context->GetAttrs();
+    OP_TILING_CHECK(attrs == nullptr, OP_LOGE(nodeName, "attrs is nullptr."), return ge::GRAPH_FAILED);
+
+    auto epRankSizePtr = attrs->GetAttrPointer<int64_t>(ATTR_EP_RANK_SIZE_INDEX);
+    auto epRankIdPtr = attrs->GetAttrPointer<int64_t>(ATTR_EP_RANK_ID_INDEX);
+    auto moeExpertNumPtr = attrs->GetAttrPointer<int64_t>(ATTR_MOE_EXPERT_NUM_INDEX);
+    auto sharedExpertRankNumPtr = attrs->GetAttrPointer<int64_t>(ATTR_SHARE_EXPERT_RANK_NUM_INDEX);
+
+    OP_TILING_CHECK(epRankIdPtr == nullptr, OP_LOGE(nodeName, "epRankIdPtr is nullptr."), return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(moeExpertNumPtr == nullptr, OP_LOGE(nodeName, "moeExpertNumPtr is nullptr."),
+                    return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(epRankSizePtr == nullptr, OP_LOGE(nodeName, "epRankSizePtr is nullptr."), return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(sharedExpertRankNumPtr == nullptr, OP_LOGE(nodeName, "sharedExpertRankNumPtr is nullptr."),
+                    return ge::GRAPH_FAILED);
+    uint32_t epRankSize = static_cast<uint32_t>(*epRankSizePtr);
+    uint32_t moeExpertNum = static_cast<uint32_t>(*moeExpertNumPtr);
+    uint32_t epRankId = static_cast<uint32_t>(*epRankIdPtr);
+    uint32_t sharedExpertRankNum = static_cast<uint32_t>(*sharedExpertRankNumPtr);
+
+    recvCountOutShape->SetDimNum(1);
+    bool isShareExpert = (epRankId < sharedExpertRankNum);
+    if (isShareExpert) {
+        recvCountOutShape->SetDim(0, epRankSize);
+    } else {
+        recvCountOutShape->SetDim(0, epRankSize * (moeExpertNum / (epRankSize - sharedExpertRankNum)));
+    }
+
     return GRAPH_SUCCESS;
 }
 
 static ge::graphStatus InferDataType(gert::InferDataTypeContext *context)
 {
     const auto expandXDataType = context->GetInputDataType(EXPAND_X_INDEX);
     context->SetOutputDataType(OUTPUT_X_INDEX, expandXDataType);
+    context->SetOutputDataType(OUTPUT_REC_COUNT_INDEX, ge::DT_INT32);
     return ge::GRAPH_SUCCESS;
 }
 
diff --git a/csrc/deepep/ops/op_host/fused_deep_moe_tiling.cpp b/csrc/deepep/ops/op_host/fused_deep_moe_tiling.cpp
@@ -18,7 +18,7 @@
 #include "tiling/hccl/hccl_tiling.h"
 
 #define GM_ALIGN_SIZE 512
-#define ENABLE_TILING_CHECK
+// #define ENABLE_TILING_CHECK
 
 using namespace ge;
 namespace {
diff --git a/csrc/deepep/ops/op_host/op_api/aclnn_fused_deep_moe.cpp b/csrc/deepep/ops/op_host/op_api/aclnn_fused_deep_moe.cpp
@@ -12,6 +12,8 @@
 #include "aclnn/opdev/platform.h"
 #include "aclnnInner_fused_deep_moe.h"
 
+#define ENABLE_TENSOR_LIST
+
 enum class NnopbaseHcclServerType {
     NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
     NNOPBASE_HCCL_SERVER_TYPE_MTE,
@@ -23,17 +25,27 @@ extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor,
 extern "C" {
 #endif
 
-aclnnStatus aclnnFusedDeepMoeGetWorkspaceSize(
-    const aclTensor *x, const aclTensor *expertIds, const aclTensor *gmm1PermutedWeight,
-    const aclTensor *gmm1PermutedWeightScale, const aclTensor *gmm2Weight, const aclTensor *gmm2WeightScale,
-    const aclTensor *expertSmoothScalesOptional, const aclTensor *expertScalesOptional, char *groupEp,
-    int64_t epRankSize, int64_t epRankId, int64_t moeExpertNum, int64_t shareExpertNum, int64_t shareExpertRankNum,
-    int64_t quantMode, int64_t globalBs, const aclTensor *output, uint64_t *workspaceSize, aclOpExecutor **executor)
+aclnnStatus aclnnFusedDeepMoeGetWorkspaceSize(const aclTensor *x, const aclTensor *expertIds,
+#ifdef ENABLE_TENSOR_LIST
+                                              const aclTensorList *gmm1PermutedWeight,
+                                              const aclTensorList *gmm1PermutedWeightScale,
+                                              const aclTensorList *gmm2Weight, const aclTensorList *gmm2WeightScale,
+#else
+                                              const aclTensor *gmm1PermutedWeight,
+                                              const aclTensor *gmm1PermutedWeightScale, const aclTensor *gmm2Weight,
+                                              const aclTensor *gmm2WeightScale,
+#endif
+                                              const aclTensor *expertSmoothScalesOptional,
+                                              const aclTensor *expertScalesOptional, char *groupEp, int64_t epRankSize,
+                                              int64_t epRankId, int64_t moeExpertNum, int64_t shareExpertNum,
+                                              int64_t shareExpertRankNum, int64_t quantMode, int64_t globalBs,
+                                              const aclTensor *output, const aclTensor *outputRecvCount,
+                                              uint64_t *workspaceSize, aclOpExecutor **executor)
 {
     return aclnnInnerFusedDeepMoeGetWorkspaceSize(
         x, expertIds, gmm1PermutedWeight, gmm1PermutedWeightScale, gmm2Weight, gmm2WeightScale,
         expertSmoothScalesOptional, expertScalesOptional, groupEp, epRankSize, epRankId, moeExpertNum, shareExpertNum,
-        shareExpertRankNum, quantMode, globalBs, output, workspaceSize, executor);
+        shareExpertRankNum, quantMode, globalBs, output, outputRecvCount, workspaceSize, executor);
 }
 
 aclnnStatus aclnnFusedDeepMoe(void *workspace, uint64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream)
diff --git a/csrc/deepep/ops/op_host/op_api/aclnn_fused_deep_moe.h b/csrc/deepep/ops/op_host/op_api/aclnn_fused_deep_moe.h
@@ -10,18 +10,27 @@
 #ifndef FUSED_DEEP_MOE
 #define FUSED_DEEP_MOE
 
+#define ENABLE_TENSOR_LIST
+
 #include "aclnn/acl_meta.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 __attribute__((visibility("default"))) aclnnStatus aclnnFusedDeepMoeGetWorkspaceSize(
-    const aclTensor *x, const aclTensor *expertIds, const aclTensor *gmm1PermutedWeight,
-    const aclTensor *gmm1PermutedWeightScale, const aclTensor *gmm2Weight, const aclTensor *gmm2WeightScale,
+    const aclTensor *x, const aclTensor *expertIds,
+#ifdef ENABLE_TENSOR_LIST
+    const aclTensorList *gmm1PermutedWeight, const aclTensorList *gmm1PermutedWeightScale,
+    const aclTensorList *gmm2Weight, const aclTensorList *gmm2WeightScale,
+#else
+    const aclTensor *gmm1PermutedWeight, const aclTensor *gmm1PermutedWeightScale, const aclTensor *gmm2Weight,
+    const aclTensor *gmm2WeightScale,
+#endif
     const aclTensor *expertSmoothScalesOptional, const aclTensor *expertScalesOptional, char *groupEp,
     int64_t epRankSize, int64_t epRankId, int64_t moeExpertNum, int64_t shareExpertNum, int64_t shareExpertRankNum,
-    int64_t quantMode, int64_t globalBs, const aclTensor *output, uint64_t *workspaceSize, aclOpExecutor **executor);
+    int64_t quantMode, int64_t globalBs, const aclTensor *output, const aclTensor *outputRecvCount,
+    uint64_t *workspaceSize, aclOpExecutor **executor);
 
 __attribute__((visibility("default"))) aclnnStatus aclnnFusedDeepMoe(void *workspace, uint64_t workspaceSize,
                                                                      aclOpExecutor *executor, aclrtStream stream);
diff --git a/csrc/deepep/ops/op_kernel/fused_deep_moe.cpp b/csrc/deepep/ops/op_kernel/fused_deep_moe.cpp
@@ -15,7 +15,7 @@ extern "C" __global__ __aicore__ void fused_deep_moe(
     GM_ADDR x, GM_ADDR expert_ids, GM_ADDR gmm1_permuted_weight, GM_ADDR gmm1_permuted_weight_scale,
     GM_ADDR gmm2_weight, GM_ADDR gmm2_weight_scale, GM_ADDR expert_smooth_scales, GM_ADDR expert_scales,
     // output
-    GM_ADDR output,
+    GM_ADDR output, GM_ADDR outputRecvCount,
     // system
     GM_ADDR workspace, GM_ADDR tiling)
 {
@@ -27,7 +27,7 @@ extern "C" __global__ __aicore__ void fused_deep_moe(
     if constexpr (TILING_KEY_IS(0) || TILING_KEY_IS(1)) {
         FusedDeepMoe<DTYPE_X, int32_t, false, TILING_KEY_VAR> op;
         op.Init(x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, gmm2_weight, gmm2_weight_scale,
-                expert_smooth_scales, expert_scales, output, workspace, nullptr, &tiling_data);
+                expert_smooth_scales, expert_scales, output, outputRecvCount, workspace, nullptr, &tiling_data);
         op.Process();
     }
 }
diff --git a/csrc/deepep/ops/op_kernel/fused_deep_moe.h b/csrc/deepep/ops/op_kernel/fused_deep_moe.h
diff --git a/csrc/deepep/ops/utils/op_kernel/operator/catlass/act/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_multistage_workspace.hpp b/csrc/deepep/ops/utils/op_kernel/operator/catlass/act/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_multistage_workspace.hpp
diff --git a/csrc/deepep/ops/utils/op_kernel/operator/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h b/csrc/deepep/ops/utils/op_kernel/operator/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h
diff --git a/csrc/deepep/pytorch_npu_helper.hpp b/csrc/deepep/pytorch_npu_helper.hpp

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ extern "C" __global__ __aicore__ void fused_deep_moe(`
`15`	`15`	`GM_ADDR x, GM_ADDR expert_ids, GM_ADDR gmm1_permuted_weight, GM_ADDR gmm1_permuted_weight_scale,`
`16`	`16`	`GM_ADDR gmm2_weight, GM_ADDR gmm2_weight_scale, GM_ADDR expert_smooth_scales, GM_ADDR expert_scales,`
`17`	`17`	`// output`
`18`		`- GM_ADDR output,`
	`18`	`+ GM_ADDR output, GM_ADDR outputRecvCount,`
`19`	`19`	`// system`
`20`	`20`	`GM_ADDR workspace, GM_ADDR tiling)`
`21`	`21`	`{`
`@@ -27,7 +27,7 @@ extern "C" __global__ __aicore__ void fused_deep_moe(`
`27`	`27`	`if constexpr (TILING_KEY_IS(0) \|\| TILING_KEY_IS(1)) {`
`28`	`28`	`FusedDeepMoe<DTYPE_X, int32_t, false, TILING_KEY_VAR> op;`
`29`	`29`	`op.Init(x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, gmm2_weight, gmm2_weight_scale,`
`30`		`- expert_smooth_scales, expert_scales, output, workspace, nullptr, &tiling_data);`
	`30`	`+ expert_smooth_scales, expert_scales, output, outputRecvCount, workspace, nullptr, &tiling_data);`
`31`	`31`	`op.Process();`
`32`	`32`	`}`
`33`	`33`	`}`