PaddlePaddle
diff --git a/‎.github/workflows/ce_job.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ce_job.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci_image_update.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci_image_update.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/publish_job.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/publish_job.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/machete/machete_mm.cu‎
Lines changed: 5 additions & 1 deletion b/‎custom_ops/gpu_ops/machete/machete_mm.cu‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/machete/machete_prepack_B.cu‎
Lines changed: 2 additions & 0 deletions b/‎custom_ops/gpu_ops/machete/machete_prepack_B.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/moe_topk_select.cc‎
Lines changed: 14 additions & 12 deletions b/‎custom_ops/xpu_ops/src/ops/moe_topk_select.cc‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/pybind/pybind.cc‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/xpu_ops/src/ops/pybind/pybind.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/features/speculative_decoding.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/features/speculative_decoding.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/online_serving/metrics.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/online_serving/metrics.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/zh/features/speculative_decoding.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/zh/features/speculative_decoding.md‎
Lines changed: 8 additions & 0 deletions
@@ -9,7 +9,7 @@ on:
 permissions: read-all
 
 concurrency:
-  group: ${{ github.ref }}-${{ github.sha }}
+  group: CE-Job-${{ github.ref }}-${{ github.sha }}
   cancel-in-progress: true
 
 jobs:
 
@@ -8,7 +8,7 @@ on:
 permissions: read-all
 
 concurrency:
-  group: ${{ github.ref }}-${{ github.sha }}
+  group: CI-Images-Build-${{ github.ref }}-${{ github.sha }}
   cancel-in-progress: true
 
 
 
@@ -13,7 +13,7 @@ on:
 permissions: read-all
 
 concurrency:
-  group: ${{ github.ref }}-${{ github.sha }}
+  group: Publish-Job-${{ github.ref }}-${{ github.sha }}
   cancel-in-progress: true
 
 
 
@@ -30,10 +30,12 @@ paddle::Tensor mm(paddle::Tensor const& A, paddle::Tensor const& B,
                  std::optional<paddle::Tensor> const& maybe_token_scales,
                  std::string maybe_schedule) {
   machete::ScalarType const b_type = machete::ScalarType::from_id(b_type_id);
-  std::optional<int64_t> maybe_group_size_opt;
+  std::optional<int64_t> maybe_group_size_opt = std::optional<int64_t>(maybe_group_size);
   std::optional<std::string> maybe_schedule_opt;
   if (maybe_schedule == "") {
     maybe_schedule_opt = std::nullopt;
+  } else {
+    maybe_schedule_opt = std::optional<std::string>(maybe_schedule);
   }
   return machete::mm_dispatch({.A = A,
                       .B = B,
@@ -63,6 +65,8 @@ std::vector<paddle::Tensor> MacheteMMKernel(
   paddle::DataType maybe_out_type;
   if (b_type_str == "uint4b8") {
     b_type_id = machete::kU4B8.id();
+  } else if (b_type_str == "uint8b128") {
+    b_type_id = machete::kU8B128.id();
   } else {
     PADDLE_ENFORCE(false, "b_type_str not supported!");
   }
 
@@ -51,6 +51,8 @@ std::vector<paddle::Tensor> MachetePrepackBKernel(
 
   if (b_type_str == "uint4b8") {
     b_type_id = machete::kU4B8.id();
+  } else if (b_type_str == "uint8b128") {
+    b_type_id = machete::kU8B128.id();
   } else {
     PADDLE_ENFORCE(false, "b_type_str not supported!");
   }
 
@@ -43,18 +43,20 @@ std::vector<paddle::Tensor> MoeTopkSelect(
   int32_t* block_statistic = nullptr;
   const float* bias_data =
       bias.get_ptr() != nullptr ? bias.get_ptr()->data<float>() : nullptr;
-  int ret = infer_ops::moe_softmax_topk_norm_fusion(
-      xpu_ctx->x_context(),
-      gating_logits.data<float>(),
-      topk_weights.mutable_data<float>(),
-      topk_ids.mutable_data<int>(),
-      block_statistic,
-      token_num,
-      expert_num,
-      moe_topk,
-      0,
-      bias_data);
-  PD_CHECK(ret == 0);
+  if (token_num > 0) {
+    int ret = infer_ops::moe_softmax_topk_norm_fusion(
+        xpu_ctx->x_context(),
+        gating_logits.data<float>(),
+        topk_weights.mutable_data<float>(),
+        topk_ids.mutable_data<int>(),
+        block_statistic,
+        token_num,
+        expert_num,
+        moe_topk,
+        0,
+        bias_data);
+    PD_CHECK(ret == 0);
+  }
 
   return {topk_ids, topk_weights};
 }
 
@@ -416,7 +416,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("bias"),
         py::arg("weight_dtype"),
         py::arg("arch"),
-        py::arg("group_size"));
+        py::arg("group_size")=-1);
 
   m.def("ep_moe_expert_combine",
         &MoeEPCombine,
 
@@ -18,6 +18,12 @@ This project implements an efficient **Speculative Decoding** inference framewor
   - ⏳ Coming Soon: Support Chunk-prefill
   - ⏳ Coming Soon: Multi-layer MTP Layer
 
+- **Decoding with Hybrid MTP and Ngram Methods(Hybrid-MTP-with-Ngram)**
+
+  - Overview: A hybrid method combining MTP and Ngram. First, MTP generates N draft tokens, then Ngram matching is used to supplement additional draft tokens.
+
+  - Use Cases: Suitable when higher draft token coverage is required, leveraging both MTP’s generation capability and the efficiency of Ngram matching.
+
 ---
 
 ### Coming Soon
@@ -132,7 +138,13 @@ python -m fastdeploy.entrypoints.openai.api_server \
     --scheduler-password "scheduler_mtp" \
     --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}' &
 ```
+## Decoding with Hybrid MTP and Ngram Methods
 
+When starting the service, you only need to modify the --speculative-config option.
+For example, use MTP to generate two draft tokens, and then append three additional draft tokens from Ngram matching:
+```
+--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram", "num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'
+```
 ## 🧠 Using Ngram-Based Decoding
 This method uses an n-gram sliding window to match the prompt and generated tokens to predict draft tokens. It is particularly effective in scenarios with high input-output overlap (e.g., code completion, document search).
 
 
@@ -34,7 +34,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
 | `fastdeploy:available_gpu_block_num`         | Gauge     | Number of available gpu blocks in cache, including prefix caching blocks that are not officially released               | Count   |
 | `fastdeploy:free_gpu_block_num`              | Gauge     | Number of free blocks in cache                      | Count   |
 | `fastdeploy:max_gpu_block_num`               | Gauge     | Number of total blocks determined when service started| Count   |
-| `available_gpu_resource`                     | Gauge     | Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num               | Count   |
+| `fastdeploy:available_gpu_resource`          | Gauge     | Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num               | Count   |
 | `fastdeploy:requests_number`                 | Counter   | Total number of requests received                   | Count   |
 | `fastdeploy:send_cache_failed_num`           | Counter   | Total number of failures of sending cache           | Count   |
 | `fastdeploy:first_token_latency`             | Gauge     | Latest time to generate first token in seconds      | Seconds   |
 
@@ -14,6 +14,9 @@
   - ⏳ 即将支持：兼容 Chunk Prefill
   - ⏳ 即将支持：多层 MTP layer
 
+- **混合MTP、Ngram方法解码(Hybrid-MTP-with-Ngram)**
+  - 方法概述：混合MTP与Ngram方法，先使用MTP产出N个草稿Token，再使用Ngram匹配补充草稿Token。
+  - 使用场景：适合在需要更多草稿Token时使用，兼顾MTP生成能力与Ngram匹配的高效性。
 ---
 
 ### ⏳ 规划中
@@ -110,7 +113,12 @@ python -m fastdeploy.entrypoints.openai.api_server  \
        --scheduler-password "scheduler_mtp" \
        --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}'  &
 ```
+## 使用混合MTP、Ngram方法解码
+在启动服务时，只需改动 --speculative-config 即可。例如使用MTP产出两个DraftToken，再额外拼接三个Ngram匹配的DraftToken
+```
+--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram" ,"num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'
 
+```
 ## 🧠 使用 Ngram 解码
 该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token，适合输入和输出有很大 overlap 的场景，如代码续写、文档查询等。
 > 使用 4×H100；量化方式选择 WINT4
Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,8 @@ std::vector<paddle::Tensor> MachetePrepackBKernel(`
`51`	`51`
`52`	`52`	`if (b_type_str == "uint4b8") {`
`53`	`53`	`b_type_id = machete::kU4B8.id();`
	`54`	`+ } else if (b_type_str == "uint8b128") {`
	`55`	`+ b_type_id = machete::kU8B128.id();`
`54`	`56`	`} else {`
`55`	`57`	`PADDLE_ENFORCE(false, "b_type_str not supported!");`
`56`	`58`	`}`