Skip to content

Commit 102545b

Browse files
committed
fix logger
2 parents e3c785e + 113e330 commit 102545b

File tree

85 files changed

+4533
-702
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+4533
-702
lines changed

.github/workflows/ce_job.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
permissions: read-all
1010

1111
concurrency:
12-
group: ${{ github.ref }}-${{ github.sha }}
12+
group: CE-Job-${{ github.ref }}-${{ github.sha }}
1313
cancel-in-progress: true
1414

1515
jobs:

.github/workflows/ci_image_update.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ on:
88
permissions: read-all
99

1010
concurrency:
11-
group: ${{ github.ref }}-${{ github.sha }}
11+
group: CI-Images-Build-${{ github.ref }}-${{ github.sha }}
1212
cancel-in-progress: true
1313

1414

.github/workflows/publish_job.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ on:
1313
permissions: read-all
1414

1515
concurrency:
16-
group: ${{ github.ref }}-${{ github.sha }}
16+
group: Publish-Job-${{ github.ref }}-${{ github.sha }}
1717
cancel-in-progress: true
1818

1919

custom_ops/gpu_ops/machete/machete_mm.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@ paddle::Tensor mm(paddle::Tensor const& A, paddle::Tensor const& B,
3030
std::optional<paddle::Tensor> const& maybe_token_scales,
3131
std::string maybe_schedule) {
3232
machete::ScalarType const b_type = machete::ScalarType::from_id(b_type_id);
33-
std::optional<int64_t> maybe_group_size_opt;
33+
std::optional<int64_t> maybe_group_size_opt = std::optional<int64_t>(maybe_group_size);
3434
std::optional<std::string> maybe_schedule_opt;
3535
if (maybe_schedule == "") {
3636
maybe_schedule_opt = std::nullopt;
37+
} else {
38+
maybe_schedule_opt = std::optional<std::string>(maybe_schedule);
3739
}
3840
return machete::mm_dispatch({.A = A,
3941
.B = B,
@@ -63,6 +65,8 @@ std::vector<paddle::Tensor> MacheteMMKernel(
6365
paddle::DataType maybe_out_type;
6466
if (b_type_str == "uint4b8") {
6567
b_type_id = machete::kU4B8.id();
68+
} else if (b_type_str == "uint8b128") {
69+
b_type_id = machete::kU8B128.id();
6670
} else {
6771
PADDLE_ENFORCE(false, "b_type_str not supported!");
6872
}

custom_ops/gpu_ops/machete/machete_prepack_B.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ std::vector<paddle::Tensor> MachetePrepackBKernel(
5151

5252
if (b_type_str == "uint4b8") {
5353
b_type_id = machete::kU4B8.id();
54+
} else if (b_type_str == "uint8b128") {
55+
b_type_id = machete::kU8B128.id();
5456
} else {
5557
PADDLE_ENFORCE(false, "b_type_str not supported!");
5658
}

custom_ops/xpu_ops/src/ops/moe_topk_select.cc

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,20 @@ std::vector<paddle::Tensor> MoeTopkSelect(
4343
int32_t* block_statistic = nullptr;
4444
const float* bias_data =
4545
bias.get_ptr() != nullptr ? bias.get_ptr()->data<float>() : nullptr;
46-
int ret = infer_ops::moe_softmax_topk_norm_fusion(
47-
xpu_ctx->x_context(),
48-
gating_logits.data<float>(),
49-
topk_weights.mutable_data<float>(),
50-
topk_ids.mutable_data<int>(),
51-
block_statistic,
52-
token_num,
53-
expert_num,
54-
moe_topk,
55-
0,
56-
bias_data);
57-
PD_CHECK(ret == 0);
46+
if (token_num > 0) {
47+
int ret = infer_ops::moe_softmax_topk_norm_fusion(
48+
xpu_ctx->x_context(),
49+
gating_logits.data<float>(),
50+
topk_weights.mutable_data<float>(),
51+
topk_ids.mutable_data<int>(),
52+
block_statistic,
53+
token_num,
54+
expert_num,
55+
moe_topk,
56+
0,
57+
bias_data);
58+
PD_CHECK(ret == 0);
59+
}
5860

5961
return {topk_ids, topk_weights};
6062
}

custom_ops/xpu_ops/src/ops/pybind/pybind.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
416416
py::arg("bias"),
417417
py::arg("weight_dtype"),
418418
py::arg("arch"),
419-
py::arg("group_size"));
419+
py::arg("group_size")=-1);
420420

421421
m.def("ep_moe_expert_combine",
422422
&MoeEPCombine,

docs/features/speculative_decoding.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ This project implements an efficient **Speculative Decoding** inference framewor
1818
- ⏳ Coming Soon: Support Chunk-prefill
1919
- ⏳ Coming Soon: Multi-layer MTP Layer
2020

21+
- **Decoding with Hybrid MTP and Ngram Methods(Hybrid-MTP-with-Ngram)**
22+
23+
- Overview: A hybrid method combining MTP and Ngram. First, MTP generates N draft tokens, then Ngram matching is used to supplement additional draft tokens.
24+
25+
- Use Cases: Suitable when higher draft token coverage is required, leveraging both MTP’s generation capability and the efficiency of Ngram matching.
26+
2127
---
2228

2329
### Coming Soon
@@ -132,7 +138,13 @@ python -m fastdeploy.entrypoints.openai.api_server \
132138
--scheduler-password "scheduler_mtp" \
133139
--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}' &
134140
```
141+
## Decoding with Hybrid MTP and Ngram Methods
135142

143+
When starting the service, you only need to modify the --speculative-config option.
144+
For example, use MTP to generate two draft tokens, and then append three additional draft tokens from Ngram matching:
145+
```
146+
--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram", "num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'
147+
```
136148
## 🧠 Using Ngram-Based Decoding
137149
This method uses an n-gram sliding window to match the prompt and generated tokens to predict draft tokens. It is particularly effective in scenarios with high input-output overlap (e.g., code completion, document search).
138150

docs/online_serving/metrics.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
3434
| `fastdeploy:available_gpu_block_num` | Gauge | Number of available gpu blocks in cache, including prefix caching blocks that are not officially released | Count |
3535
| `fastdeploy:free_gpu_block_num` | Gauge | Number of free blocks in cache | Count |
3636
| `fastdeploy:max_gpu_block_num` | Gauge | Number of total blocks determined when service started| Count |
37-
| `available_gpu_resource` | Gauge | Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num | Count |
37+
| `fastdeploy:available_gpu_resource` | Gauge | Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num | Count |
3838
| `fastdeploy:requests_number` | Counter | Total number of requests received | Count |
3939
| `fastdeploy:send_cache_failed_num` | Counter | Total number of failures of sending cache | Count |
4040
| `fastdeploy:first_token_latency` | Gauge | Latest time to generate first token in seconds | Seconds |

docs/zh/features/speculative_decoding.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
- ⏳ 即将支持:兼容 Chunk Prefill
1515
- ⏳ 即将支持:多层 MTP layer
1616

17+
- **混合MTP、Ngram方法解码(Hybrid-MTP-with-Ngram)**
18+
- 方法概述:混合MTP与Ngram方法,先使用MTP产出N个草稿Token,再使用Ngram匹配补充草稿Token。
19+
- 使用场景:适合在需要更多草稿Token时使用,兼顾MTP生成能力与Ngram匹配的高效性。
1720
---
1821

1922
### ⏳ 规划中
@@ -110,7 +113,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
110113
--scheduler-password "scheduler_mtp" \
111114
--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}' &
112115
```
116+
## 使用混合MTP、Ngram方法解码
117+
在启动服务时,只需改动 --speculative-config 即可。例如使用MTP产出两个DraftToken,再额外拼接三个Ngram匹配的DraftToken
118+
```
119+
--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram" ,"num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'
113120
121+
```
114122
## 🧠 使用 Ngram 解码
115123
该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token,适合输入和输出有很大 overlap 的场景,如代码续写、文档查询等。
116124
> 使用 4×H100;量化方式选择 WINT4

0 commit comments

Comments
 (0)