Skip to content

Commit 4feb0bf

Browse files
committed
graph : avoid huge warm-up graphs for MoE models
ggml-ci
1 parent f9a31ee commit 4feb0bf

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

src/llama-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
13121312
//
13131313

13141314
uint32_t llama_context::graph_max_nodes() const {
1315-
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
1315+
return std::max<uint32_t>(1024u, 6u*model.n_tensors());
13161316
}
13171317

13181318
llm_graph_result * llama_context::get_gf_res_reserve() const {

src/llama-graph.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
907907

908908
// aggregate experts
909909
ggml_tensor * moe_out = nullptr;
910-
for (int i = 0; i < n_expert_used; ++i) {
910+
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
911911
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
912912
experts->nb[2], i*experts->nb[1]);
913913

@@ -918,7 +918,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
918918
}
919919
}
920920

921-
if (n_expert_used == 1) {
921+
if (hparams.n_expert_used == 1) {
922922
// avoid returning a non-contiguous tensor
923923
moe_out = ggml_cont(ctx0, moe_out);
924924
}

0 commit comments

Comments
 (0)