graph : avoid huge warm-up graphs for MoE models

ggerganov · ggerganov · commit 4feb0bf17563 · 2025-07-18T11:42:23.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 //
 
 uint32_t llama_context::graph_max_nodes() const {
-    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
+    return std::max<uint32_t>(1024u, 6u*model.n_tensors());
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -907,7 +907,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
     // aggregate experts
     ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
         ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
                 experts->nb[2], i*experts->nb[1]);
 
@@ -918,7 +918,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         }
     }
 
-    if (n_expert_used == 1) {
+    if (hparams.n_expert_used == 1) {
         // avoid returning a non-contiguous tensor
         moe_out = ggml_cont(ctx0, moe_out);
     }

Original file line number	Diff line number	Diff line change
`@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {`
`1312`	`1312`	`//`
`1313`	`1313`
`1314`	`1314`	`uint32_t llama_context::graph_max_nodes() const {`
`1315`		`- return std::max<uint32_t>(65536u, 5u*model.n_tensors());`
	`1315`	`+ return std::max<uint32_t>(1024u, 6u*model.n_tensors());`
`1316`	`1316`	`}`
`1317`	`1317`
`1318`	`1318`	`llm_graph_result * llama_context::get_gf_res_reserve() const {`
Original file line number	Diff line number	Diff line change
`@@ -907,7 +907,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(`
`907`	`907`
`908`	`908`	`// aggregate experts`
`909`	`909`	`ggml_tensor * moe_out = nullptr;`
`910`		`- for (int i = 0; i < n_expert_used; ++i) {`
	`910`	`+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {`
`911`	`911`	`ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,`
`912`	`912`	`experts->nb[2], i*experts->nb[1]);`
`913`	`913`
`@@ -918,7 +918,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(`
`918`	`918`	`}`
`919`	`919`	`}`
`920`	`920`
`921`		`- if (n_expert_used == 1) {`
	`921`	`+ if (hparams.n_expert_used == 1) {`
`922`	`922`	`// avoid returning a non-contiguous tensor`
`923`	`923`	`moe_out = ggml_cont(ctx0, moe_out);`
`924`	`924`	`}`