Skip to content

Commit 986152a

Browse files
committed
Fix CUDA MoE residency memory estimate
1 parent bead3a0 commit 986152a

1 file changed

Lines changed: 30 additions & 11 deletions

File tree

src/model_gpu.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,39 @@ static int cuda_moe_down_q6_f32_cache_enabled(const BnGPUBackend *gpu) {
278278
getenv("BN_CUDA_DISABLE_Q6K_MOE_DOWN_F32_CACHE") == NULL;
279279
}
280280

281+
static size_t cuda_q6_down_f32_cache_bytes(const BnGPUBackend *gpu,
282+
const BnMoEExpertMap *em,
283+
int n_experts) {
284+
if (!cuda_moe_down_q6_f32_cache_enabled(gpu) || !em ||
285+
em->down_type != BN_GGUF_TENSOR_Q6_K || n_experts <= 0)
286+
return 0;
287+
288+
size_t elems = 0;
289+
size_t bytes = 0;
290+
if (mul3_size((size_t)n_experts, (size_t)em->down_rows,
291+
(size_t)em->down_cols, &elems) != 0 ||
292+
checked_mul_size(elems, sizeof(float), &bytes) != 0)
293+
return SIZE_MAX;
294+
295+
if (getenv("BN_CUDA_ENABLE_Q6K_MOE_DOWN_F32_CACHE"))
296+
return bytes;
297+
298+
int max_mb = 512;
299+
const char *max_env = getenv("BN_CUDA_CUBLAS_CACHE_MAX_MB");
300+
if (max_env && *max_env)
301+
max_mb = atoi(max_env);
302+
if (max_mb <= 0)
303+
return bytes;
304+
size_t max_bytes = (size_t)max_mb * 1024u * 1024u;
305+
return bytes <= max_bytes ? bytes : 0;
306+
}
307+
281308
static size_t estimate_cuda_moe_all_bytes(const BnConfig *c,
282309
const BnWeights *w,
283310
const BnGPUBackend *gpu) {
284311
if (!c || !w || c->n_experts <= 0)
285312
return 0;
286313
size_t total = 0;
287-
int q6_f32_cache = cuda_moe_down_q6_f32_cache_enabled(gpu);
288314
for (int l = 0; l < c->n_layers; l++) {
289315
const BnMoEExpertMap *em = &w->layers[l].moe.expert_map;
290316
size_t layer = 0;
@@ -302,17 +328,10 @@ static size_t estimate_cuda_moe_all_bytes(const BnConfig *c,
302328
proj > SIZE_MAX - layer)
303329
return SIZE_MAX;
304330
layer += proj;
305-
if (q6_f32_cache && em->down_type == BN_GGUF_TENSOR_Q6_K) {
306-
size_t aux = 0;
307-
if (mul3_size((size_t)c->n_experts,
308-
(size_t)em->down_rows,
309-
(size_t)em->down_cols, &aux) != 0 ||
310-
checked_mul_size(aux, sizeof(float), &aux) != 0 ||
311-
add_size_checked(&layer, aux) != 0)
312-
return SIZE_MAX;
313-
}
331+
size_t aux = cuda_q6_down_f32_cache_bytes(gpu, em, c->n_experts);
332+
if (aux == SIZE_MAX || add_size_checked(&layer, aux) != 0)
333+
return SIZE_MAX;
314334
if (cuda_moe_all_f16_cache_enabled()) {
315-
size_t aux = 0;
316335
if (mul3_size((size_t)c->n_experts,
317336
(size_t)em->gate_rows,
318337
(size_t)em->gate_cols, &aux) != 0 ||

0 commit comments

Comments
 (0)