Skip to content

Commit c88efec

Browse files
committed
Guard CUDA full MoE residency by VRAM budget
1 parent da4cd7f commit c88efec

1 file changed

Lines changed: 72 additions & 0 deletions

File tree

src/model_gpu.c

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <stdlib.h>
77
#include <stdint.h>
88
#include <limits.h>
9+
#include <stdio.h>
910
#include <string.h>
1011

1112
static int checked_mul_size(size_t a, size_t b, size_t *out) {
@@ -178,6 +179,75 @@ static int can_use_cuda_moe_routed_ffn_model(const BnConfig *c,
178179
return moe_layers > 0;
179180
}
180181

182+
static size_t env_mb_or_default(const char *name, size_t def) {
183+
const char *s = getenv(name);
184+
if (!s || !*s)
185+
return def;
186+
char *end = NULL;
187+
unsigned long long v = strtoull(s, &end, 10);
188+
if (!end || *end != '\0')
189+
return def;
190+
return (size_t)v;
191+
}
192+
193+
static size_t estimate_cuda_moe_all_bytes(const BnConfig *c,
194+
const BnWeights *w) {
195+
if (!c || !w || c->n_experts <= 0)
196+
return 0;
197+
size_t total = 0;
198+
for (int l = 0; l < c->n_layers; l++) {
199+
const BnMoEExpertMap *em = &w->layers[l].moe.expert_map;
200+
size_t layer = 0;
201+
size_t proj = 0;
202+
if (checked_mul_size(em->expert_gate_bytes,
203+
(size_t)c->n_experts, &proj) != 0 ||
204+
checked_mul_size(em->expert_up_bytes,
205+
(size_t)c->n_experts, &layer) != 0)
206+
return SIZE_MAX;
207+
if (proj > SIZE_MAX - layer)
208+
return SIZE_MAX;
209+
layer += proj;
210+
if (checked_mul_size(em->expert_down_bytes,
211+
(size_t)c->n_experts, &proj) != 0 ||
212+
proj > SIZE_MAX - layer)
213+
return SIZE_MAX;
214+
layer += proj;
215+
if (layer > SIZE_MAX - total)
216+
return SIZE_MAX;
217+
total += layer;
218+
}
219+
return total;
220+
}
221+
222+
static int cuda_moe_all_fits_memory(BnGPUBackend *gpu,
223+
const BnConfig *c,
224+
const BnWeights *w) {
225+
if (!gpu || !gpu->memory_info)
226+
return 1;
227+
size_t need = estimate_cuda_moe_all_bytes(c, w);
228+
if (need == 0)
229+
return 0;
230+
if (need == SIZE_MAX)
231+
return 0;
232+
size_t free_bytes = 0;
233+
size_t total_bytes = 0;
234+
if (gpu->memory_info(gpu->ctx, &free_bytes, &total_bytes) != 0)
235+
return 1;
236+
size_t reserve_mb = env_mb_or_default("BN_CUDA_MOE_FULL_RESERVE_MB", 4096);
237+
size_t reserve = reserve_mb > SIZE_MAX / (1024u * 1024u)
238+
? SIZE_MAX
239+
: reserve_mb * 1024u * 1024u;
240+
if (free_bytes > need && free_bytes - need >= reserve)
241+
return 1;
242+
fprintf(stderr,
243+
"[bn:gpu] skipping full CUDA MoE residency: need=%.1f GiB "
244+
"free=%.1f GiB total=%.1f GiB reserve=%.1f GiB "
245+
"(using lazy GPU expert cache)\n",
246+
need / 1073741824.0, free_bytes / 1073741824.0,
247+
total_bytes / 1073741824.0, reserve / 1073741824.0);
248+
return 0;
249+
}
250+
181251
int bn_model_upload_weights(BnModel *model, BnGPUBackend *gpu) {
182252
if (!model || !gpu || !gpu->buffer_create) return -1;
183253
if (bn_model_ensure_backend(model) != 0) return -1;
@@ -189,6 +259,8 @@ int bn_model_upload_weights(BnModel *model, BnGPUBackend *gpu) {
189259
int n_layers = c->n_layers;
190260
int upload_moe_all_model = !getenv("BN_CUDA_DISABLE_MOE_ROUTED_FFN") &&
191261
can_use_cuda_moe_routed_ffn_model(c, w);
262+
if (upload_moe_all_model && !cuda_moe_all_fits_memory(gpu, c, w))
263+
upload_moe_all_model = 0;
192264

193265
if (w->output_weight.data) {
194266
void *output_weight_gpu = upload_qweight(gpu, &w->output_weight);

0 commit comments

Comments
 (0)