66#include <stdlib.h>
77#include <stdint.h>
88#include <limits.h>
9+ #include <stdio.h>
910#include <string.h>
1011
1112static int checked_mul_size (size_t a , size_t b , size_t * out ) {
@@ -178,6 +179,75 @@ static int can_use_cuda_moe_routed_ffn_model(const BnConfig *c,
178179 return moe_layers > 0 ;
179180}
180181
182+ static size_t env_mb_or_default (const char * name , size_t def ) {
183+ const char * s = getenv (name );
184+ if (!s || !* s )
185+ return def ;
186+ char * end = NULL ;
187+ unsigned long long v = strtoull (s , & end , 10 );
188+ if (!end || * end != '\0' )
189+ return def ;
190+ return (size_t )v ;
191+ }
192+
193+ static size_t estimate_cuda_moe_all_bytes (const BnConfig * c ,
194+ const BnWeights * w ) {
195+ if (!c || !w || c -> n_experts <= 0 )
196+ return 0 ;
197+ size_t total = 0 ;
198+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
199+ const BnMoEExpertMap * em = & w -> layers [l ].moe .expert_map ;
200+ size_t layer = 0 ;
201+ size_t proj = 0 ;
202+ if (checked_mul_size (em -> expert_gate_bytes ,
203+ (size_t )c -> n_experts , & proj ) != 0 ||
204+ checked_mul_size (em -> expert_up_bytes ,
205+ (size_t )c -> n_experts , & layer ) != 0 )
206+ return SIZE_MAX ;
207+ if (proj > SIZE_MAX - layer )
208+ return SIZE_MAX ;
209+ layer += proj ;
210+ if (checked_mul_size (em -> expert_down_bytes ,
211+ (size_t )c -> n_experts , & proj ) != 0 ||
212+ proj > SIZE_MAX - layer )
213+ return SIZE_MAX ;
214+ layer += proj ;
215+ if (layer > SIZE_MAX - total )
216+ return SIZE_MAX ;
217+ total += layer ;
218+ }
219+ return total ;
220+ }
221+
222+ static int cuda_moe_all_fits_memory (BnGPUBackend * gpu ,
223+ const BnConfig * c ,
224+ const BnWeights * w ) {
225+ if (!gpu || !gpu -> memory_info )
226+ return 1 ;
227+ size_t need = estimate_cuda_moe_all_bytes (c , w );
228+ if (need == 0 )
229+ return 0 ;
230+ if (need == SIZE_MAX )
231+ return 0 ;
232+ size_t free_bytes = 0 ;
233+ size_t total_bytes = 0 ;
234+ if (gpu -> memory_info (gpu -> ctx , & free_bytes , & total_bytes ) != 0 )
235+ return 1 ;
236+ size_t reserve_mb = env_mb_or_default ("BN_CUDA_MOE_FULL_RESERVE_MB" , 4096 );
237+ size_t reserve = reserve_mb > SIZE_MAX / (1024u * 1024u )
238+ ? SIZE_MAX
239+ : reserve_mb * 1024u * 1024u ;
240+ if (free_bytes > need && free_bytes - need >= reserve )
241+ return 1 ;
242+ fprintf (stderr ,
243+ "[bn:gpu] skipping full CUDA MoE residency: need=%.1f GiB "
244+ "free=%.1f GiB total=%.1f GiB reserve=%.1f GiB "
245+ "(using lazy GPU expert cache)\n" ,
246+ need / 1073741824.0 , free_bytes / 1073741824.0 ,
247+ total_bytes / 1073741824.0 , reserve / 1073741824.0 );
248+ return 0 ;
249+ }
250+
181251int bn_model_upload_weights (BnModel * model , BnGPUBackend * gpu ) {
182252 if (!model || !gpu || !gpu -> buffer_create ) return -1 ;
183253 if (bn_model_ensure_backend (model ) != 0 ) return -1 ;
@@ -189,6 +259,8 @@ int bn_model_upload_weights(BnModel *model, BnGPUBackend *gpu) {
189259 int n_layers = c -> n_layers ;
190260 int upload_moe_all_model = !getenv ("BN_CUDA_DISABLE_MOE_ROUTED_FFN" ) &&
191261 can_use_cuda_moe_routed_ffn_model (c , w );
262+ if (upload_moe_all_model && !cuda_moe_all_fits_memory (gpu , c , w ))
263+ upload_moe_all_model = 0 ;
192264
193265 if (w -> output_weight .data ) {
194266 void * output_weight_gpu = upload_qweight (gpu , & w -> output_weight );
0 commit comments