@@ -278,13 +278,39 @@ static int cuda_moe_down_q6_f32_cache_enabled(const BnGPUBackend *gpu) {
278278 getenv ("BN_CUDA_DISABLE_Q6K_MOE_DOWN_F32_CACHE" ) == NULL ;
279279}
280280
281+ static size_t cuda_q6_down_f32_cache_bytes (const BnGPUBackend * gpu ,
282+ const BnMoEExpertMap * em ,
283+ int n_experts ) {
284+ if (!cuda_moe_down_q6_f32_cache_enabled (gpu ) || !em ||
285+ em -> down_type != BN_GGUF_TENSOR_Q6_K || n_experts <= 0 )
286+ return 0 ;
287+
288+ size_t elems = 0 ;
289+ size_t bytes = 0 ;
290+ if (mul3_size ((size_t )n_experts , (size_t )em -> down_rows ,
291+ (size_t )em -> down_cols , & elems ) != 0 ||
292+ checked_mul_size (elems , sizeof (float ), & bytes ) != 0 )
293+ return SIZE_MAX ;
294+
295+ if (getenv ("BN_CUDA_ENABLE_Q6K_MOE_DOWN_F32_CACHE" ))
296+ return bytes ;
297+
298+ int max_mb = 512 ;
299+ const char * max_env = getenv ("BN_CUDA_CUBLAS_CACHE_MAX_MB" );
300+ if (max_env && * max_env )
301+ max_mb = atoi (max_env );
302+ if (max_mb <= 0 )
303+ return bytes ;
304+ size_t max_bytes = (size_t )max_mb * 1024u * 1024u ;
305+ return bytes <= max_bytes ? bytes : 0 ;
306+ }
307+
281308static size_t estimate_cuda_moe_all_bytes (const BnConfig * c ,
282309 const BnWeights * w ,
283310 const BnGPUBackend * gpu ) {
284311 if (!c || !w || c -> n_experts <= 0 )
285312 return 0 ;
286313 size_t total = 0 ;
287- int q6_f32_cache = cuda_moe_down_q6_f32_cache_enabled (gpu );
288314 for (int l = 0 ; l < c -> n_layers ; l ++ ) {
289315 const BnMoEExpertMap * em = & w -> layers [l ].moe .expert_map ;
290316 size_t layer = 0 ;
@@ -302,17 +328,10 @@ static size_t estimate_cuda_moe_all_bytes(const BnConfig *c,
302328 proj > SIZE_MAX - layer )
303329 return SIZE_MAX ;
304330 layer += proj ;
305- if (q6_f32_cache && em -> down_type == BN_GGUF_TENSOR_Q6_K ) {
306- size_t aux = 0 ;
307- if (mul3_size ((size_t )c -> n_experts ,
308- (size_t )em -> down_rows ,
309- (size_t )em -> down_cols , & aux ) != 0 ||
310- checked_mul_size (aux , sizeof (float ), & aux ) != 0 ||
311- add_size_checked (& layer , aux ) != 0 )
312- return SIZE_MAX ;
313- }
331+ size_t aux = cuda_q6_down_f32_cache_bytes (gpu , em , c -> n_experts );
332+ if (aux == SIZE_MAX || add_size_checked (& layer , aux ) != 0 )
333+ return SIZE_MAX ;
314334 if (cuda_moe_all_f16_cache_enabled ()) {
315- size_t aux = 0 ;
316335 if (mul3_size ((size_t )c -> n_experts ,
317336 (size_t )em -> gate_rows ,
318337 (size_t )em -> gate_cols , & aux ) != 0 ||
0 commit comments