@@ -700,12 +700,39 @@ int bn_model_upload_weights(BnModel *model, BnGPUBackend *gpu) {
700700 gpu -> buffer_destroy (gpu -> ctx , moe_up_all_gpu );
701701 if (moe_down_all_gpu )
702702 gpu -> buffer_destroy (gpu -> ctx , moe_down_all_gpu );
703- fprintf (stderr ,
704- "[bn:gpu] failed full CUDA MoE residency upload at "
705- "layer=%d; aborting upload instead of mixing resident "
706- "and fallback MoE paths\n" , l );
707- bn_model_release_gpu (model );
708- return -1 ;
703+ moe_gate_all_gpu = NULL ;
704+ moe_up_all_gpu = NULL ;
705+ moe_down_all_gpu = NULL ;
706+ if (upload_moe_all_q8_f16_cache ) {
707+ fprintf (stderr ,
708+ "[bn:gpu] full CUDA MoE residency aux-cache upload "
709+ "failed at layer=%d; retrying quant-only resident "
710+ "experts for this and later layers\n" , l );
711+ upload_moe_all_q8_f16_cache = 0 ;
712+ moe_gate_all_gpu =
713+ upload_moe_all_proj (model , gpu , & lw -> moe .expert_map , 0 ,
714+ c -> n_experts , 0 );
715+ moe_up_all_gpu =
716+ upload_moe_all_proj (model , gpu , & lw -> moe .expert_map , 1 ,
717+ c -> n_experts , 0 );
718+ moe_down_all_gpu =
719+ upload_moe_all_proj (model , gpu , & lw -> moe .expert_map , 2 ,
720+ c -> n_experts , 0 );
721+ }
722+ if (!moe_gate_all_gpu || !moe_up_all_gpu || !moe_down_all_gpu ) {
723+ if (moe_gate_all_gpu )
724+ gpu -> buffer_destroy (gpu -> ctx , moe_gate_all_gpu );
725+ if (moe_up_all_gpu )
726+ gpu -> buffer_destroy (gpu -> ctx , moe_up_all_gpu );
727+ if (moe_down_all_gpu )
728+ gpu -> buffer_destroy (gpu -> ctx , moe_down_all_gpu );
729+ fprintf (stderr ,
730+ "[bn:gpu] failed full CUDA MoE residency upload at "
731+ "layer=%d; aborting upload instead of mixing "
732+ "resident and fallback MoE paths\n" , l );
733+ bn_model_release_gpu (model );
734+ return -1 ;
735+ }
709736 }
710737 void * shared_expert_gate_gpu = lw -> shared .shared_expert_gate
711738 ? gpu -> buffer_create (
0 commit comments