Skip to content

Commit d472962

Browse files
committed
Retry CUDA MoE residency without aux caches
1 parent 1ecb112 commit d472962

1 file changed

Lines changed: 33 additions & 6 deletions

File tree

src/model_gpu.c

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -700,12 +700,39 @@ int bn_model_upload_weights(BnModel *model, BnGPUBackend *gpu) {
700700
gpu->buffer_destroy(gpu->ctx, moe_up_all_gpu);
701701
if (moe_down_all_gpu)
702702
gpu->buffer_destroy(gpu->ctx, moe_down_all_gpu);
703-
fprintf(stderr,
704-
"[bn:gpu] failed full CUDA MoE residency upload at "
705-
"layer=%d; aborting upload instead of mixing resident "
706-
"and fallback MoE paths\n", l);
707-
bn_model_release_gpu(model);
708-
return -1;
703+
moe_gate_all_gpu = NULL;
704+
moe_up_all_gpu = NULL;
705+
moe_down_all_gpu = NULL;
706+
if (upload_moe_all_q8_f16_cache) {
707+
fprintf(stderr,
708+
"[bn:gpu] full CUDA MoE residency aux-cache upload "
709+
"failed at layer=%d; retrying quant-only resident "
710+
"experts for this and later layers\n", l);
711+
upload_moe_all_q8_f16_cache = 0;
712+
moe_gate_all_gpu =
713+
upload_moe_all_proj(model, gpu, &lw->moe.expert_map, 0,
714+
c->n_experts, 0);
715+
moe_up_all_gpu =
716+
upload_moe_all_proj(model, gpu, &lw->moe.expert_map, 1,
717+
c->n_experts, 0);
718+
moe_down_all_gpu =
719+
upload_moe_all_proj(model, gpu, &lw->moe.expert_map, 2,
720+
c->n_experts, 0);
721+
}
722+
if (!moe_gate_all_gpu || !moe_up_all_gpu || !moe_down_all_gpu) {
723+
if (moe_gate_all_gpu)
724+
gpu->buffer_destroy(gpu->ctx, moe_gate_all_gpu);
725+
if (moe_up_all_gpu)
726+
gpu->buffer_destroy(gpu->ctx, moe_up_all_gpu);
727+
if (moe_down_all_gpu)
728+
gpu->buffer_destroy(gpu->ctx, moe_down_all_gpu);
729+
fprintf(stderr,
730+
"[bn:gpu] failed full CUDA MoE residency upload at "
731+
"layer=%d; aborting upload instead of mixing "
732+
"resident and fallback MoE paths\n", l);
733+
bn_model_release_gpu(model);
734+
return -1;
735+
}
709736
}
710737
void *shared_expert_gate_gpu = lw->shared.shared_expert_gate
711738
? gpu->buffer_create(

0 commit comments

Comments
 (0)