We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent cab198d commit 40d0115Copy full SHA for 40d0115
1 file changed
cpp/tensorrt_llm/kernels/quantization.cuh
@@ -897,6 +897,15 @@ quantize_with_block_size(
897
}
898
899
900
+ // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614).
901
+ // PDL completion is reported when every CTA has either exited or called
902
+ // this function at least once (per CUDA Programming Guide). Without a
903
+ // CTA-wide barrier, an early-finishing warp can trigger completion while
904
+ // other warps in the same CTA are still writing sf_out / out, allowing the
905
+ // downstream NVF4 GEMM consumer to read partial data once
906
+ // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.
907
+ __syncthreads();
908
+ __threadfence();
909
cudaTriggerProgrammaticLaunchCompletion();
910
#endif
911
0 commit comments