[https://nvbugs/5970614][fix] Sync CTA before PDL trigger in quantize_with_block_size

tianyuxbear · tianyuxbear · commit 40d011508ea7 · 2026-06-11T10:23:44.000+08:00
Signed-off-by: Tianyu Xiong &lt;117647511+tianyuxbear@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -897,6 +897,15 @@ quantize_with_block_size(
             }
         }
     }
+    // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614).
+    // PDL completion is reported when every CTA has either exited or called
+    // this function at least once (per CUDA Programming Guide). Without a
+    // CTA-wide barrier, an early-finishing warp can trigger completion while
+    // other warps in the same CTA are still writing sf_out / out, allowing the
+    // downstream NVF4 GEMM consumer to read partial data once
+    // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.
+    __syncthreads();
+    __threadfence();
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }

Original file line number	Diff line number	Diff line change
`@@ -897,6 +897,15 @@ quantize_with_block_size(`
`897`	`897`	`}`
`898`	`898`	`}`
`899`	`899`	`}`
	`900`	`+ // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614).`
	`901`	`+ // PDL completion is reported when every CTA has either exited or called`
	`902`	`+ // this function at least once (per CUDA Programming Guide). Without a`
	`903`	`+ // CTA-wide barrier, an early-finishing warp can trigger completion while`
	`904`	`+ // other warps in the same CTA are still writing sf_out / out, allowing the`
	`905`	`+ // downstream NVF4 GEMM consumer to read partial data once`
	`906`	`+ // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.`
	`907`	`+ __syncthreads();`
	`908`	`+ __threadfence();`
`900`	`909`	`cudaTriggerProgrammaticLaunchCompletion();`
`901`	`910`	`#endif`
`902`	`911`	`}`