feat: increased dimension of matrices being computed, and implemented Kahan's error correction to stop floating point accumulation errors

madhav-madhusoodanan · madhav-madhusoodanan · commit be37567fb06a · 2025-12-11T02:49:52.000+05:30
diff --git a/samples/introduction/matmul/kernels/src/lib.rs b/samples/introduction/matmul/kernels/src/lib.rs
@@ -19,15 +19,24 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
     let b_step = BLOCK_SIZE * wb;
 
     let mut c_sub: f32 = 0.0;
+    let mut kahan_correction_factor = 0.0f32;
     let mut b = b_begin;
 
     for a in (a_begin..=a_end).step_by(a_step) {
+        // The equivalent Cuda C++ code for the below is:
+        // ```
+        // __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+        // ```
+        // This memory space is shared between threads of the same block
         #[address_space(shared)]
         static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+
         #[address_space(shared)]
         static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
 
         // Load A and B matrices into shared memory
+        // A.add(index) returns the pointer to the index-th element of A
+        // Hence a dereference is needed to get the value at that index
         unsafe {
             As[ty][tx].write(*A.add((a + wa * ty + tx) as usize));
             Bs[ty][tx].write(*B.add((b + wb * ty + tx) as usize));
@@ -36,8 +45,21 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
         // Synchronize to make sure the matrices are loaded
         cuda_std::thread::sync_threads();
         for k in 0..BLOCK_SIZE {
+            // Typically, this would be a simple calculation:
+            // ```
+            // c_sub += As[ty][k] * Bs[k][tx];
+            // ```
+            // However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
+            // and not allow it to accumulate in c_sub
             unsafe {
-                c_sub += As[ty][k].assume_init() * Bs[k][tx].assume_init();
+                let input = As[ty][k].assume_init() * Bs[k][tx].assume_init();
+                let y = input - kahan_correction_factor;
+                let t = c_sub + y;
+
+                // This seems like the correction factor would yield zero, however due to f32 precision limitations,
+                // it helps to isolate the small errors that would otherwise accumulate in c_sub
+                kahan_correction_factor = (t - c_sub) - y;
+                c_sub = t;
             }
         }
 
diff --git a/samples/introduction/matmul/src/main.rs b/samples/introduction/matmul/src/main.rs
@@ -36,10 +36,10 @@ fn matrix_multiply(
         DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
 
     stream.synchronize().expect("Stream couldn't synchronize!");
-    let threads = BlockSize::xy(block_size as u32, block_size as u32);
+    let blocks = BlockSize::xy(block_size as u32, block_size as u32);
     let grid = GridSize::xy(
-        (dims_b.0 / (threads.x as usize)).try_into().unwrap(),
-        (dims_a.1 / (threads.y as usize)).try_into().unwrap(),
+        (dims_b.0 / (blocks.x as usize)).try_into().unwrap(),
+        (dims_a.1 / (blocks.y as usize)).try_into().unwrap(),
     );
 
     println!("Computing result using CUDA Kernel...");
@@ -50,7 +50,7 @@ fn matrix_multiply(
         .expect("Kernel function not found!");
 
     unsafe {
-        launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+        launch!(matrix_mul_cuda<<<grid, blocks, 0, stream>>>(
             d_c.as_device_ptr(),
             d_a.as_device_ptr(),
             d_b.as_device_ptr(),
@@ -70,7 +70,7 @@ fn matrix_multiply(
 
     for _ in 0..N_ITER {
         unsafe {
-            launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+            launch!(matrix_mul_cuda<<<grid, blocks, 0, stream>>>(
                 d_c.as_device_ptr(),
                 d_a.as_device_ptr(),
                 d_b.as_device_ptr(),
@@ -152,8 +152,8 @@ fn main() -> Result<(), cust::error::CudaError> {
     println!("Device Name: {}", device.name().unwrap());
 
     let block_size: u32 = 32;
-    let dims_a: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
-    let dims_b: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
+    let dims_a: (usize, usize, usize) = (40 * block_size as usize, 40 * block_size as usize, 1);
+    let dims_b: (usize, usize, usize) = (80 * block_size as usize, 40 * block_size as usize, 1);
 
     if dims_a.0 != dims_b.1 {
         panic!("Matrix multiplication not possible with the given dimensions!");