fix: added shared memory space for matrix multiplication calculation

madhav-madhusoodanan · madhav-madhusoodanan · commit f37cc7b51e54 · 2025-12-11T02:49:52.000+05:30
diff --git a/samples/introduction/matmul/kernels/src/lib.rs b/samples/introduction/matmul/kernels/src/lib.rs
@@ -0,0 +1,52 @@
+use cuda_std::*;
+use core::mem::MaybeUninit;
+
+// SAFETY: This function is unsafe because it dereferences raw pointers.
+#[kernel]
+pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usize, wb: usize) {
+    let bx: usize = cuda_std::thread::block_idx().x as usize;
+    let by: usize = cuda_std::thread::block_idx().y as usize;
+
+    let tx: usize = cuda_std::thread::thread_idx().x as usize;
+    let ty: usize = cuda_std::thread::thread_idx().y as usize;
+
+    const BLOCK_SIZE: usize = 32;
+    let a_begin = wa * BLOCK_SIZE * by;
+    let a_end = a_begin + wa - 1;
+    let a_step = BLOCK_SIZE;
+
+    let b_begin = BLOCK_SIZE * bx;
+    let b_step = BLOCK_SIZE * wb;
+
+    let mut c_sub: f32 = 0.0;
+    let mut b = b_begin;
+
+    for a in (a_begin..=a_end).step_by(a_step) {
+        #[address_space(shared)]
+        static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+        #[address_space(shared)]
+        static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+
+        // Load A and B matrices into shared memory
+        unsafe {
+            As[ty][tx].write(*A.add((a + wa * ty + tx) as usize));
+            Bs[ty][tx].write(*B.add((b + wb * ty + tx) as usize));
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        cuda_std::thread::sync_threads();
+        for k in 0..BLOCK_SIZE {
+            unsafe {
+                c_sub += As[ty][k].assume_init() * Bs[k][tx].assume_init();
+            }
+        }
+
+        // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
+        cuda_std::thread::sync_threads();
+
+        b += b_step;
+    }
+
+    let c = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    unsafe { *C.add((c + wb * ty + tx) as usize) = c_sub; }
+}