chore: cargo fmt

madhav-madhusoodanan · madhav-madhusoodanan · commit f40c8ac0922a · 2025-12-11T03:02:12.000+05:30
diff --git a/samples/introduction/matmul/kernels/src/lib.rs b/samples/introduction/matmul/kernels/src/lib.rs
@@ -1,5 +1,5 @@
-use cuda_std::*;
 use core::mem::MaybeUninit;
+use cuda_std::*;
 
 // SAFETY: This function is unsafe because it dereferences raw pointers.
 #[kernel]
@@ -23,9 +23,12 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
 
     for a in (a_begin..=a_end).step_by(a_step) {
         #[address_space(shared)]
-        static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+        static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
+            [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+
         #[address_space(shared)]
-        static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+        static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
+            [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
 
         // Load A and B matrices into shared memory
         unsafe {
@@ -48,5 +51,7 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
     }
 
     let c = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-    unsafe { *C.add((c + wb * ty + tx) as usize) = c_sub; }
-}
+    unsafe {
+        *C.add((c + wb * ty + tx) as usize) = c_sub;
+    }
+}
diff --git a/samples/introduction/matmul/src/main.rs b/samples/introduction/matmul/src/main.rs
@@ -8,7 +8,11 @@ use cust::stream::{Stream, StreamFlags};
 
 static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
 
-fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
+fn matrix_multiply(
+    block_size: usize,
+    dims_a: (usize, usize, usize),
+    dims_b: (usize, usize, usize),
+) -> Result<(), cust::error::CudaError> {
     let dims_c = (dims_b.0, dims_a.1, 1);
     let size_a = dims_a.0 * dims_a.1;
     let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
@@ -19,19 +23,24 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
     let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
 
     let size_c = dims_b.0 * dims_a.1;
-    let mut h_c =
-        LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
+    let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
 
     let start_event = Event::new(EventFlags::DEFAULT)?;
     let stop_event = Event::new(EventFlags::DEFAULT)?;
 
-    let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
-    let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
-    let d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
-    
+    let d_a =
+        DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
+    let d_b =
+        DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
+    let d_c =
+        DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
+
     stream.synchronize().expect("Stream couldn't synchronize!");
     let threads = BlockSize::xy(block_size as u32, block_size as u32);
-    let grid = GridSize::xy((dims_b.0 / (threads.x as usize)).try_into().unwrap(), (dims_a.1 / (threads.y as usize)).try_into().unwrap());
+    let grid = GridSize::xy(
+        (dims_b.0 / (threads.x as usize)).try_into().unwrap(),
+        (dims_a.1 / (threads.y as usize)).try_into().unwrap(),
+    );
 
     println!("Computing result using CUDA Kernel...");
 
@@ -57,7 +66,7 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
         .record(&stream)
         .expect("Failed to record start_event in the CUDA stream!");
 
-    const N_ITER : u32 = 300;
+    const N_ITER: u32 = 300;
 
     for _ in 0..N_ITER {
         unsafe {
@@ -75,22 +84,26 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
         .record(&stream)
         .expect("Failed to record stop_event in the CUDA stream!");
 
-    stop_event.synchronize().expect("Stream couldn't synchronize!");
+    stop_event
+        .synchronize()
+        .expect("Stream couldn't synchronize!");
 
     let gpu_time: u128 = stop_event
         .elapsed(&start_event)
         .expect("Failed to calculate duration of GPU operations!")
         .as_micros();
 
     let avg_time = gpu_time as f32 / N_ITER as f32;
-    println!("Average time spent executing by the GPU: {} microseconds", avg_time);
+    println!(
+        "Average time spent executing by the GPU: {} microseconds",
+        avg_time
+    );
     let flops_per_matrix_mul = 2.0 * (dims_a.0 as f32) * (dims_a.1 as f32) * (dims_b.0 as f32);
     let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
     println!("Performance = {} GFlop/s", giga_flops);
 
-    unsafe{ 
-        d_c
-            .async_copy_to(&mut h_c, &stream)
+    unsafe {
+        d_c.async_copy_to(&mut h_c, &stream)
             .expect("Could not copy from device to host!");
     }
     stream.synchronize().expect("Stream couldn't synchronize!");
@@ -108,14 +121,22 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
         let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
 
         if rel_err > 1e-6 {
-            println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dims_a.0 as f32 * 0.01f32, h_c[i], rel_err);
+            println!(
+                "Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
+                i,
+                dims_a.0 as f32 * 0.01f32,
+                h_c[i],
+                rel_err
+            );
             correct = false;
         }
     }
 
     if correct {
         println!("Result = PASS");
-        println!("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.");
+        println!(
+            "NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
+        );
     } else {
         println!("Result = FAIL");
         return Err(cust::error::CudaError::UnknownError);
@@ -130,7 +151,7 @@ fn main() -> Result<(), cust::error::CudaError> {
     let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
     println!("Device Name: {}", device.name().unwrap());
 
-    let block_size: u32  = 32;
+    let block_size: u32 = 32;
     let dims_a: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
     let dims_b: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);