chore: completed matmul/main.rs

madhav-madhusoodanan · madhav-madhusoodanan · commit 80cf8ff46efc · 2025-12-11T02:49:52.000+05:30
diff --git a/samples/introduction/matmul/src/main.rs b/samples/introduction/matmul/src/main.rs
@@ -0,0 +1,139 @@
+use cust::device::Device;
+use cust::event::{Event, EventFlags};
+use cust::function::{BlockSize, GridSize};
+use cust::launch;
+use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
+use cust::module::Module;
+use cust::prelude::EventStatus;
+use cust::stream::{Stream, StreamFlags};
+use std::time::Instant;
+
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
+    let dimsC = (dimsB.0, dimsA.1, 1);
+    let size_a = dimsA.0 * dimsA.1;
+    let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
+
+    let size_b = dimsB.0 * dimsB.1;
+    let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host array couldn't be initialized!");
+
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
+
+    let mut size_c = dimsB.0 * dimsA.1;
+    let mut h_c =
+        LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
+
+    let start_event = Event::new(EventFlags::DEFAULT)?;
+    let stop_event = Event::new(EventFlags::DEFAULT)?;
+
+    let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
+    let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
+    let mut d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
+    
+    stream.synchronize().expect("Stream couldn't synchronize!");
+    let threads = BlockSize::xy(block_size as u32, block_size as u32);
+    let grid = GridSize::xy((dimsB.0 / (threads.x as usize)).try_into().unwrap(), (dimsA.1 / (threads.y as usize)).try_into().unwrap());
+
+    println!("Computing result using CUDA Kernel...");
+
+    let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
+    let matrix_mul_cuda = module
+        .get_function("matrix_mul_cuda")
+        .expect("Kernel function not found!");
+
+    unsafe {
+        launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+            d_c.as_device_ptr(),
+            d_a.as_device_ptr(),
+            d_b.as_device_ptr(),
+            dimsA.0 as u32,
+            dimsB.0 as u32
+        ))?;
+    }
+
+    println!("Done!");
+    stream.synchronize().expect("Stream couldn't synchronize!");
+
+    start_event
+        .record(&stream)
+        .expect("Failed to record start_event in the CUDA stream!");
+
+    const N_ITER : u32 = 300;
+
+    for _ in 0..N_ITER {
+        unsafe {
+            launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+                d_c.as_device_ptr(),
+                d_a.as_device_ptr(),
+                d_b.as_device_ptr(),
+                dimsA.0 as u32,
+                dimsB.0 as u32,
+            ))?;
+        }
+    }
+
+    stop_event
+        .record(&stream)
+        .expect("Failed to record stop_event in the CUDA stream!");
+
+    stop_event.synchronize().expect("Stream couldn't synchronize!");
+
+    let gpu_time: u128 = stop_event
+        .elapsed(&start_event)
+        .expect("Failed to calculate duration of GPU operations!")
+        .as_micros();
+
+    let avg_time = gpu_time as f32 / N_ITER as f32;
+    println!("Average time spent executing by the GPU: {} microseconds", avg_time);
+    let flopsPerMatrixMul = 2.0 * (dimsA.0 as f32) * (dimsA.1 as f32) * (dimsB.0 as f32);
+    let gigaFlops = (flopsPerMatrixMul / (avg_time)) / 1000.0;
+    println!("Performance = {} GFlop/s", gigaFlops);
+
+    // checking computed result
+    // test relative error by the formula
+    // |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
+    let machine_epsilon = 1.19209290E-07f32;
+    let mut correct = true;
+
+    for i in 0..(dimsC.0 * dimsC.1) {
+        let abs_err = (h_c[i] - (dimsA.0 as f32 * 0.01f32)).abs();
+        let dot_length = (dimsA.0 as f32).abs();
+        let abs_val = h_c[i].abs();
+        let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
+
+        if rel_err > 1e-6 {
+            println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dimsA.0 as f32 * 0.01f32, h_c[i], rel_err);
+            correct = false;
+        }
+    }
+
+    if correct {
+        println!("Result = PASS");
+        println!("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.");
+    } else {
+        println!("Result = FAIL");
+        return Err(cust::error::CudaError::UnknownError);
+    }
+
+    Ok(())
+}
+
+fn main() -> Result<(), cust::error::CudaError> {
+    // Set up the context, load the module, and create a stream to run kernels in.
+    let _ctx = cust::quick_init();
+    let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
+    println!("Device Name: {}", device.name().unwrap());
+
+    let block_size: u32  = 32;
+    let dimsA: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
+    let dimsB: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
+
+    if dimsA.0 != dimsB.1 {
+        panic!("Matrix multiplication not possible with the given dimensions!");
+    }
+
+    matrix_multiply(block_size as usize, dimsA, dimsB);
+
+    Ok(())
+}