|
| 1 | +/* This example demonstrates an implementation of matrix multiplication. |
| 2 | + * |
| 3 | + * 1. The matrices are first created on the host side and then copied to the device. |
| 4 | + * 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly |
| 5 | + * 3. The result is copied back to host, where the accumulated error occur. |
| 6 | + * 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm). |
| 7 | + */ |
| 8 | + |
| 9 | +use cuda_std::glam::USizeVec2; |
| 10 | +use cust::device::Device; |
| 11 | +use cust::event::{Event, EventFlags}; |
| 12 | +use cust::function::{BlockSize, GridSize}; |
| 13 | +use cust::launch; |
| 14 | +use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer}; |
| 15 | +use cust::module::Module; |
| 16 | +use cust::stream::{Stream, StreamFlags}; |
| 17 | + |
| 18 | +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); |
| 19 | + |
| 20 | +fn matrix_multiply( |
| 21 | + block_size: usize, |
| 22 | + dims_a: USizeVec2, |
| 23 | + dims_b: USizeVec2, |
| 24 | +) -> Result<(), cust::error::CudaError> { |
| 25 | + let dims_c = USizeVec2::new(dims_b.x, dims_a.y); |
| 26 | + let size_a = dims_a.x * dims_a.y; |
| 27 | + let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!"); |
| 28 | + |
| 29 | + let size_b = dims_b.x * dims_b.y; |
| 30 | + let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!"); |
| 31 | + |
| 32 | + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!"); |
| 33 | + |
| 34 | + let size_c = dims_b.x * dims_a.y; |
| 35 | + let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!"); |
| 36 | + |
| 37 | + let start_event = Event::new(EventFlags::DEFAULT)?; |
| 38 | + let stop_event = Event::new(EventFlags::DEFAULT)?; |
| 39 | + |
| 40 | + let d_a = |
| 41 | + DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!"); |
| 42 | + let d_b = |
| 43 | + DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!"); |
| 44 | + let d_c = |
| 45 | + DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!"); |
| 46 | + |
| 47 | + stream.synchronize().expect("Stream couldn't synchronize!"); |
| 48 | + let threads = BlockSize::xy(block_size as u32, block_size as u32); |
| 49 | + let grid = GridSize::xy( |
| 50 | + (dims_b.x / (threads.x as usize)).try_into().unwrap(), |
| 51 | + (dims_a.y / (threads.y as usize)).try_into().unwrap(), |
| 52 | + ); |
| 53 | + |
| 54 | + println!("Computing result using CUDA Kernel..."); |
| 55 | + |
| 56 | + let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!"); |
| 57 | + let matrix_mul_cuda = module |
| 58 | + .get_function("matrix_mul_cuda") |
| 59 | + .expect("Kernel function not found!"); |
| 60 | + |
| 61 | + unsafe { |
| 62 | + // The function definition of the kernel is: |
| 63 | + // ``` |
| 64 | + // pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) |
| 65 | + // ``` |
| 66 | + // For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer. |
| 67 | + // For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice. |
| 68 | + launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>( |
| 69 | + d_c.as_device_ptr(), |
| 70 | + d_a.as_device_ptr(), |
| 71 | + d_a.len(), |
| 72 | + d_b.as_device_ptr(), |
| 73 | + d_b.len(), |
| 74 | + dims_a.x as usize, |
| 75 | + dims_b.x as usize |
| 76 | + ))?; |
| 77 | + } |
| 78 | + |
| 79 | + println!("Done!"); |
| 80 | + stream.synchronize().expect("Stream couldn't synchronize!"); |
| 81 | + |
| 82 | + start_event |
| 83 | + .record(&stream) |
| 84 | + .expect("Failed to record start_event in the CUDA stream!"); |
| 85 | + |
| 86 | + const N_ITER: u32 = 300; |
| 87 | + |
| 88 | + for _ in 0..N_ITER { |
| 89 | + unsafe { |
| 90 | + launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>( |
| 91 | + d_c.as_device_ptr(), |
| 92 | + d_a.as_device_ptr(), |
| 93 | + d_a.len(), |
| 94 | + d_b.as_device_ptr(), |
| 95 | + d_b.len(), |
| 96 | + dims_a.x as usize, |
| 97 | + dims_b.x as usize, |
| 98 | + ))?; |
| 99 | + } |
| 100 | + } |
| 101 | + |
| 102 | + stop_event |
| 103 | + .record(&stream) |
| 104 | + .expect("Failed to record stop_event in the CUDA stream!"); |
| 105 | + |
| 106 | + stop_event |
| 107 | + .synchronize() |
| 108 | + .expect("Stream couldn't synchronize!"); |
| 109 | + |
| 110 | + let gpu_time: u128 = stop_event |
| 111 | + .elapsed(&start_event) |
| 112 | + .expect("Failed to calculate duration of GPU operations!") |
| 113 | + .as_micros(); |
| 114 | + |
| 115 | + let avg_time = gpu_time as f32 / N_ITER as f32; |
| 116 | + println!( |
| 117 | + "Average time spent executing by the GPU: {} microseconds", |
| 118 | + avg_time |
| 119 | + ); |
| 120 | + let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32); |
| 121 | + let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0; |
| 122 | + println!("Performance = {} GFlop/s", giga_flops); |
| 123 | + |
| 124 | + unsafe { |
| 125 | + d_c.async_copy_to(&mut h_c, &stream) |
| 126 | + .expect("Could not copy from device to host!"); |
| 127 | + } |
| 128 | + stream.synchronize().expect("Stream couldn't synchronize!"); |
| 129 | + |
| 130 | + // checking computed result |
| 131 | + // test relative error by the formula |
| 132 | + // |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu| |
| 133 | + let machine_epsilon = 1.19209290E-07f32; |
| 134 | + let mut correct = true; |
| 135 | + |
| 136 | + for i in 0..(dims_c.x * dims_c.y) { |
| 137 | + let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs(); |
| 138 | + let dot_length = (dims_a.x as f32).abs(); |
| 139 | + let abs_val = h_c[i].abs(); |
| 140 | + let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon); |
| 141 | + |
| 142 | + if rel_err > 1e-6 { |
| 143 | + println!( |
| 144 | + "Error at index {}: CPU = {}, GPU = {}, rel_err = {}", |
| 145 | + i, |
| 146 | + dims_a.x as f32 * 0.01f32, |
| 147 | + h_c[i], |
| 148 | + rel_err |
| 149 | + ); |
| 150 | + correct = false; |
| 151 | + } |
| 152 | + } |
| 153 | + |
| 154 | + if correct { |
| 155 | + println!("Result = PASS"); |
| 156 | + println!( |
| 157 | + "NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled." |
| 158 | + ); |
| 159 | + } else { |
| 160 | + println!("Result = FAIL"); |
| 161 | + return Err(cust::error::CudaError::UnknownError); |
| 162 | + } |
| 163 | + |
| 164 | + Ok(()) |
| 165 | +} |
| 166 | + |
| 167 | +fn main() -> Result<(), cust::error::CudaError> { |
| 168 | + // Set up the context, load the module, and create a stream to run kernels in. |
| 169 | + let _ctx = cust::quick_init(); |
| 170 | + let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!"); |
| 171 | + println!("Device Name: {}", device.name().unwrap()); |
| 172 | + |
| 173 | + let block_size: u32 = 32; |
| 174 | + let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize); |
| 175 | + let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize); |
| 176 | + |
| 177 | + if dims_a.x != dims_b.y { |
| 178 | + panic!("Matrix multiplication not possible with the given dimensions!"); |
| 179 | + } |
| 180 | + |
| 181 | + matrix_multiply(block_size as usize, dims_a, dims_b)?; |
| 182 | + Ok(()) |
| 183 | +} |
0 commit comments