|
| 1 | +use cust::device::Device; |
| 2 | +use cust::event::{Event, EventFlags}; |
| 3 | +use cust::function::{BlockSize, GridSize}; |
| 4 | +use cust::launch; |
| 5 | +use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer}; |
| 6 | +use cust::module::Module; |
| 7 | +use cust::prelude::EventStatus; |
| 8 | +use cust::stream::{Stream, StreamFlags}; |
| 9 | +use std::time::Instant; |
| 10 | + |
| 11 | +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); |
| 12 | + |
| 13 | +fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usize, usize, usize)) -> Result<(), cust::error::CudaError> { |
| 14 | + let dimsC = (dimsB.0, dimsA.1, 1); |
| 15 | + let size_a = dimsA.0 * dimsA.1; |
| 16 | + let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!"); |
| 17 | + |
| 18 | + let size_b = dimsB.0 * dimsB.1; |
| 19 | + let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host array couldn't be initialized!"); |
| 20 | + |
| 21 | + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!"); |
| 22 | + |
| 23 | + let mut size_c = dimsB.0 * dimsA.1; |
| 24 | + let mut h_c = |
| 25 | + LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!"); |
| 26 | + |
| 27 | + let start_event = Event::new(EventFlags::DEFAULT)?; |
| 28 | + let stop_event = Event::new(EventFlags::DEFAULT)?; |
| 29 | + |
| 30 | + let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!"); |
| 31 | + let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!"); |
| 32 | + let mut d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!"); |
| 33 | + |
| 34 | + stream.synchronize().expect("Stream couldn't synchronize!"); |
| 35 | + let threads = BlockSize::xy(block_size as u32, block_size as u32); |
| 36 | + let grid = GridSize::xy((dimsB.0 / (threads.x as usize)).try_into().unwrap(), (dimsA.1 / (threads.y as usize)).try_into().unwrap()); |
| 37 | + |
| 38 | + println!("Computing result using CUDA Kernel..."); |
| 39 | + |
| 40 | + let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!"); |
| 41 | + let matrix_mul_cuda = module |
| 42 | + .get_function("matrix_mul_cuda") |
| 43 | + .expect("Kernel function not found!"); |
| 44 | + |
| 45 | + unsafe { |
| 46 | + launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>( |
| 47 | + d_c.as_device_ptr(), |
| 48 | + d_a.as_device_ptr(), |
| 49 | + d_b.as_device_ptr(), |
| 50 | + dimsA.0 as u32, |
| 51 | + dimsB.0 as u32 |
| 52 | + ))?; |
| 53 | + } |
| 54 | + |
| 55 | + println!("Done!"); |
| 56 | + stream.synchronize().expect("Stream couldn't synchronize!"); |
| 57 | + |
| 58 | + start_event |
| 59 | + .record(&stream) |
| 60 | + .expect("Failed to record start_event in the CUDA stream!"); |
| 61 | + |
| 62 | + const N_ITER : u32 = 300; |
| 63 | + |
| 64 | + for _ in 0..N_ITER { |
| 65 | + unsafe { |
| 66 | + launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>( |
| 67 | + d_c.as_device_ptr(), |
| 68 | + d_a.as_device_ptr(), |
| 69 | + d_b.as_device_ptr(), |
| 70 | + dimsA.0 as u32, |
| 71 | + dimsB.0 as u32, |
| 72 | + ))?; |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + stop_event |
| 77 | + .record(&stream) |
| 78 | + .expect("Failed to record stop_event in the CUDA stream!"); |
| 79 | + |
| 80 | + stop_event.synchronize().expect("Stream couldn't synchronize!"); |
| 81 | + |
| 82 | + let gpu_time: u128 = stop_event |
| 83 | + .elapsed(&start_event) |
| 84 | + .expect("Failed to calculate duration of GPU operations!") |
| 85 | + .as_micros(); |
| 86 | + |
| 87 | + let avg_time = gpu_time as f32 / N_ITER as f32; |
| 88 | + println!("Average time spent executing by the GPU: {} microseconds", avg_time); |
| 89 | + let flopsPerMatrixMul = 2.0 * (dimsA.0 as f32) * (dimsA.1 as f32) * (dimsB.0 as f32); |
| 90 | + let gigaFlops = (flopsPerMatrixMul / (avg_time)) / 1000.0; |
| 91 | + println!("Performance = {} GFlop/s", gigaFlops); |
| 92 | + |
| 93 | + // checking computed result |
| 94 | + // test relative error by the formula |
| 95 | + // |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu| |
| 96 | + let machine_epsilon = 1.19209290E-07f32; |
| 97 | + let mut correct = true; |
| 98 | + |
| 99 | + for i in 0..(dimsC.0 * dimsC.1) { |
| 100 | + let abs_err = (h_c[i] - (dimsA.0 as f32 * 0.01f32)).abs(); |
| 101 | + let dot_length = (dimsA.0 as f32).abs(); |
| 102 | + let abs_val = h_c[i].abs(); |
| 103 | + let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon); |
| 104 | + |
| 105 | + if rel_err > 1e-6 { |
| 106 | + println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dimsA.0 as f32 * 0.01f32, h_c[i], rel_err); |
| 107 | + correct = false; |
| 108 | + } |
| 109 | + } |
| 110 | + |
| 111 | + if correct { |
| 112 | + println!("Result = PASS"); |
| 113 | + println!("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."); |
| 114 | + } else { |
| 115 | + println!("Result = FAIL"); |
| 116 | + return Err(cust::error::CudaError::UnknownError); |
| 117 | + } |
| 118 | + |
| 119 | + Ok(()) |
| 120 | +} |
| 121 | + |
| 122 | +fn main() -> Result<(), cust::error::CudaError> { |
| 123 | + // Set up the context, load the module, and create a stream to run kernels in. |
| 124 | + let _ctx = cust::quick_init(); |
| 125 | + let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!"); |
| 126 | + println!("Device Name: {}", device.name().unwrap()); |
| 127 | + |
| 128 | + let block_size: u32 = 32; |
| 129 | + let dimsA: (usize, usize, usize) = (block_size as usize, block_size as usize, 1); |
| 130 | + let dimsB: (usize, usize, usize) = (block_size as usize, block_size as usize, 1); |
| 131 | + |
| 132 | + if dimsA.0 != dimsB.1 { |
| 133 | + panic!("Matrix multiplication not possible with the given dimensions!"); |
| 134 | + } |
| 135 | + |
| 136 | + matrix_multiply(block_size as usize, dimsA, dimsB); |
| 137 | + |
| 138 | + Ok(()) |
| 139 | +} |
0 commit comments