Skip to content

Commit 80cf8ff

Browse files
chore: completed matmul/main.rs
1 parent f37cc7b commit 80cf8ff

File tree

1 file changed

+139
-0
lines changed
  • samples/introduction/matmul/src

1 file changed

+139
-0
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
use cust::device::Device;
2+
use cust::event::{Event, EventFlags};
3+
use cust::function::{BlockSize, GridSize};
4+
use cust::launch;
5+
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
6+
use cust::module::Module;
7+
use cust::prelude::EventStatus;
8+
use cust::stream::{Stream, StreamFlags};
9+
use std::time::Instant;
10+
11+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
12+
13+
fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
14+
let dimsC = (dimsB.0, dimsA.1, 1);
15+
let size_a = dimsA.0 * dimsA.1;
16+
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
17+
18+
let size_b = dimsB.0 * dimsB.1;
19+
let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host array couldn't be initialized!");
20+
21+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
22+
23+
let mut size_c = dimsB.0 * dimsA.1;
24+
let mut h_c =
25+
LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
26+
27+
let start_event = Event::new(EventFlags::DEFAULT)?;
28+
let stop_event = Event::new(EventFlags::DEFAULT)?;
29+
30+
let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
31+
let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
32+
let mut d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
33+
34+
stream.synchronize().expect("Stream couldn't synchronize!");
35+
let threads = BlockSize::xy(block_size as u32, block_size as u32);
36+
let grid = GridSize::xy((dimsB.0 / (threads.x as usize)).try_into().unwrap(), (dimsA.1 / (threads.y as usize)).try_into().unwrap());
37+
38+
println!("Computing result using CUDA Kernel...");
39+
40+
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
41+
let matrix_mul_cuda = module
42+
.get_function("matrix_mul_cuda")
43+
.expect("Kernel function not found!");
44+
45+
unsafe {
46+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
47+
d_c.as_device_ptr(),
48+
d_a.as_device_ptr(),
49+
d_b.as_device_ptr(),
50+
dimsA.0 as u32,
51+
dimsB.0 as u32
52+
))?;
53+
}
54+
55+
println!("Done!");
56+
stream.synchronize().expect("Stream couldn't synchronize!");
57+
58+
start_event
59+
.record(&stream)
60+
.expect("Failed to record start_event in the CUDA stream!");
61+
62+
const N_ITER : u32 = 300;
63+
64+
for _ in 0..N_ITER {
65+
unsafe {
66+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
67+
d_c.as_device_ptr(),
68+
d_a.as_device_ptr(),
69+
d_b.as_device_ptr(),
70+
dimsA.0 as u32,
71+
dimsB.0 as u32,
72+
))?;
73+
}
74+
}
75+
76+
stop_event
77+
.record(&stream)
78+
.expect("Failed to record stop_event in the CUDA stream!");
79+
80+
stop_event.synchronize().expect("Stream couldn't synchronize!");
81+
82+
let gpu_time: u128 = stop_event
83+
.elapsed(&start_event)
84+
.expect("Failed to calculate duration of GPU operations!")
85+
.as_micros();
86+
87+
let avg_time = gpu_time as f32 / N_ITER as f32;
88+
println!("Average time spent executing by the GPU: {} microseconds", avg_time);
89+
let flopsPerMatrixMul = 2.0 * (dimsA.0 as f32) * (dimsA.1 as f32) * (dimsB.0 as f32);
90+
let gigaFlops = (flopsPerMatrixMul / (avg_time)) / 1000.0;
91+
println!("Performance = {} GFlop/s", gigaFlops);
92+
93+
// checking computed result
94+
// test relative error by the formula
95+
// |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
96+
let machine_epsilon = 1.19209290E-07f32;
97+
let mut correct = true;
98+
99+
for i in 0..(dimsC.0 * dimsC.1) {
100+
let abs_err = (h_c[i] - (dimsA.0 as f32 * 0.01f32)).abs();
101+
let dot_length = (dimsA.0 as f32).abs();
102+
let abs_val = h_c[i].abs();
103+
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
104+
105+
if rel_err > 1e-6 {
106+
println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dimsA.0 as f32 * 0.01f32, h_c[i], rel_err);
107+
correct = false;
108+
}
109+
}
110+
111+
if correct {
112+
println!("Result = PASS");
113+
println!("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.");
114+
} else {
115+
println!("Result = FAIL");
116+
return Err(cust::error::CudaError::UnknownError);
117+
}
118+
119+
Ok(())
120+
}
121+
122+
fn main() -> Result<(), cust::error::CudaError> {
123+
// Set up the context, load the module, and create a stream to run kernels in.
124+
let _ctx = cust::quick_init();
125+
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
126+
println!("Device Name: {}", device.name().unwrap());
127+
128+
let block_size: u32 = 32;
129+
let dimsA: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
130+
let dimsB: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
131+
132+
if dimsA.0 != dimsB.1 {
133+
panic!("Matrix multiplication not possible with the given dimensions!");
134+
}
135+
136+
matrix_multiply(block_size as usize, dimsA, dimsB);
137+
138+
Ok(())
139+
}

0 commit comments

Comments
 (0)