Skip to content

Commit d2a0874

Browse files
fix: code cleanup and stream synchronization after copying C from device to host memory
1 parent dc9592b commit d2a0874

File tree

1 file changed

+23
-25
lines changed
  • samples/introduction/matmul/src

1 file changed

+23
-25
lines changed

samples/introduction/matmul/src/main.rs

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,21 @@ use cust::function::{BlockSize, GridSize};
44
use cust::launch;
55
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
66
use cust::module::Module;
7-
use cust::prelude::EventStatus;
87
use cust::stream::{Stream, StreamFlags};
9-
use std::time::Instant;
108

119
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
1210

13-
fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
14-
let dimsC = (dimsB.0, dimsA.1, 1);
15-
let size_a = dimsA.0 * dimsA.1;
11+
fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
12+
let dims_c = (dims_b.0, dims_a.1, 1);
13+
let size_a = dims_a.0 * dims_a.1;
1614
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
1715

18-
let size_b = dimsB.0 * dimsB.1;
16+
let size_b = dims_b.0 * dims_b.1;
1917
let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host array couldn't be initialized!");
2018

2119
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
2220

23-
let mut size_c = dimsB.0 * dimsA.1;
21+
let size_c = dims_b.0 * dims_a.1;
2422
let mut h_c =
2523
LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
2624

@@ -29,11 +27,11 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
2927

3028
let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
3129
let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
32-
let mut d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
30+
let d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
3331

3432
stream.synchronize().expect("Stream couldn't synchronize!");
3533
let threads = BlockSize::xy(block_size as u32, block_size as u32);
36-
let grid = GridSize::xy((dimsB.0 / (threads.x as usize)).try_into().unwrap(), (dimsA.1 / (threads.y as usize)).try_into().unwrap());
34+
let grid = GridSize::xy((dims_b.0 / (threads.x as usize)).try_into().unwrap(), (dims_a.1 / (threads.y as usize)).try_into().unwrap());
3735

3836
println!("Computing result using CUDA Kernel...");
3937

@@ -47,8 +45,8 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
4745
d_c.as_device_ptr(),
4846
d_a.as_device_ptr(),
4947
d_b.as_device_ptr(),
50-
dimsA.0 as usize,
51-
dimsB.0 as usize
48+
dims_a.0 as usize,
49+
dims_b.0 as usize
5250
))?;
5351
}
5452

@@ -67,8 +65,8 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
6765
d_c.as_device_ptr(),
6866
d_a.as_device_ptr(),
6967
d_b.as_device_ptr(),
70-
dimsA.0 as usize,
71-
dimsB.0 as usize,
68+
dims_a.0 as usize,
69+
dims_b.0 as usize,
7270
))?;
7371
}
7472
}
@@ -86,30 +84,31 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
8684

8785
let avg_time = gpu_time as f32 / N_ITER as f32;
8886
println!("Average time spent executing by the GPU: {} microseconds", avg_time);
89-
let flopsPerMatrixMul = 2.0 * (dimsA.0 as f32) * (dimsA.1 as f32) * (dimsB.0 as f32);
90-
let gigaFlops = (flopsPerMatrixMul / (avg_time)) / 1000.0;
91-
println!("Performance = {} GFlop/s", gigaFlops);
87+
let flops_per_matrix_mul = 2.0 * (dims_a.0 as f32) * (dims_a.1 as f32) * (dims_b.0 as f32);
88+
let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
89+
println!("Performance = {} GFlop/s", giga_flops);
9290

9391
unsafe{
9492
d_c
9593
.async_copy_to(&mut h_c, &stream)
9694
.expect("Could not copy from device to host!");
9795
}
96+
stream.synchronize().expect("Stream couldn't synchronize!");
9897

9998
// checking computed result
10099
// test relative error by the formula
101100
// |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
102101
let machine_epsilon = 1.19209290E-07f32;
103102
let mut correct = true;
104103

105-
for i in 0..(dimsC.0 * dimsC.1) {
106-
let abs_err = (h_c[i] - (dimsA.0 as f32 * 0.01f32)).abs();
107-
let dot_length = (dimsA.0 as f32).abs();
104+
for i in 0..(dims_c.0 * dims_c.1) {
105+
let abs_err = (h_c[i] - (dims_a.0 as f32 * 0.01f32)).abs();
106+
let dot_length = (dims_a.0 as f32).abs();
108107
let abs_val = h_c[i].abs();
109108
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
110109

111110
if rel_err > 1e-6 {
112-
println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dimsA.0 as f32 * 0.01f32, h_c[i], rel_err);
111+
println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dims_a.0 as f32 * 0.01f32, h_c[i], rel_err);
113112
correct = false;
114113
}
115114
}
@@ -132,14 +131,13 @@ fn main() -> Result<(), cust::error::CudaError> {
132131
println!("Device Name: {}", device.name().unwrap());
133132

134133
let block_size: u32 = 32;
135-
let dimsA: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
136-
let dimsB: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
134+
let dims_a: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
135+
let dims_b: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
137136

138-
if dimsA.0 != dimsB.1 {
137+
if dims_a.0 != dims_b.1 {
139138
panic!("Matrix multiplication not possible with the given dimensions!");
140139
}
141140

142-
matrix_multiply(block_size as usize, dimsA, dimsB);
143-
141+
matrix_multiply(block_size as usize, dims_a, dims_b)?;
144142
Ok(())
145143
}

0 commit comments

Comments
 (0)