Skip to content

Commit f40c8ac

Browse files
chore: cargo fmt
1 parent d2a0874 commit f40c8ac

File tree

2 files changed

+48
-22
lines changed

2 files changed

+48
-22
lines changed

samples/introduction/matmul/kernels/src/lib.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
use cuda_std::*;
21
use core::mem::MaybeUninit;
2+
use cuda_std::*;
33

44
// SAFETY: This function is unsafe because it dereferences raw pointers.
55
#[kernel]
@@ -23,9 +23,12 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
2323

2424
for a in (a_begin..=a_end).step_by(a_step) {
2525
#[address_space(shared)]
26-
static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
26+
static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
27+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
28+
2729
#[address_space(shared)]
28-
static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] = [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
30+
static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
31+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
2932

3033
// Load A and B matrices into shared memory
3134
unsafe {
@@ -48,5 +51,7 @@ pub unsafe fn matrix_mul_cuda(C: *mut f32, A: *const f32, B: *const f32, wa: usi
4851
}
4952

5053
let c = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
51-
unsafe { *C.add((c + wb * ty + tx) as usize) = c_sub; }
52-
}
54+
unsafe {
55+
*C.add((c + wb * ty + tx) as usize) = c_sub;
56+
}
57+
}

samples/introduction/matmul/src/main.rs

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@ use cust::stream::{Stream, StreamFlags};
88

99
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
1010

11-
fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (usize, usize, usize)) -> Result<(), cust::error::CudaError> {
11+
fn matrix_multiply(
12+
block_size: usize,
13+
dims_a: (usize, usize, usize),
14+
dims_b: (usize, usize, usize),
15+
) -> Result<(), cust::error::CudaError> {
1216
let dims_c = (dims_b.0, dims_a.1, 1);
1317
let size_a = dims_a.0 * dims_a.1;
1418
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
@@ -19,19 +23,24 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
1923
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
2024

2125
let size_c = dims_b.0 * dims_a.1;
22-
let mut h_c =
23-
LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
26+
let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
2427

2528
let start_event = Event::new(EventFlags::DEFAULT)?;
2629
let stop_event = Event::new(EventFlags::DEFAULT)?;
2730

28-
let d_a = DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
29-
let d_b = DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
30-
let d_c = DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
31-
31+
let d_a =
32+
DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
33+
let d_b =
34+
DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
35+
let d_c =
36+
DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
37+
3238
stream.synchronize().expect("Stream couldn't synchronize!");
3339
let threads = BlockSize::xy(block_size as u32, block_size as u32);
34-
let grid = GridSize::xy((dims_b.0 / (threads.x as usize)).try_into().unwrap(), (dims_a.1 / (threads.y as usize)).try_into().unwrap());
40+
let grid = GridSize::xy(
41+
(dims_b.0 / (threads.x as usize)).try_into().unwrap(),
42+
(dims_a.1 / (threads.y as usize)).try_into().unwrap(),
43+
);
3544

3645
println!("Computing result using CUDA Kernel...");
3746

@@ -57,7 +66,7 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
5766
.record(&stream)
5867
.expect("Failed to record start_event in the CUDA stream!");
5968

60-
const N_ITER : u32 = 300;
69+
const N_ITER: u32 = 300;
6170

6271
for _ in 0..N_ITER {
6372
unsafe {
@@ -75,22 +84,26 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
7584
.record(&stream)
7685
.expect("Failed to record stop_event in the CUDA stream!");
7786

78-
stop_event.synchronize().expect("Stream couldn't synchronize!");
87+
stop_event
88+
.synchronize()
89+
.expect("Stream couldn't synchronize!");
7990

8091
let gpu_time: u128 = stop_event
8192
.elapsed(&start_event)
8293
.expect("Failed to calculate duration of GPU operations!")
8394
.as_micros();
8495

8596
let avg_time = gpu_time as f32 / N_ITER as f32;
86-
println!("Average time spent executing by the GPU: {} microseconds", avg_time);
97+
println!(
98+
"Average time spent executing by the GPU: {} microseconds",
99+
avg_time
100+
);
87101
let flops_per_matrix_mul = 2.0 * (dims_a.0 as f32) * (dims_a.1 as f32) * (dims_b.0 as f32);
88102
let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
89103
println!("Performance = {} GFlop/s", giga_flops);
90104

91-
unsafe{
92-
d_c
93-
.async_copy_to(&mut h_c, &stream)
105+
unsafe {
106+
d_c.async_copy_to(&mut h_c, &stream)
94107
.expect("Could not copy from device to host!");
95108
}
96109
stream.synchronize().expect("Stream couldn't synchronize!");
@@ -108,14 +121,22 @@ fn matrix_multiply(block_size: usize, dims_a: (usize, usize, usize), dims_b: (us
108121
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
109122

110123
if rel_err > 1e-6 {
111-
println!("Error at index {}: CPU = {}, GPU = {}, rel_err = {}", i, dims_a.0 as f32 * 0.01f32, h_c[i], rel_err);
124+
println!(
125+
"Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
126+
i,
127+
dims_a.0 as f32 * 0.01f32,
128+
h_c[i],
129+
rel_err
130+
);
112131
correct = false;
113132
}
114133
}
115134

116135
if correct {
117136
println!("Result = PASS");
118-
println!("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.");
137+
println!(
138+
"NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
139+
);
119140
} else {
120141
println!("Result = FAIL");
121142
return Err(cust::error::CudaError::UnknownError);
@@ -130,7 +151,7 @@ fn main() -> Result<(), cust::error::CudaError> {
130151
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
131152
println!("Device Name: {}", device.name().unwrap());
132153

133-
let block_size: u32 = 32;
154+
let block_size: u32 = 32;
134155
let dims_a: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
135156
let dims_b: (usize, usize, usize) = (block_size as usize, block_size as usize, 1);
136157

0 commit comments

Comments
 (0)