Skip to content

Commit fd9cef8

Browse files
madhav-madhusoodananMadhav Madhusoodanan
authored andcommitted
feat: Ported the matrix multiplication sample of Nvidia/cuda-samples repo.
fix: added shared memory space for matrix multiplication calculation chore: completed matmul/main.rs fix: type corrections and proper copying of result data from device to host fix: code cleanup and stream synchronization after copying C from device to host memory chore: cargo fmt feat: increased dimension of matrices being computed, and implemented Kahan's error correction to stop floating point accumulation errors chore: update readme fix: cargo.toml fixes feat: code cleanup Move documentation to sample-specific top-level comments in main.rs chore: remove documentation feat: added buildfile
1 parent 6519002 commit fd9cef8

File tree

9 files changed

+322
-8
lines changed

9 files changed

+322
-8
lines changed

Cargo.lock

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ members = [
5454

5555
"samples/introduction/async_api",
5656
"samples/introduction/async_api/kernels",
57+
"samples/introduction/matmul",
58+
"samples/introduction/matmul/kernels",
5759

5860
"tests/compiletests",
5961
"tests/compiletests/deps-helper",

samples/introduction/README.md

Lines changed: 0 additions & 8 deletions
This file was deleted.

samples/introduction/async_api/src/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
2+
*
3+
* 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
4+
* 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
5+
* 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
6+
*/
7+
18
use cust::device::Device;
29
use cust::event::{Event, EventFlags};
310
use cust::function::{BlockSize, GridSize};
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "matmul"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
cuda_std = { path = "../../../crates/cuda_std" }
9+
10+
[build-dependencies]
11+
cuda_builder = { workspace = true, default-features = false }
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use std::env;
2+
use std::path;
3+
4+
use cuda_builder::CudaBuilder;
5+
6+
fn main() {
7+
println!("cargo::rerun-if-changed=build.rs");
8+
println!("cargo::rerun-if-changed=kernels");
9+
10+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11+
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12+
13+
CudaBuilder::new(manifest_dir.join("kernels"))
14+
.copy_to(out_path.join("kernels.ptx"))
15+
.build()
16+
.unwrap();
17+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use core::mem::MaybeUninit;
2+
use cuda_std::*;
3+
4+
// SAFETY: This function is unsafe because it dereferences raw pointers.
5+
#[kernel]
6+
pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) {
7+
let bx: usize = cuda_std::thread::block_idx().x as usize;
8+
let by: usize = cuda_std::thread::block_idx().y as usize;
9+
10+
let tx: usize = cuda_std::thread::thread_idx().x as usize;
11+
let ty: usize = cuda_std::thread::thread_idx().y as usize;
12+
13+
const BLOCK_SIZE: usize = 32;
14+
let a_begin = wa * BLOCK_SIZE * by;
15+
let a_end = a_begin + wa - 1;
16+
let a_step = BLOCK_SIZE;
17+
18+
let b_begin = BLOCK_SIZE * bx;
19+
let b_step = BLOCK_SIZE * wb;
20+
21+
let mut c_sub: f32 = 0.0;
22+
let mut kahan_correction_factor = 0.0f32;
23+
let mut bi = b_begin;
24+
25+
for ai in (a_begin..=a_end).step_by(a_step) {
26+
// The equivalent Cuda C++ code for the below is:
27+
// ```
28+
// __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
29+
// ```
30+
// This memory space is shared between threads of the same block
31+
#[address_space(shared)]
32+
static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
33+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
34+
35+
#[address_space(shared)]
36+
static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
37+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
38+
39+
// Load A and B matrices into shared memory
40+
// A.add(index) returns the pointer to the index-th element of A
41+
// Hence a dereference is needed to get the value at that index
42+
unsafe {
43+
As[ty][tx].write(a[ai + wa * ty + tx]);
44+
Bs[ty][tx].write(b[bi + wb * ty + tx]);
45+
}
46+
47+
// Synchronize to make sure the matrices are loaded
48+
cuda_std::thread::sync_threads();
49+
for k in 0..BLOCK_SIZE {
50+
// Typically, this would be a simple calculation:
51+
// ```
52+
// c_sub += As[ty][k] * Bs[k][tx];
53+
// ```
54+
// However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
55+
// and not allow it to accumulate in c_sub
56+
let input = unsafe { As[ty][k].assume_init() * Bs[k][tx].assume_init() };
57+
let y = input - kahan_correction_factor;
58+
let sum = c_sub + y;
59+
60+
// This seems like the correction factor would yield zero, however due to f32 precision limitations,
61+
// it helps to isolate the small errors that would otherwise accumulate in c_sub
62+
kahan_correction_factor = (sum - c_sub) - y;
63+
c_sub = sum;
64+
}
65+
66+
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
67+
cuda_std::thread::sync_threads();
68+
69+
bi += b_step;
70+
}
71+
72+
let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
73+
unsafe {
74+
*c.add((ci + wb * ty + tx) as usize) = c_sub;
75+
}
76+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/* This example demonstrates an implementation of matrix multiplication.
2+
*
3+
* 1. The matrices are first created on the host side and then copied to the device.
4+
* 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly
5+
* 3. The result is copied back to host, where the accumulated error occur.
6+
* 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
7+
*/
8+
9+
use cuda_std::glam::USizeVec2;
10+
use cust::device::Device;
11+
use cust::event::{Event, EventFlags};
12+
use cust::function::{BlockSize, GridSize};
13+
use cust::launch;
14+
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
15+
use cust::module::Module;
16+
use cust::stream::{Stream, StreamFlags};
17+
18+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
19+
20+
fn matrix_multiply(
21+
block_size: usize,
22+
dims_a: USizeVec2,
23+
dims_b: USizeVec2,
24+
) -> Result<(), cust::error::CudaError> {
25+
let dims_c = USizeVec2::new(dims_b.x, dims_a.y);
26+
let size_a = dims_a.x * dims_a.y;
27+
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
28+
29+
let size_b = dims_b.x * dims_b.y;
30+
let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!");
31+
32+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
33+
34+
let size_c = dims_b.x * dims_a.y;
35+
let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
36+
37+
let start_event = Event::new(EventFlags::DEFAULT)?;
38+
let stop_event = Event::new(EventFlags::DEFAULT)?;
39+
40+
let d_a =
41+
DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
42+
let d_b =
43+
DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
44+
let d_c =
45+
DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
46+
47+
stream.synchronize().expect("Stream couldn't synchronize!");
48+
let threads = BlockSize::xy(block_size as u32, block_size as u32);
49+
let grid = GridSize::xy(
50+
(dims_b.x / (threads.x as usize)).try_into().unwrap(),
51+
(dims_a.y / (threads.y as usize)).try_into().unwrap(),
52+
);
53+
54+
println!("Computing result using CUDA Kernel...");
55+
56+
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
57+
let matrix_mul_cuda = module
58+
.get_function("matrix_mul_cuda")
59+
.expect("Kernel function not found!");
60+
61+
unsafe {
62+
// The function definition of the kernel is:
63+
// ```
64+
// pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize)
65+
// ```
66+
// For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer.
67+
// For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice.
68+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
69+
d_c.as_device_ptr(),
70+
d_a.as_device_ptr(),
71+
d_a.len(),
72+
d_b.as_device_ptr(),
73+
d_b.len(),
74+
dims_a.x as usize,
75+
dims_b.x as usize
76+
))?;
77+
}
78+
79+
println!("Done!");
80+
stream.synchronize().expect("Stream couldn't synchronize!");
81+
82+
start_event
83+
.record(&stream)
84+
.expect("Failed to record start_event in the CUDA stream!");
85+
86+
const N_ITER: u32 = 300;
87+
88+
for _ in 0..N_ITER {
89+
unsafe {
90+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
91+
d_c.as_device_ptr(),
92+
d_a.as_device_ptr(),
93+
d_a.len(),
94+
d_b.as_device_ptr(),
95+
d_b.len(),
96+
dims_a.x as usize,
97+
dims_b.x as usize,
98+
))?;
99+
}
100+
}
101+
102+
stop_event
103+
.record(&stream)
104+
.expect("Failed to record stop_event in the CUDA stream!");
105+
106+
stop_event
107+
.synchronize()
108+
.expect("Stream couldn't synchronize!");
109+
110+
let gpu_time: u128 = stop_event
111+
.elapsed(&start_event)
112+
.expect("Failed to calculate duration of GPU operations!")
113+
.as_micros();
114+
115+
let avg_time = gpu_time as f32 / N_ITER as f32;
116+
println!(
117+
"Average time spent executing by the GPU: {} microseconds",
118+
avg_time
119+
);
120+
let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32);
121+
let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
122+
println!("Performance = {} GFlop/s", giga_flops);
123+
124+
unsafe {
125+
d_c.async_copy_to(&mut h_c, &stream)
126+
.expect("Could not copy from device to host!");
127+
}
128+
stream.synchronize().expect("Stream couldn't synchronize!");
129+
130+
// checking computed result
131+
// test relative error by the formula
132+
// |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
133+
let machine_epsilon = 1.19209290E-07f32;
134+
let mut correct = true;
135+
136+
for i in 0..(dims_c.x * dims_c.y) {
137+
let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs();
138+
let dot_length = (dims_a.x as f32).abs();
139+
let abs_val = h_c[i].abs();
140+
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
141+
142+
if rel_err > 1e-6 {
143+
println!(
144+
"Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
145+
i,
146+
dims_a.x as f32 * 0.01f32,
147+
h_c[i],
148+
rel_err
149+
);
150+
correct = false;
151+
}
152+
}
153+
154+
if correct {
155+
println!("Result = PASS");
156+
println!(
157+
"NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
158+
);
159+
} else {
160+
println!("Result = FAIL");
161+
return Err(cust::error::CudaError::UnknownError);
162+
}
163+
164+
Ok(())
165+
}
166+
167+
fn main() -> Result<(), cust::error::CudaError> {
168+
// Set up the context, load the module, and create a stream to run kernels in.
169+
let _ctx = cust::quick_init();
170+
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
171+
println!("Device Name: {}", device.name().unwrap());
172+
173+
let block_size: u32 = 32;
174+
let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize);
175+
let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize);
176+
177+
if dims_a.x != dims_b.y {
178+
panic!("Matrix multiplication not possible with the given dimensions!");
179+
}
180+
181+
matrix_multiply(block_size as usize, dims_a, dims_b)?;
182+
Ok(())
183+
}

0 commit comments

Comments
 (0)