diff --git a/Cargo.lock b/Cargo.lock index 23b752b4..7a9496b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1717,6 +1717,13 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kernels" +version = "0.1.0" +dependencies = [ + "cuda_std", +] + [[package]] name = "khronos_api" version = "3.1.0" @@ -1859,6 +1866,15 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matmul" +version = "0.1.0" +dependencies = [ + "cuda_builder", + "cuda_std", + "cust", +] + [[package]] name = "matrixmultiply" version = "0.3.10" diff --git a/Cargo.toml b/Cargo.toml index 632d3460..108b31a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,8 @@ members = [ "samples/introduction/async_api", "samples/introduction/async_api/kernels", + "samples/introduction/matmul", + "samples/introduction/matmul/kernels", "tests/compiletests", "tests/compiletests/deps-helper", diff --git a/samples/introduction/README.md b/samples/introduction/README.md deleted file mode 100644 index 3c980a66..00000000 --- a/samples/introduction/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Chapter 0: Introduction - -## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api) -This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations. - -1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations. -2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device) -3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU. diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs index 063efbd3..8ecbdf12 100644 --- a/samples/introduction/async_api/src/main.rs +++ b/samples/introduction/async_api/src/main.rs @@ -1,3 +1,10 @@ +/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations. + * + * 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations. + * 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device) + * 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU. + */ + use cust::device::Device; use cust::event::{Event, EventFlags}; use cust::function::{BlockSize, GridSize}; diff --git a/samples/introduction/matmul/Cargo.toml b/samples/introduction/matmul/Cargo.toml new file mode 100644 index 00000000..344339ee --- /dev/null +++ b/samples/introduction/matmul/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "matmul" +version = "0.1.0" +edition = "2024" + +[dependencies] +cust = { path = "../../../crates/cust" } +cuda_std = { path = "../../../crates/cuda_std" } + +[build-dependencies] +cuda_builder = { workspace = true, default-features = false } \ No newline at end of file diff --git a/samples/introduction/matmul/build.rs b/samples/introduction/matmul/build.rs new file mode 100644 index 00000000..7f23bac1 --- /dev/null +++ b/samples/introduction/matmul/build.rs @@ -0,0 +1,17 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + CudaBuilder::new(manifest_dir.join("kernels")) + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); +} diff --git a/samples/introduction/matmul/kernels/Cargo.toml b/samples/introduction/matmul/kernels/Cargo.toml new file mode 100644 index 00000000..1aa3f192 --- /dev/null +++ b/samples/introduction/matmul/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] \ No newline at end of file diff --git a/samples/introduction/matmul/kernels/src/lib.rs b/samples/introduction/matmul/kernels/src/lib.rs new file mode 100644 index 00000000..52244612 --- /dev/null +++ b/samples/introduction/matmul/kernels/src/lib.rs @@ -0,0 +1,76 @@ +use core::mem::MaybeUninit; +use cuda_std::*; + +// SAFETY: This function is unsafe because it dereferences raw pointers. +#[kernel] +pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) { + let bx: usize = cuda_std::thread::block_idx().x as usize; + let by: usize = cuda_std::thread::block_idx().y as usize; + + let tx: usize = cuda_std::thread::thread_idx().x as usize; + let ty: usize = cuda_std::thread::thread_idx().y as usize; + + const BLOCK_SIZE: usize = 32; + let a_begin = wa * BLOCK_SIZE * by; + let a_end = a_begin + wa - 1; + let a_step = BLOCK_SIZE; + + let b_begin = BLOCK_SIZE * bx; + let b_step = BLOCK_SIZE * wb; + + let mut c_sub: f32 = 0.0; + let mut kahan_correction_factor = 0.0f32; + let mut bi = b_begin; + + for ai in (a_begin..=a_end).step_by(a_step) { + // The equivalent Cuda C++ code for the below is: + // ``` + // __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // ``` + // This memory space is shared between threads of the same block + #[address_space(shared)] + static mut As: [[MaybeUninit; BLOCK_SIZE]; BLOCK_SIZE] = + [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE]; + + #[address_space(shared)] + static mut Bs: [[MaybeUninit; BLOCK_SIZE]; BLOCK_SIZE] = + [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE]; + + // Load A and B matrices into shared memory + // A.add(index) returns the pointer to the index-th element of A + // Hence a dereference is needed to get the value at that index + unsafe { + As[ty][tx].write(a[ai + wa * ty + tx]); + Bs[ty][tx].write(b[bi + wb * ty + tx]); + } + + // Synchronize to make sure the matrices are loaded + cuda_std::thread::sync_threads(); + for k in 0..BLOCK_SIZE { + // Typically, this would be a simple calculation: + // ``` + // c_sub += As[ty][k] * Bs[k][tx]; + // ``` + // However, to improve numerical stability, we use Kahan summation here so that the error can be isolated + // and not allow it to accumulate in c_sub + let input = unsafe { As[ty][k].assume_init() * Bs[k][tx].assume_init() }; + let y = input - kahan_correction_factor; + let sum = c_sub + y; + + // This seems like the correction factor would yield zero, however due to f32 precision limitations, + // it helps to isolate the small errors that would otherwise accumulate in c_sub + kahan_correction_factor = (sum - c_sub) - y; + c_sub = sum; + } + + // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration + cuda_std::thread::sync_threads(); + + bi += b_step; + } + + let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx; + unsafe { + *c.add((ci + wb * ty + tx) as usize) = c_sub; + } +} diff --git a/samples/introduction/matmul/src/main.rs b/samples/introduction/matmul/src/main.rs new file mode 100644 index 00000000..4e2b0db3 --- /dev/null +++ b/samples/introduction/matmul/src/main.rs @@ -0,0 +1,183 @@ +/* This example demonstrates an implementation of matrix multiplication. + * + * 1. The matrices are first created on the host side and then copied to the device. + * 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly + * 3. The result is copied back to host, where the accumulated error occur. + * 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm). + */ + +use cuda_std::glam::USizeVec2; +use cust::device::Device; +use cust::event::{Event, EventFlags}; +use cust::function::{BlockSize, GridSize}; +use cust::launch; +use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer}; +use cust::module::Module; +use cust::stream::{Stream, StreamFlags}; + +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +fn matrix_multiply( + block_size: usize, + dims_a: USizeVec2, + dims_b: USizeVec2, +) -> Result<(), cust::error::CudaError> { + let dims_c = USizeVec2::new(dims_b.x, dims_a.y); + let size_a = dims_a.x * dims_a.y; + let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!"); + + let size_b = dims_b.x * dims_b.y; + let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!"); + + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!"); + + let size_c = dims_b.x * dims_a.y; + let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!"); + + let start_event = Event::new(EventFlags::DEFAULT)?; + let stop_event = Event::new(EventFlags::DEFAULT)?; + + let d_a = + DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!"); + let d_b = + DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!"); + let d_c = + DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!"); + + stream.synchronize().expect("Stream couldn't synchronize!"); + let threads = BlockSize::xy(block_size as u32, block_size as u32); + let grid = GridSize::xy( + (dims_b.x / (threads.x as usize)).try_into().unwrap(), + (dims_a.y / (threads.y as usize)).try_into().unwrap(), + ); + + println!("Computing result using CUDA Kernel..."); + + let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!"); + let matrix_mul_cuda = module + .get_function("matrix_mul_cuda") + .expect("Kernel function not found!"); + + unsafe { + // The function definition of the kernel is: + // ``` + // pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) + // ``` + // For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer. + // For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice. + launch!(matrix_mul_cuda<<>>( + d_c.as_device_ptr(), + d_a.as_device_ptr(), + d_a.len(), + d_b.as_device_ptr(), + d_b.len(), + dims_a.x, + dims_b.x + ))?; + } + + println!("Done!"); + stream.synchronize().expect("Stream couldn't synchronize!"); + + start_event + .record(&stream) + .expect("Failed to record start_event in the CUDA stream!"); + + const N_ITER: u32 = 300; + + for _ in 0..N_ITER { + unsafe { + launch!(matrix_mul_cuda<<>>( + d_c.as_device_ptr(), + d_a.as_device_ptr(), + d_a.len(), + d_b.as_device_ptr(), + d_b.len(), + dims_a.x, + dims_b.x, + ))?; + } + } + + stop_event + .record(&stream) + .expect("Failed to record stop_event in the CUDA stream!"); + + stop_event + .synchronize() + .expect("Stream couldn't synchronize!"); + + let gpu_time: u128 = stop_event + .elapsed(&start_event) + .expect("Failed to calculate duration of GPU operations!") + .as_micros(); + + let avg_time = gpu_time as f32 / N_ITER as f32; + println!( + "Average time spent executing by the GPU: {} microseconds", + avg_time + ); + let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32); + let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0; + println!("Performance = {} GFlop/s", giga_flops); + + unsafe { + d_c.async_copy_to(&mut h_c, &stream) + .expect("Could not copy from device to host!"); + } + stream.synchronize().expect("Stream couldn't synchronize!"); + + // checking computed result + // test relative error by the formula + // |_cpu - _gpu| / |_cpu| + let machine_epsilon = 1.1920929E-07f32; + let mut correct = true; + + for i in 0..(dims_c.x * dims_c.y) { + let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs(); + let dot_length = (dims_a.x as f32).abs(); + let abs_val = h_c[i].abs(); + let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon); + + if rel_err > 1e-6 { + println!( + "Error at index {}: CPU = {}, GPU = {}, rel_err = {}", + i, + dims_a.x as f32 * 0.01f32, + h_c[i], + rel_err + ); + correct = false; + } + } + + if correct { + println!("Result = PASS"); + println!( + "NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled." + ); + } else { + println!("Result = FAIL"); + return Err(cust::error::CudaError::UnknownError); + } + + Ok(()) +} + +fn main() -> Result<(), cust::error::CudaError> { + // Set up the context, load the module, and create a stream to run kernels in. + let _ctx = cust::quick_init(); + let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!"); + println!("Device Name: {}", device.name().unwrap()); + + let block_size: u32 = 32; + let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize); + let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize); + + if dims_a.x != dims_b.y { + panic!("Matrix multiplication not possible with the given dimensions!"); + } + + matrix_multiply(block_size as usize, dims_a, dims_b)?; + Ok(()) +}