Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ members = [

"samples/introduction/async_api",
"samples/introduction/async_api/kernels",
"samples/introduction/matmul",
"samples/introduction/matmul/kernels",

"tests/compiletests",
"tests/compiletests/deps-helper",
Expand Down
8 changes: 0 additions & 8 deletions samples/introduction/README.md

This file was deleted.

7 changes: 7 additions & 0 deletions samples/introduction/async_api/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
*
* 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
* 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
* 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
*/

use cust::device::Device;
use cust::event::{Event, EventFlags};
use cust::function::{BlockSize, GridSize};
Expand Down
11 changes: 11 additions & 0 deletions samples/introduction/matmul/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "matmul"
version = "0.1.0"
edition = "2024"

[dependencies]
cust = { path = "../../../crates/cust" }
cuda_std = { path = "../../../crates/cuda_std" }

[build-dependencies]
cuda_builder = { workspace = true, default-features = false }
17 changes: 17 additions & 0 deletions samples/introduction/matmul/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use std::env;
use std::path;

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());

CudaBuilder::new(manifest_dir.join("kernels"))
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();
}
10 changes: 10 additions & 0 deletions samples/introduction/matmul/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../../crates/cuda_std" }

[lib]
crate-type = ["cdylib", "rlib"]
76 changes: 76 additions & 0 deletions samples/introduction/matmul/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use core::mem::MaybeUninit;
use cuda_std::*;

// SAFETY: This function is unsafe because it dereferences raw pointers.
#[kernel]
pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) {
let bx: usize = cuda_std::thread::block_idx().x as usize;
let by: usize = cuda_std::thread::block_idx().y as usize;

let tx: usize = cuda_std::thread::thread_idx().x as usize;
let ty: usize = cuda_std::thread::thread_idx().y as usize;

const BLOCK_SIZE: usize = 32;
let a_begin = wa * BLOCK_SIZE * by;
let a_end = a_begin + wa - 1;
let a_step = BLOCK_SIZE;

let b_begin = BLOCK_SIZE * bx;
let b_step = BLOCK_SIZE * wb;

let mut c_sub: f32 = 0.0;
let mut kahan_correction_factor = 0.0f32;
let mut bi = b_begin;

for ai in (a_begin..=a_end).step_by(a_step) {
// The equivalent Cuda C++ code for the below is:
// ```
// __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// ```
// This memory space is shared between threads of the same block
#[address_space(shared)]
static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];

#[address_space(shared)]
static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];

// Load A and B matrices into shared memory
// A.add(index) returns the pointer to the index-th element of A
// Hence a dereference is needed to get the value at that index
unsafe {
As[ty][tx].write(a[ai + wa * ty + tx]);
Bs[ty][tx].write(b[bi + wb * ty + tx]);
}

// Synchronize to make sure the matrices are loaded
cuda_std::thread::sync_threads();
for k in 0..BLOCK_SIZE {
// Typically, this would be a simple calculation:
// ```
// c_sub += As[ty][k] * Bs[k][tx];
// ```
// However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
// and not allow it to accumulate in c_sub
let input = unsafe { As[ty][k].assume_init() * Bs[k][tx].assume_init() };
let y = input - kahan_correction_factor;
let sum = c_sub + y;

// This seems like the correction factor would yield zero, however due to f32 precision limitations,
// it helps to isolate the small errors that would otherwise accumulate in c_sub
kahan_correction_factor = (sum - c_sub) - y;
c_sub = sum;
}

// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
cuda_std::thread::sync_threads();

bi += b_step;
}

let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
unsafe {
*c.add((ci + wb * ty + tx) as usize) = c_sub;
}
}
183 changes: 183 additions & 0 deletions samples/introduction/matmul/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/* This example demonstrates an implementation of matrix multiplication.
*
* 1. The matrices are first created on the host side and then copied to the device.
* 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly
* 3. The result is copied back to host, where the accumulated error occur.
* 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
*/

use cuda_std::glam::USizeVec2;
use cust::device::Device;
use cust::event::{Event, EventFlags};
use cust::function::{BlockSize, GridSize};
use cust::launch;
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
use cust::module::Module;
use cust::stream::{Stream, StreamFlags};

static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));

fn matrix_multiply(
block_size: usize,
dims_a: USizeVec2,
dims_b: USizeVec2,
) -> Result<(), cust::error::CudaError> {
let dims_c = USizeVec2::new(dims_b.x, dims_a.y);
let size_a = dims_a.x * dims_a.y;
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");

let size_b = dims_b.x * dims_b.y;
let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!");

let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");

let size_c = dims_b.x * dims_a.y;
let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");

let start_event = Event::new(EventFlags::DEFAULT)?;
let stop_event = Event::new(EventFlags::DEFAULT)?;

let d_a =
DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
let d_b =
DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
let d_c =
DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");

stream.synchronize().expect("Stream couldn't synchronize!");
let threads = BlockSize::xy(block_size as u32, block_size as u32);
let grid = GridSize::xy(
(dims_b.x / (threads.x as usize)).try_into().unwrap(),
(dims_a.y / (threads.y as usize)).try_into().unwrap(),
);

println!("Computing result using CUDA Kernel...");

let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
let matrix_mul_cuda = module
.get_function("matrix_mul_cuda")
.expect("Kernel function not found!");

unsafe {
// The function definition of the kernel is:
// ```
// pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize)
// ```
// For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer.
// For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice.
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
d_c.as_device_ptr(),
d_a.as_device_ptr(),
d_a.len(),
d_b.as_device_ptr(),
d_b.len(),
dims_a.x,
dims_b.x
))?;
}

println!("Done!");
stream.synchronize().expect("Stream couldn't synchronize!");

start_event
.record(&stream)
.expect("Failed to record start_event in the CUDA stream!");

const N_ITER: u32 = 300;

for _ in 0..N_ITER {
unsafe {
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
d_c.as_device_ptr(),
d_a.as_device_ptr(),
d_a.len(),
d_b.as_device_ptr(),
d_b.len(),
dims_a.x,
dims_b.x,
))?;
}
}

stop_event
.record(&stream)
.expect("Failed to record stop_event in the CUDA stream!");

stop_event
.synchronize()
.expect("Stream couldn't synchronize!");

let gpu_time: u128 = stop_event
.elapsed(&start_event)
.expect("Failed to calculate duration of GPU operations!")
.as_micros();

let avg_time = gpu_time as f32 / N_ITER as f32;
println!(
"Average time spent executing by the GPU: {} microseconds",
avg_time
);
let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32);
let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
println!("Performance = {} GFlop/s", giga_flops);

unsafe {
d_c.async_copy_to(&mut h_c, &stream)
.expect("Could not copy from device to host!");
}
stream.synchronize().expect("Stream couldn't synchronize!");

// checking computed result
// test relative error by the formula
// |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
let machine_epsilon = 1.1920929E-07f32;
let mut correct = true;

for i in 0..(dims_c.x * dims_c.y) {
let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs();
let dot_length = (dims_a.x as f32).abs();
let abs_val = h_c[i].abs();
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);

if rel_err > 1e-6 {
println!(
"Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
i,
dims_a.x as f32 * 0.01f32,
h_c[i],
rel_err
);
correct = false;
}
}

if correct {
println!("Result = PASS");
println!(
"NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
);
} else {
println!("Result = FAIL");
return Err(cust::error::CudaError::UnknownError);
}

Ok(())
}

fn main() -> Result<(), cust::error::CudaError> {
// Set up the context, load the module, and create a stream to run kernels in.
let _ctx = cust::quick_init();
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
println!("Device Name: {}", device.name().unwrap());

let block_size: u32 = 32;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a const BLOCK_SIZE: u32 = 32;

let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize);
let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize);

if dims_a.x != dims_b.y {
panic!("Matrix multiplication not possible with the given dimensions!");
}

matrix_multiply(block_size as usize, dims_a, dims_b)?;
Ok(())
}
Loading