Rust-GPU
diff --git a/‎.github/workflows/ci_linux.yml
+3-3 b/‎.github/workflows/ci_linux.yml
+3-3
diff --git a/‎.github/workflows/ci_windows.yml
+2-2 b/‎.github/workflows/ci_windows.yml
+2-2
diff --git a/‎Cargo.toml
+7-2 b/‎Cargo.toml
+7-2
diff --git a/‎examples/cuda/cpu/add/Cargo.toml
-22 b/‎examples/cuda/cpu/add/Cargo.toml
-22
diff --git a/‎examples/cuda/cpu/add/build.rs
-8 b/‎examples/cuda/cpu/add/build.rs
-8
diff --git a/‎examples/cuda/cpu/path_tracer/build.rs
-14 b/‎examples/cuda/cpu/path_tracer/build.rs
-14
diff --git a/‎examples/cuda/gemm/Cargo.toml
+16 b/‎examples/cuda/gemm/Cargo.toml
+16
diff --git a/‎examples/cuda/gemm/build.rs
+15 b/‎examples/cuda/gemm/build.rs
+15
diff --git a/‎examples/cuda/gemm/kernels/Cargo.toml
+11 b/‎examples/cuda/gemm/kernels/Cargo.toml
+11
diff --git a/‎examples/cuda/gemm/kernels/src/gemm_naive.rs
+46 b/‎examples/cuda/gemm/kernels/src/gemm_naive.rs
+46
diff --git a/‎examples/cuda/gemm/kernels/src/gemm_tiled.rs
+83 b/‎examples/cuda/gemm/kernels/src/gemm_tiled.rs
+83
diff --git a/‎examples/cuda/gemm/kernels/src/lib.rs
+5 b/‎examples/cuda/gemm/kernels/src/lib.rs
+5
@@ -73,18 +73,18 @@ jobs:
       - name: Clippy
         env:
           RUSTFLAGS: -Dwarnings
-        run: cargo clippy --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
+        run: cargo clippy --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
 
       - name: Build all bindings
         run: cargo build --all-features -p cust_raw
 
       - name: Build workspace
-        run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
+        run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
 
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
 
       - name: Prepare artifact details
         id: artifact_details
 
@@ -66,7 +66,7 @@ jobs:
         run: cargo build --all-features -p cust_raw
 
       - name: Build
-        run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*"
+        run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*"
 
       # Don't currently test because many tests rely on the system having a CUDA GPU
       # - name: Test
@@ -75,4 +75,4 @@ jobs:
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
@@ -8,9 +8,14 @@ members = [
 
   "xtask",
 
+  "examples/cuda/vecadd",
+  "examples/cuda/vecadd/kernels",
+  "examples/cuda/gemm",
+  "examples/cuda/gemm/kernels",
+  "examples/cuda/path_tracer",
+  "examples/cuda/path_tracer/kernels",
+
   "examples/optix/*",
-  "examples/cuda/cpu/*",
-  "examples/cuda/gpu/*",
 ]
 
 exclude = [
 
@@ -0,0 +1,16 @@
+[package]
+name = "gemm"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+blastoff = { path = "../../../crates/blastoff" }
+cuda_std = { path = "../../../crates/cuda_std" }
+cust = { path = "../../../crates/cust" }
+cust_raw = { path = "../../../crates/cust_raw", features = ["driver"] }
+ndarray = { version = "0.16", features = ["approx"] }
+ndarray-rand = "0.15.0"
+rand = "0.9"
+
+[build-dependencies]
+cuda_builder = { path = "../../../crates/cuda_builder" }
@@ -0,0 +1,15 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    CudaBuilder::new("kernels")
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
@@ -0,0 +1,11 @@
+[package]
+name = "gemm-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+glam = { version = "0.30.1", default-features = false, features = ["cuda", "nostd-libm"] }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
@@ -0,0 +1,46 @@
+use cuda_std::kernel;
+use cuda_std::thread;
+
+#[kernel]
+#[allow(improper_ctypes_definitions)]
+/// Naive GEMM kernel for C = alpha * A * B + beta * C.
+///
+/// This kernel computes each element of the output matrix C independently, without any memory coalescing or tiling optimizations.
+///
+/// # Safety
+/// CUDA kernel requires unsafe.
+///
+/// # Parameters
+/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
+/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
+/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
+/// - `m`: Number of rows in A and C.
+/// - `n`: Number of columns in B and C.
+/// - `k`: Number of columns in A and rows in B.
+/// - `alpha`: Scalar multiplier for A * B.
+/// - `beta`: Scalar multiplier for C.
+///
+/// # Thread Mapping
+/// Each thread computes one element of C at (row, col).
+pub unsafe fn gemm_naive(
+    mat_a: &[f32],
+    mat_b: &[f32],
+    mat_c: *mut f32,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) {
+    let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
+    let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
+
+    if row < m && col < n {
+        let mut sum = 0.0f32;
+        for i in 0..k {
+            sum += mat_a[row * k + i] * mat_b[i * n + col];
+        }
+        let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
+        *elem = alpha * sum + beta * *elem;
+    }
+}
@@ -0,0 +1,83 @@
+use cuda_std::address_space;
+use cuda_std::kernel;
+use cuda_std::thread;
+
+#[kernel]
+#[allow(improper_ctypes_definitions)]
+/// Tiled GEMM kernel for C = alpha * A * B + beta * C.
+///
+/// This kernel uses shared memory tiling to improve memory access patterns and performance.
+///
+/// # Safety
+/// CUDA kernel requires unsafe.
+///
+/// # Parameters
+/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
+/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
+/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
+/// - `m`: Number of rows in A and C.
+/// - `n`: Number of columns in B and C.
+/// - `k`: Number of columns in A and rows in B.
+/// - `alpha`: Scalar multiplier for A * B.
+/// - `beta`: Scalar multiplier for C.
+///
+/// # Tiling
+/// Each block computes a TILE_SIZE x TILE_SIZE tile of C using shared memory for A and B tiles.
+/// Threads within a block collaboratively load tiles and compute partial sums.
+///
+/// # Thread Mapping
+/// Each thread computes one element of the output tile.
+pub unsafe fn gemm_tiled(
+    mat_a: &[f32],
+    mat_b: &[f32],
+    mat_c: *mut f32,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) {
+    const TILE_SIZE: usize = 16;
+
+    #[address_space(shared)]
+    static mut TILE_A: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];
+    #[address_space(shared)]
+    static mut TILE_B: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];
+
+    // Thread indices within the block.
+    let tx = thread::thread_idx_x() as usize;
+    let ty = thread::thread_idx_y() as usize;
+
+    // Calculate row and column in the mat_c.
+    let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
+    let col = thread::block_idx_y() as usize * TILE_SIZE + tx;
+
+    let mut sum = 0.0f32;
+    // Loop over tiles of mat_a and mat_b in the k dimension.
+    for kk in (0..k).step_by(TILE_SIZE) {
+        // Collaborative loading of tiles into shared memory.
+        if row < m && (kk + tx) < k {
+            unsafe { TILE_A[ty * TILE_SIZE + tx] = mat_a[row * k + (kk + tx)] };
+        } else {
+            unsafe { TILE_A[ty * TILE_SIZE + tx] = 0.0f32 };
+        }
+        if col < n && (kk + ty) < k {
+            unsafe { TILE_B[ty * TILE_SIZE + tx] = mat_b[(kk + ty) * n + col] };
+        } else {
+            unsafe { TILE_B[ty * TILE_SIZE + tx] = 0.0f32 };
+        }
+        thread::sync_threads();
+
+        // Perform the computation on the tile.
+        for i in 0..TILE_SIZE {
+            sum += unsafe { TILE_A[ty * TILE_SIZE + i] * TILE_B[i * TILE_SIZE + tx] };
+        }
+        thread::sync_threads();
+    }
+
+    // Write the result back to mat_c with alpha and beta scaling.
+    if row < m && col < n {
+        let c = unsafe { mat_c.add(row * n + col) };
+        unsafe { *c = alpha * sum + beta * *c };
+    }
+}
@@ -0,0 +1,5 @@
+mod gemm_naive;
+mod gemm_tiled;
+
+pub use crate::gemm_naive::gemm_naive;
+pub use crate::gemm_tiled::gemm_tiled;