diff --git a/Cargo.lock b/Cargo.lock
index 23b752b4..7a9496b6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1717,6 +1717,13 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "kernels"
+version = "0.1.0"
+dependencies = [
+ "cuda_std",
+]
+
 [[package]]
 name = "khronos_api"
 version = "3.1.0"
@@ -1859,6 +1866,15 @@ dependencies = [
  "regex-automata",
 ]
 
+[[package]]
+name = "matmul"
+version = "0.1.0"
+dependencies = [
+ "cuda_builder",
+ "cuda_std",
+ "cust",
+]
+
 [[package]]
 name = "matrixmultiply"
 version = "0.3.10"
diff --git a/Cargo.toml b/Cargo.toml
index 632d3460..108b31a6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,8 @@ members = [
 
   "samples/introduction/async_api",
   "samples/introduction/async_api/kernels",
+  "samples/introduction/matmul",
+  "samples/introduction/matmul/kernels",
 
   "tests/compiletests",
   "tests/compiletests/deps-helper",
diff --git a/samples/introduction/README.md b/samples/introduction/README.md
deleted file mode 100644
index 3c980a66..00000000
--- a/samples/introduction/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Chapter 0: Introduction
-
-## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api)
-This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
-
-1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
-2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
-3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
index 063efbd3..8ecbdf12 100644
--- a/samples/introduction/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -1,3 +1,10 @@
+/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
+ *
+ * 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
+ * 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
+ * 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
+ */
+
 use cust::device::Device;
 use cust::event::{Event, EventFlags};
 use cust::function::{BlockSize, GridSize};
diff --git a/samples/introduction/matmul/Cargo.toml b/samples/introduction/matmul/Cargo.toml
new file mode 100644
index 00000000..344339ee
--- /dev/null
+++ b/samples/introduction/matmul/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "matmul"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+cuda_std = { path = "../../../crates/cuda_std" }
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
\ No newline at end of file
diff --git a/samples/introduction/matmul/build.rs b/samples/introduction/matmul/build.rs
new file mode 100644
index 00000000..7f23bac1
--- /dev/null
+++ b/samples/introduction/matmul/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+    CudaBuilder::new(manifest_dir.join("kernels"))
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/samples/introduction/matmul/kernels/Cargo.toml b/samples/introduction/matmul/kernels/Cargo.toml
new file mode 100644
index 00000000..1aa3f192
--- /dev/null
+++ b/samples/introduction/matmul/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
\ No newline at end of file
diff --git a/samples/introduction/matmul/kernels/src/lib.rs b/samples/introduction/matmul/kernels/src/lib.rs
new file mode 100644
index 00000000..52244612
--- /dev/null
+++ b/samples/introduction/matmul/kernels/src/lib.rs
@@ -0,0 +1,76 @@
+use core::mem::MaybeUninit;
+use cuda_std::*;
+
+// SAFETY: This function is unsafe because it dereferences raw pointers.
+#[kernel]
+pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) {
+    let bx: usize = cuda_std::thread::block_idx().x as usize;
+    let by: usize = cuda_std::thread::block_idx().y as usize;
+
+    let tx: usize = cuda_std::thread::thread_idx().x as usize;
+    let ty: usize = cuda_std::thread::thread_idx().y as usize;
+
+    const BLOCK_SIZE: usize = 32;
+    let a_begin = wa * BLOCK_SIZE * by;
+    let a_end = a_begin + wa - 1;
+    let a_step = BLOCK_SIZE;
+
+    let b_begin = BLOCK_SIZE * bx;
+    let b_step = BLOCK_SIZE * wb;
+
+    let mut c_sub: f32 = 0.0;
+    let mut kahan_correction_factor = 0.0f32;
+    let mut bi = b_begin;
+
+    for ai in (a_begin..=a_end).step_by(a_step) {
+        // The equivalent Cuda C++ code for the below is:
+        // ```
+        // __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+        // ```
+        // This memory space is shared between threads of the same block
+        #[address_space(shared)]
+        static mut As: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
+            [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+
+        #[address_space(shared)]
+        static mut Bs: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
+            [[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
+
+        // Load A and B matrices into shared memory
+        // A.add(index) returns the pointer to the index-th element of A
+        // Hence a dereference is needed to get the value at that index
+        unsafe {
+            As[ty][tx].write(a[ai + wa * ty + tx]);
+            Bs[ty][tx].write(b[bi + wb * ty + tx]);
+        }
+
+        // Synchronize to make sure the matrices are loaded
+        cuda_std::thread::sync_threads();
+        for k in 0..BLOCK_SIZE {
+            // Typically, this would be a simple calculation:
+            // ```
+            // c_sub += As[ty][k] * Bs[k][tx];
+            // ```
+            // However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
+            // and not allow it to accumulate in c_sub
+            let input = unsafe { As[ty][k].assume_init() * Bs[k][tx].assume_init() };
+            let y = input - kahan_correction_factor;
+            let sum = c_sub + y;
+
+            // This seems like the correction factor would yield zero, however due to f32 precision limitations,
+            // it helps to isolate the small errors that would otherwise accumulate in c_sub
+            kahan_correction_factor = (sum - c_sub) - y;
+            c_sub = sum;
+        }
+
+        // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
+        cuda_std::thread::sync_threads();
+
+        bi += b_step;
+    }
+
+    let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    unsafe {
+        *c.add((ci + wb * ty + tx) as usize) = c_sub;
+    }
+}
diff --git a/samples/introduction/matmul/src/main.rs b/samples/introduction/matmul/src/main.rs
new file mode 100644
index 00000000..4e2b0db3
--- /dev/null
+++ b/samples/introduction/matmul/src/main.rs
@@ -0,0 +1,183 @@
+/* This example demonstrates an implementation of matrix multiplication.
+ *
+ * 1. The matrices are first created on the host side and then copied to the device.
+ * 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly
+ * 3. The result is copied back to host, where the accumulated error occur.
+ * 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
+ */
+
+use cuda_std::glam::USizeVec2;
+use cust::device::Device;
+use cust::event::{Event, EventFlags};
+use cust::function::{BlockSize, GridSize};
+use cust::launch;
+use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
+use cust::module::Module;
+use cust::stream::{Stream, StreamFlags};
+
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+fn matrix_multiply(
+    block_size: usize,
+    dims_a: USizeVec2,
+    dims_b: USizeVec2,
+) -> Result<(), cust::error::CudaError> {
+    let dims_c = USizeVec2::new(dims_b.x, dims_a.y);
+    let size_a = dims_a.x * dims_a.y;
+    let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
+
+    let size_b = dims_b.x * dims_b.y;
+    let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!");
+
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
+
+    let size_c = dims_b.x * dims_a.y;
+    let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
+
+    let start_event = Event::new(EventFlags::DEFAULT)?;
+    let stop_event = Event::new(EventFlags::DEFAULT)?;
+
+    let d_a =
+        DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
+    let d_b =
+        DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
+    let d_c =
+        DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
+
+    stream.synchronize().expect("Stream couldn't synchronize!");
+    let threads = BlockSize::xy(block_size as u32, block_size as u32);
+    let grid = GridSize::xy(
+        (dims_b.x / (threads.x as usize)).try_into().unwrap(),
+        (dims_a.y / (threads.y as usize)).try_into().unwrap(),
+    );
+
+    println!("Computing result using CUDA Kernel...");
+
+    let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
+    let matrix_mul_cuda = module
+        .get_function("matrix_mul_cuda")
+        .expect("Kernel function not found!");
+
+    unsafe {
+        // The function definition of the kernel is:
+        // ```
+        // pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize)
+        // ```
+        // For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer.
+        // For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice.
+        launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+            d_c.as_device_ptr(),
+            d_a.as_device_ptr(),
+            d_a.len(),
+            d_b.as_device_ptr(),
+            d_b.len(),
+            dims_a.x,
+            dims_b.x
+        ))?;
+    }
+
+    println!("Done!");
+    stream.synchronize().expect("Stream couldn't synchronize!");
+
+    start_event
+        .record(&stream)
+        .expect("Failed to record start_event in the CUDA stream!");
+
+    const N_ITER: u32 = 300;
+
+    for _ in 0..N_ITER {
+        unsafe {
+            launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
+                d_c.as_device_ptr(),
+                d_a.as_device_ptr(),
+                d_a.len(),
+                d_b.as_device_ptr(),
+                d_b.len(),
+                dims_a.x,
+                dims_b.x,
+            ))?;
+        }
+    }
+
+    stop_event
+        .record(&stream)
+        .expect("Failed to record stop_event in the CUDA stream!");
+
+    stop_event
+        .synchronize()
+        .expect("Stream couldn't synchronize!");
+
+    let gpu_time: u128 = stop_event
+        .elapsed(&start_event)
+        .expect("Failed to calculate duration of GPU operations!")
+        .as_micros();
+
+    let avg_time = gpu_time as f32 / N_ITER as f32;
+    println!(
+        "Average time spent executing by the GPU: {} microseconds",
+        avg_time
+    );
+    let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32);
+    let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
+    println!("Performance = {} GFlop/s", giga_flops);
+
+    unsafe {
+        d_c.async_copy_to(&mut h_c, &stream)
+            .expect("Could not copy from device to host!");
+    }
+    stream.synchronize().expect("Stream couldn't synchronize!");
+
+    // checking computed result
+    // test relative error by the formula
+    // |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
+    let machine_epsilon = 1.1920929E-07f32;
+    let mut correct = true;
+
+    for i in 0..(dims_c.x * dims_c.y) {
+        let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs();
+        let dot_length = (dims_a.x as f32).abs();
+        let abs_val = h_c[i].abs();
+        let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
+
+        if rel_err > 1e-6 {
+            println!(
+                "Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
+                i,
+                dims_a.x as f32 * 0.01f32,
+                h_c[i],
+                rel_err
+            );
+            correct = false;
+        }
+    }
+
+    if correct {
+        println!("Result = PASS");
+        println!(
+            "NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
+        );
+    } else {
+        println!("Result = FAIL");
+        return Err(cust::error::CudaError::UnknownError);
+    }
+
+    Ok(())
+}
+
+fn main() -> Result<(), cust::error::CudaError> {
+    // Set up the context, load the module, and create a stream to run kernels in.
+    let _ctx = cust::quick_init();
+    let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
+    println!("Device Name: {}", device.name().unwrap());
+
+    let block_size: u32 = 32;
+    let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize);
+    let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize);
+
+    if dims_a.x != dims_b.y {
+        panic!("Matrix multiplication not possible with the given dimensions!");
+    }
+
+    matrix_multiply(block_size as usize, dims_a, dims_b)?;
+    Ok(())
+}