From 59df2f6c4993913fddefe75b8151c622244666da Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Sat, 23 Nov 2024 13:22:38 -0400 Subject: [PATCH] Make backends return errors Annoyingly, Results don't cover all errors and we need this hacky error handler dance. See https://github.com/gfx-rs/wgpu/issues/3767. --- .../code/Cargo.lock | 31 ++- .../code/Cargo.toml | 1 + .../code/benches/gpu_bench.rs | 30 +-- .../code/benches/isomorphic_bench.rs | 16 +- .../code/bin/blog/Cargo.toml | 1 + .../code/bin/blog/src/bin.rs | 196 +++++++++++++----- .../code/crates/cpu/matmul/Cargo.toml | 5 +- .../crates/cpu/matmul/src/backends/cpu.rs | 100 +++++++-- .../crates/cpu/matmul/src/backends/wgpu.rs | 27 ++- .../code/crates/cpu/matmul/src/lib.rs | 51 +++-- 10 files changed, 325 insertions(+), 133 deletions(-) diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock index e77b05f..13361d5 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock @@ -670,7 +670,7 @@ checksum = "c151a2a5ef800297b4e79efa4f4bec035c5f51d5ae587287c9b952bdf734cacd" dependencies = [ "log", "presser", - "thiserror", + "thiserror 1.0.69", "windows", ] @@ -951,6 +951,7 @@ dependencies = [ "isomorphic", "rayon", "settings", + "thiserror 2.0.3", "tracing", "wgpu", ] @@ -994,7 +995,7 @@ dependencies = [ "rustc-hash", "spirv", "termcolor", - "thiserror", + "thiserror 1.0.69", "unicode-xid", ] @@ -1636,7 +1637,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", ] [[package]] @@ -1650,6 +1660,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "thread_local" version = "1.1.8" @@ -1924,7 +1945,7 @@ dependencies = [ "raw-window-handle", "rustc-hash", "smallvec", - "thiserror", + "thiserror 1.0.69", "wgpu-hal", "wgpu-types", ] @@ -1966,7 +1987,7 @@ dependencies = [ "renderdoc-sys", "rustc-hash", "smallvec", - "thiserror", + "thiserror 1.0.69", "wasm-bindgen", "web-sys", "wgpu-types", diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml index 8f8a0ac..6467c80 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml @@ -48,6 +48,7 @@ spirv-std = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867 futures = "0.3" glam = { version = "0.29.2", features = ["cuda", "bytemuck"] } tracing = "0.1.40" +wgpu = { version = "23.0", features = ["spirv", "vulkan-portability"] } # Enable incremental by default in release mode. [profile.release] diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs index 1f688f5..62e24d5 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs @@ -29,17 +29,17 @@ const SIZES: &[(u32, u32, u32)] = &[ (64, 32, 128), // A: 64x32, B: 32x128, Result: 64x128 (1024, 512, 2048), // A: 1024x512, B: 512x2048, Result: 1024x2048 (2048, 1024, 4096), // A: 2048x1024, B: 1024x4096, Result: 2048x4096 + */ ]; fn bench_all_variants(c: &mut Criterion) { // Initialize all variants outside the loop - let multiplier_naive = matmul::naive::wgpu(); - let multiplier_workgroup_256 = matmul::workgroup_256::wgpu(); - let multiplier_workgroup_2d = matmul::workgroup_2d::wgpu(); - let multiplier_tiling_1d = matmul::tiling_1d::wgpu(); - let multiplier_tiling_1d_loop = matmul::tiling_1d_loop::wgpu(); - let multiplier_tiling_2d = matmul::tiling_2d::wgpu(); - let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu(); + let multiplier_naive = matmul::naive::wgpu().unwrap(); + let multiplier_workgroup_256 = matmul::workgroup_256::wgpu().unwrap(); + let multiplier_workgroup_2d = matmul::workgroup_2d::wgpu().unwrap(); + let multiplier_tiling_1d = matmul::tiling_1d::wgpu().unwrap(); + let multiplier_tiling_1d_loop = matmul::tiling_1d_loop::wgpu().unwrap(); + let multiplier_tiling_2d = matmul::tiling_2d::wgpu().unwrap(); for &(m, k, n) in SIZES { // Calculate FLOPs for this size @@ -134,22 +134,6 @@ fn bench_all_variants(c: &mut Criterion) { }); }, ); - - group.bench_with_input( - BenchmarkId::new("isomorphic:wgpu", format!("{}x{}x{}", m, k, n)), - &(m, k, n), - |bench, &(m, k, n)| { - bench.iter(|| { - black_box(multiplier_isomorphic_gpu.multiply( - black_box(&a), - black_box(&b), - m, - k, - n, - )) - }); - }, - ); } } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs b/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs index 66740ed..399bd85 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/isomorphic_bench.rs @@ -6,7 +6,6 @@ use rand::Rng; use std::time::Duration; const WARMUP_TIME: Duration = Duration::from_secs(2); -const MEASUREMENT_TIME: Duration = Duration::from_secs(5 * 60); const SAMPLE_SIZE: usize = 10; /// Matrix sizes to benchmark @@ -34,19 +33,18 @@ const SIZES: &[(u32, u32, u32)] = &[ fn bench_isomorphic_variants(c: &mut Criterion) { // Initialize isomorphic variants - let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu(); - let multiplier_isomorphic_cpu_single = matmul::isomorphic::cpu::single_threaded(); - let multiplier_isomorphic_cpu_multi = matmul::isomorphic::cpu::multi_threaded(); + let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu().unwrap(); + let multiplier_isomorphic_cpu_single = matmul::isomorphic::cpu::single_threaded().unwrap(); + let multiplier_isomorphic_cpu_multi = matmul::isomorphic::cpu::multi_threaded().unwrap(); for &(m, k, n) in SIZES { - // Calculate FLOPs for this size - let flops = 2.0 * (m as f64 * n as f64 * k as f64); - - let mut group = c.benchmark_group(format!("isomorphic_matmul{}x{}x{}", m, k, n)); + let mut group = c.benchmark_group("isomorphic"); group.sampling_mode(SamplingMode::Flat); group.warm_up_time(WARMUP_TIME); - //group.measurement_time(MEASUREMENT_TIME); group.sample_size(SAMPLE_SIZE); + + // Calculate FLOPs for this size + let flops = 2.0 * (m as f64 * n as f64 * k as f64); group.throughput(Throughput::Elements(flops as u64)); // Create matrices for the given size diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml index 0d72391..89ab374 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml +++ b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/Cargo.toml @@ -10,6 +10,7 @@ path = "src/bin.rs" [dependencies] matmul = { path = "../../crates/cpu/matmul" } settings = { path = "../../crates/shared/settings" } +wgpu.workspace = true futures.workspace = true tracing.workspace = true tracing-subscriber = { version = "0.3.18", features = ["env-filter", "std"] } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs index 7f8f78f..f12bcb2 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs @@ -1,17 +1,67 @@ use matmul::MatrixMultiply; use std::fmt::Display; +use std::sync::{Mutex, OnceLock}; use std::time::Instant; -use tracing::{debug, info, instrument, span, trace, Level}; +use tracing::{debug, error, info, instrument, span, trace, warn, Level}; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use wgpu::Device; + +// Thread-safe global error state for WGPU. +// See https://github.com/gfx-rs/wgpu/issues/2912 +static WGPU_ERROR_STATE: OnceLock>> = OnceLock::new(); + +/// Initializes the global error state. Should be called once at startup. +fn init_error_state() { + WGPU_ERROR_STATE.set(Mutex::new(None)).unwrap(); +} + +/// Sets an error in the global error state. +fn set_error(error: wgpu::Error) { + if let Some(state) = WGPU_ERROR_STATE.get() { + let mut state_lock = state.lock().unwrap(); + *state_lock = Some(error); + } else { + panic!("Error state not initialized!"); + } +} + +/// Clears the global error state. +fn clear_error() { + if let Some(state) = WGPU_ERROR_STATE.get() { + let mut state_lock = state.lock().unwrap(); + *state_lock = None; + } else { + panic!("Error state not initialized!"); + } +} + +/// Retrieves and clears the last error. +fn take_error() -> Option { + if let Some(state) = WGPU_ERROR_STATE.get() { + let mut state_lock = state.lock().unwrap(); + state_lock.take() + } else { + panic!("Error state not initialized!"); + } +} + +/// Installs a global error handler for the given device. +fn install_error_handler(device: &Device) { + device.on_uncaptured_error(Box::new(move |error| { + set_error(error); + })); +} fn main() { + // Initialize the error state. + init_error_state(); + tracing_subscriber::registry() .with(fmt::Layer::default()) .with(EnvFilter::from_default_env()) .init(); let sizes = [ - // Square matrices (2, 2, 2), (4, 4, 4), (8, 8, 8), @@ -19,65 +69,105 @@ fn main() { (32, 32, 32), (64, 64, 64), (128, 128, 128), - // Non-square matrices - (4, 2, 8), // A: 4x2, B: 2x8, Result: 4x8 - (8, 4, 2), // A: 8x4, B: 4x2, Result: 8x2 - (16, 8, 32), // A: 16x8, B: 8x32, Result: 16x32 - (32, 16, 8), // A: 32x16, B: 16x8, Result: 32x8 - (64, 32, 128), // A: 64x32, B: 32x128, Result: 64x128 + (256, 256, 256), + (512, 512, 512), + (1024, 1024, 1024), + (2048, 2048, 2048), ]; - run_tests(matmul::naive::wgpu(), &sizes); - run_tests(matmul::workgroup_256::wgpu(), &sizes); - run_tests(matmul::workgroup_2d::wgpu(), &sizes); - run_tests(matmul::tiling_1d::wgpu(), &sizes); - run_tests(matmul::tiling_1d_loop::wgpu(), &sizes); - run_tests(matmul::tiling_2d::wgpu(), &sizes); + for size in sizes { + let matmul = matmul::naive::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } + + for size in sizes { + let matmul = matmul::workgroup_256::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } + + for size in sizes { + let matmul = matmul::workgroup_2d::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } + + for size in sizes { + let matmul = matmul::tiling_1d::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } - run_tests(matmul::isomorphic::wgpu(), &sizes); - run_tests(matmul::isomorphic::cpu::single_threaded(), &sizes); - run_tests(matmul::isomorphic::cpu::multi_threaded(), &sizes); + for size in sizes { + let matmul = matmul::tiling_1d_loop::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } + + for size in sizes { + let matmul = matmul::tiling_2d::wgpu().unwrap(); + install_error_handler(&matmul.device); + run_test(matmul, size); + clear_error(); + } } -#[instrument(skip(multiplier, sizes), fields(algorithm = %multiplier))] -fn run_tests>(multiplier: U, sizes: &[(u32, u32, u32)]) { +#[instrument(skip(multiplier, size), fields(algorithm = %multiplier, size=?size))] +fn run_test>(multiplier: U, size: (u32, u32, u32)) { debug!(algorithm = %multiplier, "Starting tests"); + let (m, k, n) = size; + + let span = tracing::span!(Level::DEBUG, "matmul", algorithm = %multiplier, m, k, n); + let _enter = span.enter(); - for &(m, k, n) in sizes { - let span = tracing::span!(Level::INFO, "matrix_test", algorithm = %multiplier, m, k, n); - let _enter = span.enter(); - - info!("Testing size: {}x{}x{}", m, k, n); - - // Setup phase - let setup_span = span!(Level::INFO, "setup_phase"); - let _setup_enter = setup_span.enter(); - let a: Vec = (0..m * k).map(|i| i as f32).collect(); - let b: Vec = (0..k * n).map(|i| i as f32).collect(); - drop(_setup_enter); - - // Compute phase - let compute_span = span!(Level::INFO, "compute_phase"); - let compute_start = Instant::now(); - let _compute_enter = compute_span.enter(); - let result = multiplier.multiply(&a, &b, m, k, n); - let compute_time = compute_start.elapsed(); - drop(_compute_enter); - - // Calculate GFLOPS - let gflop_span = span!(Level::INFO, "calculate_gflops"); - let _gflop_enter = gflop_span.enter(); - let ops = 2.0 * (m * n * k) as f64; - let flops = ops / compute_time.as_secs_f64() / 1e9; - info!("Flops: {}", flops); - drop(_gflop_enter); - - // Verification phase - let verify_span = span!(Level::INFO, "verification_phase"); - let _verify_enter = verify_span.enter(); - verify_results(&a, &b, &result, m, k, n); - drop(_verify_enter); + trace!("Testing size: {}x{}x{}", m, k, n); + + // Setup phase + let setup_span = span!(Level::DEBUG, "setup_phase"); + let _setup_enter = setup_span.enter(); + let a: Vec = (0..m * k).map(|i| i as f32).collect(); + let b: Vec = (0..k * n).map(|i| i as f32).collect(); + drop(_setup_enter); + + // Compute phase + let compute_span = span!(Level::DEBUG, "compute_phase"); + let compute_start = Instant::now(); + let _compute_enter = compute_span.enter(); + let result = multiplier.multiply(&a, &b, m, k, n); + let compute_time = compute_start.elapsed(); + drop(_compute_enter); + + if let Some(error) = take_error() { + warn!("wgpu error occurred: {:?}", error); + return; + } + + if result.is_err() { + error!("Error during computation: {:?}", result); + return; } + + let result = result.unwrap(); + + // Calculate FLOPS + let flop_span = span!(Level::DEBUG, "calculate_flops"); + let _flop_enter = flop_span.enter(); + let ops = 2.0 * (m * n * k) as f64; + let flops = ops / compute_time.as_secs_f64() / 1e9; + info!("Flops: {}", flops); + drop(_flop_enter); + + // Verification phase + let verify_span = span!(Level::DEBUG, "verification_phase"); + let _verify_enter = verify_span.enter(); + verify_results(&a, &b, &result, m, k, n); + drop(_verify_enter); } #[instrument(skip(a, b, result), fields(rows = m, cols = n))] diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml index e4da59d..5a7d8db 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml @@ -9,12 +9,12 @@ crate-type = ["lib", "cdylib"] [dependencies] settings = { path = "../../shared/settings" } bytemuck = { version = "1.9", features = ["derive"] } -wgpu = { version = "23.0", features = ["spirv"] } ash = { version = "0.37" } rayon = "1.10" futures.workspace = true -tracing.workspace = true glam.workspace = true +tracing.workspace = true +wgpu.workspace = true # The following dependencies are used to link to the compiled shaders. compiled_naive = { path = "../compiled_for_gpu/naive" } @@ -26,3 +26,4 @@ compiled_tiling_2d = { path = "../compiled_for_gpu/tiling_2d" } compiled_isomorphic = { path = "../compiled_for_gpu/isomorphic" } # The CPU side of the isomophic implementation. isomorphic = { path = "../../shared/isomorphic" } +thiserror = "2.0.3" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs index ad0d66d..459cafd 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/cpu.rs @@ -1,4 +1,4 @@ -use crate::{Cpu, GridComputation, MatrixMultiply}; +use crate::{Cpu, GridComputation, MatrixMultiply, MatrixMultiplyError}; use glam::UVec3; use rayon::prelude::*; use settings::Dimensions; @@ -23,11 +23,18 @@ impl MatrixMultiply for SingleThreadedMatMul where T: Cpu + GridComputation + Display + Send + Sync, { - fn new(variant: T) -> impl Future + Send { - async move { SingleThreadedMatMul { variant } } + fn new(variant: T) -> impl Future> + Send { + async move { Ok(SingleThreadedMatMul { variant }) } } - fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + fn multiply( + &self, + a: &[f32], + b: &[f32], + m: u32, + k: u32, + n: u32, + ) -> Result, MatrixMultiplyError> { // Initialize the result vector with zeros as that is what the GPU does. let mut result = vec![0.0; (m * n) as usize]; @@ -68,7 +75,7 @@ where } } - result + Ok(result) } } @@ -87,11 +94,18 @@ impl MatrixMultiply for MultiThreadedMatMul where T: Cpu + GridComputation + Display + Send + Sync, { - fn new(variant: T) -> impl Future + Send { - async move { MultiThreadedMatMul { variant } } + fn new(variant: T) -> impl Future> + Send { + async move { Ok(MultiThreadedMatMul { variant }) } } - fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + fn multiply( + &self, + a: &[f32], + b: &[f32], + m: u32, + k: u32, + n: u32, + ) -> Result, MatrixMultiplyError> { // Initialize the result vector with zeros let result = vec![0.0; (m * n) as usize]; let result = Mutex::new(result); @@ -123,12 +137,14 @@ where .collect(); // Process each (x, y) pair in parallel - tasks.par_iter().for_each(|&(x, y)| { + tasks.par_iter().try_for_each(|&(x, y)| { // Define global_id (adjust z if necessary) let global_id = UVec3::new(x as u32, y as u32, 0); // Changed z to 0 for consistency // Lock the mutex to get mutable access to the result vector - let mut result_lock = result.lock().unwrap(); + let mut result_lock = result + .lock() + .map_err(|_| MatrixMultiplyError::CpuLockError)?; // Perform the matmul operation for element (x, y) ::call( @@ -139,18 +155,21 @@ where &b, &mut result_lock, ); - }); + + Ok(()) + })?; // Extract the result vector from the Mutex - let result = Mutex::into_inner(result).unwrap(); + let result = Mutex::into_inner(result).map_err(|_| MatrixMultiplyError::CpuLockError)?; - result + Ok(result) } } #[cfg(test)] mod tests { use super::*; + use futures::executor::block_on; #[test] fn test_single_threaded_matmul_2x1x1() { @@ -164,9 +183,12 @@ mod tests { let expected = vec![3.0, 6.0]; let variant = crate::variants::Isomorphic; - let matrix_multiplier = futures::executor::block_on(SingleThreadedMatMul::new(variant)); + let matrix_multiplier = + block_on(SingleThreadedMatMul::new(variant)).expect("Failed to create"); - let result = matrix_multiplier.multiply(&a, &b, m, k, n); + let result = matrix_multiplier + .multiply(&a, &b, m, k, n) + .expect("Matrix multiplication failed"); assert_eq!(result, expected); } @@ -195,9 +217,12 @@ mod tests { ]; let variant = crate::variants::Isomorphic; - let matrix_multiplier = futures::executor::block_on(SingleThreadedMatMul::new(variant)); + let matrix_multiplier = + block_on(SingleThreadedMatMul::new(variant)).expect("Failed to create"); - let result = matrix_multiplier.multiply(&a, &b, m, k, n); + let result = matrix_multiplier + .multiply(&a, &b, m, k, n) + .expect("Matrix multiplication failed"); assert_eq!(result, expected); } @@ -214,9 +239,46 @@ mod tests { let expected = vec![3.0, 6.0]; let variant = crate::variants::Isomorphic; - let matrix_multiplier = futures::executor::block_on(MultiThreadedMatMul::new(variant)); + let matrix_multiplier = + block_on(MultiThreadedMatMul::new(variant)).expect("Failed to create"); + + let result = matrix_multiplier + .multiply(&a, &b, m, k, n) + .expect("Matrix multiplication failed"); + + assert_eq!(result, expected); + } + + #[test] + fn test_multithreaded_matmul_4x4() { + let m = 4; + let k = 4; + let n = 4; + + // Define matrix `a` (4x4) in row-major order + let a = vec![ + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ]; + + // Define matrix `b` (4x4) in row-major order + let b = vec![ + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, + ]; + + // Expected result (4x4) after multiplying `a` and `b` + let expected = vec![ + 250.0, 260.0, 270.0, 280.0, 618.0, 644.0, 670.0, 696.0, 986.0, 1028.0, 1070.0, 1112.0, + 1354.0, 1412.0, 1470.0, 1528.0, + ]; + + let variant = crate::variants::Isomorphic; + let matrix_multiplier = + block_on(MultiThreadedMatMul::new(variant)).expect("Failed to create"); - let result = matrix_multiplier.multiply(&a, &b, m, k, n); + let result = matrix_multiplier + .multiply(&a, &b, m, k, n) + .expect("Matrix multiplication failed"); assert_eq!(result, expected); } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs index 990dc24..9c5dd70 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/backends/wgpu.rs @@ -1,4 +1,4 @@ -use crate::{Gpu, GridComputation, MatrixMultiply}; +use crate::{Gpu, GridComputation, MatrixMultiply, MatrixMultiplyError}; use bytemuck; use futures::channel::oneshot; use futures::executor::block_on; @@ -11,7 +11,7 @@ use wgpu::{self, util::DeviceExt}; /// Matrix multiplication on the GPU using `wgpu`. pub struct MatrixMultiplier { - device: wgpu::Device, + pub device: wgpu::Device, queue: wgpu::Queue, pipeline: wgpu::ComputePipeline, bind_group_layout: wgpu::BindGroupLayout, @@ -29,7 +29,7 @@ where T: Gpu + GridComputation + Display + Send, { /// Initializes a new `MatrixMultiplier` with necessary GPU resources. - async fn new(variant: T) -> Self { + async fn new(variant: T) -> Result { // Set up WGPU to talk to the system's GPUs and manage rendering or compute tasks. let instance = create_instance().await; @@ -51,20 +51,27 @@ where // Build the actual GPU pipeline to run the GPU program and manage execution. let pipeline = create_compute_pipeline(&device, &pipeline_layout, &shader); - Self { + Ok(Self { device, queue, pipeline, bind_group_layout, variant, - } + }) } /// Executes matrix multiplication for given input matrices. /// /// Uploads the input matrices to the GPU, dispatches the compute shader, /// and retrieves the result. - fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec { + fn multiply( + &self, + a: &[f32], + b: &[f32], + m: u32, + k: u32, + n: u32, + ) -> Result, MatrixMultiplyError> { trace!(?a, ?b, "Starting matrix multiplication"); let result_size = (m * n * std::mem::size_of::() as u32) as u64; @@ -100,7 +107,7 @@ where // // This is a `uniform` buffer instead of `storage` buffer because the data is // the same for all workgroups, it is read-only, and it is small enough to fit - // in a single buffer (`uniform` buffers are limited to to 64 KB on most GPUs + // in a single buffer (`uniform` buffers are limited to 64 KB on most GPUs // and often less on older GPUs). let dimensions = Dimensions::new(m, k, n); let dimensions_buffer = create_buffer_init( @@ -163,8 +170,8 @@ where // Wait for the mapping to complete and verify success. block_on(receiver) - .expect("Failed to receive data") - .expect("Map async failed"); + .map_err(|_| MatrixMultiplyError::GpuDataReceive)? + .map_err(|_| MatrixMultiplyError::GpuBufferMapping)?; // Read and convert the result data into a typed vector instead of raw bytes. let data = slice.get_mapped_range(); @@ -173,7 +180,7 @@ where staging_buffer.unmap(); trace!(?result, "Matrix multiplication result"); - result + Ok(result) } } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs index 7dde951..9f90d44 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs @@ -4,14 +4,39 @@ use glam::UVec3; use settings::Dimensions; use std::fmt::Display; use std::future::Future; +use thiserror::Error; mod backends; pub mod variants; +/// Errors that can happen for matrix multiply on the CPU or GPU. +#[derive(Error, Debug)] +pub enum MatrixMultiplyError { + #[error("Failed to initialize GPU instance")] + GpuInstanceCreation, + #[error("Failed to find an appropriate GPU adapter")] + GpuAdapterRequest, + #[error("Failed to create GPU device and queue")] + GpuDeviceCreation, + #[error("Failed to receive data from the GPU")] + GpuDataReceive, + #[error("Mapping GPU buffer failed")] + GpuBufferMapping, + #[error("Failed to acquire a lock on the result vector")] + CpuLockError, +} + /// The trait that defines how to multiply two matrices. -pub trait MatrixMultiply: Display { - fn new(variant: T) -> impl Future + Send; - fn multiply(&self, a: &[f32], b: &[f32], m: u32, k: u32, n: u32) -> Vec; +pub trait MatrixMultiply: Display + Sized { + fn new(variant: T) -> impl Future> + Send; + fn multiply( + &self, + a: &[f32], + b: &[f32], + m: u32, + k: u32, + n: u32, + ) -> Result, MatrixMultiplyError>; } /// Matrix multiplication logic that can be run on the CPU. @@ -44,7 +69,7 @@ pub mod naive { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(backends::wgpu::MatrixMultiplier::new(variants::Naive)) } } @@ -53,7 +78,7 @@ pub mod workgroup_256 { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(backends::wgpu::MatrixMultiplier::new( variants::Workgroup256, )) @@ -64,7 +89,7 @@ pub mod workgroup_2d { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(MatrixMultiplier::new(variants::Workgroup2d)) } } @@ -73,7 +98,7 @@ pub mod tiling_1d { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(MatrixMultiplier::new(variants::Tiling1d)) } } @@ -82,7 +107,7 @@ pub mod tiling_1d_loop { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(MatrixMultiplier::new(variants::Tiling1dLoop)) } } @@ -91,7 +116,7 @@ pub mod tiling_2d { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(MatrixMultiplier::new(variants::Tiling2d)) } } @@ -100,7 +125,7 @@ pub mod isomorphic { use super::*; use crate::backends::wgpu::MatrixMultiplier; - pub fn wgpu() -> MatrixMultiplier { + pub fn wgpu() -> Result, MatrixMultiplyError> { futures::executor::block_on(MatrixMultiplier::new(variants::Isomorphic)) } @@ -108,11 +133,13 @@ pub mod isomorphic { use super::*; use crate::backends::cpu::{MultiThreadedMatMul, SingleThreadedMatMul}; - pub fn single_threaded() -> SingleThreadedMatMul { + pub fn single_threaded( + ) -> Result, MatrixMultiplyError> { futures::executor::block_on(SingleThreadedMatMul::new(variants::Isomorphic)) } - pub fn multi_threaded() -> MultiThreadedMatMul { + pub fn multi_threaded( + ) -> Result, MatrixMultiplyError> { futures::executor::block_on(MultiThreadedMatMul::new(variants::Isomorphic)) } }