diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock index af27d11..c42f6f1 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock @@ -295,6 +295,13 @@ dependencies = [ "spirv-builder", ] +[[package]] +name = "compiled_tiling_1d_loop" +version = "0.1.0" +dependencies = [ + "spirv-builder", +] + [[package]] name = "compiled_tiling_2d_simd" version = "0.1.0" @@ -934,6 +941,7 @@ dependencies = [ "compiled_isomorphic", "compiled_naive", "compiled_tiling_1d", + "compiled_tiling_1d_loop", "compiled_tiling_2d_simd", "compiled_workgroup_256", "compiled_workgroup_2d", @@ -1659,6 +1667,14 @@ dependencies = [ "spirv-std", ] +[[package]] +name = "tiling_1d_loop" +version = "0.1.0" +dependencies = [ + "settings", + "spirv-std", +] + [[package]] name = "tiling_2d_simd" version = "0.1.0" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml index 115f79b..f519f31 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml +++ b/blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml @@ -8,6 +8,7 @@ members = [ "crates/gpu/workgroup_256", "crates/gpu/workgroup_2d", "crates/gpu/tiling_1d", + "crates/gpu/tiling_1d_loop", "crates/gpu/tiling_2d_simd", # # ---- The rust code that runs both on the GPU and the CPU. ---- diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs index 31f1998..b109f1c 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs @@ -37,6 +37,7 @@ fn bench_all_variants(c: &mut Criterion) { let multiplier_workgroup_256 = matmul::workgroup_256::wgpu(); let multiplier_workgroup_2d = matmul::workgroup_2d::wgpu(); let multiplier_tiling_1d = matmul::tiling_1d::wgpu(); + let multiplier_tiling_1d_loop = matmul::tiling_1d_loop::wgpu(); let multiplier_tiling_2d_simd = matmul::tiling_2d_simd::wgpu(); let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu(); @@ -108,6 +109,22 @@ fn bench_all_variants(c: &mut Criterion) { }, ); + group.bench_with_input( + BenchmarkId::new("tiling_1d_loop:wgpu", format!("{}x{}x{}", m, k, n)), + &(m, k, n), + |bench, &(m, k, n)| { + bench.iter(|| { + black_box(multiplier_tiling_1d_loop.multiply( + black_box(&a), + black_box(&b), + m, + k, + n, + )) + }); + }, + ); + group.bench_with_input( BenchmarkId::new("tiling_2d_simd:wgpu", format!("{}x{}x{}", m, k, n)), &(m, k, n), diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs index 3aef2b8..862bf9f 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs @@ -31,6 +31,7 @@ fn main() { run_tests(matmul::workgroup_256::wgpu(), &sizes); run_tests(matmul::workgroup_2d::wgpu(), &sizes); run_tests(matmul::tiling_1d::wgpu(), &sizes); + run_tests(matmul::tiling_1d_loop::wgpu(), &sizes); run_tests(matmul::tiling_2d_simd::wgpu(), &sizes); run_tests(matmul::isomorphic::wgpu(), &sizes); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/Cargo.toml new file mode 100644 index 0000000..1c97b21 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "compiled_tiling_1d_loop" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["lib", "cdylib"] + +[build-dependencies] +spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" } diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/build.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/build.rs new file mode 100644 index 0000000..db5a980 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/build.rs @@ -0,0 +1,37 @@ +use spirv_builder::{MetadataPrintout, SpirvBuilder}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn main() -> Result<(), Box> { + let gpu_crate_path = Path::new("../../../gpu/tiling_1d_loop"); + + // Compile the shader crate with SpirvBuilder. + let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2") + .print_metadata(MetadataPrintout::Full) + .build()?; + + // Get the compiled shader as a PathBuf and read its binary content. + let shader_path = result.module.unwrap_single(); + let shader_binary = fs::read(&shader_path)?; + + // Generate Rust code with a constant holding the shader binary content. + let shader_binary_literal = shader_binary + .iter() + .map(|byte| format!("0x{:02X}", byte)) + .collect::>() + .join(", "); + let generated_code = format!( + "/// Compiled SPIR-V shader binary\n\ + pub const SHADER_BINARY: &[u8] = &[{}];", + shader_binary_literal + ); + + // Write this generated code to `OUT_DIR` as `shader_binary.rs`. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let shader_binary_rs = out_dir.join("shader_binary.rs"); + fs::write(&shader_binary_rs, generated_code)?; + + println!("Generated shader binary constant at {:?}", shader_binary_rs); + Ok(()) +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/src/lib.rs new file mode 100644 index 0000000..c4ca963 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/compiled_for_gpu/tiling_1d_loop/src/lib.rs @@ -0,0 +1,4 @@ +// Including the raw bytes generated shader binary in our rust code. This "bloats" the +// binary, but it also means you don't have to worry about the shader file being +// misplaced or deleted. +include!(concat!(env!("OUT_DIR"), "/shader_binary.rs")); diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml index be36071..ef79b6e 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml @@ -21,6 +21,7 @@ compiled_naive = { path = "../compiled_for_gpu/naive" } compiled_workgroup_256 = { path = "../compiled_for_gpu/workgroup_256" } compiled_workgroup_2d = { path = "../compiled_for_gpu/workgroup_2d" } compiled_tiling_1d = { path = "../compiled_for_gpu/tiling_1d" } +compiled_tiling_1d_loop = { path = "../compiled_for_gpu/tiling_1d_loop" } compiled_tiling_2d_simd = { path = "../compiled_for_gpu/tiling_2d_simd" } compiled_isomorphic = { path = "../compiled_for_gpu/isomorphic" } # The CPU side of the isomophic implementation. diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs index 19de411..8018479 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs @@ -77,6 +77,15 @@ pub mod tiling_1d { } } +pub mod tiling_1d_loop { + use super::*; + use crate::backends::wgpu::MatrixMultiplier; + + pub fn wgpu() -> MatrixMultiplier { + futures::executor::block_on(MatrixMultiplier::new(variants::Tiling1dLoop)) + } +} + pub mod tiling_2d_simd { use super::*; use crate::backends::wgpu::MatrixMultiplier; diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs index 79058ab..a2b0c33 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs @@ -123,6 +123,36 @@ impl GridComputation for Tiling1d { } } +/// GPU implementation of matrix multiplication with one-dimensional tiling (using loops). +pub struct Tiling1dLoop; + +impl Display for Tiling1dLoop { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "tiling_1d_loop") + } +} + +impl Gpu for Tiling1dLoop { + fn compiled_shader(&self) -> &[u8] { + compiled_tiling_1d_loop::SHADER_BINARY + } +} + +impl GridComputation for Tiling1dLoop { + fn workgroup(&self) -> UVec3 { + UVec3::new(16, 16, 1) + } + + fn dispatch_count(&self, m: u32, n: u32) -> UVec3 { + let workgroup = self.workgroup(); + UVec3::new( + (m + workgroup.x - 1) / workgroup.x, + (n + workgroup.y - 1) / workgroup.y, + 1, + ) + } +} + /// GPU implementation of matrix multiplication with two-dimensional tiling. pub struct Tiling2dSimd; diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.lock b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.lock new file mode 100644 index 0000000..77f6ff7 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.lock @@ -0,0 +1,149 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "glam" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc46dd3ec48fdd8e693a98d2b8bafae273a2d54c1de02a2a7e3d57d501f39677" +dependencies = [ + "libm", +] + +[[package]] +name = "gpu" +version = "0.1.0" +dependencies = [ + "shared", + "spirv-std", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "shared" +version = "0.1.0" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "spirv-std" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "bitflags", + "glam", + "num-traits", + "spirv-std-macros", + "spirv-std-types", +] + +[[package]] +name = "spirv-std-macros" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" +dependencies = [ + "proc-macro2", + "quote", + "spirv-std-types", + "syn 1.0.109", +] + +[[package]] +name = "spirv-std-types" +version = "0.9.0" +source = "git+https://github.com/rust-gpu/rust-gpu?rev=0da80f8a61867590a0824873fa45dc8983e49da8#0da80f8a61867590a0824873fa45dc8983e49da8" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.toml b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.toml new file mode 100644 index 0000000..3af9294 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tiling_1d_loop" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["dylib", "lib"] + +[dependencies] +settings = { path = "../../shared/settings"} +spirv-std.workspace = true diff --git a/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/src/lib.rs b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/src/lib.rs new file mode 100644 index 0000000..e5b6a5a --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/src/lib.rs @@ -0,0 +1,43 @@ +#![no_std] + +use settings::Dimensions; +use settings::TILE_SIZE; +use spirv_std::glam::UVec3; +use spirv_std::spirv; + +#[spirv(compute(threads(16, 16)))] +pub fn matmul( + #[spirv(global_invocation_id)] global_id: UVec3, + #[spirv(uniform, descriptor_set = 0, binding = 0)] dimensions: &Dimensions, + #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] a: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 2)] b: &[f32], + #[spirv(storage_buffer, descriptor_set = 0, binding = 3)] result: &mut [f32], +) { + let row = global_id.y as usize; + let col = (global_id.x * TILE_SIZE) as usize; + + if row >= dimensions.m as usize || col >= dimensions.n as usize { + return; + } + + // Compute sums for each offset directly + let mut sums = [0.0; TILE_SIZE as usize]; + + for i in 0..dimensions.k as usize { + let a_elem = a[row * dimensions.k as usize + i]; + + for offset in 0..TILE_SIZE as usize { + if col + offset < dimensions.n as usize { + let b_elem = b[i * dimensions.n as usize + col + offset]; + sums[offset] += a_elem * b_elem; + } + } + } + + // Write results back + for offset in 0..TILE_SIZE as usize { + if col + offset < dimensions.n as usize { + result[row * dimensions.n as usize + col + offset] = sums[offset]; + } + } +} diff --git a/blog/2024-11-21-optimizing-matrix-mul/index.md b/blog/2024-11-21-optimizing-matrix-mul/index.md index 1444a9f..577c750 100644 --- a/blog/2024-11-21-optimizing-matrix-mul/index.md +++ b/blog/2024-11-21-optimizing-matrix-mul/index.md @@ -278,6 +278,15 @@ The kernel looks roughly the same as before except we've unrolled the computatio are calculating `TILE_SIZE` results per thread. We also need some error checking for when our matrices don't fit nicely. +But this code is kinda gross...it looks like the opaque GPU code we are used to. Let's +make it nice! + +import { RustTiling1dLoop } from './snippets/tiling_1d_loop.tsx'; + + + +Much better. + We can take this a step further and calculate 2D results per thread! Instead of calculating 4 elements per single row, we can calculate 4 elements for 4 rows (e.g. a 2D tile). diff --git a/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d_loop.tsx b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d_loop.tsx new file mode 100644 index 0000000..3b669e0 --- /dev/null +++ b/blog/2024-11-21-optimizing-matrix-mul/snippets/tiling_1d_loop.tsx @@ -0,0 +1,14 @@ +import React from "react"; +import CodeBlock from "@theme/CodeBlock"; +import Snippet from "@site/src/components/Snippet"; +import RustKernelSource from "!!raw-loader!../code/crates/gpu/tiling_1d_loop/src/lib.rs"; + +export const RustTiling1dLoop: React.FC = () => ( + + {RustKernelSource} + +);