Skip to content

Commit 872163a

Browse files
authored
Add tiling_1d_loop (#30)
This is tiling_1d but with loops.
1 parent 3f91b13 commit 872163a

File tree

15 files changed

+352
-0
lines changed

15 files changed

+352
-0
lines changed

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/Cargo.lock

+16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ members = [
88
"crates/gpu/workgroup_256",
99
"crates/gpu/workgroup_2d",
1010
"crates/gpu/tiling_1d",
11+
"crates/gpu/tiling_1d_loop",
1112
"crates/gpu/tiling_2d_simd",
1213
#
1314
# ---- The rust code that runs both on the GPU and the CPU. ----

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/benches/gpu_bench.rs

+17
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ fn bench_all_variants(c: &mut Criterion) {
3737
let multiplier_workgroup_256 = matmul::workgroup_256::wgpu();
3838
let multiplier_workgroup_2d = matmul::workgroup_2d::wgpu();
3939
let multiplier_tiling_1d = matmul::tiling_1d::wgpu();
40+
let multiplier_tiling_1d_loop = matmul::tiling_1d_loop::wgpu();
4041
let multiplier_tiling_2d_simd = matmul::tiling_2d_simd::wgpu();
4142
let multiplier_isomorphic_gpu = matmul::isomorphic::wgpu();
4243

@@ -108,6 +109,22 @@ fn bench_all_variants(c: &mut Criterion) {
108109
},
109110
);
110111

112+
group.bench_with_input(
113+
BenchmarkId::new("tiling_1d_loop:wgpu", format!("{}x{}x{}", m, k, n)),
114+
&(m, k, n),
115+
|bench, &(m, k, n)| {
116+
bench.iter(|| {
117+
black_box(multiplier_tiling_1d_loop.multiply(
118+
black_box(&a),
119+
black_box(&b),
120+
m,
121+
k,
122+
n,
123+
))
124+
});
125+
},
126+
);
127+
111128
group.bench_with_input(
112129
BenchmarkId::new("tiling_2d_simd:wgpu", format!("{}x{}x{}", m, k, n)),
113130
&(m, k, n),

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/bin/blog/src/bin.rs

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ fn main() {
3131
run_tests(matmul::workgroup_256::wgpu(), &sizes);
3232
run_tests(matmul::workgroup_2d::wgpu(), &sizes);
3333
run_tests(matmul::tiling_1d::wgpu(), &sizes);
34+
run_tests(matmul::tiling_1d_loop::wgpu(), &sizes);
3435
run_tests(matmul::tiling_2d_simd::wgpu(), &sizes);
3536

3637
run_tests(matmul::isomorphic::wgpu(), &sizes);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "compiled_tiling_1d_loop"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[lib]
7+
crate-type = ["lib", "cdylib"]
8+
9+
[build-dependencies]
10+
spirv-builder = { git = "https://github.com/rust-gpu/rust-gpu", rev = "0da80f8a61867590a0824873fa45dc8983e49da8" }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
use spirv_builder::{MetadataPrintout, SpirvBuilder};
2+
use std::env;
3+
use std::fs;
4+
use std::path::{Path, PathBuf};
5+
6+
fn main() -> Result<(), Box<dyn std::error::Error>> {
7+
let gpu_crate_path = Path::new("../../../gpu/tiling_1d_loop");
8+
9+
// Compile the shader crate with SpirvBuilder.
10+
let result = SpirvBuilder::new(gpu_crate_path, "spirv-unknown-vulkan1.2")
11+
.print_metadata(MetadataPrintout::Full)
12+
.build()?;
13+
14+
// Get the compiled shader as a PathBuf and read its binary content.
15+
let shader_path = result.module.unwrap_single();
16+
let shader_binary = fs::read(&shader_path)?;
17+
18+
// Generate Rust code with a constant holding the shader binary content.
19+
let shader_binary_literal = shader_binary
20+
.iter()
21+
.map(|byte| format!("0x{:02X}", byte))
22+
.collect::<Vec<_>>()
23+
.join(", ");
24+
let generated_code = format!(
25+
"/// Compiled SPIR-V shader binary\n\
26+
pub const SHADER_BINARY: &[u8] = &[{}];",
27+
shader_binary_literal
28+
);
29+
30+
// Write this generated code to `OUT_DIR` as `shader_binary.rs`.
31+
let out_dir = PathBuf::from(env::var("OUT_DIR")?);
32+
let shader_binary_rs = out_dir.join("shader_binary.rs");
33+
fs::write(&shader_binary_rs, generated_code)?;
34+
35+
println!("Generated shader binary constant at {:?}", shader_binary_rs);
36+
Ok(())
37+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// Including the raw bytes generated shader binary in our rust code. This "bloats" the
2+
// binary, but it also means you don't have to worry about the shader file being
3+
// misplaced or deleted.
4+
include!(concat!(env!("OUT_DIR"), "/shader_binary.rs"));

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ compiled_naive = { path = "../compiled_for_gpu/naive" }
2121
compiled_workgroup_256 = { path = "../compiled_for_gpu/workgroup_256" }
2222
compiled_workgroup_2d = { path = "../compiled_for_gpu/workgroup_2d" }
2323
compiled_tiling_1d = { path = "../compiled_for_gpu/tiling_1d" }
24+
compiled_tiling_1d_loop = { path = "../compiled_for_gpu/tiling_1d_loop" }
2425
compiled_tiling_2d_simd = { path = "../compiled_for_gpu/tiling_2d_simd" }
2526
compiled_isomorphic = { path = "../compiled_for_gpu/isomorphic" }
2627
# The CPU side of the isomophic implementation.

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/lib.rs

+9
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,15 @@ pub mod tiling_1d {
7777
}
7878
}
7979

80+
pub mod tiling_1d_loop {
81+
use super::*;
82+
use crate::backends::wgpu::MatrixMultiplier;
83+
84+
pub fn wgpu() -> MatrixMultiplier<variants::Tiling1dLoop> {
85+
futures::executor::block_on(MatrixMultiplier::new(variants::Tiling1dLoop))
86+
}
87+
}
88+
8089
pub mod tiling_2d_simd {
8190
use super::*;
8291
use crate::backends::wgpu::MatrixMultiplier;

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/crates/cpu/matmul/src/variants.rs

+30
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,36 @@ impl GridComputation for Tiling1d {
123123
}
124124
}
125125

126+
/// GPU implementation of matrix multiplication with one-dimensional tiling (using loops).
127+
pub struct Tiling1dLoop;
128+
129+
impl Display for Tiling1dLoop {
130+
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
131+
write!(f, "tiling_1d_loop")
132+
}
133+
}
134+
135+
impl Gpu for Tiling1dLoop {
136+
fn compiled_shader(&self) -> &[u8] {
137+
compiled_tiling_1d_loop::SHADER_BINARY
138+
}
139+
}
140+
141+
impl GridComputation for Tiling1dLoop {
142+
fn workgroup(&self) -> UVec3 {
143+
UVec3::new(16, 16, 1)
144+
}
145+
146+
fn dispatch_count(&self, m: u32, n: u32) -> UVec3 {
147+
let workgroup = self.workgroup();
148+
UVec3::new(
149+
(m + workgroup.x - 1) / workgroup.x,
150+
(n + workgroup.y - 1) / workgroup.y,
151+
1,
152+
)
153+
}
154+
}
155+
126156
/// GPU implementation of matrix multiplication with two-dimensional tiling.
127157
pub struct Tiling2dSimd;
128158

Diff for: blog/2024-11-21-optimizing-matrix-mul/code/crates/gpu/tiling_1d_loop/Cargo.lock

+149
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "tiling_1d_loop"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[lib]
7+
crate-type = ["dylib", "lib"]
8+
9+
[dependencies]
10+
settings = { path = "../../shared/settings"}
11+
spirv-std.workspace = true

0 commit comments

Comments
 (0)