Skip to content

Commit

Permalink
gpu: Make LightTexture::update_scatter() work in chunks instead of …
Browse files Browse the repository at this point in the history
…single texels.

This is a workaround for <gfx-rs/wgpu#6827>.
However, it should significantly improve throughput too; copying single
texels at a time seems to be quite expensive relative to the cost of
copying additional texels.
  • Loading branch information
kpreid committed Dec 28, 2024
1 parent 4b09104 commit cc1c9c4
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 39 deletions.
6 changes: 4 additions & 2 deletions all-is-cubes-gpu/benches/wgpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use all_is_cubes_render::camera::{GraphicsOptions, StandardCameras, Viewport};
use all_is_cubes_render::Flaws;
use all_is_cubes_render::HeadlessRenderer;

use all_is_cubes_gpu::in_wgpu::{headless, init, LightTexture};
use all_is_cubes_gpu::in_wgpu::{headless, init, LightChunk, LightTexture};

fn main() {
let runtime = tokio::runtime::Builder::new_multi_thread().build().unwrap();
Expand Down Expand Up @@ -168,11 +168,13 @@ fn light_benches(runtime: &Runtime, c: &mut Criterion, instance: &wgpu::Instance
LightTexture::new("lt", &device, bounds.size(), wgpu::TextureUsages::empty());
let space = Space::builder(bounds).build();

let updates = LightChunk::all_in_region(bounds);

// update_scatter() will do nothing if not mapped first
texture.ensure_mapped(&queue, &space, bounds);

b.iter_with_large_drop(|| {
texture.update_scatter(&device, &queue, &space, space.bounds().interior_iter());
texture.update_scatter(&device, &queue, &space, updates.iter().copied());

scopeguard::guard((), |()| {
// flush wgpu's buffering of copy commands (not sure if this is effective).
Expand Down
2 changes: 1 addition & 1 deletion all-is-cubes-gpu/src/in_wgpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub mod headless;
pub mod init;
mod light_texture;
#[doc(hidden)] // public for benchmark
pub use light_texture::LightTexture;
pub use light_texture::{LightChunk, LightTexture};
mod pipelines;
mod poll;
mod postprocess;
Expand Down
133 changes: 110 additions & 23 deletions all-is-cubes-gpu/src/in_wgpu/light_texture.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::array;

use cfg_if::cfg_if;
#[cfg(feature = "auto-threads")]
use rayon::{
Expand All @@ -7,11 +9,12 @@ use rayon::{

use all_is_cubes::math::{
Aab, Axis, Cube, FaceMap, FreeCoordinate, GridAab, GridCoordinate, GridSize, GridSizeCoord,
PositiveSign,
};
use all_is_cubes::space::Space;
use all_is_cubes::{
euclid::{Box3D, Vector3D},
math::PositiveSign,
euclid::{vec3, Box3D, Point3D, Size3D, Vector3D},
math::VectorOps,
};
use all_is_cubes_render::camera::Camera;

Expand Down Expand Up @@ -52,6 +55,60 @@ fn visible_light_volume(space_bounds: GridAab, camera: &Camera) -> GridAab {
.unwrap_or(GridAab::ORIGIN_CUBE)
}

/// Size of the minimum unit in which we partially update a [`LightTexture`].
/// This size is not visible outside this module except as the granularity of [`LightChunk`] values.
const LIGHT_CHUNK_SIZE: GridSize = GridSize::new(16, 1, 1);
#[allow(clippy::cast_possible_wrap)]
const LIGHT_CHUNK_SIZE_I32: Size3D<i32, Cube> = Size3D::new(
LIGHT_CHUNK_SIZE.width as i32,
LIGHT_CHUNK_SIZE.height as i32,
LIGHT_CHUNK_SIZE.depth as i32,
);
const LIGHT_CHUNK_VOLUME: usize =
(LIGHT_CHUNK_SIZE.width * LIGHT_CHUNK_SIZE.height * LIGHT_CHUNK_SIZE.depth) as usize;

/// Coordinates for a chunk of light values in a [`LightTexture`] to update.
/// These are generally much smaller than mesh chunks.
///
/// This may be lossily converted from a [`Cube`] to find the containing chunk.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
#[doc(hidden)] // public for benchmark
pub struct LightChunk(Point3D<i32, ()>);

impl LightChunk {
pub fn new(cube: Cube) -> Self {
LightChunk(
cube.lower_bounds()
.div_euclid(&LIGHT_CHUNK_SIZE_I32)
.cast_unit(),
)
}

pub fn first_cube(self) -> Cube {
Cube::from(
self.0
.cast_unit::<Cube>()
.to_vector()
.zip(LIGHT_CHUNK_SIZE_I32.to_vector(), |coord, scale| {
coord * scale
})
.to_point(),
)
}

/// For testing only. Implemented in a brute-force way because it doesn’t need to be cheaper.
pub fn all_in_region(region: GridAab) -> Vec<LightChunk> {
let mut chunks: Vec<LightChunk> = region
.interior_iter()
.map(LightChunk::new)
.collect::<std::collections::HashSet<LightChunk>>() // deduplicate
.into_iter()
.collect();
chunks.sort_by_key(|chunk| <[i32; 3]>::from(chunk.first_cube()));
chunks
}
}

/// Keeps a 3D [`wgpu::Texture`] up to date with the light data from a [`Space`].
///
/// [`Space`] coordinates are mapped directly to texel coordinates, with modulo wrap-around.
Expand Down Expand Up @@ -80,7 +137,7 @@ pub struct LightTexture {
}

impl LightTexture {
const COPY_BUFFER_TEXELS: usize = 1024;
const COPY_BUFFER_CHUNKS: usize = 512;
const COMPONENTS: usize = 4;

/// Compute the appropriate size of light texture for the given conditions.
Expand All @@ -92,6 +149,7 @@ impl LightTexture {
// Extra volume of 1 extra cube around all sides automatically captures sky light.
let space_size = space_bounds.size() + GridSize::splat(2);

// Compute the size that we need to accomodate the camera view distance.
// times 2 for radius, plus one to account for the effect of rounding up points to
// containing cubes.
let camera_size = GridSize::splat(
Expand All @@ -103,13 +161,24 @@ impl LightTexture {

// The texture need not be bigger than the Space or bigger than the viewable diameter.
// But it must also be within wgpu's limits.
space_size.min(camera_size).clamp(
GridSize::splat(1),
GridSize::splat(limits.max_texture_dimension_3d),
let visually_needed_size = space_size.min(camera_size).max(GridSize::splat(1));

// Round up to a multiple of LIGHT_CHUNK_SIZE;
// this part is for the sake of the implementation of updating rather than because
// we need the data.
let chunked_size =
visually_needed_size.zip(LIGHT_CHUNK_SIZE.cast_unit(), |ss, cs| ss.div_ceil(cs) * cs);

// Limit to wgpu limits, rounded down to chunk.
chunked_size.min(
GridSize::splat(limits.max_texture_dimension_3d)
.zip(LIGHT_CHUNK_SIZE.cast_unit(), |ss, cs| (ss / cs) * cs),
)
}

/// Construct a new texture of the specified size with no data.
///
/// The size must be a size returned by [`LightTexture::choose_size()`].
pub fn new(
label_prefix: &str,
device: &wgpu::Device,
Expand All @@ -135,7 +204,10 @@ impl LightTexture {
texture,
copy_buffer: device.create_buffer(&wgpu::BufferDescriptor {
label: Some(&format!("{label_prefix} space light copy buffer")),
size: u64::try_from(Self::COPY_BUFFER_TEXELS * Self::COMPONENTS).unwrap(),
size: u64::try_from(
Self::COPY_BUFFER_CHUNKS * LIGHT_CHUNK_VOLUME * Self::COMPONENTS,
)
.unwrap(),
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
}),
Expand Down Expand Up @@ -341,37 +413,52 @@ impl LightTexture {
device: &wgpu::Device,
queue: &wgpu::Queue,
space: &Space,
cubes: impl IntoIterator<Item = Cube>,
chunks: impl IntoIterator<Item = LightChunk>,
) -> usize {
let mut total_count = 0;

let texture_size = extent_to_size3d(self.texture.size()).to_i32();

// Filter out out-of-bounds cubes.
let cubes = cubes
let chunks = chunks
.into_iter()
.filter(|&cube| self.mapped_region.contains_cube(cube));
.filter(|&chunk| self.mapped_region.contains_cube(chunk.first_cube()));

// Break into batches of our buffer size.
for cube_batch in &itertools::Itertools::chunks(cubes, Self::COPY_BUFFER_TEXELS) {
for chunk_batch in &itertools::Itertools::chunks(chunks, Self::COPY_BUFFER_CHUNKS) {
#[allow(clippy::large_stack_arrays)]
let mut data: [Texel; Self::COPY_BUFFER_TEXELS] =
[[0; Self::COMPONENTS]; Self::COPY_BUFFER_TEXELS];
let mut data: [[Texel; LIGHT_CHUNK_VOLUME]; Self::COPY_BUFFER_CHUNKS] =
[[[0; Self::COMPONENTS]; LIGHT_CHUNK_VOLUME]; Self::COPY_BUFFER_CHUNKS];
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("space light scatter-copy"),
});
let mut batch_count = 0;

for (index, cube) in cube_batch.into_iter().enumerate() {
data[index] = space.get_lighting(cube).as_texel();
for (index_in_batch, chunk) in chunk_batch.into_iter().enumerate() {
let first_cube = chunk.first_cube();
data[index_in_batch] = array::from_fn(|texel_index_in_chunk| {
#[allow(clippy::cast_possible_wrap)] // only as big as LIGHT_CHUNK_VOLUME
let texel_index_in_chunk = texel_index_in_chunk as i32;
let offset = vec3(
texel_index_in_chunk.rem_euclid(LIGHT_CHUNK_SIZE_I32.width),
texel_index_in_chunk
.div_euclid(LIGHT_CHUNK_SIZE_I32.width)
.rem_euclid(LIGHT_CHUNK_SIZE_I32.height),
texel_index_in_chunk
.div_euclid(LIGHT_CHUNK_SIZE_I32.width * LIGHT_CHUNK_SIZE_I32.height),
);

space.get_lighting(first_cube + offset).as_texel()
});

// TODO: When compute shaders are available, use a compute shader to do these
// scattered writes instead of issuing individual commands.
encoder.copy_buffer_to_texture(
wgpu::ImageCopyBuffer {
buffer: &self.copy_buffer,
layout: wgpu::ImageDataLayout {
offset: (index * Self::COMPONENTS) as u64,
offset: (index_in_batch * (LIGHT_CHUNK_VOLUME * Self::COMPONENTS))
as u64,
bytes_per_row: None,
rows_per_image: None,
},
Expand All @@ -380,15 +467,11 @@ impl LightTexture {
texture: &self.texture,
mip_level: 0,
origin: point_to_origin(
cube.lower_bounds().rem_euclid(&texture_size).to_u32(),
first_cube.lower_bounds().rem_euclid(&texture_size).to_u32(),
),
aspect: wgpu::TextureAspect::All,
},
wgpu::Extent3d {
width: 1,
height: 1,
depth_or_array_layers: 1,
},
size3d_to_extent(LIGHT_CHUNK_SIZE),
);

batch_count += 1;
Expand All @@ -399,7 +482,11 @@ impl LightTexture {
// To do this optimally, `StagingBelt` will need to be modified to allow
// us accessing its buffers to issue a `copy_buffer_to_texture` instead of
// it issuing a `copy_buffer_to_buffer`.
queue.write_buffer(&self.copy_buffer, 0, data[..batch_count].as_flattened());
queue.write_buffer(
&self.copy_buffer,
0,
data[..batch_count].as_flattened().as_flattened(),
);

queue.submit([encoder.finish()]);
}
Expand Down
9 changes: 5 additions & 4 deletions all-is-cubes-gpu/src/in_wgpu/space.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ use all_is_cubes::chunking::ChunkPos;
use all_is_cubes::content::palette;
use all_is_cubes::listen::{self, Listen as _, Listener};
use all_is_cubes::math::{
rgba_const, Cube, Face6, FreeCoordinate, FreePoint, GridAab, GridCoordinate, GridPoint,
GridSize, GridVector, Rgb, Rgba, Wireframe as _, ZeroOne,
rgba_const, Face6, FreeCoordinate, FreePoint, GridAab, GridCoordinate, GridPoint, GridSize,
GridVector, Rgb, Rgba, Wireframe as _, ZeroOne,
};
use all_is_cubes::raycast::Ray;
#[cfg(feature = "rerun")]
Expand All @@ -29,6 +29,7 @@ use all_is_cubes_render::{Flaws, RenderError};
use crate::in_wgpu::block_texture::BlockTextureViews;
use crate::in_wgpu::frame_texture::FramebufferTextures;
use crate::in_wgpu::glue::{to_wgpu_color, to_wgpu_index_format};
use crate::in_wgpu::light_texture::LightChunk;
use crate::in_wgpu::pipelines::Pipelines;
use crate::in_wgpu::skybox;
use crate::in_wgpu::vertex::{WgpuInstanceData, WgpuLinesVertex};
Expand Down Expand Up @@ -1041,7 +1042,7 @@ struct SpaceRendererTodo {
/// None means do a full space reupload.
///
/// TODO: experiment with different granularities of light invalidation (chunks, dirty rects, etc.)
light: Option<HashSet<Cube>>,
light: Option<HashSet<LightChunk>>,

sky: bool,
}
Expand All @@ -1065,7 +1066,7 @@ impl listen::Store<SpaceChange> for SpaceRendererTodo {
SpaceChange::CubeLight { cube } => {
// None means we're already at "update everything"
if let Some(set) = &mut self.light {
set.insert(cube);
set.insert(LightChunk::new(cube));
}
}
SpaceChange::CubeBlock { .. } => {}
Expand Down
26 changes: 17 additions & 9 deletions all-is-cubes-gpu/tests/shaders/tests.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::sync::Arc;

use all_is_cubes::math::GridSize;
use all_is_cubes::math::{ps64, GridSize, Rgb};
use all_is_cubes::raycast::scale_to_integer_step;
use all_is_cubes::space::Space;
use all_is_cubes::universe::Universe;
use all_is_cubes::util::YieldProgress;

use all_is_cubes_gpu::in_wgpu::{init, LightTexture};
use all_is_cubes_gpu::in_wgpu::{init, LightChunk, LightTexture};

use crate::harness::run_shader_test;
use crate::wgsl::{frag_expr, to_wgsl};
Expand Down Expand Up @@ -77,10 +78,10 @@ async fn scale_to_integer_step_test() {
/// Not a shader test per se, but a test that the light texture updates correctly.
#[tokio::test]
#[rstest::rstest]
async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
use all_is_cubes::math::Rgb;
use all_is_cubes::space::Space;

async fn light_texture_write_read(
#[values(false, true)] use_scatter: bool,
#[values(16, 30, 50)] space_size_param: u32,
) {
let ((device, queue), (_universe, space, dark_space)) = tokio::join!(
async {
let instance = crate::harness::instance().await;
Expand All @@ -94,10 +95,12 @@ async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
},
async {
let mut universe = Universe::new();
// TODO: the test would be more rigorous with a precise size rather than the rounding
// that lighting_bench_space() does.
let space = all_is_cubes::content::testing::lighting_bench_space(
&mut universe,
YieldProgress::noop(),
GridSize::new(32, 32, 32),
GridSize::splat(space_size_param),
)
.await
.unwrap();
Expand All @@ -112,15 +115,20 @@ async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
let mut lt = LightTexture::new(
"light_texture_write_test",
&device,
GridSize::splat(32),
LightTexture::choose_size(&device.limits(), space.bounds(), ps64(1e6)),
wgpu::TextureUsages::COPY_SRC,
);

if use_scatter {
// First initialize with black from dark_space, then refresh it using update_scatter().
lt.ensure_mapped(&queue, &dark_space, space.bounds());

lt.update_scatter(&device, &queue, &space, space.bounds().interior_iter());
lt.update_scatter(
&device,
&queue,
&space,
LightChunk::all_in_region(space.bounds()).into_iter(),
);
} else {
lt.ensure_mapped(&queue, &space, space.bounds());
}
Expand Down

0 comments on commit cc1c9c4

Please sign in to comment.