gpu: Make LightTexture::update_scatter() work in chunks instead of …

…single texels. This is a workaround for <gfx-rs/wgpu#6827>. However, it should significantly improve throughput too; copying single texels at a time seems to be quite expensive relative to the cost of copying additional texels.
kpreid · Dec 28, 2024 · cc1c9c4 · cc1c9c4
1 parent 4b09104
commit cc1c9c4
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 39 deletions.
diff --git a/all-is-cubes-gpu/benches/wgpu.rs b/all-is-cubes-gpu/benches/wgpu.rs
@@ -18,7 +18,7 @@ use all_is_cubes_render::camera::{GraphicsOptions, StandardCameras, Viewport};
 use all_is_cubes_render::Flaws;
 use all_is_cubes_render::HeadlessRenderer;
 
-use all_is_cubes_gpu::in_wgpu::{headless, init, LightTexture};
+use all_is_cubes_gpu::in_wgpu::{headless, init, LightChunk, LightTexture};
 
 fn main() {
     let runtime = tokio::runtime::Builder::new_multi_thread().build().unwrap();
@@ -168,11 +168,13 @@ fn light_benches(runtime: &Runtime, c: &mut Criterion, instance: &wgpu::Instance
             LightTexture::new("lt", &device, bounds.size(), wgpu::TextureUsages::empty());
         let space = Space::builder(bounds).build();
 
+        let updates = LightChunk::all_in_region(bounds);
+
         // update_scatter() will do nothing if not mapped first
         texture.ensure_mapped(&queue, &space, bounds);
 
         b.iter_with_large_drop(|| {
-            texture.update_scatter(&device, &queue, &space, space.bounds().interior_iter());
+            texture.update_scatter(&device, &queue, &space, updates.iter().copied());
 
             scopeguard::guard((), |()| {
                 // flush wgpu's buffering of copy commands (not sure if this is effective).

diff --git a/all-is-cubes-gpu/src/in_wgpu.rs b/all-is-cubes-gpu/src/in_wgpu.rs
@@ -52,7 +52,7 @@ pub mod headless;
 pub mod init;
 mod light_texture;
 #[doc(hidden)] // public for benchmark
-pub use light_texture::LightTexture;
+pub use light_texture::{LightChunk, LightTexture};
 mod pipelines;
 mod poll;
 mod postprocess;

diff --git a/all-is-cubes-gpu/src/in_wgpu/light_texture.rs b/all-is-cubes-gpu/src/in_wgpu/light_texture.rs
@@ -1,3 +1,5 @@
+use std::array;
+
 use cfg_if::cfg_if;
 #[cfg(feature = "auto-threads")]
 use rayon::{
@@ -7,11 +9,12 @@ use rayon::{
 
 use all_is_cubes::math::{
     Aab, Axis, Cube, FaceMap, FreeCoordinate, GridAab, GridCoordinate, GridSize, GridSizeCoord,
+    PositiveSign,
 };
 use all_is_cubes::space::Space;
 use all_is_cubes::{
-    euclid::{Box3D, Vector3D},
-    math::PositiveSign,
+    euclid::{vec3, Box3D, Point3D, Size3D, Vector3D},
+    math::VectorOps,
 };
 use all_is_cubes_render::camera::Camera;
 
@@ -52,6 +55,60 @@ fn visible_light_volume(space_bounds: GridAab, camera: &Camera) -> GridAab {
         .unwrap_or(GridAab::ORIGIN_CUBE)
 }
 
+/// Size of the minimum unit in which we partially update a [`LightTexture`].
+/// This size is not visible outside this module except as the granularity of [`LightChunk`] values.
+const LIGHT_CHUNK_SIZE: GridSize = GridSize::new(16, 1, 1);
+#[allow(clippy::cast_possible_wrap)]
+const LIGHT_CHUNK_SIZE_I32: Size3D<i32, Cube> = Size3D::new(
+    LIGHT_CHUNK_SIZE.width as i32,
+    LIGHT_CHUNK_SIZE.height as i32,
+    LIGHT_CHUNK_SIZE.depth as i32,
+);
+const LIGHT_CHUNK_VOLUME: usize =
+    (LIGHT_CHUNK_SIZE.width * LIGHT_CHUNK_SIZE.height * LIGHT_CHUNK_SIZE.depth) as usize;
+
+/// Coordinates for a chunk of light values in a [`LightTexture`] to update.
+/// These are generally much smaller than mesh chunks.
+///
+/// This may be lossily converted from a [`Cube`] to find the containing chunk.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+#[doc(hidden)] // public for benchmark
+pub struct LightChunk(Point3D<i32, ()>);
+
+impl LightChunk {
+    pub fn new(cube: Cube) -> Self {
+        LightChunk(
+            cube.lower_bounds()
+                .div_euclid(&LIGHT_CHUNK_SIZE_I32)
+                .cast_unit(),
+        )
+    }
+
+    pub fn first_cube(self) -> Cube {
+        Cube::from(
+            self.0
+                .cast_unit::<Cube>()
+                .to_vector()
+                .zip(LIGHT_CHUNK_SIZE_I32.to_vector(), |coord, scale| {
+                    coord * scale
+                })
+                .to_point(),
+        )
+    }
+
+    /// For testing only. Implemented in a brute-force way because it doesn’t need to be cheaper.
+    pub fn all_in_region(region: GridAab) -> Vec<LightChunk> {
+        let mut chunks: Vec<LightChunk> = region
+            .interior_iter()
+            .map(LightChunk::new)
+            .collect::<std::collections::HashSet<LightChunk>>() // deduplicate
+            .into_iter()
+            .collect();
+        chunks.sort_by_key(|chunk| <[i32; 3]>::from(chunk.first_cube()));
+        chunks
+    }
+}
+
 /// Keeps a 3D [`wgpu::Texture`] up to date with the light data from a [`Space`].
 ///
 /// [`Space`] coordinates are mapped directly to texel coordinates, with modulo wrap-around.
@@ -80,7 +137,7 @@ pub struct LightTexture {
 }
 
 impl LightTexture {
-    const COPY_BUFFER_TEXELS: usize = 1024;
+    const COPY_BUFFER_CHUNKS: usize = 512;
     const COMPONENTS: usize = 4;
 
     /// Compute the appropriate size of light texture for the given conditions.
@@ -92,6 +149,7 @@ impl LightTexture {
         // Extra volume of 1 extra cube around all sides automatically captures sky light.
         let space_size = space_bounds.size() + GridSize::splat(2);
 
+        // Compute the size that we need to accomodate the camera view distance.
         // times 2 for radius, plus one to account for the effect of rounding up points to
         // containing cubes.
         let camera_size = GridSize::splat(
@@ -103,13 +161,24 @@ impl LightTexture {
 
         // The texture need not be bigger than the Space or bigger than the viewable diameter.
         // But it must also be within wgpu's limits.
-        space_size.min(camera_size).clamp(
-            GridSize::splat(1),
-            GridSize::splat(limits.max_texture_dimension_3d),
+        let visually_needed_size = space_size.min(camera_size).max(GridSize::splat(1));
+
+        // Round up to a multiple of LIGHT_CHUNK_SIZE;
+        // this part is for the sake of the implementation of updating rather than because
+        // we need the data.
+        let chunked_size =
+            visually_needed_size.zip(LIGHT_CHUNK_SIZE.cast_unit(), |ss, cs| ss.div_ceil(cs) * cs);
+
+        // Limit to wgpu limits, rounded down to chunk.
+        chunked_size.min(
+            GridSize::splat(limits.max_texture_dimension_3d)
+                .zip(LIGHT_CHUNK_SIZE.cast_unit(), |ss, cs| (ss / cs) * cs),
         )
     }
 
     /// Construct a new texture of the specified size with no data.
+    ///
+    /// The size must be a size returned by [`LightTexture::choose_size()`].
     pub fn new(
         label_prefix: &str,
         device: &wgpu::Device,
@@ -135,7 +204,10 @@ impl LightTexture {
             texture,
             copy_buffer: device.create_buffer(&wgpu::BufferDescriptor {
                 label: Some(&format!("{label_prefix} space light copy buffer")),
-                size: u64::try_from(Self::COPY_BUFFER_TEXELS * Self::COMPONENTS).unwrap(),
+                size: u64::try_from(
+                    Self::COPY_BUFFER_CHUNKS * LIGHT_CHUNK_VOLUME * Self::COMPONENTS,
+                )
+                .unwrap(),
                 usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
                 mapped_at_creation: false,
             }),
@@ -341,37 +413,52 @@ impl LightTexture {
         device: &wgpu::Device,
         queue: &wgpu::Queue,
         space: &Space,
-        cubes: impl IntoIterator<Item = Cube>,
+        chunks: impl IntoIterator<Item = LightChunk>,
     ) -> usize {
         let mut total_count = 0;
 
         let texture_size = extent_to_size3d(self.texture.size()).to_i32();
 
         // Filter out out-of-bounds cubes.
-        let cubes = cubes
+        let chunks = chunks
             .into_iter()
-            .filter(|&cube| self.mapped_region.contains_cube(cube));
+            .filter(|&chunk| self.mapped_region.contains_cube(chunk.first_cube()));
 
         // Break into batches of our buffer size.
-        for cube_batch in &itertools::Itertools::chunks(cubes, Self::COPY_BUFFER_TEXELS) {
+        for chunk_batch in &itertools::Itertools::chunks(chunks, Self::COPY_BUFFER_CHUNKS) {
             #[allow(clippy::large_stack_arrays)]
-            let mut data: [Texel; Self::COPY_BUFFER_TEXELS] =
-                [[0; Self::COMPONENTS]; Self::COPY_BUFFER_TEXELS];
+            let mut data: [[Texel; LIGHT_CHUNK_VOLUME]; Self::COPY_BUFFER_CHUNKS] =
+                [[[0; Self::COMPONENTS]; LIGHT_CHUNK_VOLUME]; Self::COPY_BUFFER_CHUNKS];
             let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
                 label: Some("space light scatter-copy"),
             });
             let mut batch_count = 0;
 
-            for (index, cube) in cube_batch.into_iter().enumerate() {
-                data[index] = space.get_lighting(cube).as_texel();
+            for (index_in_batch, chunk) in chunk_batch.into_iter().enumerate() {
+                let first_cube = chunk.first_cube();
+                data[index_in_batch] = array::from_fn(|texel_index_in_chunk| {
+                    #[allow(clippy::cast_possible_wrap)] // only as big as LIGHT_CHUNK_VOLUME
+                    let texel_index_in_chunk = texel_index_in_chunk as i32;
+                    let offset = vec3(
+                        texel_index_in_chunk.rem_euclid(LIGHT_CHUNK_SIZE_I32.width),
+                        texel_index_in_chunk
+                            .div_euclid(LIGHT_CHUNK_SIZE_I32.width)
+                            .rem_euclid(LIGHT_CHUNK_SIZE_I32.height),
+                        texel_index_in_chunk
+                            .div_euclid(LIGHT_CHUNK_SIZE_I32.width * LIGHT_CHUNK_SIZE_I32.height),
+                    );
+
+                    space.get_lighting(first_cube + offset).as_texel()
+                });
 
                 // TODO: When compute shaders are available, use a compute shader to do these
                 // scattered writes instead of issuing individual commands.
                 encoder.copy_buffer_to_texture(
                     wgpu::ImageCopyBuffer {
                         buffer: &self.copy_buffer,
                         layout: wgpu::ImageDataLayout {
-                            offset: (index * Self::COMPONENTS) as u64,
+                            offset: (index_in_batch * (LIGHT_CHUNK_VOLUME * Self::COMPONENTS))
+                                as u64,
                             bytes_per_row: None,
                             rows_per_image: None,
                         },
@@ -380,15 +467,11 @@ impl LightTexture {
                         texture: &self.texture,
                         mip_level: 0,
                         origin: point_to_origin(
-                            cube.lower_bounds().rem_euclid(&texture_size).to_u32(),
+                            first_cube.lower_bounds().rem_euclid(&texture_size).to_u32(),
                         ),
                         aspect: wgpu::TextureAspect::All,
                     },
-                    wgpu::Extent3d {
-                        width: 1,
-                        height: 1,
-                        depth_or_array_layers: 1,
-                    },
+                    size3d_to_extent(LIGHT_CHUNK_SIZE),
                 );
 
                 batch_count += 1;
@@ -399,7 +482,11 @@ impl LightTexture {
             // To do this optimally, `StagingBelt` will need to be modified to allow
             // us accessing its buffers to issue a `copy_buffer_to_texture` instead of
             // it issuing a `copy_buffer_to_buffer`.
-            queue.write_buffer(&self.copy_buffer, 0, data[..batch_count].as_flattened());
+            queue.write_buffer(
+                &self.copy_buffer,
+                0,
+                data[..batch_count].as_flattened().as_flattened(),
+            );
 
             queue.submit([encoder.finish()]);
         }

diff --git a/all-is-cubes-gpu/src/in_wgpu/space.rs b/all-is-cubes-gpu/src/in_wgpu/space.rs
@@ -11,8 +11,8 @@ use all_is_cubes::chunking::ChunkPos;
 use all_is_cubes::content::palette;
 use all_is_cubes::listen::{self, Listen as _, Listener};
 use all_is_cubes::math::{
-    rgba_const, Cube, Face6, FreeCoordinate, FreePoint, GridAab, GridCoordinate, GridPoint,
-    GridSize, GridVector, Rgb, Rgba, Wireframe as _, ZeroOne,
+    rgba_const, Face6, FreeCoordinate, FreePoint, GridAab, GridCoordinate, GridPoint, GridSize,
+    GridVector, Rgb, Rgba, Wireframe as _, ZeroOne,
 };
 use all_is_cubes::raycast::Ray;
 #[cfg(feature = "rerun")]
@@ -29,6 +29,7 @@ use all_is_cubes_render::{Flaws, RenderError};
 use crate::in_wgpu::block_texture::BlockTextureViews;
 use crate::in_wgpu::frame_texture::FramebufferTextures;
 use crate::in_wgpu::glue::{to_wgpu_color, to_wgpu_index_format};
+use crate::in_wgpu::light_texture::LightChunk;
 use crate::in_wgpu::pipelines::Pipelines;
 use crate::in_wgpu::skybox;
 use crate::in_wgpu::vertex::{WgpuInstanceData, WgpuLinesVertex};
@@ -1041,7 +1042,7 @@ struct SpaceRendererTodo {
     /// None means do a full space reupload.
     ///
     /// TODO: experiment with different granularities of light invalidation (chunks, dirty rects, etc.)
-    light: Option<HashSet<Cube>>,
+    light: Option<HashSet<LightChunk>>,
 
     sky: bool,
 }
@@ -1065,7 +1066,7 @@ impl listen::Store<SpaceChange> for SpaceRendererTodo {
                 SpaceChange::CubeLight { cube } => {
                     // None means we're already at "update everything"
                     if let Some(set) = &mut self.light {
-                        set.insert(cube);
+                        set.insert(LightChunk::new(cube));
                     }
                 }
                 SpaceChange::CubeBlock { .. } => {}

diff --git a/all-is-cubes-gpu/tests/shaders/tests.rs b/all-is-cubes-gpu/tests/shaders/tests.rs
@@ -1,11 +1,12 @@
 use std::sync::Arc;
 
-use all_is_cubes::math::GridSize;
+use all_is_cubes::math::{ps64, GridSize, Rgb};
 use all_is_cubes::raycast::scale_to_integer_step;
+use all_is_cubes::space::Space;
 use all_is_cubes::universe::Universe;
 use all_is_cubes::util::YieldProgress;
 
-use all_is_cubes_gpu::in_wgpu::{init, LightTexture};
+use all_is_cubes_gpu::in_wgpu::{init, LightChunk, LightTexture};
 
 use crate::harness::run_shader_test;
 use crate::wgsl::{frag_expr, to_wgsl};
@@ -77,10 +78,10 @@ async fn scale_to_integer_step_test() {
 /// Not a shader test per se, but a test that the light texture updates correctly.
 #[tokio::test]
 #[rstest::rstest]
-async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
-    use all_is_cubes::math::Rgb;
-    use all_is_cubes::space::Space;
-
+async fn light_texture_write_read(
+    #[values(false, true)] use_scatter: bool,
+    #[values(16, 30, 50)] space_size_param: u32,
+) {
     let ((device, queue), (_universe, space, dark_space)) = tokio::join!(
         async {
             let instance = crate::harness::instance().await;
@@ -94,10 +95,12 @@ async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
         },
         async {
             let mut universe = Universe::new();
+            // TODO: the test would be more rigorous with a precise size rather than the rounding
+            // that lighting_bench_space() does.
             let space = all_is_cubes::content::testing::lighting_bench_space(
                 &mut universe,
                 YieldProgress::noop(),
-                GridSize::new(32, 32, 32),
+                GridSize::splat(space_size_param),
             )
             .await
             .unwrap();
@@ -112,15 +115,20 @@ async fn light_texture_write_read(#[values(false, true)] use_scatter: bool) {
     let mut lt = LightTexture::new(
         "light_texture_write_test",
         &device,
-        GridSize::splat(32),
+        LightTexture::choose_size(&device.limits(), space.bounds(), ps64(1e6)),
         wgpu::TextureUsages::COPY_SRC,
     );
 
     if use_scatter {
         // First initialize with black from dark_space, then refresh it using update_scatter().
         lt.ensure_mapped(&queue, &dark_space, space.bounds());
 
-        lt.update_scatter(&device, &queue, &space, space.bounds().interior_iter());
+        lt.update_scatter(
+            &device,
+            &queue,
+            &space,
+            LightChunk::all_in_region(space.bounds()).into_iter(),
+        );
     } else {
         lt.ensure_mapped(&queue, &space, space.bounds());
     }