cpp-gamedev · karnkaul · Mar 30, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025
diff --git a/assets/shader.vert b/assets/shader.vert
diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt
@@ -27,6 +27,10 @@ target_compile_definitions(glm PUBLIC
 message(STATUS "[Vulkan-Headers]")
 add_subdirectory(src/Vulkan-Headers)
 
+# add VulkanMemoryAllocator to build tree
+message(STATUS "[VulkanMemoryAllocator]")
+add_subdirectory(src/VulkanMemoryAllocator)
+
 # setup Dear ImGui library
 message(STATUS "[Dear ImGui]")
 add_library(imgui)
@@ -55,6 +59,28 @@ target_sources(imgui PRIVATE
   src/imgui/backends/imgui_impl_vulkan.h
 )
 
+# setup vma library (source file with VMA interface)
+message(STATUS "[vma]")
+add_library(vma)
+add_library(vma::vma ALIAS vma)
+target_link_libraries(vma PUBLIC
+  Vulkan::Headers
+  GPUOpen::VulkanMemoryAllocator
+)
+target_include_directories(vma SYSTEM PUBLIC
+  src/VulkanMemoryAllocator/include
+)
+target_compile_definitions(vma PUBLIC
+  VMA_STATIC_VULKAN_FUNCTIONS=0
+  VMA_DYNAMIC_VULKAN_FUNCTIONS=1
+)
+target_sources(vma PRIVATE
+  vk_mem_alloc.cpp
+)
+
+# ignore compiler warnings
+target_compile_options(vma PRIVATE -w)
+
 # declare ext library target
 add_library(${PROJECT_NAME} INTERFACE)
 add_library(learn-vk::ext ALIAS ${PROJECT_NAME})
@@ -63,6 +89,7 @@ add_library(learn-vk::ext ALIAS ${PROJECT_NAME})
 target_link_libraries(${PROJECT_NAME} INTERFACE
   glm::glm
   imgui::imgui
+  vma::vma
 )
 
 # setup preprocessor defines

diff --git a/ext/src.zip b/ext/src.zip
diff --git a/ext/vk_mem_alloc.cpp b/ext/vk_mem_alloc.cpp
@@ -0,0 +1,3 @@
+#define VMA_IMPLEMENTATION
+
+#include <vk_mem_alloc.h>
diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md
@@ -33,3 +33,13 @@
   - [GLSL to SPIR-V](shader_objects/glsl_to_spir_v.md)
   - [Drawing a Triangle](shader_objects/drawing_triangle.md)
   - [Graphics Pipelines](shader_objects/pipelines.md)
+
+# Shader Resources
+
+- [Memory Allocation](memory/README.md)
+  - [Vulkan Memory Allocator](memory/vma.md)
+  - [Buffers](memory/buffers.md)
+  - [Vertex Buffer](memory/vertex_buffer.md)
+  - [Command Block](memory/command_block.md)
+  - [Device Buffers](memory/device_buffers.md)
+  - [Images](memory/images.md)
diff --git a/guide/src/memory/README.md b/guide/src/memory/README.md
@@ -0,0 +1,5 @@
+# Memory Allocation
+
+Being an explicit API, [allocating memory](https://docs.vulkan.org/guide/latest/memory_allocation.html) in Vulkan that can be used by the device is the application's responsibility. The specifics can get quite complicated, but as recommended by the spec, we shall simply defer all that to a library: [Vulkan Memory Allocator (VMA)](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator).
+
+Vulkan exposes two kinds of objects that use such allocated memory: Buffers and Images, VMA offers transparent support for both: we just have to allocate/free buffers and images through VMA instead of the device directly. Unlike memory allocation / object construction on the CPU, there are many more parameters (than say alignment and size) to provide for the creation of buffers and images. As you might have guessed, we shall constrain ourselves to a subset that's relevant for shader resources: vertex buffers, uniform/storage buffers, and texture images.
diff --git a/guide/src/memory/buffers.md b/guide/src/memory/buffers.md
@@ -0,0 +1,94 @@
+# Buffers
+
+First add the RAII wrapper components for VMA buffers:
+
+```cpp
+struct RawBuffer {
+  [[nodiscard]] auto mapped_span() const -> std::span<std::byte> {
+    return std::span{static_cast<std::byte*>(mapped), size};
+  }
+
+  auto operator==(RawBuffer const& rhs) const -> bool = default;
+
+  VmaAllocator allocator{};
+  VmaAllocation allocation{};
+  vk::Buffer buffer{};
+  vk::DeviceSize size{};
+  void* mapped{};
+};
+
+struct BufferDeleter {
+  void operator()(RawBuffer const& raw_buffer) const noexcept;
+};
+
+// ...
+void BufferDeleter::operator()(RawBuffer const& raw_buffer) const noexcept {
+  vmaDestroyBuffer(raw_buffer.allocator, raw_buffer.buffer,
+                   raw_buffer.allocation);
+}
+```
+
+Buffers can be backed by host (RAM) or device (VRAM) memory: the former is mappable and thus useful for data that changes every frame, latter is faster to access for the GPU but needs more complex methods to copy data to. Add the related types and a create function:
+
+```cpp
+struct BufferCreateInfo {
+  VmaAllocator allocator;
+  vk::BufferUsageFlags usage;
+  std::uint32_t queue_family;
+};
+
+enum class BufferMemoryType : std::int8_t { Host, Device };
+
+[[nodiscard]] auto create_buffer(BufferCreateInfo const& create_info,
+                                 BufferMemoryType memory_type,
+                                 vk::DeviceSize size) -> Buffer;
+
+// ...
+auto vma::create_buffer(BufferCreateInfo const& create_info,
+                        BufferMemoryType const memory_type,
+                        vk::DeviceSize const size) -> Buffer {
+  if (size == 0) {
+    std::println(stderr, "Buffer cannot be 0-sized");
+    return {};
+  }
+
+  auto allocation_ci = VmaAllocationCreateInfo{};
+  allocation_ci.flags =
+    VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+  auto usage = create_info.usage;
+  if (memory_type == BufferMemoryType::Device) {
+    allocation_ci.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+    // device buffers need to support TransferDst.
+    usage |= vk::BufferUsageFlagBits::eTransferDst;
+  } else {
+    allocation_ci.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+    // host buffers can provide mapped memory.
+    allocation_ci.flags |= VMA_ALLOCATION_CREATE_MAPPED_BIT;
+  }
+
+  auto buffer_ci = vk::BufferCreateInfo{};
+  buffer_ci.setQueueFamilyIndices(create_info.queue_family)
+    .setSize(size)
+    .setUsage(usage);
+  auto vma_buffer_ci = static_cast<VkBufferCreateInfo>(buffer_ci);
+
+  VmaAllocation allocation{};
+  VkBuffer buffer{};
+  auto allocation_info = VmaAllocationInfo{};
+  auto const result =
+    vmaCreateBuffer(create_info.allocator, &vma_buffer_ci, &allocation_ci,
+            &buffer, &allocation, &allocation_info);
+  if (result != VK_SUCCESS) {
+    std::println(stderr, "Failed to create VMA Buffer");
+    return {};
+  }
+
+  return RawBuffer{
+    .allocator = create_info.allocator,
+    .allocation = allocation,
+    .buffer = buffer,
+    .size = size,
+    .mapped = allocation_info.pMappedData,
+  };
+}
+```
diff --git a/guide/src/memory/command_block.md b/guide/src/memory/command_block.md
@@ -0,0 +1,84 @@
+# Command Block
+
+Long-lived vertex buffers perform better when backed by Device memory, especially for 3D meshes. Data is transferred to device buffers in two steps: 
+
+1. Allocate a host buffer and copy the data to its mapped memory
+1. Allocate a device buffer, record a Buffer Copy operation and submit it
+
+The second step requires a command buffer and queue submission (_and_ waiting for the submitted work to complete). Encapsulate this behavior into a class, it will also be used for creating images:
+
+```cpp
+class CommandBlock {
+ public:
+  explicit CommandBlock(vk::Device device, vk::Queue queue,
+                        vk::CommandPool command_pool);
+
+  [[nodiscard]] auto command_buffer() const -> vk::CommandBuffer {
+    return *m_command_buffer;
+  }
+
+  void submit_and_wait();
+
+ private:
+  vk::Device m_device{};
+  vk::Queue m_queue{};
+  vk::UniqueCommandBuffer m_command_buffer{};
+};
+```
+
+The constructor takes an existing command pool created for such ad-hoc allocations, and the queue for submission later. This way it can be passed around after creation and used by other code.
+
+```cpp
+CommandBlock::CommandBlock(vk::Device const device, vk::Queue const queue,
+               vk::CommandPool const command_pool)
+  : m_device(device), m_queue(queue) {
+  // allocate a UniqueCommandBuffer which will free the underlying command
+  // buffer from its owning pool on destruction.
+  auto allocate_info = vk::CommandBufferAllocateInfo{};
+  allocate_info.setCommandPool(command_pool)
+    .setCommandBufferCount(1)
+    .setLevel(vk::CommandBufferLevel::ePrimary);
+  // all the current VulkanHPP functions for UniqueCommandBuffer allocation
+  // return vectors.
+  auto command_buffers = m_device.allocateCommandBuffersUnique(allocate_info);
+  m_command_buffer = std::move(command_buffers.front());
+
+  // start recording commands before returning.
+  auto begin_info = vk::CommandBufferBeginInfo{};
+  begin_info.setFlags(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
+  m_command_buffer->begin(begin_info);
+}
+```
+
+`submit_and_wait()` resets the unique command buffer at the end, to free it from its command pool:
+
+```cpp
+void CommandBlock::submit_and_wait() {
+  if (!m_command_buffer) { return; }
+
+  // end recording and submit.
+  m_command_buffer->end();
+  auto submit_info = vk::SubmitInfo2KHR{};
+  auto const command_buffer_info =
+    vk::CommandBufferSubmitInfo{*m_command_buffer};
+  submit_info.setCommandBufferInfos(command_buffer_info);
+  auto fence = m_device.createFenceUnique({});
+  m_queue.submit2(submit_info, *fence);
+
+  // wait for submit fence to be signaled.
+  static constexpr auto timeout_v =
+    static_cast<std::uint64_t>(std::chrono::nanoseconds(30s).count());
+  auto const result = m_device.waitForFences(*fence, vk::True, timeout_v);
+  if (result != vk::Result::eSuccess) {
+    std::println(stderr, "Failed to submit Command Buffer");
+  }
+  // free the command buffer.
+  m_command_buffer.reset();
+}
+```
+
+## Multithreading considerations
+
+Instead of blocking the main thread on every Command Block's `submit_and_wait()`, you might be wondering if command block usage could be multithreaded. The answer is yes! But with some extra work: each thread will require its own command pool - just using one owned (unique) pool per Command Block (with no need to free the buffer) is a good starting point. All queue operations need to be synchronized, ie a critical section protected by a mutex. This includes Swapchain acquire/present calls, and Queue submissions. A `class Queue` value type that stores a copy of the `vk::Queue` and a pointer/reference to its `std::mutex` - and wraps the submit call - can be passed to command blocks. Just this much will enable asynchronous asset loading etc, as each loading thread will use its own command pool, and queue submissions all around will be critical sections. `VmaAllocator` is internally synchronized (can be disabled at build time), so performing allocations through the same allocator on multiple threads is safe.
+
+For multi-threaded rendering, use a Secondary command buffer per thread to record rendering commands, accumulate and execute them in the main (Primary) command buffer currently in `RenderSync`. This is not particularly helpful unless you have thousands of expensive draw calls and dozens of render passes, as recording even a hundred draws will likely be faster on a single thread.
diff --git a/guide/src/memory/device_buffers.md b/guide/src/memory/device_buffers.md
@@ -0,0 +1,133 @@
+# Device Buffers
+
+This guide will only use device buffers for vertex buffers, where both vertex and index data will be strung together in a single VBO. The create function can thus take the data and perform the buffer copy operation before returning. In essence this return value is a "GPU const" buffer. To enable utilizing separate spans for vertices and indices (instead of forcing allocation of a contiguous bytestream and copying the data), the create function takes a slightly awkward span of spans:
+
+```cpp
+// disparate byte spans.
+using ByteSpans = std::span<std::span<std::byte const> const>;
+
+// returns a Device Buffer with each byte span sequentially written.
+[[nodiscard]] auto create_device_buffer(BufferCreateInfo const& create_info,
+                                        CommandBlock command_block,
+                                        ByteSpans const& byte_spans) -> Buffer;
+```
+
+Implement `create_device_buffer()`:
+
+```cpp
+auto vma::create_device_buffer(BufferCreateInfo const& create_info,
+                               CommandBlock command_block,
+                               ByteSpans const& byte_spans) -> Buffer {
+  auto const total_size = std::accumulate(
+    byte_spans.begin(), byte_spans.end(), 0uz,
+    [](std::size_t const n, std::span<std::byte const> bytes) {
+      return n + bytes.size();
+    });
+
+  auto staging_ci = create_info;
+  staging_ci.usage = vk::BufferUsageFlagBits::eTransferSrc;
+
+  // create staging Host Buffer with TransferSrc usage.
+  auto staging_buffer =
+    create_buffer(staging_ci, BufferMemoryType::Host, total_size);
+  // create the Device Buffer.
+  auto ret = create_buffer(create_info, BufferMemoryType::Device, total_size);
+  // can't do anything if either buffer creation failed.
+  if (!staging_buffer.get().buffer || !ret.get().buffer) { return {}; }
+
+  // copy byte spans into staging buffer.
+  auto dst = staging_buffer.get().mapped_span();
+  for (auto const bytes : byte_spans) {
+    std::memcpy(dst.data(), bytes.data(), bytes.size());
+    dst = dst.subspan(bytes.size());
+  }
+
+  // record buffer copy operation.
+  auto buffer_copy = vk::BufferCopy2{};
+  buffer_copy.setSize(total_size);
+  auto copy_buffer_info = vk::CopyBufferInfo2{};
+  copy_buffer_info.setSrcBuffer(staging_buffer.get().buffer)
+    .setDstBuffer(ret.get().buffer)
+    .setRegions(buffer_copy);
+  command_block.command_buffer().copyBuffer2(copy_buffer_info);
+
+  // submit and wait.
+  // waiting here is necessary to keep the staging buffer alive while the GPU
+  // accesses it through the recorded commands.
+  // this is also why the function takes ownership of the passed CommandBlock
+  // instead of just referencing it / taking a vk::CommandBuffer.
+  command_block.submit_and_wait();
+
+  return ret;
+}
+```
+
+Add a command block pool to `App`, and a helper function to create command blocks:
+
+```cpp
+void App::create_cmd_block_pool() {
+  auto command_pool_ci = vk::CommandPoolCreateInfo{};
+  command_pool_ci
+    .setQueueFamilyIndex(m_gpu.queue_family)
+    // this flag indicates that the allocated Command Buffers will be
+    // short-lived.
+    .setFlags(vk::CommandPoolCreateFlagBits::eTransient);
+  m_cmd_block_pool = m_device->createCommandPoolUnique(command_pool_ci);
+}
+
+auto App::create_command_block() const -> CommandBlock {
+  return CommandBlock{*m_device, m_queue, *m_cmd_block_pool};
+}
+```
+
+Update `create_vertex_buffer()` to create a quad with indices:
+
+```cpp
+template <typename T>
+[[nodiscard]] constexpr auto to_byte_array(T const& t) {
+  return std::bit_cast<std::array<std::byte, sizeof(T)>>(t);
+}
+
+// ...
+void App::create_vertex_buffer() {
+  // vertices of a quad.
+  static constexpr auto vertices_v = std::array{
+    Vertex{.position = {-0.5f, -0.5f}, .color = {1.0f, 0.0f, 0.0f}},
+    Vertex{.position = {0.5f, -0.5f}, .color = {0.0f, 1.0f, 0.0f}},
+    Vertex{.position = {0.5f, 0.5f}, .color = {0.0f, 0.0f, 1.0f}},
+    Vertex{.position = {-0.5f, 0.5f}, .color = {1.0f, 1.0f, 0.0f}},
+  };
+  static constexpr auto indices_v = std::array{
+    0u, 1u, 2u, 2u, 3u, 0u,
+  };
+  static constexpr auto vertices_bytes_v = to_byte_array(vertices_v);
+  static constexpr auto indices_bytes_v = to_byte_array(indices_v);
+  static constexpr auto total_bytes_v =
+    std::array<std::span<std::byte const>, 2>{
+      vertices_bytes_v,
+      indices_bytes_v,
+    };
+  // we want to write total_bytes_v to a Device VertexBuffer | IndexBuffer.
+  m_vbo = vma::create_device_buffer(m_allocator.get(),
+                                    vk::BufferUsageFlagBits::eVertexBuffer |
+                                      vk::BufferUsageFlagBits::eIndexBuffer,
+                                    create_command_block(), total_bytes_v);
+}
+```
+
+Update `draw()`:
+
+```cpp
+void App::draw(vk::CommandBuffer const command_buffer) const {
+  m_shader->bind(command_buffer, m_framebuffer_size);
+  // single VBO at binding 0 at no offset.
+  command_buffer.bindVertexBuffers(0, m_vbo.get().buffer, vk::DeviceSize{});
+  // u32 indices after offset of 4 vertices.
+  command_buffer.bindIndexBuffer(m_vbo.get().buffer, 4 * sizeof(Vertex),
+                                 vk::IndexType::eUint32);
+  // m_vbo has 6 indices.
+  command_buffer.drawIndexed(6, 1, 0, 0, 0);
+}
+```
+
+![VBO Quad](./vbo_quad.png)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#define VMA_IMPLEMENTATION

		#include <vk_mem_alloc.h>