Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified assets/shader.vert
Binary file not shown.
27 changes: 27 additions & 0 deletions ext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ target_compile_definitions(glm PUBLIC
message(STATUS "[Vulkan-Headers]")
add_subdirectory(src/Vulkan-Headers)

# add VulkanMemoryAllocator to build tree
message(STATUS "[VulkanMemoryAllocator]")
add_subdirectory(src/VulkanMemoryAllocator)

# setup Dear ImGui library
message(STATUS "[Dear ImGui]")
add_library(imgui)
Expand Down Expand Up @@ -55,6 +59,28 @@ target_sources(imgui PRIVATE
src/imgui/backends/imgui_impl_vulkan.h
)

# setup vma library (source file with VMA interface)
message(STATUS "[vma]")
add_library(vma)
add_library(vma::vma ALIAS vma)
target_link_libraries(vma PUBLIC
Vulkan::Headers
GPUOpen::VulkanMemoryAllocator
)
target_include_directories(vma SYSTEM PUBLIC
src/VulkanMemoryAllocator/include
)
target_compile_definitions(vma PUBLIC
VMA_STATIC_VULKAN_FUNCTIONS=0
VMA_DYNAMIC_VULKAN_FUNCTIONS=1
)
target_sources(vma PRIVATE
vk_mem_alloc.cpp
)

# ignore compiler warnings
target_compile_options(vma PRIVATE -w)

# declare ext library target
add_library(${PROJECT_NAME} INTERFACE)
add_library(learn-vk::ext ALIAS ${PROJECT_NAME})
Expand All @@ -63,6 +89,7 @@ add_library(learn-vk::ext ALIAS ${PROJECT_NAME})
target_link_libraries(${PROJECT_NAME} INTERFACE
glm::glm
imgui::imgui
vma::vma
)

# setup preprocessor defines
Expand Down
Binary file modified ext/src.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions ext/vk_mem_alloc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#define VMA_IMPLEMENTATION

#include <vk_mem_alloc.h>
10 changes: 10 additions & 0 deletions guide/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,13 @@
- [GLSL to SPIR-V](shader_objects/glsl_to_spir_v.md)
- [Drawing a Triangle](shader_objects/drawing_triangle.md)
- [Graphics Pipelines](shader_objects/pipelines.md)

# Shader Resources

- [Memory Allocation](memory/README.md)
- [Vulkan Memory Allocator](memory/vma.md)
- [Buffers](memory/buffers.md)
- [Vertex Buffer](memory/vertex_buffer.md)
- [Command Block](memory/command_block.md)
- [Device Buffers](memory/device_buffers.md)
- [Images](memory/images.md)
5 changes: 5 additions & 0 deletions guide/src/memory/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Memory Allocation

Being an explicit API, [allocating memory](https://docs.vulkan.org/guide/latest/memory_allocation.html) in Vulkan that can be used by the device is the application's responsibility. The specifics can get quite complicated, but as recommended by the spec, we shall simply defer all that to a library: [Vulkan Memory Allocator (VMA)](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator).

Vulkan exposes two kinds of objects that use such allocated memory: Buffers and Images, VMA offers transparent support for both: we just have to allocate/free buffers and images through VMA instead of the device directly. Unlike memory allocation / object construction on the CPU, there are many more parameters (than say alignment and size) to provide for the creation of buffers and images. As you might have guessed, we shall constrain ourselves to a subset that's relevant for shader resources: vertex buffers, uniform/storage buffers, and texture images.
94 changes: 94 additions & 0 deletions guide/src/memory/buffers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Buffers

First add the RAII wrapper components for VMA buffers:

```cpp
struct RawBuffer {
[[nodiscard]] auto mapped_span() const -> std::span<std::byte> {
return std::span{static_cast<std::byte*>(mapped), size};
}

auto operator==(RawBuffer const& rhs) const -> bool = default;

VmaAllocator allocator{};
VmaAllocation allocation{};
vk::Buffer buffer{};
vk::DeviceSize size{};
void* mapped{};
};

struct BufferDeleter {
void operator()(RawBuffer const& raw_buffer) const noexcept;
};

// ...
void BufferDeleter::operator()(RawBuffer const& raw_buffer) const noexcept {
vmaDestroyBuffer(raw_buffer.allocator, raw_buffer.buffer,
raw_buffer.allocation);
}
```

Buffers can be backed by host (RAM) or device (VRAM) memory: the former is mappable and thus useful for data that changes every frame, latter is faster to access for the GPU but needs more complex methods to copy data to. Add the related types and a create function:

```cpp
struct BufferCreateInfo {
VmaAllocator allocator;
vk::BufferUsageFlags usage;
std::uint32_t queue_family;
};

enum class BufferMemoryType : std::int8_t { Host, Device };

[[nodiscard]] auto create_buffer(BufferCreateInfo const& create_info,
BufferMemoryType memory_type,
vk::DeviceSize size) -> Buffer;

// ...
auto vma::create_buffer(BufferCreateInfo const& create_info,
BufferMemoryType const memory_type,
vk::DeviceSize const size) -> Buffer {
if (size == 0) {
std::println(stderr, "Buffer cannot be 0-sized");
return {};
}

auto allocation_ci = VmaAllocationCreateInfo{};
allocation_ci.flags =
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
auto usage = create_info.usage;
if (memory_type == BufferMemoryType::Device) {
allocation_ci.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
// device buffers need to support TransferDst.
usage |= vk::BufferUsageFlagBits::eTransferDst;
} else {
allocation_ci.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
// host buffers can provide mapped memory.
allocation_ci.flags |= VMA_ALLOCATION_CREATE_MAPPED_BIT;
}

auto buffer_ci = vk::BufferCreateInfo{};
buffer_ci.setQueueFamilyIndices(create_info.queue_family)
.setSize(size)
.setUsage(usage);
auto vma_buffer_ci = static_cast<VkBufferCreateInfo>(buffer_ci);

VmaAllocation allocation{};
VkBuffer buffer{};
auto allocation_info = VmaAllocationInfo{};
auto const result =
vmaCreateBuffer(create_info.allocator, &vma_buffer_ci, &allocation_ci,
&buffer, &allocation, &allocation_info);
if (result != VK_SUCCESS) {
std::println(stderr, "Failed to create VMA Buffer");
return {};
}

return RawBuffer{
.allocator = create_info.allocator,
.allocation = allocation,
.buffer = buffer,
.size = size,
.mapped = allocation_info.pMappedData,
};
}
```
84 changes: 84 additions & 0 deletions guide/src/memory/command_block.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Command Block

Long-lived vertex buffers perform better when backed by Device memory, especially for 3D meshes. Data is transferred to device buffers in two steps:

1. Allocate a host buffer and copy the data to its mapped memory
1. Allocate a device buffer, record a Buffer Copy operation and submit it

The second step requires a command buffer and queue submission (_and_ waiting for the submitted work to complete). Encapsulate this behavior into a class, it will also be used for creating images:

```cpp
class CommandBlock {
public:
explicit CommandBlock(vk::Device device, vk::Queue queue,
vk::CommandPool command_pool);

[[nodiscard]] auto command_buffer() const -> vk::CommandBuffer {
return *m_command_buffer;
}

void submit_and_wait();

private:
vk::Device m_device{};
vk::Queue m_queue{};
vk::UniqueCommandBuffer m_command_buffer{};
};
```

The constructor takes an existing command pool created for such ad-hoc allocations, and the queue for submission later. This way it can be passed around after creation and used by other code.

```cpp
CommandBlock::CommandBlock(vk::Device const device, vk::Queue const queue,
vk::CommandPool const command_pool)
: m_device(device), m_queue(queue) {
// allocate a UniqueCommandBuffer which will free the underlying command
// buffer from its owning pool on destruction.
auto allocate_info = vk::CommandBufferAllocateInfo{};
allocate_info.setCommandPool(command_pool)
.setCommandBufferCount(1)
.setLevel(vk::CommandBufferLevel::ePrimary);
// all the current VulkanHPP functions for UniqueCommandBuffer allocation
// return vectors.
auto command_buffers = m_device.allocateCommandBuffersUnique(allocate_info);
m_command_buffer = std::move(command_buffers.front());

// start recording commands before returning.
auto begin_info = vk::CommandBufferBeginInfo{};
begin_info.setFlags(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
m_command_buffer->begin(begin_info);
}
```

`submit_and_wait()` resets the unique command buffer at the end, to free it from its command pool:

```cpp
void CommandBlock::submit_and_wait() {
if (!m_command_buffer) { return; }

// end recording and submit.
m_command_buffer->end();
auto submit_info = vk::SubmitInfo2KHR{};
auto const command_buffer_info =
vk::CommandBufferSubmitInfo{*m_command_buffer};
submit_info.setCommandBufferInfos(command_buffer_info);
auto fence = m_device.createFenceUnique({});
m_queue.submit2(submit_info, *fence);

// wait for submit fence to be signaled.
static constexpr auto timeout_v =
static_cast<std::uint64_t>(std::chrono::nanoseconds(30s).count());
auto const result = m_device.waitForFences(*fence, vk::True, timeout_v);
if (result != vk::Result::eSuccess) {
std::println(stderr, "Failed to submit Command Buffer");
}
// free the command buffer.
m_command_buffer.reset();
}
```

## Multithreading considerations

Instead of blocking the main thread on every Command Block's `submit_and_wait()`, you might be wondering if command block usage could be multithreaded. The answer is yes! But with some extra work: each thread will require its own command pool - just using one owned (unique) pool per Command Block (with no need to free the buffer) is a good starting point. All queue operations need to be synchronized, ie a critical section protected by a mutex. This includes Swapchain acquire/present calls, and Queue submissions. A `class Queue` value type that stores a copy of the `vk::Queue` and a pointer/reference to its `std::mutex` - and wraps the submit call - can be passed to command blocks. Just this much will enable asynchronous asset loading etc, as each loading thread will use its own command pool, and queue submissions all around will be critical sections. `VmaAllocator` is internally synchronized (can be disabled at build time), so performing allocations through the same allocator on multiple threads is safe.

For multi-threaded rendering, use a Secondary command buffer per thread to record rendering commands, accumulate and execute them in the main (Primary) command buffer currently in `RenderSync`. This is not particularly helpful unless you have thousands of expensive draw calls and dozens of render passes, as recording even a hundred draws will likely be faster on a single thread.
133 changes: 133 additions & 0 deletions guide/src/memory/device_buffers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Device Buffers

This guide will only use device buffers for vertex buffers, where both vertex and index data will be strung together in a single VBO. The create function can thus take the data and perform the buffer copy operation before returning. In essence this return value is a "GPU const" buffer. To enable utilizing separate spans for vertices and indices (instead of forcing allocation of a contiguous bytestream and copying the data), the create function takes a slightly awkward span of spans:

```cpp
// disparate byte spans.
using ByteSpans = std::span<std::span<std::byte const> const>;

// returns a Device Buffer with each byte span sequentially written.
[[nodiscard]] auto create_device_buffer(BufferCreateInfo const& create_info,
CommandBlock command_block,
ByteSpans const& byte_spans) -> Buffer;
```

Implement `create_device_buffer()`:

```cpp
auto vma::create_device_buffer(BufferCreateInfo const& create_info,
CommandBlock command_block,
ByteSpans const& byte_spans) -> Buffer {
auto const total_size = std::accumulate(
byte_spans.begin(), byte_spans.end(), 0uz,
[](std::size_t const n, std::span<std::byte const> bytes) {
return n + bytes.size();
});

auto staging_ci = create_info;
staging_ci.usage = vk::BufferUsageFlagBits::eTransferSrc;

// create staging Host Buffer with TransferSrc usage.
auto staging_buffer =
create_buffer(staging_ci, BufferMemoryType::Host, total_size);
// create the Device Buffer.
auto ret = create_buffer(create_info, BufferMemoryType::Device, total_size);
// can't do anything if either buffer creation failed.
if (!staging_buffer.get().buffer || !ret.get().buffer) { return {}; }

// copy byte spans into staging buffer.
auto dst = staging_buffer.get().mapped_span();
for (auto const bytes : byte_spans) {
std::memcpy(dst.data(), bytes.data(), bytes.size());
dst = dst.subspan(bytes.size());
}

// record buffer copy operation.
auto buffer_copy = vk::BufferCopy2{};
buffer_copy.setSize(total_size);
auto copy_buffer_info = vk::CopyBufferInfo2{};
copy_buffer_info.setSrcBuffer(staging_buffer.get().buffer)
.setDstBuffer(ret.get().buffer)
.setRegions(buffer_copy);
command_block.command_buffer().copyBuffer2(copy_buffer_info);

// submit and wait.
// waiting here is necessary to keep the staging buffer alive while the GPU
// accesses it through the recorded commands.
// this is also why the function takes ownership of the passed CommandBlock
// instead of just referencing it / taking a vk::CommandBuffer.
command_block.submit_and_wait();

return ret;
}
```

Add a command block pool to `App`, and a helper function to create command blocks:

```cpp
void App::create_cmd_block_pool() {
auto command_pool_ci = vk::CommandPoolCreateInfo{};
command_pool_ci
.setQueueFamilyIndex(m_gpu.queue_family)
// this flag indicates that the allocated Command Buffers will be
// short-lived.
.setFlags(vk::CommandPoolCreateFlagBits::eTransient);
m_cmd_block_pool = m_device->createCommandPoolUnique(command_pool_ci);
}

auto App::create_command_block() const -> CommandBlock {
return CommandBlock{*m_device, m_queue, *m_cmd_block_pool};
}
```

Update `create_vertex_buffer()` to create a quad with indices:

```cpp
template <typename T>
[[nodiscard]] constexpr auto to_byte_array(T const& t) {
return std::bit_cast<std::array<std::byte, sizeof(T)>>(t);
}

// ...
void App::create_vertex_buffer() {
// vertices of a quad.
static constexpr auto vertices_v = std::array{
Vertex{.position = {-0.5f, -0.5f}, .color = {1.0f, 0.0f, 0.0f}},
Vertex{.position = {0.5f, -0.5f}, .color = {0.0f, 1.0f, 0.0f}},
Vertex{.position = {0.5f, 0.5f}, .color = {0.0f, 0.0f, 1.0f}},
Vertex{.position = {-0.5f, 0.5f}, .color = {1.0f, 1.0f, 0.0f}},
};
static constexpr auto indices_v = std::array{
0u, 1u, 2u, 2u, 3u, 0u,
};
static constexpr auto vertices_bytes_v = to_byte_array(vertices_v);
static constexpr auto indices_bytes_v = to_byte_array(indices_v);
static constexpr auto total_bytes_v =
std::array<std::span<std::byte const>, 2>{
vertices_bytes_v,
indices_bytes_v,
};
// we want to write total_bytes_v to a Device VertexBuffer | IndexBuffer.
m_vbo = vma::create_device_buffer(m_allocator.get(),
vk::BufferUsageFlagBits::eVertexBuffer |
vk::BufferUsageFlagBits::eIndexBuffer,
create_command_block(), total_bytes_v);
}
```

Update `draw()`:

```cpp
void App::draw(vk::CommandBuffer const command_buffer) const {
m_shader->bind(command_buffer, m_framebuffer_size);
// single VBO at binding 0 at no offset.
command_buffer.bindVertexBuffers(0, m_vbo.get().buffer, vk::DeviceSize{});
// u32 indices after offset of 4 vertices.
command_buffer.bindIndexBuffer(m_vbo.get().buffer, 4 * sizeof(Vertex),
vk::IndexType::eUint32);
// m_vbo has 6 indices.
command_buffer.drawIndexed(6, 1, 0, 0, 0);
}
```

![VBO Quad](./vbo_quad.png)
Loading