Skip to content

Commit 175d00e

Browse files
committed
fix up resource deallocation runtime errors, first draft of tutorial
1 parent a05a22b commit 175d00e

File tree

2 files changed

+126
-80
lines changed

2 files changed

+126
-80
lines changed

gpu.h

+4-31
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ struct Kernel {
129129
size_t outputSize;
130130
size_t numBuffers;
131131
size_t numInputs;
132-
WGPUCommandBuffer commandBuffer;
132+
WGPUCommandBuffer commandBuffer; // managed automatically by wgpuQueueSubmit
133133
WGPUBuffer readbackBuffer;
134134
CallbackDataDyn callbackData;
135135
std::promise<void> promise;
@@ -174,43 +174,16 @@ bool operator<(const Kernel &lhs, const Kernel &rhs) {
174174
return lhs.commandBuffer < rhs.commandBuffer;
175175
}
176176

177-
void FreeKernel(Kernel *op) {
178-
log(kDefLog, kInfo, "Freeing kernel");
179-
// TODO(avh): nullptr is insufficient check for freeable resources
180-
if (op->commandBuffer != nullptr) {
181-
wgpuCommandBufferRelease(op->commandBuffer);
182-
}
183-
if (op->readbackBuffer != nullptr) {
184-
wgpuBufferRelease(op->readbackBuffer);
185-
}
186-
if (op->callbackData.buffer != nullptr) {
187-
wgpuBufferRelease(op->callbackData.buffer);
188-
}
189-
}
190-
191-
void FreeMultiKernel(MultiKernel *pipeline) {
192-
log(kDefLog, kInfo, "Freeing multi kernel");
193-
if (pipeline->commandBuffer) {
194-
// wgpuCommandBufferRelease(pipeline->commandBuffer);
195-
}
196-
if (pipeline->readbackBuffer) {
197-
// wgpuBufferRelease(pipeline->readbackBuffer);
198-
}
199-
}
200-
201177
struct KernelPool {
202178
KernelPool(GPUContext *ctx) : ctx(ctx), data() {}
203179
GPUContext *ctx;
204180
std::set<Kernel *> data;
205181
std::set<MultiKernel *> multiData;
206182
~KernelPool() {
207-
for (auto kernelPtr : data) {
208-
FreeKernel(kernelPtr);
209-
}
183+
// Note : commandBuffer is destroyed upon queue submission,
184+
// explicitly destroying readback and callback buffers
185+
// produces runtime errors.
210186
data.clear();
211-
for (MultiKernel *multiKernelPtr : multiData) {
212-
FreeMultiKernel(multiKernelPtr);
213-
}
214187
multiData.clear();
215188
}
216189
};

run.cpp

+122-49
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@ void wait() {
2424
void section(const char *content) {
2525
fprintf(stdout, "\033[2J\033[1;1H"); // clear screen
2626
fprintf(stdout, "%s\n", kAsciiBanner);
27-
fprintf(stdout, "================================================================================\n");
27+
fprintf(stdout, "============================================================"
28+
"====================\n");
2829
fprintf(stdout, "%s\n", content);
2930
wait();
3031
// fprintf(stdout, "\033[4A\033[0J"); // clear lines
3132
}
3233

33-
void runHelloGELU(GPUContext& ctx) {
34+
void runHelloGELU(GPUContext &ctx) {
3435
// Device code (runs on the GPU) using WGSL (WebGPU Shading Language)
3536
const char *kGELU = R"(
3637
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@@ -55,8 +56,7 @@ void runHelloGELU(GPUContext& ctx) {
5556
}
5657
GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
5758
GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
58-
Kernel op =
59-
CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
59+
Kernel op = CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
6060
DispatchKernel(ctx, op);
6161
Wait(ctx, op.future);
6262
ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
@@ -152,7 +152,6 @@ Let's try running this.
152152
GPUContext ctx = CreateContext();
153153
runHelloGELU(ctx);
154154

155-
156155
section(R"(
157156
Design Objectives of gpu.cpp
158157
----------------------------
@@ -196,11 +195,14 @@ when the GPU computation occurs:
196195
)");
197196

198197
section(R"(
198+
Ahead-of-time GPU Resource Preparation
199+
--------------------------------------
199200
201+
In the next sections, we'll look at the ahead-of-time GPU resource preparation
200202
201-
*Ahead-of-time GPU Resource Preparation*
202-
203-
203+
These are functions that acquire resources and prepare state for GPU
204+
computation. These are assumed to be less performance critical and not on hot
205+
code paths.
204206
)");
205207

206208
section(R"(
@@ -245,7 +247,6 @@ for allocating and deallocating tensors data on the GPU. In practice
245247
246248
)");
247249

248-
249250
section(R"(
250251
`CreateContext()` creates a GPUContext
251252
--------------------------------------
@@ -288,19 +289,56 @@ Let's try creating some data on the GPU now.
288289
289290
)");
290291

291-
std::array<float, 3072> inputArr;
292-
std::array<float, 3072> outputArr;
293-
for (int i = 0; i < 3072; ++i) {
294-
inputArr[i] = static_cast<float>(i); // dummy input data
295-
}
296-
GPUTensor input = CreateTensor(ctx, {3072}, kf32, inputArr.data());
297-
GPUTensor output = CreateTensor(ctx, {3072}, kf32, outputArr.data());
292+
std::array<float, 3072> inputArr;
293+
std::array<float, 3072> outputArr;
294+
for (int i = 0; i < 3072; ++i) {
295+
inputArr[i] = static_cast<float>(i); // dummy input data
296+
}
297+
GPUTensor input = CreateTensor(ctx, {3072}, kf32, inputArr.data());
298+
GPUTensor output = CreateTensor(ctx, {3072}, kf32, outputArr.data());
299+
300+
fprintf(stdout, "\nSuccessfully created input and output tensors.\n\n");
301+
wait();
298302

299-
fprintf(stdout, "\nSuccessfully created input and output tensors.\n\n");
300-
wait();
303+
section(R"(
304+
Create a Kernel with `CreateKernel()`
305+
-------------------------------------
301306
307+
Reviewing our GELU example and after using `CreateTensor()` to allocate and
308+
bind buffers for input and output data, we can use `CreateKernel()` to create a
309+
kernel.
302310
303-
section(R"(
311+
```
312+
// Previously: Create the input and output tensors
313+
GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
314+
GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
315+
316+
// ...
317+
318+
Kernel op =
319+
CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
320+
```
321+
322+
Note this *does not run* the kernel, it just prepares the kernel as a resource
323+
to be dispatched later.
324+
325+
There are four arguments to `CreateKernel()`:
326+
- `GPUContext` - the context for the GPU
327+
- `ShaderCode` - the shader code for the kernel
328+
- `GPUTensor` - the input tensor. Even though the kernel is not executed,
329+
GPUTensor provides a handle to the buffers on the GPU to be loaded when the
330+
kernel is run. If there's more than one input, `GPUTensors` can be used which
331+
is an ordered collection of `GPUTensor`.
332+
- `GPUTensor` - the output tensor. As with the input tensor, the values are not
333+
important at this point, the underlying reference to the GPU buffer is bound to
334+
the kernel so that when the kernel is dispatched, it will know where to write
335+
the output data.
336+
337+
The kGELU string that goes into ShaderCode is the WGSL shader code for the
338+
kernel. We'll look at this next.
339+
)");
340+
341+
section(R"(
304342
WGSL Compute Kernels are Programs that run Computation on the GPU
305343
------------------------------------------------------------------
306344
@@ -337,58 +375,93 @@ that this is a compute kernel. The `@workgroup_size(256)` annotation specifies
337375
the workgroup size for the kernel.
338376
)");
339377

340-
section(R"(
341-
`CreateKernel()` is used to create a Kernel
342-
-------------------------------------------
378+
section(R"(
379+
Performance critical dispatch of GPU computation
380+
------------------------------------------------
343381
344-
Reviewing our GELU example and after using `CreateTensor()` to allocate and
345-
bind buffers for input and output data, we can use `CreateKernel()` to create a
346-
kernel.
382+
The past few sections have covered the ahead-of-time GPU resource preparation
383+
consisting of `Create*()` and supporting functions.
384+
385+
None of these actually execute computation on the GPU yet.
386+
387+
Next we'll look at the dispatch functions which asynchronously dispatches the
388+
kernel for execution.
389+
)");
390+
391+
392+
section(R"(
393+
Dispatch a kernel for execution with `DispatchKernel()`
394+
------------------------------------------------------
395+
396+
After creating a kernel, you can dispatch it for execution on the GPU using
397+
`DispatchKernel()`.
347398
348399
```
349-
GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
350-
GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
400+
// Previously: Create the kernel
351401
Kernel op =
352402
CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
403+
404+
// ...
405+
406+
DispatchKernel(ctx, op);
407+
Wait(ctx, op.future);
408+
ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
409+
}
353410
```
354411
355-
Note this *does not run* the kernel, it just prepares the kernel as a resource
356-
to be dispatched later.
412+
Note that the kernel is executed asynchronously on the GPU, in other words,
413+
execution will continue on the CPU while the GPU is running the kernel.
357414
358-
There are four arguments to `CreateKernel()`:
359-
- `GPUContext` - the context for the GPU
360-
- `ShaderCode` - the shader code for the kernel
361-
- `GPUTensor` - the input tensor. Even though the kernel is not executed,
362-
GPUTensor provides a handle to the buffers on the GPU to be loaded when the
363-
kernel is run. If there's more than one input, `GPUTensors` can be used which
364-
is an ordered collection of `GPUTensor`.
365-
- `GPUTensor` - the output tensor. As with the input tensor, the values are not
366-
important at this point, the underlying reference to the GPU buffer is bound to
367-
the kernel so that when the kernel is dispatched, it will know where to write
368-
the output data.
415+
To wait for the kernel to finish, you can use `Wait(ctx, op.future)`. This will
416+
block until the kernel has finished executing.
369417
370-
)");
418+
Note the output of the kernel (if any) is written to the output tensor on the
419+
GPU. It is not copied back to CPU by default until you call `ToCPU()` to copy
420+
the data back to the CPU.
371421
422+
This is intentional to allow for efficient pipelining of GPU computation and
423+
reusing GPU resources without copying data back and forth unless it's specified.
424+
)");
372425

373426
section(R"(
374-
Dispatching a kernel
375-
------------------
427+
Dispatch multiple kernels for execution with `DispatchMultiKernel()`
428+
---------------------------------------------------------------------
429+
430+
If you have multiple kernels to dispatch, you can use `CreateMultiKernel()` and
431+
`DispatchMultiKernel()`.
432+
433+
These create and dispatch multiple kernels together and are similar to
434+
`CreateKernel()` and `DispatchKernel()`, but with multiple kernels and multiple
435+
inputs per kernel.
436+
437+
With a more complex input signature, `CreateMultiKernel()` takes a structured
438+
input type `MultiKernelDesc` that specifies the kernels and their inputs. But
439+
otherwise usage is similar.
376440
377-
TODO(avh)
441+
Note that inputs can even be shared between kernels, allowing for building a
442+
complex computation graphs with shared inputs between them.
378443
)");
379444

380445

381446
section(R"(
382447
gpu.cpp vs. the raw WebGPU API
383448
------------------------------
384449
385-
The main responsibility of the types and functions of the library is to make
386-
it trivial to represent these common building blocks of computation
450+
The main responsibility of the types and functions of the library is to make it
451+
simple to represent these common building blocks of computation.
452+
453+
If you look at `examples/webgpu_intro/run.cpp` you can learn more about what
454+
it's like to interact directly with the WebGPU API.
455+
)");
456+
457+
section(R"(
458+
That's it for the introduction to gpu.cpp.
459+
460+
Have fun and let us know if you have any questions or feedback!
387461
388-
If you look at `examples/webgpu_intro/run.cpp` you can get a sense of what it's
389-
like to interact directly with the WebGPU.
462+
We're happy to collaborate with contributors or hear what you're building with
463+
gpu.cpp.
390464
)");
391465

392-
fprintf(stdout, "Goodbye!\n");
393466
return 0;
394467
}

0 commit comments

Comments
 (0)