@@ -24,13 +24,14 @@ void wait() {
24
24
void section (const char *content) {
25
25
fprintf (stdout, " \033 [2J\033 [1;1H" ); // clear screen
26
26
fprintf (stdout, " %s\n " , kAsciiBanner );
27
- fprintf (stdout, " ================================================================================\n " );
27
+ fprintf (stdout, " ============================================================"
28
+ " ====================\n " );
28
29
fprintf (stdout, " %s\n " , content);
29
30
wait ();
30
31
// fprintf(stdout, "\033[4A\033[0J"); // clear lines
31
32
}
32
33
33
- void runHelloGELU (GPUContext& ctx) {
34
+ void runHelloGELU (GPUContext & ctx) {
34
35
// Device code (runs on the GPU) using WGSL (WebGPU Shading Language)
35
36
const char *kGELU = R"(
36
37
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@@ -55,8 +56,7 @@ void runHelloGELU(GPUContext& ctx) {
55
56
}
56
57
GPUTensor input = CreateTensor (ctx, {N}, kf32, inputArr.data ());
57
58
GPUTensor output = CreateTensor (ctx, {N}, kf32, outputArr.data ());
58
- Kernel op =
59
- CreateKernel (ctx, ShaderCode{kGELU , 256 }, input, output);
59
+ Kernel op = CreateKernel (ctx, ShaderCode{kGELU , 256 }, input, output);
60
60
DispatchKernel (ctx, op);
61
61
Wait (ctx, op.future );
62
62
ToCPU (ctx, output, outputArr.data (), sizeof (outputArr));
@@ -152,7 +152,6 @@ Let's try running this.
152
152
GPUContext ctx = CreateContext ();
153
153
runHelloGELU (ctx);
154
154
155
-
156
155
section (R"(
157
156
Design Objectives of gpu.cpp
158
157
----------------------------
@@ -196,11 +195,14 @@ when the GPU computation occurs:
196
195
)" );
197
196
198
197
section (R"(
198
+ Ahead-of-time GPU Resource Preparation
199
+ --------------------------------------
199
200
201
+ In the next sections, we'll look at the ahead-of-time GPU resource preparation
200
202
201
- *Ahead-of-time GPU Resource Preparation*
202
-
203
-
203
+ These are functions that acquire resources and prepare state for GPU
204
+ computation. These are assumed to be less performance critical and not on hot
205
+ code paths.
204
206
)" );
205
207
206
208
section (R"(
@@ -245,7 +247,6 @@ for allocating and deallocating tensors data on the GPU. In practice
245
247
246
248
)" );
247
249
248
-
249
250
section (R"(
250
251
`CreateContext()` creates a GPUContext
251
252
--------------------------------------
@@ -288,19 +289,56 @@ Let's try creating some data on the GPU now.
288
289
289
290
)" );
290
291
291
- std::array<float , 3072 > inputArr;
292
- std::array<float , 3072 > outputArr;
293
- for (int i = 0 ; i < 3072 ; ++i) {
294
- inputArr[i] = static_cast <float >(i); // dummy input data
295
- }
296
- GPUTensor input = CreateTensor (ctx, {3072 }, kf32, inputArr.data ());
297
- GPUTensor output = CreateTensor (ctx, {3072 }, kf32, outputArr.data ());
292
+ std::array<float , 3072 > inputArr;
293
+ std::array<float , 3072 > outputArr;
294
+ for (int i = 0 ; i < 3072 ; ++i) {
295
+ inputArr[i] = static_cast <float >(i); // dummy input data
296
+ }
297
+ GPUTensor input = CreateTensor (ctx, {3072 }, kf32, inputArr.data ());
298
+ GPUTensor output = CreateTensor (ctx, {3072 }, kf32, outputArr.data ());
299
+
300
+ fprintf (stdout, " \n Successfully created input and output tensors.\n\n " );
301
+ wait ();
298
302
299
- fprintf (stdout, " \n Successfully created input and output tensors.\n\n " );
300
- wait ();
303
+ section (R"(
304
+ Create a Kernel with `CreateKernel()`
305
+ -------------------------------------
301
306
307
+ Reviewing our GELU example and after using `CreateTensor()` to allocate and
308
+ bind buffers for input and output data, we can use `CreateKernel()` to create a
309
+ kernel.
302
310
303
- section (R"(
311
+ ```
312
+ // Previously: Create the input and output tensors
313
+ GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
314
+ GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
315
+
316
+ // ...
317
+
318
+ Kernel op =
319
+ CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
320
+ ```
321
+
322
+ Note this *does not run* the kernel, it just prepares the kernel as a resource
323
+ to be dispatched later.
324
+
325
+ There are four arguments to `CreateKernel()`:
326
+ - `GPUContext` - the context for the GPU
327
+ - `ShaderCode` - the shader code for the kernel
328
+ - `GPUTensor` - the input tensor. Even though the kernel is not executed,
329
+ GPUTensor provides a handle to the buffers on the GPU to be loaded when the
330
+ kernel is run. If there's more than one input, `GPUTensors` can be used which
331
+ is an ordered collection of `GPUTensor`.
332
+ - `GPUTensor` - the output tensor. As with the input tensor, the values are not
333
+ important at this point, the underlying reference to the GPU buffer is bound to
334
+ the kernel so that when the kernel is dispatched, it will know where to write
335
+ the output data.
336
+
337
+ The kGELU string that goes into ShaderCode is the WGSL shader code for the
338
+ kernel. We'll look at this next.
339
+ )" );
340
+
341
+ section (R"(
304
342
WGSL Compute Kernels are Programs that run Computation on the GPU
305
343
------------------------------------------------------------------
306
344
@@ -337,58 +375,93 @@ that this is a compute kernel. The `@workgroup_size(256)` annotation specifies
337
375
the workgroup size for the kernel.
338
376
)" );
339
377
340
- section (R"(
341
- `CreateKernel()` is used to create a Kernel
342
- -------------------------------------------
378
+ section (R"(
379
+ Performance critical dispatch of GPU computation
380
+ ------------------------------------------------
343
381
344
- Reviewing our GELU example and after using `CreateTensor()` to allocate and
345
- bind buffers for input and output data, we can use `CreateKernel()` to create a
346
- kernel.
382
+ The past few sections have covered the ahead-of-time GPU resource preparation
383
+ consisting of `Create*()` and supporting functions.
384
+
385
+ None of these actually execute computation on the GPU yet.
386
+
387
+ Next we'll look at the dispatch functions which asynchronously dispatches the
388
+ kernel for execution.
389
+ )" );
390
+
391
+
392
+ section (R"(
393
+ Dispatch a kernel for execution with `DispatchKernel()`
394
+ ------------------------------------------------------
395
+
396
+ After creating a kernel, you can dispatch it for execution on the GPU using
397
+ `DispatchKernel()`.
347
398
348
399
```
349
- GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
350
- GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
400
+ // Previously: Create the kernel
351
401
Kernel op =
352
402
CreateKernel(ctx, ShaderCode{kGELU, 256}, input, output);
403
+
404
+ // ...
405
+
406
+ DispatchKernel(ctx, op);
407
+ Wait(ctx, op.future);
408
+ ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
409
+ }
353
410
```
354
411
355
- Note this *does not run* the kernel, it just prepares the kernel as a resource
356
- to be dispatched later .
412
+ Note that the kernel is executed asynchronously on the GPU, in other words,
413
+ execution will continue on the CPU while the GPU is running the kernel .
357
414
358
- There are four arguments to `CreateKernel()`:
359
- - `GPUContext` - the context for the GPU
360
- - `ShaderCode` - the shader code for the kernel
361
- - `GPUTensor` - the input tensor. Even though the kernel is not executed,
362
- GPUTensor provides a handle to the buffers on the GPU to be loaded when the
363
- kernel is run. If there's more than one input, `GPUTensors` can be used which
364
- is an ordered collection of `GPUTensor`.
365
- - `GPUTensor` - the output tensor. As with the input tensor, the values are not
366
- important at this point, the underlying reference to the GPU buffer is bound to
367
- the kernel so that when the kernel is dispatched, it will know where to write
368
- the output data.
415
+ To wait for the kernel to finish, you can use `Wait(ctx, op.future)`. This will
416
+ block until the kernel has finished executing.
369
417
370
- )" );
418
+ Note the output of the kernel (if any) is written to the output tensor on the
419
+ GPU. It is not copied back to CPU by default until you call `ToCPU()` to copy
420
+ the data back to the CPU.
371
421
422
+ This is intentional to allow for efficient pipelining of GPU computation and
423
+ reusing GPU resources without copying data back and forth unless it's specified.
424
+ )" );
372
425
373
426
section (R"(
374
- Dispatching a kernel
375
- ------------------
427
+ Dispatch multiple kernels for execution with `DispatchMultiKernel()`
428
+ ---------------------------------------------------------------------
429
+
430
+ If you have multiple kernels to dispatch, you can use `CreateMultiKernel()` and
431
+ `DispatchMultiKernel()`.
432
+
433
+ These create and dispatch multiple kernels together and are similar to
434
+ `CreateKernel()` and `DispatchKernel()`, but with multiple kernels and multiple
435
+ inputs per kernel.
436
+
437
+ With a more complex input signature, `CreateMultiKernel()` takes a structured
438
+ input type `MultiKernelDesc` that specifies the kernels and their inputs. But
439
+ otherwise usage is similar.
376
440
377
- TODO(avh)
441
+ Note that inputs can even be shared between kernels, allowing for building a
442
+ complex computation graphs with shared inputs between them.
378
443
)" );
379
444
380
445
381
446
section (R"(
382
447
gpu.cpp vs. the raw WebGPU API
383
448
------------------------------
384
449
385
- The main responsibility of the types and functions of the library is to make
386
- it trivial to represent these common building blocks of computation
450
+ The main responsibility of the types and functions of the library is to make it
451
+ simple to represent these common building blocks of computation.
452
+
453
+ If you look at `examples/webgpu_intro/run.cpp` you can learn more about what
454
+ it's like to interact directly with the WebGPU API.
455
+ )" );
456
+
457
+ section (R"(
458
+ That's it for the introduction to gpu.cpp.
459
+
460
+ Have fun and let us know if you have any questions or feedback!
387
461
388
- If you look at `examples/webgpu_intro/run.cpp` you can get a sense of what it's
389
- like to interact directly with the WebGPU .
462
+ We're happy to collaborate with contributors or hear what you're building with
463
+ gpu.cpp .
390
464
)" );
391
465
392
- fprintf (stdout, " Goodbye!\n " );
393
466
return 0 ;
394
467
}
0 commit comments