clean up API naming convention consistency - Prepare -> Create, Launch -> Dispatch

austinvhuang · austinvhuang · commit 2223ea40d635 · 2024-06-11T13:39:01.000-04:00
diff --git a/README.md b/README.md
@@ -64,8 +64,8 @@ int main(int argc, char **argv) {
   GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
   Kernel op =
-      PrepareKernel(ctx, kGELU, std::array{input}, output);
-  LaunchKernel(ctx, op);
+      CreateKernel(ctx, kGELU, std::array{input}, output);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 10; ++i) {
@@ -77,7 +77,7 @@ int main(int argc, char **argv) {
 ```
 
 For those curious about what happens under the hood with the raw WebGPU API,
-the equivalent functionality is implemented using the raw WebGPU C API in
+the equivalent functionality is implemented using the WebGPU C API in
 `examples/webgpu_intro/run.cpp`.
 
 ## Quick Start
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
@@ -18,8 +18,8 @@ int main(int argc, char **argv) {
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
 
   Kernel op =
-      PrepareKernel(ctx, GeluShader(256, kf32), std::array{input}, output);
-  LaunchKernel(ctx, op);
+      CreateKernel(ctx, GeluShader(256, kf32), std::array{input}, output);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 10; ++i) {
diff --git a/examples/raymarch/run.cpp b/examples/raymarch/run.cpp
@@ -78,8 +78,8 @@ int main(int argc, char **argv) {
 
   GPUContext ctx = CreateGPUContext();
   GPUTensor devScreen = CreateTensor(ctx, {NROWS, NCOLS}, kf32, screen.data());
-  Kernel render = PrepareKernel(ctx, ShaderCode{kSDF, 64}, {}, 0, devScreen, params);
-  LaunchKernel(ctx, render);
+  Kernel render = CreateKernel(ctx, ShaderCode{kSDF, 64}, {}, 0, devScreen, params);
+  DispatchKernel(ctx, render);
   Wait(ctx, render.future);
   ToCPU(ctx, devScreen, screen.data(), sizeof(screen));
 
diff --git a/gpu.h b/gpu.h
@@ -440,7 +440,7 @@ void ToGPU(GPUContext &ctx, const float *data, GPUTensor &tensor) {
                        tensor.data.size);
 }
 
-Kernel PrepareKernel(GPUContext &ctx, const ShaderCode &shader,
+Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
                      const GPUTensor *inputs, size_t numInputs,
                      const GPUTensor &output, const void *params = nullptr,
                      size_t paramsSize = 0) {
@@ -628,39 +628,39 @@ Kernel PrepareKernel(GPUContext &ctx, const ShaderCode &shader,
     check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
   }
 
-  log(kDefLog, kInfo, "Exiting PrepareKernel");
+  log(kDefLog, kInfo, "Exiting CreateKernel");
   return op;
 }
 
 template <typename ParamsType = NoParam>
-Kernel PrepareKernel(GPUContext &ctx, const ShaderCode &shader,
+Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
                      const GPUTensor *inputs, size_t numInputs,
                      const GPUTensor &output,
                      const ParamsType &params = ParamsType{}) {
   if constexpr (!IsNoParam<ParamsType>) {
     log(kDefLog, kInfo, "Using params of size %d bytes", sizeof(ParamsType));
-    return PrepareKernel(ctx, shader, inputs, numInputs, output,
+    return CreateKernel(ctx, shader, inputs, numInputs, output,
                          reinterpret_cast<const void *>(&params),
                          sizeof(ParamsType));
   } else {
     log(kDefLog, kInfo, "No params");
-    return PrepareKernel(ctx, shader, inputs, numInputs, output, nullptr, 0);
+    return CreateKernel(ctx, shader, inputs, numInputs, output, nullptr, 0);
   }
 }
 
 /*
- * PrepareKernel with array of inputs (convienence function)
+ * CreateKernel with array of inputs (convienence function)
  */
 template <typename ParamsType = NoParam, size_t numInputs>
-Kernel PrepareKernel(GPUContext &ctx, const ShaderCode &shader,
+Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
                      const std::array<GPUTensor, numInputs> &inputs,
                      const GPUTensor &output,
                      const ParamsType &params = ParamsType{}) {
-  return PrepareKernel<ParamsType>(ctx, shader, inputs.data(), numInputs,
+  return CreateKernel<ParamsType>(ctx, shader, inputs.data(), numInputs,
                                    output, params);
 }
 
-MultiKernel PrepareMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
+MultiKernel CreateMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;
   MultiKernel pipeline;
@@ -849,7 +849,7 @@ MultiKernel PrepareMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
   return pipeline;
 }
 
-void LaunchKernel(GPUContext &ctx, Kernel &op) {
+void DispatchKernel(GPUContext &ctx, Kernel &op) {
   // Submit the command buffer
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
 
@@ -870,7 +870,7 @@ void LaunchKernel(GPUContext &ctx, Kernel &op) {
       &op.callbackData);
 }
 
-void LaunchMultiKernel(GPUContext &ctx, MultiKernel &pipeline) {
+void DispatchMultiKernel(GPUContext &ctx, MultiKernel &pipeline) {
   wgpuQueueSubmit(ctx.queue, 1, &pipeline.commandBuffer);
 
   pipeline.callbackData = CallbackDataDyn{
diff --git a/run.cpp b/run.cpp
@@ -76,9 +76,9 @@ The core verbs (functions) of interest are:
 
 - *Requesting GPU Resources* - CreateGPUContext(), CreateArray() and
   CreateTensor() 
-- *Ahead-of-Time Preparation of a Computation* - PrepareKernel() which both binds
+- *Ahead-of-Time Preparation of a Computation* - CreateKernel() which both binds
   resources and compiles the kernel 
-- *Asynchronous Execution of Computation* - LaunchKernel(), Wait()
+- *Asynchronous Execution of Computation* - DispatchKernel(), Wait()
 - *Data Movement* - ToCPU(), ToGPU(), also CreateArray and CreateTensor have
   convenience overloads that take CPU data directly as part of instantiation.
 
@@ -186,15 +186,15 @@ workgroup size is the number of threads in a workgroup.
 )");
 
 section(R"(
-Preparing a kernel
+Creating a kernel
 ------------------
 
 TODO(avh)
 )");
 
 
 section(R"(
-Launching a kernel
+Dispatching a kernel
 ------------------
 
 TODO(avh)
diff --git a/utils/test_kernels.cpp b/utils/test_kernels.cpp
@@ -33,10 +33,10 @@ void TestResidual(GPUContext &ctx) {
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
   ShaderCode shaderCode = ResidualShader(workgroupSize, kf32);
   log(kDefLog, kInfo, "Shader Code :\n%s", shaderCode.code.c_str());
-  Kernel op = PrepareKernel<NoParam, 2>(
+  Kernel op = CreateKernel<NoParam, 2>(
       ctx, ResidualShader(workgroupSize, kf32),
       std::array<GPUTensor, 2>{input1, input2}, output, {});
-  LaunchKernel(ctx, op);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s", show<float, N, 1>(outputArr, "Output").c_str());
@@ -56,9 +56,9 @@ void TestHadamard(GPUContext &ctx) {
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
   ShaderCode shaderCode = HadamardShader(workgroupSize, kf32);
   log(kDefLog, kInfo, "Shader Code :\n%s", shaderCode.code.c_str());
-  Kernel op = PrepareKernel(ctx, HadamardShader(workgroupSize, kf32),
+  Kernel op = CreateKernel(ctx, HadamardShader(workgroupSize, kf32),
                             std::array{input1, input2}, output, {});
-  LaunchKernel(ctx, op);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   log(kDefLog, kInfo, "%s", show<float, N, 1>(outputArr, "Output").c_str());
 }
@@ -77,9 +77,9 @@ void TestMatmul(GPUContext &ctx) {
   GPUTensor input2 = CreateTensor(ctx, {K, N}, kf32, input2Arr.data());
   GPUTensor output = CreateTensor(ctx, {M, N}, kf32, outputArr.data());
   Kernel op =
-      PrepareKernel(ctx, MatmulShader(256, kShaderMatMul1, kf32, M, K, N),
+      CreateKernel(ctx, MatmulShader(256, kShaderMatMul1, kf32, M, K, N),
                     std::array{input1, input2}, output);
-  LaunchKernel(ctx, op);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s", show<float, M, K>(input1Arr, "A").c_str());
@@ -132,9 +132,9 @@ void TestGelu(GPUContext &ctx) {
   GPUTensor geluOut = CreateTensor(ctx, {N}, kf32, outputArr.data());
   log(kDefLog, kInfo, "Creating GELU Shader");
   Kernel op =
-      PrepareKernel(ctx, GeluShader(256, kf32), std::array{geluIn}, geluOut);
-  log(kDefLog, kInfo, "Launching GELU Shader");
-  LaunchKernel(ctx, op);
+      CreateKernel(ctx, GeluShader(256, kf32), std::array{geluIn}, geluOut);
+  log(kDefLog, kInfo, "Dispatching GELU Shader");
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, geluOut, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s", show<float, N, 1>(inputArr, "GELU Input").c_str());
@@ -169,10 +169,10 @@ void TestLayerNorm(GPUContext &ctx) {
   GPUTensor weight = CreateTensor(ctx, {C}, kf32, weightArr.data());
   GPUTensor bias = CreateTensor(ctx, {C}, kf32, biasArr.data());
   GPUTensor output = CreateTensor(ctx, {N, C}, kf32, outputArr.data());
-  Kernel op = PrepareKernel<LNParam, 3>(ctx, LayerNormShader(256, kf32),
+  Kernel op = CreateKernel<LNParam, 3>(ctx, LayerNormShader(256, kf32),
                                         std::array{input, weight, bias}, output,
                                         params);
-  LaunchKernel(ctx, op);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s",
@@ -212,9 +212,9 @@ void TestSoftmax(GPUContext &ctx) {
   randint(inputArr, gen, 0, 3);
   GPUTensor input = CreateTensor(ctx, {B, T, C}, kf32, inputArr.data());
   GPUTensor output = CreateTensor(ctx, {B, T, C}, kf32, outputArr.data());
-  Kernel op = PrepareKernel<SoftmaxParam, 1>(ctx, SoftmaxShader(256, kf32),
+  Kernel op = CreateKernel<SoftmaxParam, 1>(ctx, SoftmaxShader(256, kf32),
                                              {input}, output, {B * T, C});
-  LaunchKernel(ctx, op);
+  DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s",
@@ -274,8 +274,8 @@ void TestMultiKernel1(GPUContext &ctx) {
       .params = &param,
       .paramSizes = &size,
   };
-  MultiKernel pipeline = PrepareMultiKernel(ctx, desc);
-  LaunchMultiKernel(ctx, pipeline);
+  MultiKernel pipeline = CreateMultiKernel(ctx, desc);
+  DispatchMultiKernel(ctx, pipeline);
   Wait(ctx, pipeline.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   log(kDefLog, kInfo, "%s",
@@ -328,8 +328,8 @@ void TestMultiKernel2(GPUContext &ctx) {
       .params = params.data(),
       .paramSizes = paramSizes.data(),
   };
-  MultiKernel pipeline = PrepareMultiKernel(ctx, desc);
-  LaunchMultiKernel(ctx, pipeline);
+  MultiKernel pipeline = CreateMultiKernel(ctx, desc);
+  DispatchMultiKernel(ctx, pipeline);
   Wait(ctx, pipeline.future);
 
   log(kDefLog, kInfo, "%s",