Simplify CreateKernel overloads, move shaders.h to utils/

austinvhuang · austinvhuang · commit bf71d533302c · 2024-06-19T16:08:50.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -72,15 +72,15 @@ endif()
 # Build the library target (libgpu)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(SRC_LIB gpu.h nn/shaders.h utils/array_utils.h utils/logging.h)
+set(SRC_LIB gpu.h utils/shaders.h utils/array_utils.h utils/logging.h)
 add_library(gpu SHARED ${SRC_LIB})
 set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
 
 # For additional targets see directories under `examples/`, which have their own CMakeLists.txt
 
 # Test of basic kernels
 
-set(SRC_TESTS utils/test_kernels.cpp gpu.h nn/shaders.h utils/array_utils.h utils/logging.h)
+set(SRC_TESTS utils/test_kernels.cpp gpu.h utils/shaders.h utils/array_utils.h utils/logging.h)
 add_executable(run_tests ${SRC_TESTS})
 target_link_libraries(run_tests PRIVATE ${LIBDL} ${CMAKE_DL_LIBS} webgpu)
 target_include_directories(run_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/README.md b/README.md
@@ -30,9 +30,10 @@ Here's an GELU kernel implemented (based on the CUDA implementation of
 invoked from the host using this library.
 
 ```
+#include "gpu.h"
 #include <array>
 #include <cstdio>
-#include "gpu.h"
+#include <future>
 
 using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
                      // CreateShader, DispatchKernel, Wait, ToCPU
@@ -42,20 +43,22 @@ static const char *kGelu = R"(
 const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
 @group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
 @group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
 @compute @workgroup_size({{workgroupSize}})
 fn main(
     @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
     let i: u32 = GlobalInvocationID.x;
     if (i < arrayLength(&inp)) {
         let x: f32 = inp[i];
-        // select is more stable for larger values of x
+        // select is more stable than tanh for large x
         out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR 
-                  * (x + .044715 * x * x * x))), x, x > 10.0);
+               * (x + .044715 * x * x * x))), x, x > 10.0);
     }
 }
 )";
 
 int main(int argc, char **argv) {
+  printf("\nHello, gpu.cpp\n\n");
   Context ctx = CreateContext();
   static constexpr size_t N = 3072;
   std::array<float, N> inputArr, outputArr;
@@ -64,25 +67,28 @@ int main(int argc, char **argv) {
   }
   Tensor input = CreateTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = CreateTensor(ctx, Shape{N}, kf32);
-  Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32), input, output);
-  DispatchKernel(ctx, op);
-  Wait(ctx, op.future);
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32), TensorList{input, output},
+                           /* nthreads */ {N, 1, 1});
+  DispatchKernel(ctx, op, promise);
+  Wait(ctx, future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
-  for (int i = 0; i < 10; ++i) {
+  for (int i = 0; i < 32; ++i) {
     printf("out[%d] : gelu(%.2f) = %.2f\n", i, inputArr[i], outputArr[i]);
   }
   printf("...\n\n");
   return 0;
 }
 ```
 
+This example is available in `examples/hello_world/run.cpp`. 
+
 For those curious about what happens under the hood with the raw WebGPU API,
 the equivalent functionality is implemented using the WebGPU C API in
 `examples/webgpu_intro/run.cpp`.
 
-## Quick Start: Building and Running
-
-*Tutorial App*
+## Quick Start: Dependencies and Installation
 
 The only dependency of this library is a WebGPU implementation. Currently we
 recommend using the Dawn backend until further testing, but we plan to support
@@ -93,32 +99,11 @@ you can install cmake using [homebrew](https://brew.sh/) with: `brew install
 cmake`. On Ubuntu, you can install cmake using `apt-get` with: `sudo apt-get
 install cmake`.
 
-The build is handled by cmake. Some useful common cmake invocations are wrapped
-in the convenience Makefile. To start you can try building a terminal demo
-tutorial which also tests the functionality of the library, this builds the
-demo tutorial in `run.cpp`:
-
-```
-make demo
-```
-
-You should see an introductory message:
-```
-   ____ _____  __  __ _________  ____ 
-  / __ `/ __ \/ / / // ___/ __ \/ __ \
- / /_/ / /_/ / /_/ // /__/ /_/ / /_/ /
- \__, / .___/\__,_(_)___/ .___/ .___/ 
-/____/_/               /_/   /_/
-
-================================================================================
-
-Welcome!
---------
+## Quick Start: Building and Running
 
-This program is a brief intro to the gpu.cpp library.
-...
 
-```
+The build is handled by cmake. Some useful common cmake invocations are wrapped
+in the convenience Makefile. 
 
 The first time you build and run the project, it will download the WebGPU
 backend implementation (Dawn by default) and build it which may take a few
diff --git a/gpu.h b/gpu.h
@@ -206,13 +206,11 @@ struct KernelPool {
   KernelPool(Context *ctx) : ctx(ctx), data() {}
   Context *ctx;
   std::set<Kernel *> data;
-  // std::set<MultiKernel *> multiData;
   ~KernelPool() {
     // Note : Some kernel resources such as commandBuffer are harvested by
     // queue submission, explicitly destroying readback and callback buffers
     // produces runtime errors.
     data.clear();
-    // multiData.clear();
   }
 };
 
@@ -664,8 +662,6 @@ void ResetCommandBuffer(WGPUDevice &device, const Shape &nThreads, Kernel &op) {
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
   }
-  // op.promise = std::promise<void>();
-  // op.future = op.promise.get_future();
 }
 
 /**
@@ -800,7 +796,6 @@ Kernel CreateKernel(Context &ctx, const ShaderCode &shader,
       .entries = bindGroupEntries.data(),
   };
   op.bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
-
   {
     WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
         .bindGroupLayoutCount = 1,
@@ -833,42 +828,6 @@ Kernel CreateKernel(Context &ctx, const ShaderCode &shader,
   return op;
 }
 
-/**
- * @brief Overload which wraps the CreateKernel factory function to create a
- * kernel on the GPU with a statically determined ParamsType instead of casting
- * params to a void pointer. paramSize is then determined by the size of the
- * ParamsType.
- *
- * @param[in] ctx Context instance to manage the kernel
- * @param[in] shader Shader code for the kernel
- * @param[in] inputs A span of input tensors as a pointer
- * @param[in] numInputs Number of input tensors, effectively the size of the
- * *inputs span.
- * @param[in] output Output tensor for the kernel
- * @param[in] nThreads Shape of the workgroup size for the kernel, must be of
- * rank 3.
- * @param[in] params Optional parameters for the kernel. If the kernel does not
- * have any parameters, use NoParam.
- * @example Kernel kernel = CreateKernel(ctx, shader, inputs, numInputs, output,
- * nThreads, params);
- */
-template <typename ParamsType = NoParam>
-Kernel CreateKernel(Context &ctx, const ShaderCode &shader,
-                    const Tensor *inputs, size_t numInputs,
-                    const Shape &nThreads,
-                    const ParamsType &params = ParamsType{}) {
-  if constexpr (!IsNoParam<ParamsType>) {
-    log(kDefLog, kInfo, "Using params of size %d bytes", sizeof(ParamsType));
-    return CreateKernel(ctx, shader, inputs, numInputs, nThreads,
-                        reinterpret_cast<const void *>(&params),
-                        sizeof(ParamsType));
-  } else {
-    log(kDefLog, kInfo, "No params");
-    return CreateKernel(ctx, shader, inputs, numInputs, nThreads,
-                        nullptr, 0);
-  }
-}
-
 /**
  * @brief Overload which wraps the CreateKernel factory function to create a
  * kernel on the GPU. This overload uses takes a static collection of input
@@ -892,17 +851,16 @@ Kernel CreateKernel(Context &ctx, const ShaderCode &shader,
                     const TensorList<numInputs> &inputs, 
                     const Shape &nThreads,
                     const ParamsType &params = ParamsType{}) {
-  // first .data gets the array, second .data() gets the pointer
-  return CreateKernel<ParamsType>(ctx, shader, inputs.data.data(), numInputs,
-                                  nThreads, params);
-}
-
-// Convenience wrapper: specialization for single input passed by reference
-template <typename ParamsType = NoParam>
-Kernel CreateKernel(Context &ctx, const ShaderCode &shader, const Tensor &input,
-                    const Shape &nThreads,
-                    const ParamsType &params = ParamsType{}) {
-  return CreateKernel(ctx, shader, &input, 1, nThreads, params);
+  if constexpr (!IsNoParam<ParamsType>) {
+    log(kDefLog, kInfo, "Using params of size %d bytes", sizeof(ParamsType));
+    return CreateKernel(ctx, shader, inputs.data.data(), numInputs, nThreads,
+                        reinterpret_cast<const void *>(&params),
+                        sizeof(ParamsType));
+  } else {
+    log(kDefLog, kInfo, "No params");
+    return CreateKernel(ctx, shader, inputs.data.data(), numInputs, nThreads,
+                        nullptr, 0);
+  }
 }
 
 /**
diff --git a/utils/shaders.h b/utils/shaders.h
diff --git a/utils/test_kernels.cpp b/utils/test_kernels.cpp
@@ -3,11 +3,11 @@
 #include <memory>
 #include <random>
 
-#include "array_utils.h"
 #include "gpu.h"
-#include "nn/shaders.h"
-#include "reference_impls.h"
+#include "utils/array_utils.h"
+#include "utils/reference_impls.h"
 #include "utils/logging.h"
+#include "utils/shaders.h"
 
 using namespace gpu;