Simplify shader API - use Create* convention for CreateShader, start gpu puzzles example, pin dawn build commit, add some checks to Makefile

austinvhuang · austinvhuang · commit a1b31f718579 · 2024-06-12T15:11:10.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,6 +30,12 @@ IF (NOT WEBGPU_TAG)
 ENDIF()
 message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}")
 
+IF (WEBGPU_TAG STREQUAL "dawn")
+  # Pin the dawn backend to a specific commit
+  set(WEBGPU_TAG "1025b977e1927b6d0327e67352f90feb4bcf8274")
+  message(STATUS "Using Dawn backend")
+ENDIF()
+
 FetchContent_Declare(
   webgpu
   GIT_REPOSITORY  ${WEBGPU_DIST_GIT_REPO}
diff --git a/Makefile b/Makefile
@@ -19,41 +19,46 @@ DEBUG_FLAGS = $(FLAGS) -DDEBUG:BOOL=ON
 EMSCRIPTEN_FLAGS = -DIMPLEMENTATION=emscripten -DCMAKE_CXX_COMPILER=em++
 LOCAL_FLAGS = -DUSE_LOCAL_LIBS=ON 
 
-demo:
+demo: check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET_DEMO) && ./$(TARGET_DEMO)
 
-tests:
+# check for the existence of clang++ and cmake
+check-dependencies:
+	@command -v clang++ >/dev/null 2>&1 || { echo >&2 "Please install clang++ with 'sudo apt-get install clang' or 'brew install llvm'"; exit 1; }
+	@command -v cmake >/dev/null 2>&1 || { echo >&2 "Please install cmake with 'sudo apt-get install cmake' or 'brew install cmake'"; exit 1; }
+
+tests: check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET_TESTS) && ./$(TARGET_TESTS)
 
-libgpu:
+libgpu: check-dependencies
 	mkdir -p build && cd build && cmake .. $(RELEASE_FLAGS) && make -j$(NUM_JOBS) gpu
 
-debug:
+debug: check-dependencies
 	mkdir -p build && cd build && cmake .. $(DEBUG_FLAGS) && make -j$(NUM_JOBS) $(TARGET_ALL)
 
-build:
+build: check-dependencies
 	mkdir -p build && cd build && cmake .. $(RELEASE_FLAGS) && make -j$(NUM_JOBS) $(TARGET_ALL)
 
-emscripten:
+emscripten: check-dependencies
 	mkdir -p build && cd build && cmake .. $(EMSCRIPTEN_FLAGS) -DIMPLEMENTATION=emscripten && make -j$(NUM_JOBS) $(TARGET_ALL)
 
 check-entr:
 	@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
 
-watch-demo: check-entr
+watch-demo: check-entr check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET_DEMO) && make -j$(NUM_JOBS) $(TARGET_DEMO) && ./$(TARGET_DEMO)"
 
-watch-tests:
+watch-tests: check-entr check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET_TESTS) && make -j$(NUM_JOBS) $(TARGET_TESTS) && ./$(TARGET_TESTS)"
 
 # experimental
-watch-tests-wgpu:
+watch-tests-wgpu: check-entr check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) $(USE_WGPU) && ls ../* ../utils/* | entr -s "rm -f $(TARGET_TESTS) && make -j$(NUM_JOBS) $(TARGET_TESTS) && ./$(TARGET_TESTS)"
 
-watch-demo-local: check-entr
+watch-demo-local: check-entr check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) $(LOCAL_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET_DEMO) && make -j$(NUM_JOBS) $(TARGET_DEMO) && ./$(TARGET_DEMO)"
 
-watch-tests-local:
+watch-tests-local: check-entr check-dependencies
 	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) $(LOCAL_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET_TESTS) && make -j$(NUM_JOBS) $(TARGET_TESTS) && ./$(TARGET_TESTS)"
 
 clean-build:
diff --git a/README.md b/README.md
@@ -86,6 +86,11 @@ The only dependency of this library is a WebGPU implementation. Currently we
 recommend using the Dawn backend until further testing, but we plan to support
 emscripten (web) and wgpu (native) backends.
 
+You should have clang and cmake installed (we currently test on 3.25+). On mac,
+you can install cmake using [homebrew](https://brew.sh/) with: `brew install
+cmake`. On Ubuntu, you can install cmake using `apt-get` with: `sudo apt-get
+install cmake`.
+
 The build is handled by cmake. Some useful common cmake invocations are wrapped
 in the convenience Makefile. To start you can try building a terminal demo
 tutorial which also tests the functionality of the library, this builds the
@@ -186,6 +191,10 @@ If you need to clean up the build artifacts, you can run:
 make clean
 ```
 
+## Troubleshooting
+
+If you run into issues building the project, please open an issue.
+
 ## Motivation and Goals
 
 Although gpu.cpp is intended for any form of general purpose GPU computation,
@@ -237,6 +246,7 @@ rendering/graphics on the GPU, although it might be useful for compute shaders
 in graphics projects - one of the examples is a small compute renderer,
 rendered to the terminal.
 
+
 ## Contributing and Work-in-Progress
 
 We welcome contributions! There's a lot of low hanging fruit - fleshing out
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
@@ -1,10 +1,27 @@
-#include <array>
-#include <cstdio>
 #include "gpu.h"
 #include "nn/shaders.h"
+#include <array>
+#include <cstdio>
 
 using namespace gpu;
 
+static const char *kGelu = R"(
+const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x;
+    // Ensure we do not access out of bounds
+    if (i < arrayLength(&inp)) {
+        let x: f32 = inp[i];
+        let cube: f32 = 0.044715 * x * x * x;
+        out[i] = 0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR * (x + cube)));
+    }
+}
+)";
+
 int main(int argc, char **argv) {
   GPUContext ctx = CreateGPUContext();
   fprintf(stdout, "\nHello, gpu.cpp\n\n");
@@ -17,8 +34,8 @@ int main(int argc, char **argv) {
   GPUTensor input = CreateTensor(ctx, {N}, kf32, inputArr.data());
   GPUTensor output = CreateTensor(ctx, {N}, kf32, outputArr.data());
 
-  Kernel op =
-      CreateKernel(ctx, GeluShader(256, kf32), std::array{input}, output);
+  Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32),
+                           std::array{input}, output);
   DispatchKernel(ctx, op);
   Wait(ctx, op.future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
diff --git a/gpu.h b/gpu.h
@@ -11,8 +11,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "webgpu/webgpu.h"
 #include "utils/logging.h"
+#include "webgpu/webgpu.h"
 
 namespace gpu {
 
@@ -67,7 +67,7 @@ struct GPUTensor {
 };
 
 struct TensorPool {
-  TensorPool(GPUContext *ctx) : ctx(ctx), data(){};
+  TensorPool(GPUContext *ctx) : ctx(ctx), data() {};
   GPUContext *ctx;
   std::unordered_map<WGPUBuffer, GPUTensor> data;
   ~TensorPool();
@@ -121,9 +121,9 @@ const char *ToString(NumType type) {
 
 /* Tensor factory function */
 GPUTensor CreateTensor(TensorPool &pool, const Shape &shape, NumType dtype,
-                 WGPUBufferUsageFlags usage = WGPUBufferUsage_Storage |
-                                              WGPUBufferUsage_CopyDst |
-                                              WGPUBufferUsage_CopySrc) {
+                       WGPUBufferUsageFlags usage = WGPUBufferUsage_Storage |
+                                                    WGPUBufferUsage_CopyDst |
+                                                    WGPUBufferUsage_CopySrc) {
   log(kDefLog, kInfo, "Creating tensor");
   size_t numElements = 1;
   for (size_t dim = 0; dim < shape.rank; dim++) {
@@ -146,16 +146,17 @@ GPUTensor CreateTensor(TensorPool &pool, const Shape &shape, NumType dtype,
 /* Syntactic sugar - take in ctx instead of pool*/
 GPUTensor CreateTensor(GPUContext &ctx, const Shape &shape, NumType dtype) {
   return CreateTensor(ctx.pool, shape, dtype,
-                WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-                    WGPUBufferUsage_CopySrc);
+                      WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                          WGPUBufferUsage_CopySrc);
 }
 
 /* With Value Initialization (pointer) */
 GPUTensor CreateTensor(GPUContext &ctx, const Shape &shape, NumType dtype,
-                 float *data) {
-  GPUTensor tensor = CreateTensor(ctx.pool, shape, dtype,
-                            WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-                                WGPUBufferUsage_CopySrc);
+                       float *data) {
+  GPUTensor tensor =
+      CreateTensor(ctx.pool, shape, dtype,
+                   WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                       WGPUBufferUsage_CopySrc);
   wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
                        tensor.data.size);
   return tensor;
@@ -187,16 +188,33 @@ struct CallbackDataDyn {
 };
 
 struct ShaderCode {
-  std::string code;
+  std::string data;
   size_t wgSize; // workgroup size
 };
 
+void ReplaceAll(std::string &str, const std::string &from,
+                const std::string &to) {
+  size_t start_pos = 0;
+  while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
+    str.replace(start_pos, from.length(), to);
+    start_pos += to.length();
+  }
+}
+
+ShaderCode CreateShader(const char *shaderRaw, size_t workgroupSize,
+                        NumType precision) {
+  std::string codeString(shaderRaw);
+  ReplaceAll(codeString, "{{workgroupSize}}", std::to_string(workgroupSize));
+  ReplaceAll(codeString, "{{precision}}", ToString(precision));
+  return ShaderCode{codeString, workgroupSize};
+}
+
 struct KernelDesc {
   const ShaderCode shader;
   const GPUTensor *inputs;
   size_t numInputs;
   const GPUTensor output;
-  const void* params;
+  const void *params;
   const size_t paramSize;
 };
 
@@ -441,9 +459,9 @@ void ToGPU(GPUContext &ctx, const float *data, GPUTensor &tensor) {
 }
 
 Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
-                     const GPUTensor *inputs, size_t numInputs,
-                     const GPUTensor &output, const void *params = nullptr,
-                     size_t paramsSize = 0) {
+                    const GPUTensor *inputs, size_t numInputs,
+                    const GPUTensor &output, const void *params = nullptr,
+                    size_t paramsSize = 0) {
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;
   Kernel op;
@@ -591,7 +609,7 @@ Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
     pipelineLayout =
         wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
     WGPUShaderModuleWGSLDescriptor wgslDesc = {
-        .code = shader.code.c_str(),
+        .code = shader.data.c_str(),
     };
     wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
     WGPUShaderModuleDescriptor shaderModuleDesc = {};
@@ -634,14 +652,14 @@ Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
 
 template <typename ParamsType = NoParam>
 Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
-                     const GPUTensor *inputs, size_t numInputs,
-                     const GPUTensor &output,
-                     const ParamsType &params = ParamsType{}) {
+                    const GPUTensor *inputs, size_t numInputs,
+                    const GPUTensor &output,
+                    const ParamsType &params = ParamsType{}) {
   if constexpr (!IsNoParam<ParamsType>) {
     log(kDefLog, kInfo, "Using params of size %d bytes", sizeof(ParamsType));
     return CreateKernel(ctx, shader, inputs, numInputs, output,
-                         reinterpret_cast<const void *>(&params),
-                         sizeof(ParamsType));
+                        reinterpret_cast<const void *>(&params),
+                        sizeof(ParamsType));
   } else {
     log(kDefLog, kInfo, "No params");
     return CreateKernel(ctx, shader, inputs, numInputs, output, nullptr, 0);
@@ -653,11 +671,11 @@ Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
  */
 template <typename ParamsType = NoParam, size_t numInputs>
 Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
-                     const std::array<GPUTensor, numInputs> &inputs,
-                     const GPUTensor &output,
-                     const ParamsType &params = ParamsType{}) {
-  return CreateKernel<ParamsType>(ctx, shader, inputs.data(), numInputs,
-                                   output, params);
+                    const std::array<GPUTensor, numInputs> &inputs,
+                    const GPUTensor &output,
+                    const ParamsType &params = ParamsType{}) {
+  return CreateKernel<ParamsType>(ctx, shader, inputs.data(), numInputs, output,
+                                  params);
 }
 
 MultiKernel CreateMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
@@ -791,7 +809,7 @@ MultiKernel CreateMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
     // Create shader module
     log(kDefLog, kInfo, "Create shader module");
     WGPUShaderModuleWGSLDescriptor wgslDesc = {
-        .code = desc.shader[shaderIndex].code.c_str(),
+        .code = desc.shader[shaderIndex].data.c_str(),
     };
     wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
     WGPUShaderModuleDescriptor shaderModuleDesc = {
diff --git a/nn/shaders.h b/nn/shaders.h
diff --git a/utils/test_kernels.cpp b/utils/test_kernels.cpp