add GPU puzzles, convenience aliases for CreateKernel

austinvhuang · austinvhuang · commit 8639720ed313 · 2024-06-12T17:16:35.000-04:00
diff --git a/examples/gpu_puzzles/CMakeLists.txt b/examples/gpu_puzzles/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.11)
+project(gpu_puzzles)
+
+include(FetchContent)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+message(STATUS "CMAKE_CURRENT_SOURCE_DIR: " ${CMAKE_CURRENT_SOURCE_DIR})
+message(STATUS "LIBRARY DIRECTORY: " ${CMAKE_CURRENT_SOURCE_DIR}/../../)
+
+# For a standalone repo, remove this line and set the path to the repos own
+# FetchContent cache directory. Alternatively, don't set FETCHCONTENT_BASE_DIR
+# and the repos will be downloaded to the build directory.
+set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party")
+
+FetchContent_Declare(
+  gpu
+  # For standalone repo, replace GIT_REPOSITORY with the URL:
+  # GIT_REPOSITORY https://github.com/AnswerDotAI/gpu.cpp
+  GIT_REPOSITORY file://${CMAKE_CURRENT_SOURCE_DIR}/../../
+  GIT_TAG main
+  GIT_SHALLOW    TRUE
+)
+FetchContent_MakeAvailable(gpu)
+
+add_executable(gpu_puzzles run.cpp)
+target_link_libraries(gpu_puzzles gpu webgpu)
+target_include_directories(gpu_puzzles PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ )
+
diff --git a/examples/gpu_puzzles/Makefile b/examples/gpu_puzzles/Makefile
@@ -0,0 +1,16 @@
+NUM_JOBS ?= $(shell nproc)
+CXX=clang++
+
+TARGET = gpu_puzzles
+FLAGS = -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_CXX_COMPILER="$(CXX)"
+FASTBUILD_FLAGS = $(FLAGS) -DFASTBUILD:BOOL=ON 
+
+run:
+	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)
+
+watch:
+	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)"
+
+clean:
+	read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
+
diff --git a/examples/gpu_puzzles/run.cpp b/examples/gpu_puzzles/run.cpp
@@ -0,0 +1,128 @@
+/* 
+ * WIP implementation of Sasha Rush's GPU puzzles https://github.com/srush/GPU-Puzzles
+ */
+
+#include <array>
+#include <cstdio>
+#include "gpu.h"
+#include "utils/array_utils.h"
+
+using namespace gpu;
+
+static constexpr size_t N = 3072;
+
+template <size_t N>
+std::array<float, N> makeData() {
+  std::array<float, N> inputArr;
+  for (int i = 0; i < N; ++i) {
+    inputArr[i] = static_cast<float>(i); // dummy input data
+  }
+  return inputArr;
+}
+
+template <size_t N>
+void showResult(GPUContext& ctx, Kernel& op,  GPUTensor& output) {
+  DispatchKernel(ctx, op);
+  std::array<float, N> outputArr;
+  Wait(ctx, op.future);
+  ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  fprintf(stdout, "%s", show<float, N, 1>(outputArr, "output").c_str());
+}
+
+// Puzzle 1 : Map
+// Implement a "kernel" (GPU function) that adds 10 to each position of vector
+// a and stores it in vector out. You have 1 thread per position.
+const char *kPuzzle1_Map= R"(
+@group(0) @binding(0) var<storage, read_write> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output : array<f32>;
+@compute @workgroup_size(256)
+fn main(
+  @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let idx = GlobalInvocationID.x;
+    if (idx < arrayLength(&input)) {
+      output[idx] = input[idx] + 10;
+    }
+  }
+)";
+
+void puzzle1(GPUContext& ctx) {
+  fprintf(stdout, "\n\nPuzzle 1\n\n");
+  GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
+  GPUTensor output = CreateTensor(ctx, {N}, kf32);
+  Kernel op =
+      CreateKernel(ctx, ShaderCode{kPuzzle1_Map, 256}, input, output);
+  showResult<N>(ctx, op, output);
+}
+
+// Puzzle 2 : Zip
+// Implement a kernel that adds together each position of a and b and stores it
+// in out. You have 1 thread per position.
+const char *kPuzzle2_Map= R"(
+@group(0) @binding(0) var<storage, read_write> a: array<f32>;
+@group(0) @binding(1) var<storage, read_write> b: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output : array<f32>;
+@compute @workgroup_size(256)
+fn main(
+  @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let idx = GlobalInvocationID.x;
+    if (idx < arrayLength(&a)) {
+      output[idx] = a[idx] + b[idx];
+    }
+  }
+)";
+
+void puzzle2(GPUContext& ctx) {
+  fprintf(stdout, "\n\nPuzzle 2\n\n");
+  GPUTensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
+  GPUTensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
+  GPUTensor output = CreateTensor(ctx, {N}, kf32);
+  Kernel op =
+      CreateKernel(ctx, ShaderCode{kPuzzle2_Map, 256}, GPUTensors{a, b}, output);
+  showResult<N>(ctx, op, output);
+}
+
+
+// Puzzle 3 : Guards
+// Implement a kernel that adds 10 to each position of a and stores it in out.
+// You have more threads than positions.
+const char *kPuzzle3_Map= R"(
+@group(0) @binding(0) var<storage, read_write> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output : array<f32>;
+@compute @workgroup_size(4)
+fn main(
+  @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
+  ) {
+    // increment by workgroup size 
+    for (var i = GlobalInvocationID.x; i < arrayLength(&input); i = i + 4) {
+      output[i] = input[i] + 10;
+    }
+  }
+)";
+void puzzle3(GPUContext& ctx) {
+  fprintf(stdout, "\n\nPuzzle 3\n\n");
+  GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
+  GPUTensor output = CreateTensor(ctx, {N}, kf32);
+  Kernel op =
+      CreateKernel(ctx, ShaderCode{kPuzzle3_Map, 4}, input, output);
+  showResult<N>(ctx, op, output);
+}
+
+// Puzzle 4 : Map 2D
+// Implement a kernel that adds 10 to each position of a and stores it in out.
+// Input a is 2D and square. You have more threads than positions.
+// TODO
+
+// Puzzle 5 : Broadcast
+// Implement a kernel that adds a and b and stores it in out. Inputs a and b
+// are vectors. You have more threads than positions.
+// TODO
+
+// ...
+
+int main(int argc, char **argv) {
+  GPUContext ctx = CreateGPUContext();
+  puzzle1(ctx);
+  puzzle2(ctx);
+  puzzle3(ctx);
+  return 0;
+}
diff --git a/gpu.h b/gpu.h
@@ -66,6 +66,20 @@ struct GPUTensor {
   Shape shape;
 };
 
+template <std::size_t N> struct GPUTensors {
+  std::array<GPUTensor, N> data;
+  GPUTensors(std::initializer_list<GPUTensor> init) {
+    std::copy(init.begin(), init.end(), data.begin());
+  }
+  GPUTensor &operator[](std::size_t index) { return data[index]; }
+  const GPUTensor &operator[](std::size_t index) const { return data[index]; }
+};
+
+template <std::size_t N> GPUTensors(std::array<GPUTensor, N>) -> GPUTensors<N>;
+
+// Deduction guide for GPUTensors
+template <typename... Args> GPUTensors(Args...) -> GPUTensors<sizeof...(Args)>;
+
 struct TensorPool {
   TensorPool(GPUContext *ctx) : ctx(ctx), data() {};
   GPUContext *ctx;
@@ -678,6 +692,29 @@ Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
                                   params);
 }
 
+/*
+ * CreateKernel with GPUTensors of inputs (convienence function)
+ */
+template <typename ParamsType = NoParam, size_t numInputs>
+Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
+                    const GPUTensors<numInputs> &inputs,
+                    const GPUTensor &output,
+                    const ParamsType &params = ParamsType{}) {
+  // first .data gets the array, second .data() gets the pointer
+  return CreateKernel<ParamsType>(ctx, shader, inputs.data.data(), numInputs,
+                                  output, params);
+}
+
+/*
+ * CreateKernel with single input case (convienence function)
+ */
+template <typename ParamsType = NoParam>
+Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
+                    const GPUTensor &input, const GPUTensor &output,
+                    const ParamsType &params = ParamsType{}) {
+  return CreateKernel(ctx, shader, &input, 1, output, params);
+}
+
 MultiKernel CreateMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;