skeleton code for physics example, readme tweaks, formatting gpu.h

austinvhuang · austinvhuang · commit 3570190a7e40 · 2024-06-19T11:21:12.000-04:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -19,3 +19,9 @@ jobs:
 
     - name: No-op Step
       run: echo "This is a no-op action"
+
+    - name: Install CMake
+      run: sudo apt-get install cmake
+
+    - name: Build project
+      run: make all
diff --git a/README.md b/README.md
@@ -206,28 +206,27 @@ become computable objects over which custom algorithms are implemented.
 Performing custom computations over compute-intensive foundation models
 benefits from low-level control of the GPU. 
 
-Many important foundation model
-advances today take this form, for example:
-
-- Approximate Computation - quantization, sparsification, model compression, distillation
-- Conditional/Branching Computation - Mixture-of-experts, Hydranets, Fast feed-forward, Early Exit
-- Auxillary Computation - Q[X]oRA variants, Speculative Decoding, Constrained Decoding
-
-
 At this time, tooling for implementing low-level GPU computation is heavily
-focused on CUDA as a first class citizen.
-
-This leaves a gap in portability, meaning R&D algorithms that work in a
-research environment are difficult to operationalize for everyday use to run on
-personal computing hardware that's broadly accessible (personal workstations,
-laptops, mobile devices).
+focused on CUDA as a first class citizen. This leaves a gap in portability,
+meaning R&D algorithms that work in a research environment are difficult to
+operationalize for everyday use to run on personal computing hardware that's
+broadly accessible (personal workstations, laptops, mobile devices).
 
 We created gpu.cpp as a lightweight C++ library that allows us to easily and
 directly implement native low-level GPU algorithms as part of R&D and drop
 implementations into code running on personal computing devices either as
 native applications or in the browser without impediment by hardware, tooling,
 or runtime support.
 
+## What gpu.cpp is for
+
+- Hybrid CPU + GPU computation
+- Simple Integration of Low-level GPU computation into C++ projects
+- Fine-grained control of GPU computation for ML inference
+- Bespoke GPU computation for emerging neural network architectures
+- Portable GPU computation
+- Custom parallel algorithms
+
 ## What gpu.cpp is not
 
 gpu.cpp is meant for developers with basic familiarity with C++ and GPU
diff --git a/examples/physics/CMakeLists.txt b/examples/physics/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.11)
+project(physics)
+
+include(FetchContent)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+message(STATUS "CMAKE_CURRENT_SOURCE_DIR: " ${CMAKE_CURRENT_SOURCE_DIR})
+message(STATUS "LIBRARY DIRECTORY: " ${CMAKE_CURRENT_SOURCE_DIR}/../../)
+
+# For a standalone repo, remove this line and set the path to the repos own
+# FetchContent cache directory. Alternatively, don't set FETCHCONTENT_BASE_DIR
+# and the repos will be downloaded to the build directory.
+set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party")
+
+FetchContent_Declare(
+  gpu
+  # For standalone repo, replace GIT_REPOSITORY with the URL:
+  # GIT_REPOSITORY https://github.com/AnswerDotAI/gpu.cpp
+  GIT_REPOSITORY file://${CMAKE_CURRENT_SOURCE_DIR}/../../
+  GIT_TAG main
+  GIT_SHALLOW    TRUE
+)
+FetchContent_MakeAvailable(gpu)
+
+add_executable(physics run.cpp)
+target_link_libraries(physics gpu webgpu)
+target_include_directories(physics PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ )
diff --git a/examples/physics/Makefile b/examples/physics/Makefile
@@ -0,0 +1,17 @@
+NUM_JOBS ?= $(shell nproc)
+CXX=clang++
+
+TARGET = physics
+FLAGS = -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_CXX_COMPILER="$(CXX)"
+FASTBUILD_FLAGS = $(FLAGS) -DFASTBUILD:BOOL=ON 
+
+run:
+	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)
+
+watch:
+	@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
+	mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)"
+
+clean:
+	read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
+
diff --git a/examples/physics/run.cpp b/examples/physics/run.cpp
@@ -0,0 +1,66 @@
+#include "gpu.h"
+#include <array>
+#include <chrono>
+#include <cstdio>
+
+using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
+                     // CreateShader, DispatchKernel, Wait, ToCPU
+                     // Tensor, TensorList Kernel, Context, Shape, kf32
+
+const char *kShaderSimulation = R"(
+const G: f32 = 9.81;
+const dt: f32 = 0.01;
+@group(0) @binding(0) var<storage, read_write> pos1: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> vel1: array<{{precision}}>;
+@group(0) @binding(2) var<storage, read_write> pos2: array<{{precision}}>;
+@group(0) @binding(3) var<storage, read_write> vel2: array<{{precision}}>;
+@group(0) @binding(4) var<storage, read_write> length: array<{{precision}}>;
+@group(0) @binding(5) var<storage, read_write> mass: array<{{precision}}>;
+@group(0) @binding(6) var<storage, read_write> output: array<{{precision}}>;
+
+@compute @workgroup_size({{workgroupSize}})
+fn main(
+    @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
+    let i: u32 = GlobalInvocationID.x;
+    if (i < arrayLength(&pos1)) {
+    // TODO
+
+    }
+}
+)";
+
+int main() {
+  printf("\nHello, gpu.cpp\n\n");
+  Context ctx = CreateContext();
+  static constexpr size_t N = 1000;
+
+  std::array<float, N> x1Arr, x2Arr, y1Arr, y2Arr, vx1Arr, vy1Arr, vx2Arr, vy2Arr, lengthArr, massArr;
+
+  Tensor pos1 = CreateTensor(ctx, Shape{N}, kf32, x1Arr.data());
+  Tensor pos2 = CreateTensor(ctx, Shape{N}, kf32, x2Arr.data());
+  Tensor vel1 = CreateTensor(ctx, Shape{N}, kf32, vx1Arr.data());
+  Tensor vel2 = CreateTensor(ctx, Shape{N}, kf32, vy1Arr.data());
+  Tensor length = CreateTensor(ctx, Shape{N}, kf32, lengthArr.data());
+  Tensor mass = CreateTensor(ctx, Shape{N}, kf32, massArr.data());
+
+  // TODO: no need to have output
+  Tensor output = CreateTensor(ctx, Shape{N}, kf32);
+
+  Shape nThreads{N, 1, 1};
+  Kernel update = CreateKernel(
+      ctx, CreateShader(kShaderSimulation, 256, kf32),
+      TensorList{pos1, vel1, pos2, vel2,
+       length, mass}, output,
+      nThreads);
+  while (true) {
+    auto start = std::chrono::high_resolution_clock::now();
+    ResetCommandBuffer(ctx.device, nThreads, update);
+
+    DispatchKernel(ctx, update);
+    Wait(ctx, update.future);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+    std::this_thread::sleep_for(std::chrono::milliseconds(16) - elapsed);
+  }
+
+}
diff --git a/gpu.h b/gpu.h