Skip to content

Commit 3570190

Browse files
committed
skeleton code for physics example, readme tweaks, formatting gpu.h
1 parent f9d9819 commit 3570190

File tree

6 files changed

+193
-75
lines changed

6 files changed

+193
-75
lines changed

.github/workflows/build.yml

+6
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,9 @@ jobs:
1919

2020
- name: No-op Step
2121
run: echo "This is a no-op action"
22+
23+
- name: Install CMake
24+
run: sudo apt-get install cmake
25+
26+
- name: Build project
27+
run: make all

README.md

+13-14
Original file line numberDiff line numberDiff line change
@@ -206,28 +206,27 @@ become computable objects over which custom algorithms are implemented.
206206
Performing custom computations over compute-intensive foundation models
207207
benefits from low-level control of the GPU.
208208

209-
Many important foundation model
210-
advances today take this form, for example:
211-
212-
- Approximate Computation - quantization, sparsification, model compression, distillation
213-
- Conditional/Branching Computation - Mixture-of-experts, Hydranets, Fast feed-forward, Early Exit
214-
- Auxillary Computation - Q[X]oRA variants, Speculative Decoding, Constrained Decoding
215-
216-
217209
At this time, tooling for implementing low-level GPU computation is heavily
218-
focused on CUDA as a first class citizen.
219-
220-
This leaves a gap in portability, meaning R&D algorithms that work in a
221-
research environment are difficult to operationalize for everyday use to run on
222-
personal computing hardware that's broadly accessible (personal workstations,
223-
laptops, mobile devices).
210+
focused on CUDA as a first class citizen. This leaves a gap in portability,
211+
meaning R&D algorithms that work in a research environment are difficult to
212+
operationalize for everyday use to run on personal computing hardware that's
213+
broadly accessible (personal workstations, laptops, mobile devices).
224214

225215
We created gpu.cpp as a lightweight C++ library that allows us to easily and
226216
directly implement native low-level GPU algorithms as part of R&D and drop
227217
implementations into code running on personal computing devices either as
228218
native applications or in the browser without impediment by hardware, tooling,
229219
or runtime support.
230220

221+
## What gpu.cpp is for
222+
223+
- Hybrid CPU + GPU computation
224+
- Simple Integration of Low-level GPU computation into C++ projects
225+
- Fine-grained control of GPU computation for ML inference
226+
- Bespoke GPU computation for emerging neural network architectures
227+
- Portable GPU computation
228+
- Custom parallel algorithms
229+
231230
## What gpu.cpp is not
232231

233232
gpu.cpp is meant for developers with basic familiarity with C++ and GPU

examples/physics/CMakeLists.txt

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
cmake_minimum_required(VERSION 3.11)
2+
project(physics)
3+
4+
include(FetchContent)
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7+
8+
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: " ${CMAKE_CURRENT_SOURCE_DIR})
9+
message(STATUS "LIBRARY DIRECTORY: " ${CMAKE_CURRENT_SOURCE_DIR}/../../)
10+
11+
# For a standalone repo, remove this line and set the path to the repos own
12+
# FetchContent cache directory. Alternatively, don't set FETCHCONTENT_BASE_DIR
13+
# and the repos will be downloaded to the build directory.
14+
set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party")
15+
16+
FetchContent_Declare(
17+
gpu
18+
# For standalone repo, replace GIT_REPOSITORY with the URL:
19+
# GIT_REPOSITORY https://github.com/AnswerDotAI/gpu.cpp
20+
GIT_REPOSITORY file://${CMAKE_CURRENT_SOURCE_DIR}/../../
21+
GIT_TAG main
22+
GIT_SHALLOW TRUE
23+
)
24+
FetchContent_MakeAvailable(gpu)
25+
26+
add_executable(physics run.cpp)
27+
target_link_libraries(physics gpu webgpu)
28+
target_include_directories(physics PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ )

examples/physics/Makefile

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
NUM_JOBS ?= $(shell nproc)
2+
CXX=clang++
3+
4+
TARGET = physics
5+
FLAGS = -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_CXX_COMPILER="$(CXX)"
6+
FASTBUILD_FLAGS = $(FLAGS) -DFASTBUILD:BOOL=ON
7+
8+
run:
9+
mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)
10+
11+
watch:
12+
@command -v entr >/dev/null 2>&1 || { echo >&2 "Please install entr with 'brew install entr' or 'sudo apt-get install entr'"; exit 1; }
13+
mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)"
14+
15+
clean:
16+
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
17+

examples/physics/run.cpp

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "gpu.h"
2+
#include <array>
3+
#include <chrono>
4+
#include <cstdio>
5+
6+
using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
7+
// CreateShader, DispatchKernel, Wait, ToCPU
8+
// Tensor, TensorList Kernel, Context, Shape, kf32
9+
10+
const char *kShaderSimulation = R"(
11+
const G: f32 = 9.81;
12+
const dt: f32 = 0.01;
13+
@group(0) @binding(0) var<storage, read_write> pos1: array<{{precision}}>;
14+
@group(0) @binding(1) var<storage, read_write> vel1: array<{{precision}}>;
15+
@group(0) @binding(2) var<storage, read_write> pos2: array<{{precision}}>;
16+
@group(0) @binding(3) var<storage, read_write> vel2: array<{{precision}}>;
17+
@group(0) @binding(4) var<storage, read_write> length: array<{{precision}}>;
18+
@group(0) @binding(5) var<storage, read_write> mass: array<{{precision}}>;
19+
@group(0) @binding(6) var<storage, read_write> output: array<{{precision}}>;
20+
21+
@compute @workgroup_size({{workgroupSize}})
22+
fn main(
23+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
24+
let i: u32 = GlobalInvocationID.x;
25+
if (i < arrayLength(&pos1)) {
26+
// TODO
27+
28+
}
29+
}
30+
)";
31+
32+
int main() {
33+
printf("\nHello, gpu.cpp\n\n");
34+
Context ctx = CreateContext();
35+
static constexpr size_t N = 1000;
36+
37+
std::array<float, N> x1Arr, x2Arr, y1Arr, y2Arr, vx1Arr, vy1Arr, vx2Arr, vy2Arr, lengthArr, massArr;
38+
39+
Tensor pos1 = CreateTensor(ctx, Shape{N}, kf32, x1Arr.data());
40+
Tensor pos2 = CreateTensor(ctx, Shape{N}, kf32, x2Arr.data());
41+
Tensor vel1 = CreateTensor(ctx, Shape{N}, kf32, vx1Arr.data());
42+
Tensor vel2 = CreateTensor(ctx, Shape{N}, kf32, vy1Arr.data());
43+
Tensor length = CreateTensor(ctx, Shape{N}, kf32, lengthArr.data());
44+
Tensor mass = CreateTensor(ctx, Shape{N}, kf32, massArr.data());
45+
46+
// TODO: no need to have output
47+
Tensor output = CreateTensor(ctx, Shape{N}, kf32);
48+
49+
Shape nThreads{N, 1, 1};
50+
Kernel update = CreateKernel(
51+
ctx, CreateShader(kShaderSimulation, 256, kf32),
52+
TensorList{pos1, vel1, pos2, vel2,
53+
length, mass}, output,
54+
nThreads);
55+
while (true) {
56+
auto start = std::chrono::high_resolution_clock::now();
57+
ResetCommandBuffer(ctx.device, nThreads, update);
58+
59+
DispatchKernel(ctx, update);
60+
Wait(ctx, update.future);
61+
auto end = std::chrono::high_resolution_clock::now();
62+
std::chrono::duration<double> elapsed = end - start;
63+
std::this_thread::sleep_for(std::chrono::milliseconds(16) - elapsed);
64+
}
65+
66+
}

0 commit comments

Comments
 (0)