Skip to content

Commit 8639720

Browse files
committed
add GPU puzzles, convenience aliases for CreateKernel
1 parent a1b31f7 commit 8639720

File tree

4 files changed

+210
-0
lines changed

4 files changed

+210
-0
lines changed

examples/gpu_puzzles/CMakeLists.txt

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
cmake_minimum_required(VERSION 3.11)
2+
project(gpu_puzzles)
3+
4+
include(FetchContent)
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7+
8+
message(STATUS "CMAKE_CURRENT_SOURCE_DIR: " ${CMAKE_CURRENT_SOURCE_DIR})
9+
message(STATUS "LIBRARY DIRECTORY: " ${CMAKE_CURRENT_SOURCE_DIR}/../../)
10+
11+
# For a standalone repo, remove this line and set the path to the repos own
12+
# FetchContent cache directory. Alternatively, don't set FETCHCONTENT_BASE_DIR
13+
# and the repos will be downloaded to the build directory.
14+
set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party")
15+
16+
FetchContent_Declare(
17+
gpu
18+
# For standalone repo, replace GIT_REPOSITORY with the URL:
19+
# GIT_REPOSITORY https://github.com/AnswerDotAI/gpu.cpp
20+
GIT_REPOSITORY file://${CMAKE_CURRENT_SOURCE_DIR}/../../
21+
GIT_TAG main
22+
GIT_SHALLOW TRUE
23+
)
24+
FetchContent_MakeAvailable(gpu)
25+
26+
add_executable(gpu_puzzles run.cpp)
27+
target_link_libraries(gpu_puzzles gpu webgpu)
28+
target_include_directories(gpu_puzzles PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ )
29+

examples/gpu_puzzles/Makefile

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
NUM_JOBS ?= $(shell nproc)
2+
CXX=clang++
3+
4+
TARGET = gpu_puzzles
5+
FLAGS = -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_CXX_COMPILER="$(CXX)"
6+
FASTBUILD_FLAGS = $(FLAGS) -DFASTBUILD:BOOL=ON
7+
8+
run:
9+
mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)
10+
11+
watch:
12+
mkdir -p build && cd build && cmake .. $(FASTBUILD_FLAGS) && ls ../* ../utils/* | entr -s "rm -f $(TARGET) && make -j$(NUM_JOBS) $(TARGET) && ./$(TARGET)"
13+
14+
clean:
15+
read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
16+

examples/gpu_puzzles/run.cpp

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
/*
2+
* WIP implementation of Sasha Rush's GPU puzzles https://github.com/srush/GPU-Puzzles
3+
*/
4+
5+
#include <array>
6+
#include <cstdio>
7+
#include "gpu.h"
8+
#include "utils/array_utils.h"
9+
10+
using namespace gpu;
11+
12+
static constexpr size_t N = 3072;
13+
14+
template <size_t N>
15+
std::array<float, N> makeData() {
16+
std::array<float, N> inputArr;
17+
for (int i = 0; i < N; ++i) {
18+
inputArr[i] = static_cast<float>(i); // dummy input data
19+
}
20+
return inputArr;
21+
}
22+
23+
template <size_t N>
24+
void showResult(GPUContext& ctx, Kernel& op, GPUTensor& output) {
25+
DispatchKernel(ctx, op);
26+
std::array<float, N> outputArr;
27+
Wait(ctx, op.future);
28+
ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
29+
fprintf(stdout, "%s", show<float, N, 1>(outputArr, "output").c_str());
30+
}
31+
32+
// Puzzle 1 : Map
33+
// Implement a "kernel" (GPU function) that adds 10 to each position of vector
34+
// a and stores it in vector out. You have 1 thread per position.
35+
const char *kPuzzle1_Map= R"(
36+
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
37+
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
38+
@compute @workgroup_size(256)
39+
fn main(
40+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
41+
let idx = GlobalInvocationID.x;
42+
if (idx < arrayLength(&input)) {
43+
output[idx] = input[idx] + 10;
44+
}
45+
}
46+
)";
47+
48+
void puzzle1(GPUContext& ctx) {
49+
fprintf(stdout, "\n\nPuzzle 1\n\n");
50+
GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
51+
GPUTensor output = CreateTensor(ctx, {N}, kf32);
52+
Kernel op =
53+
CreateKernel(ctx, ShaderCode{kPuzzle1_Map, 256}, input, output);
54+
showResult<N>(ctx, op, output);
55+
}
56+
57+
// Puzzle 2 : Zip
58+
// Implement a kernel that adds together each position of a and b and stores it
59+
// in out. You have 1 thread per position.
60+
const char *kPuzzle2_Map= R"(
61+
@group(0) @binding(0) var<storage, read_write> a: array<f32>;
62+
@group(0) @binding(1) var<storage, read_write> b: array<f32>;
63+
@group(0) @binding(2) var<storage, read_write> output : array<f32>;
64+
@compute @workgroup_size(256)
65+
fn main(
66+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
67+
let idx = GlobalInvocationID.x;
68+
if (idx < arrayLength(&a)) {
69+
output[idx] = a[idx] + b[idx];
70+
}
71+
}
72+
)";
73+
74+
void puzzle2(GPUContext& ctx) {
75+
fprintf(stdout, "\n\nPuzzle 2\n\n");
76+
GPUTensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
77+
GPUTensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
78+
GPUTensor output = CreateTensor(ctx, {N}, kf32);
79+
Kernel op =
80+
CreateKernel(ctx, ShaderCode{kPuzzle2_Map, 256}, GPUTensors{a, b}, output);
81+
showResult<N>(ctx, op, output);
82+
}
83+
84+
85+
// Puzzle 3 : Guards
86+
// Implement a kernel that adds 10 to each position of a and stores it in out.
87+
// You have more threads than positions.
88+
const char *kPuzzle3_Map= R"(
89+
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
90+
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
91+
@compute @workgroup_size(4)
92+
fn main(
93+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
94+
) {
95+
// increment by workgroup size
96+
for (var i = GlobalInvocationID.x; i < arrayLength(&input); i = i + 4) {
97+
output[i] = input[i] + 10;
98+
}
99+
}
100+
)";
101+
void puzzle3(GPUContext& ctx) {
102+
fprintf(stdout, "\n\nPuzzle 3\n\n");
103+
GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
104+
GPUTensor output = CreateTensor(ctx, {N}, kf32);
105+
Kernel op =
106+
CreateKernel(ctx, ShaderCode{kPuzzle3_Map, 4}, input, output);
107+
showResult<N>(ctx, op, output);
108+
}
109+
110+
// Puzzle 4 : Map 2D
111+
// Implement a kernel that adds 10 to each position of a and stores it in out.
112+
// Input a is 2D and square. You have more threads than positions.
113+
// TODO
114+
115+
// Puzzle 5 : Broadcast
116+
// Implement a kernel that adds a and b and stores it in out. Inputs a and b
117+
// are vectors. You have more threads than positions.
118+
// TODO
119+
120+
// ...
121+
122+
int main(int argc, char **argv) {
123+
GPUContext ctx = CreateGPUContext();
124+
puzzle1(ctx);
125+
puzzle2(ctx);
126+
puzzle3(ctx);
127+
return 0;
128+
}

gpu.h

+37
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,20 @@ struct GPUTensor {
6666
Shape shape;
6767
};
6868

69+
template <std::size_t N> struct GPUTensors {
70+
std::array<GPUTensor, N> data;
71+
GPUTensors(std::initializer_list<GPUTensor> init) {
72+
std::copy(init.begin(), init.end(), data.begin());
73+
}
74+
GPUTensor &operator[](std::size_t index) { return data[index]; }
75+
const GPUTensor &operator[](std::size_t index) const { return data[index]; }
76+
};
77+
78+
template <std::size_t N> GPUTensors(std::array<GPUTensor, N>) -> GPUTensors<N>;
79+
80+
// Deduction guide for GPUTensors
81+
template <typename... Args> GPUTensors(Args...) -> GPUTensors<sizeof...(Args)>;
82+
6983
struct TensorPool {
7084
TensorPool(GPUContext *ctx) : ctx(ctx), data() {};
7185
GPUContext *ctx;
@@ -678,6 +692,29 @@ Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
678692
params);
679693
}
680694

695+
/*
696+
* CreateKernel with GPUTensors of inputs (convienence function)
697+
*/
698+
template <typename ParamsType = NoParam, size_t numInputs>
699+
Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
700+
const GPUTensors<numInputs> &inputs,
701+
const GPUTensor &output,
702+
const ParamsType &params = ParamsType{}) {
703+
// first .data gets the array, second .data() gets the pointer
704+
return CreateKernel<ParamsType>(ctx, shader, inputs.data.data(), numInputs,
705+
output, params);
706+
}
707+
708+
/*
709+
* CreateKernel with single input case (convienence function)
710+
*/
711+
template <typename ParamsType = NoParam>
712+
Kernel CreateKernel(GPUContext &ctx, const ShaderCode &shader,
713+
const GPUTensor &input, const GPUTensor &output,
714+
const ParamsType &params = ParamsType{}) {
715+
return CreateKernel(ctx, shader, &input, 1, output, params);
716+
}
717+
681718
MultiKernel CreateMultiKernel(GPUContext &ctx, const MultiKernelDesc &desc) {
682719
WGPUDevice device = ctx.device;
683720
WGPUQueue queue = ctx.queue;

0 commit comments

Comments
 (0)