Skip to content

Commit 32828d5

Browse files
committed
API refinement in preparation for release (GPUTensor -> Tensor, GPUContext -> Context for Create* consistency), fix example builds, start docstrings (WIP)
1 parent adf4ce3 commit 32828d5

File tree

7 files changed

+237
-263
lines changed

7 files changed

+237
-263
lines changed

CMakeLists.txt

+5-7
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,11 @@ IF (NOT WEBGPU_TAG)
2929
ENDIF()
3030
message(STATUS "Using WebGPU distribution tag: ${WEBGPU_TAG}")
3131

32-
# TODO - look into dawn version check build issue :(
33-
# ABSL_PROPAGATE_CXX_STD
34-
# https://github.com/google/dawn/blob/59b4421352ada94c98f0f3d63913c117378d970c/CMakeLists.txt#L246C5-L246C27
35-
3632
if (WEBGPU_TAG STREQUAL "dawn")
3733
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWEBGPU_BACKEND_DAWN")
38-
# actually use specific commit
39-
# set(WEBGPU_TAG "1025b977e1927b6d0327e67352f90feb4bcf8274") # prev commit
40-
# set(WEBGPU_TAG "acf972b7b909f52e183bdae3971b93bb13d4a29e") # latest commit
34+
# use specific commit
35+
# set(WEBGPU_TAG "1025b977e1927b6d0327e67352f90feb4bcf8274")
36+
# set(WEBGPU_TAG "acf972b7b909f52e183bdae3971b93bb13d4a29e")
4137
# add_compile_options(-UABSL_INTERNAL_AT_LEAST_CXX20)
4238
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UABSL_INTERNAL_AT_LEAST_CXX20")
4339
message(STATUS "Using Dawn backend")
@@ -92,3 +88,5 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
9288
set(SRC_LIB gpu.h nn/shaders.h utils/array_utils.h utils/logging.h)
9389
add_library(gpu SHARED ${SRC_LIB})
9490
set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
91+
92+
# For additional targets see directories under `examples/`, which have their own CMakeLists.txt

examples/gpu_puzzles/run.cpp

+77-38
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,40 @@
1-
/*
2-
* WIP implementation of Sasha Rush's GPU puzzles https://github.com/srush/GPU-Puzzles
1+
/*
2+
* WIP implementation of Sasha Rush's GPU puzzles
3+
* https://github.com/srush/GPU-Puzzles
34
*/
45

5-
#include <array>
6-
#include <cstdio>
76
#include "gpu.h"
87
#include "utils/array_utils.h"
8+
#include <array>
9+
#include <cstdio>
910

1011
using namespace gpu;
1112

1213
static constexpr size_t N = 3072;
1314

14-
template <size_t N>
15-
std::array<float, N> makeData() {
15+
template <size_t N> std::array<float, N> makeData() {
1616
std::array<float, N> inputArr;
1717
for (int i = 0; i < N; ++i) {
1818
inputArr[i] = static_cast<float>(i); // dummy input data
1919
}
2020
return inputArr;
2121
}
2222

23-
template <size_t N>
24-
void showResult(GPUContext& ctx, Kernel& op, GPUTensor& output) {
23+
template <size_t N, size_t R = N, size_t C = 1> void showResult(Context &ctx, Kernel &op, Tensor &output) {
2524
DispatchKernel(ctx, op);
26-
std::array<float, N> outputArr;
25+
std::array<float, R * C> outputArr;
2726
Wait(ctx, op.future);
2827
ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
29-
fprintf(stdout, "%s", show<float, N, 1>(outputArr, "output").c_str());
28+
printf("%s", show<float, R, C>(outputArr, "output").c_str());
3029
}
3130

3231
// Puzzle 1 : Map
3332
// Implement a "kernel" (GPU function) that adds 10 to each position of vector
3433
// a and stores it in vector out. You have 1 thread per position.
35-
const char *kPuzzle1_Map= R"(
34+
const char *kPuzzle1 = R"(
3635
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
3736
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
38-
@compute @workgroup_size(256)
37+
@compute @workgroup_size({{workgroupSize}})
3938
fn main(
4039
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
4140
let idx = GlobalInvocationID.x;
@@ -45,23 +44,23 @@ fn main(
4544
}
4645
)";
4746

48-
void puzzle1(GPUContext& ctx) {
49-
fprintf(stdout, "\n\nPuzzle 1\n\n");
50-
GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
51-
GPUTensor output = CreateTensor(ctx, {N}, kf32);
52-
Kernel op =
53-
CreateKernel(ctx, ShaderCode{kPuzzle1_Map, 256}, input, output);
47+
void puzzle1(Context &ctx) {
48+
printf("\n\nPuzzle 1\n\n");
49+
Tensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
50+
Tensor output = CreateTensor(ctx, {N}, kf32);
51+
Kernel op = CreateKernel(ctx, CreateShader(kPuzzle1, N), input, output,
52+
/*nthreads*/ {N, 1, 1});
5453
showResult<N>(ctx, op, output);
5554
}
5655

5756
// Puzzle 2 : Zip
5857
// Implement a kernel that adds together each position of a and b and stores it
5958
// in out. You have 1 thread per position.
60-
const char *kPuzzle2_Map= R"(
59+
const char *kPuzzle2 = R"(
6160
@group(0) @binding(0) var<storage, read_write> a: array<f32>;
6261
@group(0) @binding(1) var<storage, read_write> b: array<f32>;
6362
@group(0) @binding(2) var<storage, read_write> output : array<f32>;
64-
@compute @workgroup_size(256)
63+
@compute @workgroup_size({{workgroupSize}})
6564
fn main(
6665
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
6766
let idx = GlobalInvocationID.x;
@@ -71,24 +70,23 @@ fn main(
7170
}
7271
)";
7372

74-
void puzzle2(GPUContext& ctx) {
75-
fprintf(stdout, "\n\nPuzzle 2\n\n");
76-
GPUTensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
77-
GPUTensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
78-
GPUTensor output = CreateTensor(ctx, {N}, kf32);
79-
Kernel op =
80-
CreateKernel(ctx, ShaderCode{kPuzzle2_Map, 256}, GPUTensors{a, b}, output);
73+
void puzzle2(Context &ctx) {
74+
printf("\n\nPuzzle 2\n\n");
75+
Tensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
76+
Tensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
77+
Tensor output = CreateTensor(ctx, {N}, kf32);
78+
Kernel op = CreateKernel(ctx, CreateShader(kPuzzle2, 256), Tensors{a, b},
79+
output, {N, 1, 1});
8180
showResult<N>(ctx, op, output);
8281
}
8382

84-
8583
// Puzzle 3 : Guards
8684
// Implement a kernel that adds 10 to each position of a and stores it in out.
8785
// You have more threads than positions.
88-
const char *kPuzzle3_Map= R"(
86+
const char *kPuzzle3 = R"(
8987
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
9088
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
91-
@compute @workgroup_size(4)
89+
@compute @workgroup_size({{workgroupSize}})
9290
fn main(
9391
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
9492
) {
@@ -98,31 +96,72 @@ fn main(
9896
}
9997
}
10098
)";
101-
void puzzle3(GPUContext& ctx) {
102-
fprintf(stdout, "\n\nPuzzle 3\n\n");
103-
GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
104-
GPUTensor output = CreateTensor(ctx, {N}, kf32);
99+
void puzzle3(Context &ctx) {
100+
printf("\n\nPuzzle 3\n\n");
101+
Tensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
102+
Tensor output = CreateTensor(ctx, {N}, kf32);
105103
Kernel op =
106-
CreateKernel(ctx, ShaderCode{kPuzzle3_Map, 4}, input, output);
104+
CreateKernel(ctx, CreateShader(kPuzzle3, 4), input, output, {N, 1, 1});
107105
showResult<N>(ctx, op, output);
108106
}
109107

110108
// Puzzle 4 : Map 2D
111109
// Implement a kernel that adds 10 to each position of a and stores it in out.
112110
// Input a is 2D and square. You have more threads than positions.
113-
// TODO
111+
const char *kPuzzle4 = R"(
112+
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
113+
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
114+
@group(0) @binding(2) var<uniform> params: Params;
115+
struct Params {
116+
size: u32, // input is size x size
117+
};
118+
@compute @workgroup_size({{workgroupSize}})
119+
fn main(
120+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
121+
) {
122+
let idx = GlobalInvocationID.x + GlobalInvocationID.y * params.size;
123+
if (idx < arrayLength(&input)) {
124+
output[idx] = input[idx] + 10;
125+
}
126+
}
127+
)";
128+
void puzzle4(Context &ctx) {
129+
printf("\n\nPuzzle 4\n\n");
130+
static constexpr size_t N = 9;
131+
Tensor input = CreateTensor(ctx, {N, N}, kf32, makeData<N * N>().data());
132+
Tensor output = CreateTensor(ctx, {N, N}, kf32);
133+
struct Params {
134+
uint32_t size = N;
135+
};
136+
Kernel op =
137+
CreateKernel(ctx, CreateShader(kPuzzle4, /*workgroup size*/ {N, N, 1}),
138+
input, output, {N, N, 1}, Params{N});
139+
showResult<N, N, N>(ctx, op, output);
140+
}
114141

115142
// Puzzle 5 : Broadcast
116143
// Implement a kernel that adds a and b and stores it in out. Inputs a and b
117144
// are vectors. You have more threads than positions.
118-
// TODO
145+
const char *kPuzzle5_Broadcast = R"(
146+
@group(0) @binding(0) var<storage, read_write> a: array<f32>;
147+
@group(0) @binding(1) var<storage, read_write> b: array<f32>;
148+
@group(0) @binding(2) var<storage, read_write> output : array<f32>;
149+
@compute @workgroup_size({{workgroupSize}}) (
150+
fn main(
151+
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
152+
) {
153+
// TODO
154+
}
155+
)";
119156

157+
// TODO
120158
// ...
121159

122160
int main(int argc, char **argv) {
123-
GPUContext ctx = CreateContext();
161+
Context ctx = CreateContext();
124162
puzzle1(ctx);
125163
puzzle2(ctx);
126164
puzzle3(ctx);
165+
puzzle4(ctx);
127166
return 0;
128167
}

examples/hello_world/run.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
66
// CreateShader, DispatchKernel, Wait, ToCPU
7-
// GPUTensor, Kernel, GPUContext, Shape, kf32
7+
// Tensor, Kernel, Context, Shape, kf32
88

99
static const char *kGelu = R"(
1010
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@@ -25,14 +25,14 @@ fn main(
2525

2626
int main(int argc, char **argv) {
2727
printf("\nHello, gpu.cpp\n\n");
28-
GPUContext ctx = CreateContext();
28+
Context ctx = CreateContext();
2929
static constexpr size_t N = 3072;
3030
std::array<float, N> inputArr, outputArr;
3131
for (int i = 0; i < N; ++i) {
3232
inputArr[i] = static_cast<float>(i) / 2.0; // dummy input data
3333
}
34-
GPUTensor input = CreateTensor(ctx, Shape{N}, kf32, inputArr.data());
35-
GPUTensor output = CreateTensor(ctx, Shape{N}, kf32);
34+
Tensor input = CreateTensor(ctx, Shape{N}, kf32, inputArr.data());
35+
Tensor output = CreateTensor(ctx, Shape{N}, kf32);
3636
Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32), input, output,
3737
/* nthreads */ {N, 1, 1});
3838
DispatchKernel(ctx, op);

examples/render/run.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ int main(int argc, char **argv) {
113113

114114
std::fill(begin(screen), end(screen), 0.0f);
115115

116-
GPUContext ctx = CreateContext();
117-
GPUTensor devScreen = CreateTensor(ctx, {NROWS, NCOLS}, kf32, screen.data());
116+
Context ctx = CreateContext();
117+
Tensor devScreen = CreateTensor(ctx, {NROWS, NCOLS}, kf32, screen.data());
118118
uint32_t zeroTime = getCurrentTimeInMilliseconds();
119119

120120
ShaderCode shader = CreateShader(kSDF, Shape{16, 16, 1});

0 commit comments

Comments
 (0)