1
- /*
2
- * WIP implementation of Sasha Rush's GPU puzzles https://github.com/srush/GPU-Puzzles
1
+ /*
2
+ * WIP implementation of Sasha Rush's GPU puzzles
3
+ * https://github.com/srush/GPU-Puzzles
3
4
*/
4
5
5
- #include < array>
6
- #include < cstdio>
7
6
#include " gpu.h"
8
7
#include " utils/array_utils.h"
8
+ #include < array>
9
+ #include < cstdio>
9
10
10
11
using namespace gpu ;
11
12
12
13
static constexpr size_t N = 3072 ;
13
14
14
- template <size_t N>
15
- std::array<float , N> makeData () {
15
+ template <size_t N> std::array<float , N> makeData () {
16
16
std::array<float , N> inputArr;
17
17
for (int i = 0 ; i < N; ++i) {
18
18
inputArr[i] = static_cast <float >(i); // dummy input data
19
19
}
20
20
return inputArr;
21
21
}
22
22
23
- template <size_t N>
24
- void showResult (GPUContext& ctx, Kernel& op, GPUTensor& output) {
23
+ template <size_t N, size_t R = N, size_t C = 1 > void showResult (Context &ctx, Kernel &op, Tensor &output) {
25
24
DispatchKernel (ctx, op);
26
- std::array<float , N > outputArr;
25
+ std::array<float , R * C > outputArr;
27
26
Wait (ctx, op.future );
28
27
ToCPU (ctx, output, outputArr.data (), sizeof (outputArr));
29
- fprintf (stdout, " %s" , show<float , N, 1 >(outputArr, " output" ).c_str ());
28
+ printf ( " %s" , show<float , R, C >(outputArr, " output" ).c_str ());
30
29
}
31
30
32
31
// Puzzle 1 : Map
33
32
// Implement a "kernel" (GPU function) that adds 10 to each position of vector
34
33
// a and stores it in vector out. You have 1 thread per position.
35
- const char *kPuzzle1_Map = R"(
34
+ const char *kPuzzle1 = R"(
36
35
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
37
36
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
38
- @compute @workgroup_size(256 )
37
+ @compute @workgroup_size({{workgroupSize}} )
39
38
fn main(
40
39
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
41
40
let idx = GlobalInvocationID.x;
@@ -45,23 +44,23 @@ fn main(
45
44
}
46
45
)" ;
47
46
48
- void puzzle1 (GPUContext& ctx) {
49
- fprintf (stdout, " \n\n Puzzle 1\n\n " );
50
- GPUTensor input = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
51
- GPUTensor output = CreateTensor (ctx, {N}, kf32);
52
- Kernel op =
53
- CreateKernel (ctx, ShaderCode{ kPuzzle1_Map , 256 }, input, output );
47
+ void puzzle1 (Context & ctx) {
48
+ printf ( " \n\n Puzzle 1\n\n " );
49
+ Tensor input = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
50
+ Tensor output = CreateTensor (ctx, {N}, kf32);
51
+ Kernel op = CreateKernel (ctx, CreateShader ( kPuzzle1 , N), input, output,
52
+ /* nthreads */ {N, 1 , 1 } );
54
53
showResult<N>(ctx, op, output);
55
54
}
56
55
57
56
// Puzzle 2 : Zip
58
57
// Implement a kernel that adds together each position of a and b and stores it
59
58
// in out. You have 1 thread per position.
60
- const char *kPuzzle2_Map = R"(
59
+ const char *kPuzzle2 = R"(
61
60
@group(0) @binding(0) var<storage, read_write> a: array<f32>;
62
61
@group(0) @binding(1) var<storage, read_write> b: array<f32>;
63
62
@group(0) @binding(2) var<storage, read_write> output : array<f32>;
64
- @compute @workgroup_size(256 )
63
+ @compute @workgroup_size({{workgroupSize}} )
65
64
fn main(
66
65
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
67
66
let idx = GlobalInvocationID.x;
@@ -71,24 +70,23 @@ fn main(
71
70
}
72
71
)" ;
73
72
74
- void puzzle2 (GPUContext& ctx) {
75
- fprintf (stdout, " \n\n Puzzle 2\n\n " );
76
- GPUTensor a = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
77
- GPUTensor b = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
78
- GPUTensor output = CreateTensor (ctx, {N}, kf32);
79
- Kernel op =
80
- CreateKernel (ctx, ShaderCode{ kPuzzle2_Map , 256 }, GPUTensors{a, b}, output );
73
+ void puzzle2 (Context & ctx) {
74
+ printf ( " \n\n Puzzle 2\n\n " );
75
+ Tensor a = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
76
+ Tensor b = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
77
+ Tensor output = CreateTensor (ctx, {N}, kf32);
78
+ Kernel op = CreateKernel (ctx, CreateShader ( kPuzzle2 , 256 ), Tensors{a, b},
79
+ output, {N, 1 , 1 } );
81
80
showResult<N>(ctx, op, output);
82
81
}
83
82
84
-
85
83
// Puzzle 3 : Guards
86
84
// Implement a kernel that adds 10 to each position of a and stores it in out.
87
85
// You have more threads than positions.
88
- const char *kPuzzle3_Map = R"(
86
+ const char *kPuzzle3 = R"(
89
87
@group(0) @binding(0) var<storage, read_write> input: array<f32>;
90
88
@group(0) @binding(1) var<storage, read_write> output : array<f32>;
91
- @compute @workgroup_size(4 )
89
+ @compute @workgroup_size({{workgroupSize}} )
92
90
fn main(
93
91
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
94
92
) {
@@ -98,31 +96,72 @@ fn main(
98
96
}
99
97
}
100
98
)" ;
101
- void puzzle3 (GPUContext& ctx) {
102
- fprintf (stdout, " \n\n Puzzle 3\n\n " );
103
- GPUTensor input = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
104
- GPUTensor output = CreateTensor (ctx, {N}, kf32);
99
+ void puzzle3 (Context & ctx) {
100
+ printf ( " \n\n Puzzle 3\n\n " );
101
+ Tensor input = CreateTensor (ctx, {N}, kf32, makeData<N>().data ());
102
+ Tensor output = CreateTensor (ctx, {N}, kf32);
105
103
Kernel op =
106
- CreateKernel (ctx, ShaderCode{ kPuzzle3_Map , 4 } , input, output);
104
+ CreateKernel (ctx, CreateShader ( kPuzzle3 , 4 ) , input, output, {N, 1 , 1 } );
107
105
showResult<N>(ctx, op, output);
108
106
}
109
107
110
108
// Puzzle 4 : Map 2D
111
109
// Implement a kernel that adds 10 to each position of a and stores it in out.
112
110
// Input a is 2D and square. You have more threads than positions.
113
- // TODO
111
+ const char *kPuzzle4 = R"(
112
+ @group(0) @binding(0) var<storage, read_write> input: array<f32>;
113
+ @group(0) @binding(1) var<storage, read_write> output : array<f32>;
114
+ @group(0) @binding(2) var<uniform> params: Params;
115
+ struct Params {
116
+ size: u32, // input is size x size
117
+ };
118
+ @compute @workgroup_size({{workgroupSize}})
119
+ fn main(
120
+ @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
121
+ ) {
122
+ let idx = GlobalInvocationID.x + GlobalInvocationID.y * params.size;
123
+ if (idx < arrayLength(&input)) {
124
+ output[idx] = input[idx] + 10;
125
+ }
126
+ }
127
+ )" ;
128
+ void puzzle4 (Context &ctx) {
129
+ printf (" \n\n Puzzle 4\n\n " );
130
+ static constexpr size_t N = 9 ;
131
+ Tensor input = CreateTensor (ctx, {N, N}, kf32, makeData<N * N>().data ());
132
+ Tensor output = CreateTensor (ctx, {N, N}, kf32);
133
+ struct Params {
134
+ uint32_t size = N;
135
+ };
136
+ Kernel op =
137
+ CreateKernel (ctx, CreateShader (kPuzzle4 , /* workgroup size*/ {N, N, 1 }),
138
+ input, output, {N, N, 1 }, Params{N});
139
+ showResult<N, N, N>(ctx, op, output);
140
+ }
114
141
115
142
// Puzzle 5 : Broadcast
116
143
// Implement a kernel that adds a and b and stores it in out. Inputs a and b
117
144
// are vectors. You have more threads than positions.
118
- // TODO
145
+ const char *kPuzzle5_Broadcast = R"(
146
+ @group(0) @binding(0) var<storage, read_write> a: array<f32>;
147
+ @group(0) @binding(1) var<storage, read_write> b: array<f32>;
148
+ @group(0) @binding(2) var<storage, read_write> output : array<f32>;
149
+ @compute @workgroup_size({{workgroupSize}}) (
150
+ fn main(
151
+ @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>
152
+ ) {
153
+ // TODO
154
+ }
155
+ )" ;
119
156
157
+ // TODO
120
158
// ...
121
159
122
160
int main (int argc, char **argv) {
123
- GPUContext ctx = CreateContext ();
161
+ Context ctx = CreateContext ();
124
162
puzzle1 (ctx);
125
163
puzzle2 (ctx);
126
164
puzzle3 (ctx);
165
+ puzzle4 (ctx);
127
166
return 0 ;
128
167
}
0 commit comments