|
| 1 | +/* |
| 2 | + * WIP implementation of Sasha Rush's GPU puzzles https://github.com/srush/GPU-Puzzles |
| 3 | + */ |
| 4 | + |
| 5 | +#include <array> |
| 6 | +#include <cstdio> |
| 7 | +#include "gpu.h" |
| 8 | +#include "utils/array_utils.h" |
| 9 | + |
| 10 | +using namespace gpu; |
| 11 | + |
| 12 | +static constexpr size_t N = 3072; |
| 13 | + |
| 14 | +template <size_t N> |
| 15 | +std::array<float, N> makeData() { |
| 16 | + std::array<float, N> inputArr; |
| 17 | + for (int i = 0; i < N; ++i) { |
| 18 | + inputArr[i] = static_cast<float>(i); // dummy input data |
| 19 | + } |
| 20 | + return inputArr; |
| 21 | +} |
| 22 | + |
| 23 | +template <size_t N> |
| 24 | +void showResult(GPUContext& ctx, Kernel& op, GPUTensor& output) { |
| 25 | + DispatchKernel(ctx, op); |
| 26 | + std::array<float, N> outputArr; |
| 27 | + Wait(ctx, op.future); |
| 28 | + ToCPU(ctx, output, outputArr.data(), sizeof(outputArr)); |
| 29 | + fprintf(stdout, "%s", show<float, N, 1>(outputArr, "output").c_str()); |
| 30 | +} |
| 31 | + |
| 32 | +// Puzzle 1 : Map |
| 33 | +// Implement a "kernel" (GPU function) that adds 10 to each position of vector |
| 34 | +// a and stores it in vector out. You have 1 thread per position. |
| 35 | +const char *kPuzzle1_Map= R"( |
| 36 | +@group(0) @binding(0) var<storage, read_write> input: array<f32>; |
| 37 | +@group(0) @binding(1) var<storage, read_write> output : array<f32>; |
| 38 | +@compute @workgroup_size(256) |
| 39 | +fn main( |
| 40 | + @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) { |
| 41 | + let idx = GlobalInvocationID.x; |
| 42 | + if (idx < arrayLength(&input)) { |
| 43 | + output[idx] = input[idx] + 10; |
| 44 | + } |
| 45 | + } |
| 46 | +)"; |
| 47 | + |
| 48 | +void puzzle1(GPUContext& ctx) { |
| 49 | + fprintf(stdout, "\n\nPuzzle 1\n\n"); |
| 50 | + GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data()); |
| 51 | + GPUTensor output = CreateTensor(ctx, {N}, kf32); |
| 52 | + Kernel op = |
| 53 | + CreateKernel(ctx, ShaderCode{kPuzzle1_Map, 256}, input, output); |
| 54 | + showResult<N>(ctx, op, output); |
| 55 | +} |
| 56 | + |
| 57 | +// Puzzle 2 : Zip |
| 58 | +// Implement a kernel that adds together each position of a and b and stores it |
| 59 | +// in out. You have 1 thread per position. |
| 60 | +const char *kPuzzle2_Map= R"( |
| 61 | +@group(0) @binding(0) var<storage, read_write> a: array<f32>; |
| 62 | +@group(0) @binding(1) var<storage, read_write> b: array<f32>; |
| 63 | +@group(0) @binding(2) var<storage, read_write> output : array<f32>; |
| 64 | +@compute @workgroup_size(256) |
| 65 | +fn main( |
| 66 | + @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) { |
| 67 | + let idx = GlobalInvocationID.x; |
| 68 | + if (idx < arrayLength(&a)) { |
| 69 | + output[idx] = a[idx] + b[idx]; |
| 70 | + } |
| 71 | + } |
| 72 | +)"; |
| 73 | + |
| 74 | +void puzzle2(GPUContext& ctx) { |
| 75 | + fprintf(stdout, "\n\nPuzzle 2\n\n"); |
| 76 | + GPUTensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data()); |
| 77 | + GPUTensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data()); |
| 78 | + GPUTensor output = CreateTensor(ctx, {N}, kf32); |
| 79 | + Kernel op = |
| 80 | + CreateKernel(ctx, ShaderCode{kPuzzle2_Map, 256}, GPUTensors{a, b}, output); |
| 81 | + showResult<N>(ctx, op, output); |
| 82 | +} |
| 83 | + |
| 84 | + |
| 85 | +// Puzzle 3 : Guards |
| 86 | +// Implement a kernel that adds 10 to each position of a and stores it in out. |
| 87 | +// You have more threads than positions. |
| 88 | +const char *kPuzzle3_Map= R"( |
| 89 | +@group(0) @binding(0) var<storage, read_write> input: array<f32>; |
| 90 | +@group(0) @binding(1) var<storage, read_write> output : array<f32>; |
| 91 | +@compute @workgroup_size(4) |
| 92 | +fn main( |
| 93 | + @builtin(global_invocation_id) GlobalInvocationID: vec3<u32> |
| 94 | + ) { |
| 95 | + // increment by workgroup size |
| 96 | + for (var i = GlobalInvocationID.x; i < arrayLength(&input); i = i + 4) { |
| 97 | + output[i] = input[i] + 10; |
| 98 | + } |
| 99 | + } |
| 100 | +)"; |
| 101 | +void puzzle3(GPUContext& ctx) { |
| 102 | + fprintf(stdout, "\n\nPuzzle 3\n\n"); |
| 103 | + GPUTensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data()); |
| 104 | + GPUTensor output = CreateTensor(ctx, {N}, kf32); |
| 105 | + Kernel op = |
| 106 | + CreateKernel(ctx, ShaderCode{kPuzzle3_Map, 4}, input, output); |
| 107 | + showResult<N>(ctx, op, output); |
| 108 | +} |
| 109 | + |
| 110 | +// Puzzle 4 : Map 2D |
| 111 | +// Implement a kernel that adds 10 to each position of a and stores it in out. |
| 112 | +// Input a is 2D and square. You have more threads than positions. |
| 113 | +// TODO |
| 114 | + |
| 115 | +// Puzzle 5 : Broadcast |
| 116 | +// Implement a kernel that adds a and b and stores it in out. Inputs a and b |
| 117 | +// are vectors. You have more threads than positions. |
| 118 | +// TODO |
| 119 | + |
| 120 | +// ... |
| 121 | + |
| 122 | +int main(int argc, char **argv) { |
| 123 | + GPUContext ctx = CreateGPUContext(); |
| 124 | + puzzle1(ctx); |
| 125 | + puzzle2(ctx); |
| 126 | + puzzle3(ctx); |
| 127 | + return 0; |
| 128 | +} |
0 commit comments