|
| 1 | +#include "gpu.hpp" |
| 2 | +#include "metal_profiler.hpp" |
| 3 | +#include <array> |
| 4 | +#include <cstdio> |
| 5 | +#include <future> |
| 6 | + |
| 7 | +using namespace gpu; // createContext, createTensor, createKernel, |
| 8 | + // createShader, dispatchKernel, wait, toCPU |
| 9 | + // Tensor, Kernel, Context, Shape, kf32 |
| 10 | + |
| 11 | +static const char *kGelu = R"( |
| 12 | +const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI) |
| 13 | +@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>; |
| 14 | +@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>; |
| 15 | +@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>; |
| 16 | +@compute @workgroup_size({{workgroupSize}}) |
| 17 | +fn main( |
| 18 | + @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) { |
| 19 | + let i: u32 = GlobalInvocationID.x; |
| 20 | + if (i < arrayLength(&inp)) { |
| 21 | + let x: f32 = inp[i]; |
| 22 | + out[i] = select(0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR |
| 23 | + * (x + .044715 * x * x * x))), x, x > 10.0); |
| 24 | + } |
| 25 | +} |
| 26 | +)"; |
| 27 | + |
| 28 | +int main(int argc, char **argv) { |
| 29 | + printf("\033[2J\033[1;1H"); |
| 30 | + printf("\nHello gpu.cpp!\n"); |
| 31 | + printf("--------------\n\n"); |
| 32 | + |
| 33 | + Context ctx = createContext(); |
| 34 | + static constexpr size_t N = 10000; |
| 35 | + std::array<float, N> inputArr, outputArr; |
| 36 | + for (int i = 0; i < N; ++i) { |
| 37 | + inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data |
| 38 | + } |
| 39 | + Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); |
| 40 | + Tensor output = createTensor(ctx, Shape{N}, kf32); |
| 41 | + std::promise<void> promise; |
| 42 | + std::future<void> future = promise.get_future(); |
| 43 | + Kernel op = createKernel(ctx, {kGelu, 256, kf32}, |
| 44 | + Bindings{input, output}, |
| 45 | + /* nWorkgroups */ {cdiv(N, 256), 1, 1}); |
| 46 | + startCapture(); |
| 47 | + dispatchKernel(ctx, op, promise); |
| 48 | + wait(ctx, future); |
| 49 | + stopCapture(); |
| 50 | + toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); |
| 51 | + for (int i = 0; i < 12; ++i) { |
| 52 | + printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); |
| 53 | + } |
| 54 | + printf(" ...\n\n"); |
| 55 | + printf("Computed %zu values of GELU(x)\n\n", N); |
| 56 | + return 0; |
| 57 | +} |
0 commit comments