1
+ #include " gpu.h"
2
+ #include < array>
3
+ #include < cstdio>
4
+ #include < future>
5
+
6
+ using namespace gpu ; // createContext, createTensor, createKernel,
7
+ // createShader, dispatchKernel, wait, toCPU
8
+ // Tensor, Kernel, Context, Shape, kf32
9
+
10
+ static const char *kTan = R"(
11
+ @group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
12
+ @group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
13
+ @compute @workgroup_size({{workgroupSize}})
14
+ fn main(
15
+ @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
16
+ let i: u32 = GlobalInvocationID.x;
17
+ if (i < arrayLength(&inp)) {
18
+ let x: f32 = inp[i];
19
+ out[i] = tan(x);
20
+ }
21
+ }
22
+ )" ;
23
+
24
+ int main (int argc, char **argv) {
25
+ printf (" \033 [2J\033 [1;1H" );
26
+ printf (" \n Hello gpu.cpp!\n " );
27
+ printf (" --------------\n\n " );
28
+
29
+ Context ctx = createContext ();
30
+ static constexpr size_t N = 100000 ;
31
+ std::array<float , N> inputArr, outputArr;
32
+ for (int i = 0 ; i < N; ++i) {
33
+ inputArr[i] = static_cast <float >(i) / 10.0 ; // dummy input data
34
+ }
35
+ Tensor input = createTensor (ctx, Shape{N}, kf32, inputArr.data ());
36
+ Tensor output = createTensor (ctx, Shape{N}, kf32);
37
+ std::promise<void > promise;
38
+ std::future<void > future = promise.get_future ();
39
+ Kernel op = createKernel (ctx, {kTan , 256 , kf32},
40
+ Bindings{input, output},
41
+ /* nWorkgroups */ {cdiv (N, 256 ), 1 , 1 });
42
+ dispatchKernel (ctx, op, promise);
43
+ wait (ctx, future);
44
+ toCPU (ctx, output, outputArr.data (), sizeof (outputArr));
45
+ for (int i = 0 ; i < 1000 ; ++i) {
46
+ printf (" tan(%.2f) = %.10f\n " , inputArr[i], outputArr[i]);
47
+ }
48
+ printf (" ...\n\n " );
49
+ printf (" Computed %zu values of tan(x)\n\n " , N);
50
+ return 0 ;
51
+ }
0 commit comments