AnswerDotAI
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎examples/gpu_puzzles/run.cpp
+11-7 b/‎examples/gpu_puzzles/run.cpp
+11-7
diff --git a/‎examples/hello_world/run.cpp
+7-3 b/‎examples/hello_world/run.cpp
+7-3
diff --git a/‎examples/physics/run.cpp
+6-7 b/‎examples/physics/run.cpp
+6-7
diff --git a/‎examples/render/run.cpp
+10-9 b/‎examples/render/run.cpp
+10-9
@@ -38,7 +38,7 @@ watch-tests: check-entr check-dependencies
 all: build
 	cd examples/gpu_puzzles && make
 	cd examples/hello_world && make
-	cd examples/raymarch && make
+	cd examples/render && make
 	cd examples/webgpu_intro && make
 
 clean-build:
 
@@ -7,6 +7,7 @@
 #include "utils/array_utils.h"
 #include <array>
 #include <cstdio>
+#include <future>
 
 using namespace gpu;
 
@@ -21,9 +22,12 @@ template <size_t N> std::array<float, N> makeData() {
 }
 
 template <size_t N, size_t R = N, size_t C = 1> void showResult(Context &ctx, Kernel &op, Tensor &output) {
-  DispatchKernel(ctx, op);
+
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  DispatchKernel(ctx, op, promise);
   std::array<float, R * C> outputArr;
-  Wait(ctx, op.future);
+  Wait(ctx, future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   printf("%s", show<float, R, C>(outputArr, "output").c_str());
 }
@@ -48,7 +52,7 @@ void puzzle1(Context &ctx) {
   printf("\n\nPuzzle 1\n\n");
   Tensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
   Tensor output = CreateTensor(ctx, {N}, kf32);
-  Kernel op = CreateKernel(ctx, CreateShader(kPuzzle1, N), input, output,
+  Kernel op = CreateKernel(ctx, CreateShader(kPuzzle1, N), TensorList{input, output},
                            /*nthreads*/ {N, 1, 1});
   showResult<N>(ctx, op, output);
 }
@@ -75,8 +79,8 @@ void puzzle2(Context &ctx) {
   Tensor a = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
   Tensor b = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
   Tensor output = CreateTensor(ctx, {N}, kf32);
-  Kernel op = CreateKernel(ctx, CreateShader(kPuzzle2, 256), Tensors{a, b},
-                           output, {N, 1, 1});
+  Kernel op = CreateKernel(ctx, CreateShader(kPuzzle2, 256), TensorList{a, b, output},
+                           {N, 1, 1});
   showResult<N>(ctx, op, output);
 }
 
@@ -101,7 +105,7 @@ void puzzle3(Context &ctx) {
   Tensor input = CreateTensor(ctx, {N}, kf32, makeData<N>().data());
   Tensor output = CreateTensor(ctx, {N}, kf32);
   Kernel op =
-      CreateKernel(ctx, CreateShader(kPuzzle3, 4), input, output, {N, 1, 1});
+      CreateKernel(ctx, CreateShader(kPuzzle3, 4), TensorList{input, output}, {N, 1, 1});
   showResult<N>(ctx, op, output);
 }
 
@@ -135,7 +139,7 @@ void puzzle4(Context &ctx) {
   };
   Kernel op =
       CreateKernel(ctx, CreateShader(kPuzzle4, /*workgroup size*/ {N, N, 1}),
-                   input, output, {N, N, 1}, Params{N});
+                   TensorList{input, output}, {N, N, 1}, Params{N});
   showResult<N, N, N>(ctx, op, output);
 }
 
 
@@ -1,6 +1,7 @@
 #include "gpu.h"
 #include <array>
 #include <cstdio>
+#include <future>
 
 using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
                      // CreateShader, DispatchKernel, Wait, ToCPU
@@ -10,6 +11,7 @@ static const char *kGelu = R"(
 const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
 @group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
 @group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
 @compute @workgroup_size({{workgroupSize}})
 fn main(
     @builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
@@ -33,10 +35,12 @@ int main(int argc, char **argv) {
   }
   Tensor input = CreateTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = CreateTensor(ctx, Shape{N}, kf32);
-  Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32), input, output,
+  std::promise<void> promise;
+  std::future<void> future = promise.get_future();
+  Kernel op = CreateKernel(ctx, CreateShader(kGelu, 256, kf32), TensorList{input, output},
                            /* nthreads */ {N, 1, 1});
-  DispatchKernel(ctx, op);
-  Wait(ctx, op.future);
+  DispatchKernel(ctx, op, promise);
+  Wait(ctx, future);
   ToCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 32; ++i) {
     printf("out[%d] : gelu(%.2f) = %.2f\n", i, inputArr[i], outputArr[i]);
 
@@ -2,6 +2,7 @@
 #include <array>
 #include <chrono>
 #include <cstdio>
+#include <future>
 
 using namespace gpu; // CreateContext, CreateTensor, CreateKernel,
                      // CreateShader, DispatchKernel, Wait, ToCPU
@@ -43,21 +44,19 @@ int main() {
   Tensor length = CreateTensor(ctx, Shape{N}, kf32, lengthArr.data());
   Tensor mass = CreateTensor(ctx, Shape{N}, kf32, massArr.data());
 
-  // TODO: no need to have output
-  Tensor output = CreateTensor(ctx, Shape{N}, kf32);
-
   Shape nThreads{N, 1, 1};
   Kernel update = CreateKernel(
       ctx, CreateShader(kShaderSimulation, 256, kf32),
       TensorList{pos1, vel1, pos2, vel2,
-       length, mass}, output,
+       length, mass}, 
       nThreads);
   while (true) {
     auto start = std::chrono::high_resolution_clock::now();
     ResetCommandBuffer(ctx.device, nThreads, update);
-
-    DispatchKernel(ctx, update);
-    Wait(ctx, update.future);
+    std::promise<void> promise;
+    std::future<void> future = promise.get_future();
+    DispatchKernel(ctx, update, promise);
+    Wait(ctx, future);
     auto end = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed = end - start;
     std::this_thread::sleep_for(std::chrono::milliseconds(16) - elapsed);
 
@@ -1,6 +1,7 @@
 #include <array>
 #include <chrono>
 #include <cstdio>
+#include <future>
 
 #include "gpu.h"
 #include "utils/array_utils.h"
@@ -119,16 +120,18 @@ int main(int argc, char **argv) {
 
   ShaderCode shader = CreateShader(kSDF, Shape{16, 16, 1});
   Kernel renderKernel =
-      CreateKernel(ctx, shader, {}, 0, devScreen, {NCOLS, NROWS, 1}, params);
+      CreateKernel(ctx, shader, TensorList{devScreen}, {NCOLS, NROWS, 1}, params);
   while (true) {
-    DispatchKernel(ctx, renderKernel);
-    Wait(ctx, renderKernel.future);
+    std::promise<void> promise;
+    std::future<void> future = promise.get_future();
+    DispatchKernel(ctx, renderKernel, promise);
+    Wait(ctx, future);
     ToCPU(ctx, devScreen, screen.data(), sizeof(screen));
-    // Update the time field, write pparams to GPU, and create a new command
-    // buffer
     params.time = getCurrentTimeInMilliseconds() - zeroTime;
+
+    // write params to the last buffer
     wgpuQueueWriteBuffer(ctx.queue,
-                         renderKernel.buffers[renderKernel.numBuffers - 1], 0,
+                         renderKernel.buffers[renderKernel.numBindings - 1], 0,
                          static_cast<void *>(&params), sizeof(params));
     ResetCommandBuffer(ctx.device, /*nthreads*/ {NCOLS, NROWS, 1},
                        renderKernel);
@@ -137,15 +140,14 @@ int main(int argc, char **argv) {
                                     "\\|()1{}[]?-_+~<>i!lI;:,\"^`'. ";
     // static const char intensity[] = "@%#8$X71x*+=-:^~'.` ";
 
-    // normalize values
+    // Intensity = depth map, focus on depth of the objects
     float min = 0.0;
     float max = params.sphereRadius * 3;
 
     for (size_t i = 0; i < screen.size(); ++i) {
       screen[i] = (screen[i] - min) / (max - min);
     }
 
-    // index into intensity array
     std::array<char, screen.size()> raster;
     for (size_t i = 0; i < screen.size(); ++i) {
       size_t index =
@@ -155,7 +157,6 @@ int main(int argc, char **argv) {
       raster[i] = intensity[index];
     }
 
-    // Draw the raster
     char buffer[(NROWS + 2) * (NCOLS + 2)];
     char *offset = buffer;
     sprintf(offset, "+");