@@ -285,51 +285,55 @@ Kernel createSumKernel(Context& ctx, Tensor& input, Tensor& output, size_t size)
285
285
return createKernel (ctx, {kSum , num_threads, kf32}, Bindings{input, output}, {size_x, size_y, 1 });
286
286
}
287
287
288
- float sum_gpu (Context& ctx, const float * data, const float * buffer, size_t size) {
289
- WGPURequiredLimits requiredLimits = LIMITS;
290
- uint32_t num_threads = 1024 ;
291
- int nSum = round (log2 (size) / log2 (num_threads));
292
- int input_size = size;
293
- unsigned long output_size = size;
288
+ struct SumKernel {
294
289
std::vector<Tensor> outputs;
295
290
std::vector<Kernel> ops;
296
- outputs.push_back (createTensor (ctx, Shape{std::max (size, static_cast <unsigned long >(1024 *2 ))}, kf32));
297
- for (int i=size,j=0 ;i>0 ;i/=num_threads,j++){
298
- output_size = (output_size + num_threads - 1 ) / num_threads;
299
- outputs.push_back (createTensor (ctx, Shape{std::max (output_size, static_cast <unsigned long >(1024 *2 ))}, kf32));
300
- ops.push_back (createSumKernel (ctx, outputs[j], outputs[j+1 ], input_size));
301
- // printf("size: %d\n", input_size);
302
- input_size = output_size;
303
- }
304
- toGPU (ctx, data, outputs[0 ], size * sizeof (float ));
305
-
306
-
307
- {
291
+ SumKernel (Context& ctx, size_t size) {
292
+ uint32_t num_threads = 1024 ;
293
+ int nSum = round (log2 (size) / log2 (num_threads));
294
+ int input_size = size;
295
+ unsigned long output_size = size;
296
+ outputs.push_back (createTensor (ctx, Shape{std::max (size, static_cast <unsigned long >(num_threads*2 ))}, kf32));
308
297
for (int i=size,j=0 ;i>0 ;i/=num_threads,j++){
298
+ output_size = (output_size + num_threads - 1 ) / num_threads;
299
+ outputs.push_back (createTensor (ctx, Shape{std::max (output_size, static_cast <unsigned long >(num_threads*2 ))}, kf32));
300
+ ops.push_back (createSumKernel (ctx, outputs[j], outputs[j+1 ], input_size));
301
+ input_size = output_size;
302
+ }
303
+ }
304
+ void dispatchKernel (Context& ctx) {
305
+ for (int i=0 ;i<ops.size ();i++){
309
306
std::promise<void > promise;
310
307
std::future<void > future = promise.get_future ();
311
- dispatchKernel (ctx, ops[j ], promise);
308
+ gpu:: dispatchKernel (ctx, ops[i ], promise);
312
309
wait (ctx, future);
313
- resetCommandBuffer (ctx.device , ops[j ]);
310
+ resetCommandBuffer (ctx.device , ops[i ]);
314
311
}
315
312
}
313
+ void toGPU (Context& ctx, const float * data, size_t size) {
314
+ gpu::toGPU (ctx, data, outputs[0 ], size);
315
+ }
316
+ void toCPU (Context& ctx, float * data, size_t size) {
317
+ gpu::toCPU (ctx, outputs[outputs.size ()-1 ], data, size);
318
+ }
319
+ };
320
+
321
+ float sum_gpu (Context& ctx, const float * data, float * buffer, size_t size) {
322
+ WGPURequiredLimits requiredLimits = LIMITS;
323
+ SumKernel sumKernel (ctx, size);
324
+ sumKernel.toGPU (ctx, data, size * sizeof (float ));
325
+ sumKernel.dispatchKernel (ctx);
316
326
317
327
{
318
328
int nIter = 100 ;
319
329
DurationTime dt (" GPU" , true , nIter);
320
330
for (int t = 0 ; t < nIter; t++){
321
- for (int i=size,j=0 ;i>0 ;i/=num_threads,j++){
322
- std::promise<void > promise;
323
- std::future<void > future = promise.get_future ();
324
- dispatchKernel (ctx, ops[j], promise);
325
- wait (ctx, future);
326
- resetCommandBuffer (ctx.device , ops[j]);
327
- }
331
+ sumKernel.dispatchKernel (ctx);
328
332
}
329
333
}
330
334
331
335
float r = 0 ;
332
- toCPU (ctx, outputs[outputs. size ()- 1 ], ( void *) buffer, 4 * sizeof (float ));
336
+ sumKernel. toCPU (ctx, buffer, 4 * sizeof (float ));
333
337
334
338
return buffer[0 ];
335
339
}
@@ -363,6 +367,7 @@ float sum_gpu(Context& ctx, const float* data, const float* buffer, size_t size)
363
367
// return r;
364
368
// }
365
369
370
+
366
371
int main (int argc, char **argv) {
367
372
static constexpr size_t M = 4096 *2 ;
368
373
static constexpr size_t N = 4096 *2 ;
0 commit comments