Skip to content

Commit 92fa90d

Browse files
Add SumKernel
1 parent 21d909f commit 92fa90d

File tree

1 file changed

+33
-28
lines changed

1 file changed

+33
-28
lines changed

experimental/kernels/reduce.cpp

+33-28
Original file line numberDiff line numberDiff line change
@@ -285,51 +285,55 @@ Kernel createSumKernel(Context& ctx, Tensor& input, Tensor& output, size_t size)
285285
return createKernel(ctx, {kSum, num_threads, kf32}, Bindings{input, output}, {size_x, size_y, 1});
286286
}
287287

288-
float sum_gpu(Context& ctx, const float* data, const float* buffer, size_t size) {
289-
WGPURequiredLimits requiredLimits = LIMITS;
290-
uint32_t num_threads = 1024;
291-
int nSum = round(log2(size) / log2(num_threads));
292-
int input_size = size;
293-
unsigned long output_size = size;
288+
struct SumKernel {
294289
std::vector<Tensor> outputs;
295290
std::vector<Kernel> ops;
296-
outputs.push_back(createTensor(ctx, Shape{std::max(size, static_cast<unsigned long>(1024*2))}, kf32));
297-
for(int i=size,j=0;i>0;i/=num_threads,j++){
298-
output_size = (output_size + num_threads - 1) / num_threads;
299-
outputs.push_back(createTensor(ctx, Shape{std::max(output_size, static_cast<unsigned long>(1024*2))}, kf32));
300-
ops.push_back(createSumKernel(ctx, outputs[j], outputs[j+1], input_size));
301-
// printf("size: %d\n", input_size);
302-
input_size = output_size;
303-
}
304-
toGPU(ctx, data, outputs[0], size * sizeof(float));
305-
306-
307-
{
291+
SumKernel(Context& ctx, size_t size) {
292+
uint32_t num_threads = 1024;
293+
int nSum = round(log2(size) / log2(num_threads));
294+
int input_size = size;
295+
unsigned long output_size = size;
296+
outputs.push_back(createTensor(ctx, Shape{std::max(size, static_cast<unsigned long>(num_threads*2))}, kf32));
308297
for(int i=size,j=0;i>0;i/=num_threads,j++){
298+
output_size = (output_size + num_threads - 1) / num_threads;
299+
outputs.push_back(createTensor(ctx, Shape{std::max(output_size, static_cast<unsigned long>(num_threads*2))}, kf32));
300+
ops.push_back(createSumKernel(ctx, outputs[j], outputs[j+1], input_size));
301+
input_size = output_size;
302+
}
303+
}
304+
void dispatchKernel(Context& ctx) {
305+
for(int i=0;i<ops.size();i++){
309306
std::promise<void> promise;
310307
std::future<void> future = promise.get_future();
311-
dispatchKernel(ctx, ops[j], promise);
308+
gpu::dispatchKernel(ctx, ops[i], promise);
312309
wait(ctx, future);
313-
resetCommandBuffer(ctx.device, ops[j]);
310+
resetCommandBuffer(ctx.device, ops[i]);
314311
}
315312
}
313+
void toGPU(Context& ctx, const float* data, size_t size) {
314+
gpu::toGPU(ctx, data, outputs[0], size);
315+
}
316+
void toCPU(Context& ctx, float* data, size_t size) {
317+
gpu::toCPU(ctx, outputs[outputs.size()-1], data, size);
318+
}
319+
};
320+
321+
float sum_gpu(Context& ctx, const float* data, float* buffer, size_t size) {
322+
WGPURequiredLimits requiredLimits = LIMITS;
323+
SumKernel sumKernel(ctx, size);
324+
sumKernel.toGPU(ctx, data, size * sizeof(float));
325+
sumKernel.dispatchKernel(ctx);
316326

317327
{
318328
int nIter = 100;
319329
DurationTime dt("GPU", true, nIter);
320330
for (int t = 0; t < nIter; t++){
321-
for(int i=size,j=0;i>0;i/=num_threads,j++){
322-
std::promise<void> promise;
323-
std::future<void> future = promise.get_future();
324-
dispatchKernel(ctx, ops[j], promise);
325-
wait(ctx, future);
326-
resetCommandBuffer(ctx.device, ops[j]);
327-
}
331+
sumKernel.dispatchKernel(ctx);
328332
}
329333
}
330334

331335
float r = 0;
332-
toCPU(ctx, outputs[outputs.size()-1], (void*)buffer, 4 * sizeof(float));
336+
sumKernel.toCPU(ctx, buffer, 4 * sizeof(float));
333337

334338
return buffer[0];
335339
}
@@ -363,6 +367,7 @@ float sum_gpu(Context& ctx, const float* data, const float* buffer, size_t size)
363367
// return r;
364368
// }
365369

370+
366371
int main(int argc, char **argv) {
367372
static constexpr size_t M = 4096*2;
368373
static constexpr size_t N = 4096*2;

0 commit comments

Comments
 (0)