Skip to content

Commit 16a0dbc

Browse files
Fix the matmul of version 1 in unittests
1 parent 34b6d78 commit 16a0dbc

File tree

1 file changed

+61
-68
lines changed

1 file changed

+61
-68
lines changed

experimental/kernels/unittest_llmc/unittest_kernels.cpp

+61-68
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ void MATMUL_FORWARD_GPU(float* out,
321321
bool verbose = false;
322322
bool debug = false;
323323
float *out_exp;
324-
DurationTime duration("matmul_forward_gpu with creating context", verbose);
324+
DurationTime duration("matmul_forward_gpu with preparing a kernel", verbose);
325325
if (verbose) {
326326
printf("matmul forward: B=%d, T=%d, C=%d, OC=%d, bias=%d\n", B, T, C, OC, bias != NULL);
327327
}
@@ -341,49 +341,65 @@ void MATMUL_FORWARD_GPU(float* out,
341341
unsigned long oc = static_cast<unsigned long>(OC);
342342
setLogLevel(kError);
343343

344-
{
345-
DurationTime duration("matmul_forward_gpu: before creating tensors", verbose);
344+
if (version == 2 || version == 1) {
346345
// Generate the key of the cache by arguments.
347346
std::string key = "MATMUL_FORWARD_GPU_" + std::to_string(B) + "_" + std::to_string(T) + "_" + std::to_string(C) + "_" + std::to_string(OC);
348347
Kernel op;
349348
if (ctx.kernelPool.data.find(key) == ctx.kernelPool.data.end()) {
350-
constexpr size_t BT = 64;
351-
constexpr size_t BC = 16;
352-
constexpr size_t BOC = 64;
353-
constexpr size_t TT = BT / BC;
354-
constexpr size_t TOC = BOC / BC;
355-
constexpr size_t num_threads = BT * BOC / (TT * TOC);
356-
Shape wgSize = {num_threads, 1, 1};
357-
358-
std::string codeString(kShaderMatmul2DTiling);
359-
std::string unrolledCode = loopUnrolling(replaceAll(codeString, {{"{{precision}}", toString(kf32)},
360-
{"{{BT}}", toString(BT)},
361-
{"{{BC}}", toString(BC)},
362-
{"{{BOC}}", toString(BOC)},
363-
{"{{TT}}", toString(TT)},
364-
{"{{TOC}}", toString(TOC)},
365-
{"{{NUM_TILEI}}", toString(BT * BC / num_threads)},
366-
{"{{NUM_TILEW}}", toString(BOC * BC / num_threads)}
367-
}));
368-
369-
Shape nWorkgroups = {b, cdiv(T, BT), cdiv(OC, BOC)};
370349
Tensor inp_i = createTensor(ctx, Shape{b * t * c}, kf32);
371350
Tensor weight_i = createTensor(ctx, Shape{oc * c}, kf32);
372351
Tensor bias_i = bias == NULL ? createTensor(ctx, Shape{1}, kf32) : createTensor(ctx, Shape{oc}, kf32);
373352
Tensor out_o = createTensor(ctx, Shape{b * t * oc}, kf32);
374-
op = createKernel(ctx, {unrolledCode, wgSize, kf32},
375-
Bindings{inp_i, weight_i, bias_i, out_o},
376-
nWorkgroups,
377-
/* params */
378-
MatmulParams{
379-
static_cast<uint32_t>(b),
380-
static_cast<uint32_t>(t),
381-
static_cast<uint32_t>(c),
382-
static_cast<uint32_t>(oc)
383-
},
384-
nullptr,
385-
key.c_str()
386-
);
353+
354+
if (version == 2) {
355+
constexpr size_t BT = 64;
356+
constexpr size_t BC = 16;
357+
constexpr size_t BOC = 64;
358+
constexpr size_t TT = BT / BC;
359+
constexpr size_t TOC = BOC / BC;
360+
constexpr size_t num_threads = BT * BOC / (TT * TOC);
361+
Shape wgSize = {num_threads, 1, 1};
362+
363+
std::string codeString(kShaderMatmul2DTiling);
364+
std::string unrolledCode = loopUnrolling(replaceAll(codeString, {{"{{precision}}", toString(kf32)},
365+
{"{{BT}}", toString(BT)},
366+
{"{{BC}}", toString(BC)},
367+
{"{{BOC}}", toString(BOC)},
368+
{"{{TT}}", toString(TT)},
369+
{"{{TOC}}", toString(TOC)},
370+
{"{{NUM_TILEI}}", toString(BT * BC / num_threads)},
371+
{"{{NUM_TILEW}}", toString(BOC * BC / num_threads)}
372+
}));
373+
374+
Shape nWorkgroups = {b, cdiv(T, BT), cdiv(OC, BOC)};
375+
op = createKernel(ctx, {unrolledCode, wgSize, kf32},
376+
Bindings{inp_i, weight_i, bias_i, out_o},
377+
nWorkgroups,
378+
/* params */
379+
MatmulParams{
380+
static_cast<uint32_t>(b),
381+
static_cast<uint32_t>(t),
382+
static_cast<uint32_t>(c),
383+
static_cast<uint32_t>(oc)
384+
},
385+
nullptr,
386+
key.c_str()
387+
);
388+
} else {
389+
op = createKernel(ctx, {kShaderMatmul, 256, kf32},
390+
Bindings{inp_i, weight_i, bias_i, out_o},
391+
/* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
392+
/* params */
393+
MatmulParams{
394+
static_cast<uint32_t>(b),
395+
static_cast<uint32_t>(t),
396+
static_cast<uint32_t>(c),
397+
static_cast<uint32_t>(oc)
398+
},
399+
nullptr,
400+
key.c_str()
401+
);
402+
}
387403
} else {
388404
op = ctx.kernelPool.data[key];
389405
}
@@ -400,39 +416,16 @@ void MATMUL_FORWARD_GPU(float* out,
400416

401417
std::promise<void> promise;
402418
std::future<void> future = promise.get_future();
403-
404-
if (version == 2) {
405-
DurationTime duration("matmul_forward_gpu: after creating tensors", verbose);
406-
{
407-
DurationTime duration("matmul_forward_gpu: before creating kernels", verbose);
408-
{
409-
DurationTime duration("matmul_forward_gpu without creating context", verbose);
410-
dispatchKernel(ctx, op, promise);
411-
wait(ctx, future);
412-
toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
413-
}
414-
}
415-
// } else if (version == 1) {
416-
// Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
417-
// Bindings{inp_i, weight_i, bias_i, out_o},
418-
// /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
419-
// /* params */
420-
// MatmulParams{
421-
// static_cast<uint32_t>(b),
422-
// static_cast<uint32_t>(t),
423-
// static_cast<uint32_t>(c),
424-
// static_cast<uint32_t>(oc)
425-
// });
426-
// {
427-
// DurationTime duration("matmul_forward_gpu without creating context", verbose);
428-
// dispatchKernel(ctx, op, promise);
429-
// wait(ctx, future);
430-
// toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
431-
// }
432-
} else {
433-
DurationTime duration("matmul_forward_cpu", verbose);
434-
matmul_forward_dummy(out, inp, weight, bias, B, T, C, OC);
419+
420+
{
421+
DurationTime duration("matmul_forward_gpu", verbose);
422+
dispatchKernel(ctx, op, promise);
423+
wait(ctx, future);
424+
toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
435425
}
426+
} else {
427+
DurationTime duration("matmul_forward_cpu", verbose);
428+
matmul_forward_dummy(out, inp, weight, bias, B, T, C, OC);
436429
}
437430

438431
if (debug) { // compare out with out_exp.

0 commit comments

Comments
 (0)