@@ -321,7 +321,7 @@ void MATMUL_FORWARD_GPU(float* out,
321
321
bool verbose = false ;
322
322
bool debug = false ;
323
323
float *out_exp;
324
- DurationTime duration (" matmul_forward_gpu with creating context " , verbose);
324
+ DurationTime duration (" matmul_forward_gpu with preparing a kernel " , verbose);
325
325
if (verbose) {
326
326
printf (" matmul forward: B=%d, T=%d, C=%d, OC=%d, bias=%d\n " , B, T, C, OC, bias != NULL );
327
327
}
@@ -341,49 +341,65 @@ void MATMUL_FORWARD_GPU(float* out,
341
341
unsigned long oc = static_cast <unsigned long >(OC);
342
342
setLogLevel (kError );
343
343
344
- {
345
- DurationTime duration (" matmul_forward_gpu: before creating tensors" , verbose);
344
+ if (version == 2 || version == 1 ) {
346
345
// Generate the key of the cache by arguments.
347
346
std::string key = " MATMUL_FORWARD_GPU_" + std::to_string (B) + " _" + std::to_string (T) + " _" + std::to_string (C) + " _" + std::to_string (OC);
348
347
Kernel op;
349
348
if (ctx.kernelPool .data .find (key) == ctx.kernelPool .data .end ()) {
350
- constexpr size_t BT = 64 ;
351
- constexpr size_t BC = 16 ;
352
- constexpr size_t BOC = 64 ;
353
- constexpr size_t TT = BT / BC;
354
- constexpr size_t TOC = BOC / BC;
355
- constexpr size_t num_threads = BT * BOC / (TT * TOC);
356
- Shape wgSize = {num_threads, 1 , 1 };
357
-
358
- std::string codeString (kShaderMatmul2DTiling );
359
- std::string unrolledCode = loopUnrolling (replaceAll (codeString, {{" {{precision}}" , toString (kf32)},
360
- {" {{BT}}" , toString (BT)},
361
- {" {{BC}}" , toString (BC)},
362
- {" {{BOC}}" , toString (BOC)},
363
- {" {{TT}}" , toString (TT)},
364
- {" {{TOC}}" , toString (TOC)},
365
- {" {{NUM_TILEI}}" , toString (BT * BC / num_threads)},
366
- {" {{NUM_TILEW}}" , toString (BOC * BC / num_threads)}
367
- }));
368
-
369
- Shape nWorkgroups = {b, cdiv (T, BT), cdiv (OC, BOC)};
370
349
Tensor inp_i = createTensor (ctx, Shape{b * t * c}, kf32);
371
350
Tensor weight_i = createTensor (ctx, Shape{oc * c}, kf32);
372
351
Tensor bias_i = bias == NULL ? createTensor (ctx, Shape{1 }, kf32) : createTensor (ctx, Shape{oc}, kf32);
373
352
Tensor out_o = createTensor (ctx, Shape{b * t * oc}, kf32);
374
- op = createKernel (ctx, {unrolledCode, wgSize, kf32},
375
- Bindings{inp_i, weight_i, bias_i, out_o},
376
- nWorkgroups,
377
- /* params */
378
- MatmulParams{
379
- static_cast <uint32_t >(b),
380
- static_cast <uint32_t >(t),
381
- static_cast <uint32_t >(c),
382
- static_cast <uint32_t >(oc)
383
- },
384
- nullptr ,
385
- key.c_str ()
386
- );
353
+
354
+ if (version == 2 ) {
355
+ constexpr size_t BT = 64 ;
356
+ constexpr size_t BC = 16 ;
357
+ constexpr size_t BOC = 64 ;
358
+ constexpr size_t TT = BT / BC;
359
+ constexpr size_t TOC = BOC / BC;
360
+ constexpr size_t num_threads = BT * BOC / (TT * TOC);
361
+ Shape wgSize = {num_threads, 1 , 1 };
362
+
363
+ std::string codeString (kShaderMatmul2DTiling );
364
+ std::string unrolledCode = loopUnrolling (replaceAll (codeString, {{" {{precision}}" , toString (kf32)},
365
+ {" {{BT}}" , toString (BT)},
366
+ {" {{BC}}" , toString (BC)},
367
+ {" {{BOC}}" , toString (BOC)},
368
+ {" {{TT}}" , toString (TT)},
369
+ {" {{TOC}}" , toString (TOC)},
370
+ {" {{NUM_TILEI}}" , toString (BT * BC / num_threads)},
371
+ {" {{NUM_TILEW}}" , toString (BOC * BC / num_threads)}
372
+ }));
373
+
374
+ Shape nWorkgroups = {b, cdiv (T, BT), cdiv (OC, BOC)};
375
+ op = createKernel (ctx, {unrolledCode, wgSize, kf32},
376
+ Bindings{inp_i, weight_i, bias_i, out_o},
377
+ nWorkgroups,
378
+ /* params */
379
+ MatmulParams{
380
+ static_cast <uint32_t >(b),
381
+ static_cast <uint32_t >(t),
382
+ static_cast <uint32_t >(c),
383
+ static_cast <uint32_t >(oc)
384
+ },
385
+ nullptr ,
386
+ key.c_str ()
387
+ );
388
+ } else {
389
+ op = createKernel (ctx, {kShaderMatmul , 256 , kf32},
390
+ Bindings{inp_i, weight_i, bias_i, out_o},
391
+ /* nWorkgroups */ {cdiv (b * t, 256 ), 1 , 1 },
392
+ /* params */
393
+ MatmulParams{
394
+ static_cast <uint32_t >(b),
395
+ static_cast <uint32_t >(t),
396
+ static_cast <uint32_t >(c),
397
+ static_cast <uint32_t >(oc)
398
+ },
399
+ nullptr ,
400
+ key.c_str ()
401
+ );
402
+ }
387
403
} else {
388
404
op = ctx.kernelPool .data [key];
389
405
}
@@ -400,39 +416,16 @@ void MATMUL_FORWARD_GPU(float* out,
400
416
401
417
std::promise<void > promise;
402
418
std::future<void > future = promise.get_future ();
403
-
404
- if (version == 2 ) {
405
- DurationTime duration (" matmul_forward_gpu: after creating tensors" , verbose);
406
- {
407
- DurationTime duration (" matmul_forward_gpu: before creating kernels" , verbose);
408
- {
409
- DurationTime duration (" matmul_forward_gpu without creating context" , verbose);
410
- dispatchKernel (ctx, op, promise);
411
- wait (ctx, future);
412
- toCPU (ctx, out_o, out, b * t * oc * sizeof (float ));
413
- }
414
- }
415
- // } else if (version == 1) {
416
- // Kernel op = createKernel(ctx, {kShaderMatmul, 256, kf32},
417
- // Bindings{inp_i, weight_i, bias_i, out_o},
418
- // /* nWorkgroups */ {cdiv(b * t, 256), 1, 1},
419
- // /* params */
420
- // MatmulParams{
421
- // static_cast<uint32_t>(b),
422
- // static_cast<uint32_t>(t),
423
- // static_cast<uint32_t>(c),
424
- // static_cast<uint32_t>(oc)
425
- // });
426
- // {
427
- // DurationTime duration("matmul_forward_gpu without creating context", verbose);
428
- // dispatchKernel(ctx, op, promise);
429
- // wait(ctx, future);
430
- // toCPU(ctx, out_o, out, b * t * oc * sizeof(float));
431
- // }
432
- } else {
433
- DurationTime duration (" matmul_forward_cpu" , verbose);
434
- matmul_forward_dummy (out, inp, weight, bias, B, T, C, OC);
419
+
420
+ {
421
+ DurationTime duration (" matmul_forward_gpu" , verbose);
422
+ dispatchKernel (ctx, op, promise);
423
+ wait (ctx, future);
424
+ toCPU (ctx, out_o, out, b * t * oc * sizeof (float ));
435
425
}
426
+ } else {
427
+ DurationTime duration (" matmul_forward_cpu" , verbose);
428
+ matmul_forward_dummy (out, inp, weight, bias, B, T, C, OC);
436
429
}
437
430
438
431
if (debug) { // compare out with out_exp.
0 commit comments