Skip to content

Commit a1f31d7

Browse files
committed
Add prefetch hints for weight data in ternary_matvec()
Prefetch next weight chunk/block while processing current one to hide memory latency. 2B model: ~8 -> ~13 tok/s on Apple Silicon.
1 parent f2072d8 commit a1f31d7

1 file changed

Lines changed: 3 additions & 0 deletions

File tree

src/quant.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
196196
const uint8x16_t neon_zero = vdupq_n_u8(0);
197197
const uint8x16_t neon_two = vdupq_n_u8(2);
198198
while (done < cols) {
199+
__builtin_prefetch(rd + 64, 0, 0);
199200
for (int h = 0; h < 2; h++) {
200201
uint8x16_t raw = vld1q_u8(rd + h * 16);
201202
const float *xp = x + done + h * 16;
@@ -253,6 +254,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
253254
float row_sum = 0.0f;
254255
for (int b = 0; b < n_blocks_per_row; b++) {
255256
const BlockTQ2 *blk = &blocks[row * n_blocks_per_row + b];
257+
__builtin_prefetch(blk + 1, 0, 0);
256258
float d = fp16_to_fp32(blk->d);
257259
const float *xb = x + b * QK_K;
258260
#ifdef __ARM_NEON
@@ -316,6 +318,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
316318
float row_sum = 0.0f;
317319
for (int b = 0; b < n_blocks_per_row; b++) {
318320
const BlockTQ1 *blk = &blocks[row * n_blocks_per_row + b];
321+
__builtin_prefetch(blk + 1, 0, 0);
319322
float d = fp16_to_fp32(blk->d);
320323
float block_sum = 0.0f;
321324
const float *xb = x + b * QK_K;

0 commit comments

Comments
 (0)