@@ -196,6 +196,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
196196 const uint8x16_t neon_zero = vdupq_n_u8 (0 );
197197 const uint8x16_t neon_two = vdupq_n_u8 (2 );
198198 while (done < cols ) {
199+ __builtin_prefetch (rd + 64 , 0 , 0 );
199200 for (int h = 0 ; h < 2 ; h ++ ) {
200201 uint8x16_t raw = vld1q_u8 (rd + h * 16 );
201202 const float * xp = x + done + h * 16 ;
@@ -253,6 +254,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
253254 float row_sum = 0.0f ;
254255 for (int b = 0 ; b < n_blocks_per_row ; b ++ ) {
255256 const BlockTQ2 * blk = & blocks [row * n_blocks_per_row + b ];
257+ __builtin_prefetch (blk + 1 , 0 , 0 );
256258 float d = fp16_to_fp32 (blk -> d );
257259 const float * xb = x + b * QK_K ;
258260#ifdef __ARM_NEON
@@ -316,6 +318,7 @@ void ternary_matvec(float *out, const QWeight *W, const float *x) {
316318 float row_sum = 0.0f ;
317319 for (int b = 0 ; b < n_blocks_per_row ; b ++ ) {
318320 const BlockTQ1 * blk = & blocks [row * n_blocks_per_row + b ];
321+ __builtin_prefetch (blk + 1 , 0 , 0 );
319322 float d = fp16_to_fp32 (blk -> d );
320323 float block_sum = 0.0f ;
321324 const float * xb = x + b * QK_K ;
0 commit comments