A quick guide for Metal disassembler #2279

lshzh-ww · 2023-07-19T16:06:37Z

lshzh-ww
Jul 19, 2023

Apple doesn't provide any kind of disassembler as a part of developer tools to allow us take a look at the low level stuff. However, thanks to the efforts from open source community we do have a functional disassembler.

Usage

Clone the repository https://github.com/dougallj/applegpu, and run the command:
python compiler_explorer.py test.metal
You can have any kind of macros or templates in you .metal file, but you can only have one kernel function. Detailed explanations for each instruction can be found at https://dougallj.github.io/applegpu/docs.html. I feel that instructions for Apple GPU are a bit like RISC, where you have instructions to load and store some values between memory and registers, and some other instructions to operate on registers, but no "load-operate" instructions.

Example 0

Here is the kernel_mul_mat_q4_0_f32 function from master branch. I removed some logics for simplicity.

Details

#include <metal_stdlib>
using namespace metal;
#define QK4_0 32
typedef struct {
    half    d;             // delta
    uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;

// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
    block_q4_0 qb_curr, qb_next;
    float4 y_curr[8];       // src1 vector cache
    float sumf[N_DST]={0.f}, all_sum;
    thread float * yl=(thread float *)y_curr;

    // bootstrap
    qb_curr = x[tiisg];
    // each thread in a SIMD group deals with 1 block.
    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {

        float sumy = 0;
        for (int i = 0; i < QK4_0 / 4; i++) {
            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
        }
        sumy *= (-8.f);

        for (int row = 0; row < N_DST; row++) {
            // calculate
            float d = qb_curr.d;
            float acc = sumy;
            for (int i = 0; i < 16; i++) {
                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
            }
            sumf[row] += d * acc;
            qb_curr = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
        }
    }

    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
        }
    }

}

And here is the disassembled code (very long):

Details

compute shader prolog:
   0: 05110c0d00c43200     device_load      0, i32, xy, r2_r3, u6_u7, 0, signed, lsl 1
   8: 3800                 wait             0
   a: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
  10: 922d8602008200b0     icmpsel          slt, r11.cache, r3.cache, 0, r1.cache, 0
  18: e21520000000         mov_imm          r5.cache, 32, 0b0
  1e: 9205960200c200b0     icmpsel          slt, r1.cache, r11.cache, 0, r1.discard, 0
  26: be19ca0e0000         ffs              r6.cache, r5.discard
  2c: fe0dc26a6c00         xor              r3.cache, r1.discard, r3.discard
  32: e20500000000         mov_imm          r1.cache, 0, 0b0
  38: fe09964a6c00         xor              r2.cache, r11.cache, r2.discard
  3e: be15820e0000         ffs              r5.cache, r1.cache
  44: 8e1fc46b65000000     isub             r7_r8.cache, r2_r3.discard, r11.sx
  4c: 8e0d3fc82c000000     isub             r3.cache, 63, r6.discard
  54: 8e091fa82c000000     isub             r2.cache, 31, r5.discard
  5c: 9215420200c6408c     icmpsel          seq, r5.cache, r1, 0, r3.discard, r2.discard
  64: be0d8e0e0000         ffs              r3.cache, r7.cache
  6a: be09900e0000         ffs              r2.cache, r8.cache
  70: 8e0d3f682c000000     isub             r3.cache, 63, r3.discard
  78: 8e091f482c000000     isub             r2.cache, 31, r2.discard
  80: 9209900200c6408c     icmpsel          seq, r2.cache, r8.cache, 0, r3.discard, r2.discard
  88: 8e0b8a4a2c000000     isub             r2_r3.cache, r5.cache, r2.discard
  90: fe198e0ae500         or               r6.cache, r7.cache, r8
  96: 9210cc0200010190     icmpsel          seq, r4l.cache, r6.discard, 0, 1, 0
  9e: 921844f203010150     icmpsel          ugt, r6l.cache, r2, 63, 1, 0
  a6: 9202860200010150     icmpsel          ugt, r0h.cache, r3.cache, 0, 1, 0
  ae: 9202460200cc108c     icmpsel          seq, r0h.cache, r3, 0, r6l.discard, r0h.discard
  b6: 9210c10000c81090     icmpsel          seq, r4l.cache, r0h.discard, 0, r4l.discard, 1
  be: 9202caf203018188     icmpsel          seq, r0h.cache, r5.discard, 63, 1, r4l.cache
  c6: 1219c800004e0090     icmpsel          seq, r6, r4l.discard, 0, r7, 0
  ce: e2000000             mov_imm          r0l.cache, 0
  d2: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
  d8: 20c04a010000         jmp_exec_none    0x222
  de: 8e273f4838000000     isub             r9_r10.cache, 63, r2_r3.cache
  e6: 8e15014028000000     iadd             r5.cache, 1, r2.cache
  ee: 92028a422c010130     icmpsel          ult, r0h.cache, r5.cache, r2.discard, 1, 0
  f6: 8e09202829000000     isub             r2.cache, 32, r9.cache
  fe: 8e19c1602c000000     iadd             r6.cache, r0h.discard, r3.discard
 106: aea900e028c40200     bfeil            r10.cache, 0, r7.cache, r2.discard
 10e: 8e0d920a02000000     isub             r3.cache, r9.cache, 32
 116: ae3100e028920200     bfi              r12.cache, 0, r7.cache, r9.cache
 11e: fe094acae400         or               r2.cache, r5, r6
 124: ae29d40225d20200     bfi              r10.cache, r10.discard, r8, r9.discard
 12c: 9202c40200010190     icmpsel          seq, r0h.cache, r2.discard, 0, 1, 0
 134: ae2500e024860200     bfi              r9.cache, 0, r7, r3.cache
 13c: 1229860200d420ad     icmpsel          slt, r10, r3.cache, 0, r10.discard, r9.discard
 144: 1225c60200d800b0     icmpsel          slt, r9, r3.discard, 0, r12.discard, 0
 14c: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 152: 20c0bc000000         jmp_exec_none    0x20E
 158: e20900000000         mov_imm          r2.cache, 0, 0b0
 15e: ae39ce06298a0200     extr             r14.cache, r7.discard, r8.cache, r5.cache
 166: e20d00000000         mov_imm          r3.cache, 0, 0b0
 16c: aebd00002d8a0200     bfeil            r15.cache, 0, r8.discard, r5.cache
 174: e23100000000         mov_imm          r12.cache, 0, 0b0
 17a: 623500000000         mov_imm          r13, 0, 0b0
 180: 421000000000         push_exec        r0l, 2
 186: 8e4300c0bd000000     iadd             r16_r17.cache, 0, r14_r15.discard, lsl 1
 18e: 92059402000101b0     icmpsel          slt, r1.cache, r10.cache, 0, 1, 0
 196: 8e4f0020b9000000     iadd             r19_r20.cache, 0, r9_r10.cache, lsl 1
 19e: fe41e02ae800         or               r16.cache, r16.discard, r1.cache
 1a4: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 1aa: 0e1f1f083a000000     isub             r7_r8, 31, r16_r17.cache
 1b2: 7e25d86aee00         or               r9, r12.discard, r19.discard
 1b8: 8e178a1b00000000     isub             r5_r6.cache, r5_r6.cache, 1
 1c0: 7e29da8aee00         or               r10, r13.discard, r20.discard
 1c6: 9205d002008200b0     icmpsel          slt, r1.cache, r8.discard, 0, r1.cache, 0
 1ce: fe0d4acae400         or               r3.cache, r5, r6
 1d4: fe1d82028200         and              r7.cache, r1.cache, 32
 1da: fe0582128000         and              r1.cache, r1.cache, 1
 1e0: 0e3be0eb2c000000     isub             r14_r15, r16_r17.discard, r7.discard
 1e8: 9202c60200010190     icmpsel          seq, r0h.cache, r3.discard, 0, 1, 0
 1f0: 7e31820a8000         mov              r12, r1.cache
 1f6: 7e35440a8000         mov              r13, r2
 1fc: 5294c1000000         while_icmp       r0l, seq, r0h.discard, 0, 2
 202: 00c084ffffff         jmp_exec_any     0x186
 208: d21600000000         pop_exec         r0l.cache, 2
 20e: d20e00000000         pop_exec         r0l.cache, 1
 214: 8e1f0020bd000000     iadd             r7_r8.cache, 0, r9_r10.discard, lsl 1
 21c: fe19c2eaec00         or               r6.cache, r1.discard, r7.discard
 222: 520e00000000         pop_exec         r0l, 1
 228: fe15cc6a6900         xor              r5.cache, r6.discard, r11.cache
 22e: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 234: 8e09ca6a2d000000     isub             r2.cache, r5.discard, r11.discard
 23c: 9205840200c200b0     icmpsel          slt, r1.cache, r2.cache, 0, r1.discard, 0
 244: ae8500202c1b0000     bfeil            r1.cache, 0, r1.discard, 27
 24c: 8e0544222c000000     iadd             r1.cache, r2, r1.discard
 254: 2e85c25600000000     asr              r1, r1.discard, 5
 25c: c510803d01803000     uniform_store    2, i16, xy, 0, r2l_r2h, 24
 264: c508a03d01803000     uniform_store    2, i16, xy, 0, r1l_r1h, 26
 26c: 8800                 stop             

compute shader:
   0: f2023500             get_sr           r0h.cache, sr53 (simdgroup_index_in_threadgroup)
   4: f2050000             get_sr           r1.cache, sr0 (threadgroup_position_in_grid.x)
   8: 62a9000000000010     mov_imm          r42, 0, 0b0
  10: 8e2dc120ac100000     iadd             r43.cache, r0h.discard, r1.discard, lsl 1
  18: 62b1000000000010     mov_imm          r44, 0, 0b0
  20: 62b5000000000010     mov_imm          r45, 0, 0b0
  28: 62b9000000000010     mov_imm          r46, 0, 0b0
  30: e2000000             mov_imm          r0l.cache, 0
  34: 52c898f10100         if_icmp          r0l, sgt, u12, 31, 1
  3a: 8e05006025011000     iadd             r1.cache, 0, r43, lsl 2
  42: 9e05c28219000000     imadd            r1.cache, r1.discard, u12, 0
  4a: 9e17c22201800110     imadd            r37_r38.cache, r1.discard, 18, u0
  52: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
  56: 8e19cc2218140000     iadd             r38.cache, r38.discard, u1
  5e: 1e17c120014a4300     imadd            r5_r6, r0h.discard, 18, r37_r38
  66: 20c072090000         jmp_exec_none    0x9D8
  6c: 85040a0500c01000     device_load      0, i16, x, r0h, r5_r6, 0, signed
  74: 85801a4500c11000     device_load      1, i16, x, r48l, r5_r6, 1, signed
  7c: 85081a4500c4f000     device_load      1, i16, xyzw, r1l_r1h_r2l_r2h, r5_r6, 1, signed, lsl 1
  84: 85483a4500c53000     device_load      1, i16, xy, r41l_r41h, r5_r6, 3, signed, lsl 1
  8c: 85202a0500c81000     device_load      0, i16, x, r4l, r5_r6, 2, signed, lsl 2
  94: 3801                 wait             1
  96: f20d0100             get_sr           r3.cache, sr1 (threadgroup_position_in_grid.y)
  9a: fe159c098000         mov              r5.cache, u14
  a0: 9e17ca6228000000     imadd            r5_r6.cache, r5.discard, r3.cache, 0
  a8: 921d860200a401b0     icmpsel          slt, r7.cache, r3.cache, 0, u18, 0
  b0: 9e199ce12ccc0200     imadd            r6.cache, u14, r7.discard, r6.discard
  b8: 9e199e612ccc0200     imadd            r6.cache, u15, r3.discard, r6.discard
  c0: 8e1f84a13c101000     iadd             r39_r40.cache, u2, r5_r6.discard, lsl 2
  c8: 2ee9002029180011     bfeil            r58, 0, r41.cache, 24
  d0: 2ee1002029080011     bfeil            r56, 0, r41.cache, 8
  d8: 2ed5004028180010     bfeil            r53, 0, r2.cache, 24
  e0: 2ed9004028080010     bfeil            r54, 0, r2.cache, 8
  e8: 8e21d06218140000     iadd             r40.cache, r40.discard, u3
  f0: 2ec5002028180010     bfeil            r49, 0, r1.cache, 24
  f8: e2a800000010         mov_imm          r42l.cache, 0
  fe: 2ec9002028080010     bfeil            r50, 0, r1.cache, 8
 106: 62d200000010         mov_imm          r52h, 0
 10c: e20c0000             mov_imm          r3l.cache, 0
 110: 7e5043088010         mov              r52l, r1h
 116: 2ee4000006080011     bfeil            r57l, 0, r48l, 8
 11e: 7e6c93088014         mov              r59l, r41h.cache
 124: 62ee00000010         mov_imm          r59h, 0
 12a: 62a9000000000010     mov_imm          r42, 0, 0b0
 132: e2bd000000000010     mov_imm          r47.cache, 0, 0b0
 13a: 62b1000000000010     mov_imm          r44, 0, 0b0
 142: 62b5000000000010     mov_imm          r45, 0, 0b0
 14a: 62b9000000000010     mov_imm          r46, 0, 0b0
 152: 7e5c85088010         mov              r55l, r2h.cache
 158: 62de00000010         mov_imm          r55h, 0
 15e: 421000000000         push_exec        r0l, 2
 164: ae1500e0250a0001     bfi              r5.cache, 0, r47, 10
 16c: f20c3400             get_sr           r3l.cache, sr52 (thread_index_in_simdgroup)
 170: 8e0d00608c000000     iadd             r3.cache, 0, r3l.discard, lsl 1
 178: 8e0dca622c002000     iadd             r3.cache, r5.discard, r3.discard, lsl 4
 180: 0e074e6324141000     iadd             r33_r34, r39_r40, r3, lsl 2
 188: 05296e4640c0f200     device_load      1, i32, xyzw, r5_r6_r7_r8, r39_r40, r3, unsigned
 190: 0549124540c8f200     device_load      1, i32, xyzw, r9_r10_r11_r12, r33_r34, 1, signed, lsl 2
 198: 0569224540c8f200     device_load      1, i32, xyzw, r13_r14_r15_r16, r33_r34, 2, signed, lsl 2
 1a0: 0589324540c8f200     device_load      1, i32, xyzw, r17_r18_r19_r20, r33_r34, 3, signed, lsl 2
 1a8: 05a9420540c8f200     device_load      0, i32, xyzw, r21_r22_r23_r24, r33_r34, 4, signed, lsl 2
 1b0: 05c9520540c8f200     device_load      0, i32, xyzw, r25_r26_r27_r28, r33_r34, 5, signed, lsl 2
 1b8: 05e9620540c8f200     device_load      0, i32, xyzw, r29_r30_r31_r32, r33_r34, 6, signed, lsl 2
 1c0: 0509720540c9f200     device_load      0, i32, xyzw, r33_r34_r35_r36, r33_r34, 7, signed, lsl 2
 1c8: 3801                 wait             1
 1ca: aa8d4ca22400         fadd32           r3.cache, r6, r5
 1d0: aa8dc6e22400         fadd32           r3.cache, r3.discard, r7
 1d6: aa8dc6022500         fadd32           r3.cache, r3.discard, r8
 1dc: aa8dc6422500         fadd32           r3.cache, r3.discard, r10
 1e2: aa8dc6222500         fadd32           r3.cache, r3.discard, r9
 1e8: aa8dc6622500         fadd32           r3.cache, r3.discard, r11
 1ee: aa8dc6822500         fadd32           r3.cache, r3.discard, r12
 1f4: aa8dc6c22500         fadd32           r3.cache, r3.discard, r14
 1fa: aa8dc6a22500         fadd32           r3.cache, r3.discard, r13
 200: aa8dc6e22500         fadd32           r3.cache, r3.discard, r15
 206: aa8dc6022600         fadd32           r3.cache, r3.discard, r16
 20c: aa8dc6422600         fadd32           r3.cache, r3.discard, r18
 212: aa8dc6222600         fadd32           r3.cache, r3.discard, r17
 218: aa8dc6622600         fadd32           r3.cache, r3.discard, r19
 21e: 2a8dc6822600         fadd32           r3, r3.discard, r20
 224: 3800                 wait             0
 226: aa8dc6c22600         fadd32           r3.cache, r3.discard, r22
 22c: aa8dc6a22600         fadd32           r3.cache, r3.discard, r21
 232: aa8dc6e22600         fadd32           r3.cache, r3.discard, r23
 238: aa8dc6022700         fadd32           r3.cache, r3.discard, r24
 23e: aa8dc6422700         fadd32           r3.cache, r3.discard, r26
 244: aa8dc6222700         fadd32           r3.cache, r3.discard, r25
 24a: aa8dc6622700         fadd32           r3.cache, r3.discard, r27
 250: aa8dc6822700         fadd32           r3.cache, r3.discard, r28
 256: aa8dc6c22700         fadd32           r3.cache, r3.discard, r30
 25c: aa8dc6a22700         fadd32           r3.cache, r3.discard, r29
 262: aa8dc6e22700         fadd32           r3.cache, r3.discard, r31
 268: aa8dc6022401         fadd32           r3.cache, r3.discard, r32
 26e: aa8dc6422801         fadd32           r3.cache, r3.discard, r34.cache
 274: aa8dc6222801         fadd32           r3.cache, r3.discard, r33.cache
 27a: aa8dc6622401         fadd32           r3.cache, r3.discard, r35
 280: aacdc6822411         fadd32           r51.cache, r3.discard, r36
 286: 620d00000000         mov_imm          r3, 0, 0b0
 28c: 9acde6020217         fmul32           r51.cache, r51.discard, -8.0
 292: 421000000000         push_exec        r0l, 2
 298: fe75b6f28014         and              r61.cache, r59.cache, 15
 29e: fe71b4f28014         and              r60.cache, r58.cache, 15
 2a4: bef90ba42f11         convert          s32_to_f, r62.cache, r61.discard, rte
 2aa: be810b842f21         convert          s32_to_f, r64.cache, r60.discard, rte
 2b0: 7e7142f28010         and              r60, r1, 15
 2b6: aefd00402b040011     bfeil            r63.cache, 0, r58.cache, 4
 2be: 7e6944f28010         and              r58, r2, 15
 2c4: aef500602b040411     bfeil            r61.cache, 0, r59.cache, 4, mask 0xF
 2cc: ba8164022c664222     fmadd32          r64.cache, r18, r64.discard, r51
 2d4: befd0be42f11         convert          s32_to_f, r63.cache, r63.discard, rte
 2da: fe6db0f28014         and              r59.cache, r56.cache, 15
 2e0: bef50ba42f11         convert          s32_to_f, r61.cache, r61.discard, rte
 2e6: bafd44e22fc08215     fmadd32          r63.cache, r34, r63.discard, r64.discard
 2ee: beed0b642b11         convert          s32_to_f, r59.cache, r59.cache, rte
 2f4: baf962c22ffe4211     fmadd32          r62.cache, r17, r62.discard, r63.discard
 2fc: aefd00002b040411     bfeil            r63.cache, 0, r56.cache, 4, mask 0xF
 304: baf942a22ffc4215     fmadd32          r62.cache, r33, r61.discard, r62.discard
 30c: aef5002029040411     bfeil            r61.cache, 0, r41.cache, 4, mask 0xF
 314: fe6192f28014         and              r56.cache, r41.cache, 15
 31a: bea50be42f11         convert          s32_to_f, r41.cache, r63.discard, rte
 320: baed60622bfc4211     fmadd32          r59.cache, r16, r59.cache, r62.discard
 328: bee10b042b11         convert          s32_to_f, r56.cache, r56.cache, rte
 32e: baed402229b64215     fmadd32          r59.cache, r32, r41.cache, r59.cache
 336: bea50ba42f11         convert          s32_to_f, r41.cache, r61.discard, rte
 33c: bae15e022bb64211     fmadd32          r56.cache, r15, r56.cache, r59.cache
 344: ae85002028040400     bfeil            r1.cache, 0, r1.cache, 4, mask 0xF
 34c: baed7e2229b04211     fmadd32          r59.cache, r31, r41.cache, r56.cache
 354: ae89004028040400     bfeil            r2.cache, 0, r2.cache, 4, mask 0xF
 35c: fe25b2f28317         and              r41.cache, r57.cache, 255
 362: aef4008008080010     bfeil            r61l.cache, 0, r4l.cache, 8
 36a: fe6192f28014         and              r56.cache, r41.cache, 15
 370: aea5002029040011     bfeil            r41.cache, 0, r41.cache, 4
 378: fe1088f08303         and              r4l.cache, r4l.cache, 255
 37e: bee50b042b11         convert          s32_to_f, r57.cache, r56.cache, rte
 384: fe6188f08010         and              r56.cache, r4l.cache, 15
 38a: bea50b242911         convert          s32_to_f, r41.cache, r41.cache, rte
 390: baed4c222bb64211     fmadd32          r59.cache, r6, r57.cache, r59.cache
 398: bee10b042b11         convert          s32_to_f, r56.cache, r56.cache, rte
 39e: fe65a0f28317         and              r57.cache, r48.cache, 255
 3a4: aec1008008040010     bfeil            r48.cache, 0, r4l.cache, 4
 3ac: baed6c2229b64211     fmadd32          r59.cache, r22, r41.cache, r59.cache
 3b4: aea500202b040011     bfeil            r41.cache, 0, r57.cache, 4
 3bc: fe65b2f28014         and              r57.cache, r57.cache, 15
 3c2: bec10b042a11         convert          s32_to_f, r48.cache, r48.cache, rte
 3c8: bae166022bb64211     fmadd32          r56.cache, r19, r56.cache, r59.cache
 3d0: bee50b242b11         convert          s32_to_f, r57.cache, r57.cache, rte
 3d6: baed46022ab04215     fmadd32          r59.cache, r35, r48.cache, r56.cache
 3de: bee10b242911         convert          s32_to_f, r56.cache, r41.cache, rte
 3e4: fe41baf08014         and              r48.cache, r61l.cache, 15
 3ea: aea500a00f040011     bfeil            r41.cache, 0, r61l.discard, 4
 3f2: bae54a222b764211     fmadd32          r57.cache, r5, r57.cache, r59
 3fa: bec10b042a11         convert          s32_to_f, r48.cache, r48.cache, rte
 400: bae16a022bb24211     fmadd32          r56.cache, r21, r56.cache, r57.cache
 408: bea50b242911         convert          s32_to_f, r41.cache, r41.cache, rte
 40e: bac168022ab04211     fmadd32          r48.cache, r20, r48.cache, r56.cache
 416: bee10b442711         convert          s32_to_f, r56.cache, r58, rte
 41c: bae5482229a04215     fmadd32          r57.cache, r36, r41.cache, r48.cache
 424: bea50b442810         convert          s32_to_f, r41.cache, r2.cache, rte
 42a: fe41acf28014         and              r48.cache, r54.cache, 15
 430: ae8900c02a040401     bfeil            r2.cache, 0, r54.cache, 4, mask 0xF
 438: bad956022b724211     fmadd32          r54.cache, r11, r56.cache, r57
 440: bec10b042a11         convert          s32_to_f, r48.cache, r48.cache, rte
 446: bae1762229ac4211     fmadd32          r56.cache, r27, r41.cache, r54.cache
 44e: bea50b442810         convert          s32_to_f, r41.cache, r2.cache, rte
 454: fe59aef28014         and              r54.cache, r55.cache, 15
 45a: ae8900e02a040401     bfeil            r2.cache, 0, r55.cache, 4, mask 0xF
 462: bac158022a704211     fmadd32          r48.cache, r12, r48.cache, r56
 46a: bed90bc42a11         convert          s32_to_f, r54.cache, r54.cache, rte
 470: badd782229a04211     fmadd32          r55.cache, r28, r41.cache, r48.cache
 478: bec10b442810         convert          s32_to_f, r48.cache, r2.cache, rte
 47e: fe25aaf28014         and              r41.cache, r53.cache, 15
 484: ae8900a02a040001     bfeil            r2.cache, 0, r53.cache, 4
 48c: bad55ac226ae4211     fmadd32          r53.cache, r13, r54, r55.cache
 494: bea50b242911         convert          s32_to_f, r41.cache, r41.cache, rte
 49a: bac17a022aaa4211     fmadd32          r48.cache, r29, r48.cache, r53.cache
 4a2: be890b442800         convert          s32_to_f, r2.cache, r2.cache, rte
 4a8: baa55c2229a04211     fmadd32          r41.cache, r14, r41.cache, r48.cache
 4b0: bec10b842f11         convert          s32_to_f, r48.cache, r60.discard, rte
 4b6: bad57c4228924210     fmadd32          r53.cache, r30, r2.cache, r41.cache
 4be: be890b242800         convert          s32_to_f, r2.cache, r1.cache, rte
 4c4: fe25a4f28014         and              r41.cache, r50.cache, 15
 4ca: ae8500402a040401     bfeil            r1.cache, 0, r50.cache, 4, mask 0xF
 4d2: bac14e022a6a4211     fmadd32          r48.cache, r7, r48.cache, r53
 4da: bea50b242911         convert          s32_to_f, r41.cache, r41.cache, rte
 4e0: bac96e4228a04210     fmadd32          r50.cache, r23, r2.cache, r48.cache
 4e8: be890b242800         convert          s32_to_f, r2.cache, r1.cache, rte
 4ee: fe41a8f28014         and              r48.cache, r52.cache, 15
 4f4: ae8500802a040401     bfeil            r1.cache, 0, r52.cache, 4, mask 0xF
 4fc: baa5502229a44211     fmadd32          r41.cache, r8, r41.cache, r50.cache
 504: bec10b042a11         convert          s32_to_f, r48.cache, r48.cache, rte
 50a: bac9704228924210     fmadd32          r50.cache, r24, r2.cache, r41.cache
 512: bea50b242810         convert          s32_to_f, r41.cache, r1.cache, rte
 518: fe09a2f28004         and              r2.cache, r49.cache, 15
 51e: ae85002026040001     bfeil            r1.cache, 0, r49, 4
 526: bac152022a644211     fmadd32          r48.cache, r9, r48.cache, r50
 52e: be890b442800         convert          s32_to_f, r2.cache, r2.cache, rte
 534: baa5722229604211     fmadd32          r41.cache, r25, r41.cache, r48
 53c: be850b242800         convert          s32_to_f, r1.cache, r1.cache, rte
 542: ba89544228924200     fmadd32          r2.cache, r10, r2.cache, r41.cache
 54a: ba89742228840200     fmadd32          r2.cache, r26, r1.cache, r2.cache
 552: f2103400             get_sr           r4l.cache, sr52 (thread_index_in_simdgroup)
 556: 92a586200098c0895010 icmpsel          seq, r41.cache, r3l.cache, 2, r44.cache, r46.cache
 560: ae85006028020000     bfeil            r1.cache, 0, r3.cache, 2
 568: ba89841204924200     fmadd32          r2.cache, r2.cache, r0h, r41.cache
 570: 8e5d016028100000     iadd             r55.cache, 1, r3.cache
 578: 8e0582e225010000     iadd             r1.cache, r1.cache, r47
 580: 12b986000084c0851010 icmpsel          seq, r46, r3l.cache, 0, r2.cache, r46
 58a: 12b18620008480851010 icmpsel          seq, r44, r3l.cache, 2, r2.cache, r44
 594: 8e0d02602c000000     iadd             r3.cache, 2, r3.discard
 59c: 9e096e8219880004     imadd            r2.cache, r55, u12, r4l.cache
 5a4: 8e250020a8100000     iadd             r41.cache, 0, r1.cache, lsl 1
 5ac: ae85006028020000     bfeil            r1.cache, 0, r3.cache, 2
 5b4: 8e51842229112000     iadd             r52.cache, r2.cache, r41.cache, lsl 4
 5bc: fe0946228000         and              r2.cache, r3, 2
 5c2: 8e0582e225010000     iadd             r1.cache, r1.cache, r47
 5ca: 9e25848019480010     imadd            r41.cache, r2l.cache, u12, r4l
 5d2: 8e410020a8100000     iadd             r48.cache, 0, r1.cache, lsl 1
 5da: 1e07a822018a4304     imadd            r1_r2, r52.cache, 18, r37_r38.cache
 5e2: 8e25920226152000     iadd             r41.cache, r41.cache, r48, lsl 4
 5ea: 1e535222014a4314     imadd            r52_r53, r41, 18, r37_r38
 5f2: 8584020500c11000     device_load      0, i16, x, r48h, r1_r2, 0, signed
 5fa: 8508124500c21000     device_load      1, i16, x, r65l, r1_r2, 1, signed
 602: 8588124500c5f000     device_load      1, i16, xyzw, r49l_r49h_r50l_r50h, r1_r2, 1, signed, lsl 1
 60a: 85d0324500c53000     device_load      1, i16, xy, r58l_r58h, r1_r2, 3, signed, lsl 1
 612: 85bc224500c91000     device_load      1, i16, x, r55h, r1_r2, 2, signed, lsl 2
 61a: 8504080560c01000     device_load      0, i16, x, r0h, r52_r53, 0, signed
 622: 8580180560c11000     device_load      0, i16, x, r48l, r52_r53, 1, signed
 62a: 8508180560c4f000     device_load      0, i16, xyzw, r1l_r1h_r2l_r2h, r52_r53, 1, signed, lsl 1
 632: 8548380560c53000     device_load      0, i16, xy, r41l_r41h, r52_r53, 3, signed, lsl 1
 63a: 8520280560c81000     device_load      0, i16, x, r4l, r52_r53, 2, signed, lsl 2
 642: 3801                 wait             1
 644: fe54a3088014         mov              r53l.cache, r49h.cache
 64a: aed100402b040411     bfeil            r52.cache, 0, r58.cache, 4, mask 0xF
 652: fe59aaf28014         and              r54.cache, r53.cache, 15
 658: 3ed10b842611         convert          s32_to_f, r52, r52, rte
 65e: fe54b5088014         mov              r53l.cache, r58h.cache
 664: 3ef90bc42a11         convert          s32_to_f, r62, r54.cache, rte
 66a: fe59aaf28014         and              r54.cache, r53.cache, 15
 670: aed5004027140411     bfeil            r53.cache, 0, r58, 20, mask 0xF
 678: 3ee10bc42a11         convert          s32_to_f, r56, r54.cache, rte
 67e: fe146ff08327         and              r69l.cache, r55h, 255
 684: 3ed90ba42a11         convert          s32_to_f, r54, r53.cache, rte
 68a: fe554af08018         and              r53.cache, r69l, 15
 690: aee500402a040411     bfeil            r57.cache, 0, r50.cache, 4, mask 0xF
 698: 3ed50ba42611         convert          s32_to_f, r53, r53, rte
 69e: 3ef50b242b11         convert          s32_to_f, r61, r57.cache, rte
 6a4: 2ee500402a180411     bfeil            r57, 0, r50.cache, 24, mask 0xF
 6ac: aef1002008080812     bfeil            r60.cache, 0, r65l.cache, 8, mask 0xFF
 6b4: 2eed0040261c0011     bfeil            r59, 0, r50, 28
 6bc: ae8100802b040021     bfeil            r64.cache, 0, r60.cache, 4
 6c4: aefd00202a140411     bfeil            r63.cache, 0, r49.cache, 20, mask 0xF
 6cc: be810b042c22         convert          s32_to_f, r64.cache, r64.discard, rte
 6d2: fe05c2f0832b         and              r65.cache, r65l.discard, 255
 6d8: 3efd0be42f11         convert          s32_to_f, r63, r63.discard, rte
 6de: fe0982f28028         and              r66.cache, r65.cache, 15
 6e4: ae8500202c040022     bfeil            r65.cache, 0, r65.discard, 4
 6ec: be910b442c22         convert          s32_to_f, r68.cache, r66.discard, rte
 6f2: be8d0b242c22         convert          s32_to_f, r67.cache, r65.discard, rte
 6f8: fe71f8f28014         and              r60.cache, r60.discard, 15
 6fe: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 704: fe09a2f28024         and              r66.cache, r49.cache, 15
 70a: ae8500202a040421     bfeil            r65.cache, 0, r49.cache, 4, mask 0xF
 712: ba914a822c664222     fmadd32          r68.cache, r5, r68.discard, r51
 71a: be890b442c22         convert          s32_to_f, r66.cache, r66.discard, rte
 720: ba8d6a622cc88222     fmadd32          r67.cache, r21, r67.discard, r68.discard
 728: be850b242c22         convert          s32_to_f, r65.cache, r65.discard, rte
 72e: ba8d4c822fc68221     fmadd32          r67.cache, r6, r60.discard, r67.discard
 736: aef100202a080411     bfeil            r60.cache, 0, r49.cache, 8, mask 0xF
 73e: ba8d6c022cc68222     fmadd32          r67.cache, r22, r64.discard, r67.discard
 746: ae8100202a0c0421     bfeil            r64.cache, 0, r49.cache, 12, mask 0xF
 74e: ba894e422cc68222     fmadd32          r66.cache, r7, r66.discard, r67.discard
 756: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 75c: ba856e222cc48222     fmadd32          r65.cache, r23, r65.discard, r66.discard
 764: be810b042c22         convert          s32_to_f, r64.cache, r64.discard, rte
 76a: ba8550822fc28221     fmadd32          r65.cache, r8, r60.discard, r65.discard
 772: aef100202a180411     bfeil            r60.cache, 0, r49.cache, 24, mask 0xF
 77a: ba8170022cc28222     fmadd32          r64.cache, r24, r64.discard, r65.discard
 782: aec500202a1c0011     bfeil            r49.cache, 0, r49.cache, 28
 78a: ba8152c22fc08221     fmadd32          r64.cache, r9, r62.discard, r64.discard
 792: bef90b842f11         convert          s32_to_f, r62.cache, r60.discard, rte
 798: fe71a4f28014         and              r60.cache, r50.cache, 15
 79e: bec50b242a11         convert          s32_to_f, r49.cache, r49.cache, rte
 7a4: bafd72e22fc08211     fmadd32          r63.cache, r25, r63.discard, r64.discard
 7ac: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 7b2: baf954c22ffe4211     fmadd32          r62.cache, r10, r62.discard, r63.discard
 7ba: ae8100402a080421     bfeil            r64.cache, 0, r50.cache, 8, mask 0xF
 7c2: baf974222afc4211     fmadd32          r62.cache, r26, r49.cache, r62.discard
 7ca: aec500402a0c0411     bfeil            r49.cache, 0, r50.cache, 12, mask 0xF
 7d2: bafd56822ffc4211     fmadd32          r63.cache, r11, r60.discard, r62.discard
 7da: aef900402a140411     bfeil            r62.cache, 0, r50.cache, 20, mask 0xF
 7e2: fe70a5088014         mov              r60l.cache, r50h.cache
 7e8: bec90b042c12         convert          s32_to_f, r50.cache, r64.discard, rte
 7ee: fe71f8f28014         and              r60.cache, r60.discard, 15
 7f4: bec50b242a11         convert          s32_to_f, r49.cache, r49.cache, rte
 7fa: baf576a22ffe4211     fmadd32          r61.cache, r27, r61.discard, r63.discard
 802: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 808: baf558422afa4211     fmadd32          r61.cache, r12, r50.cache, r61.discard
 810: bec90bc42f11         convert          s32_to_f, r50.cache, r62.discard, rte
 816: baf578222afa4211     fmadd32          r61.cache, r28, r49.cache, r61.discard
 81e: bec50b242b11         convert          s32_to_f, r49.cache, r57.cache, rte
 824: fe65b4f28014         and              r57.cache, r58.cache, 15
 82a: beed0b642b11         convert          s32_to_f, r59.cache, r59.cache, rte
 830: baf15a822ffa4211     fmadd32          r60.cache, r13, r60.discard, r61.discard
 838: bee50b242b11         convert          s32_to_f, r57.cache, r57.cache, rte
 83e: baf17a422af84211     fmadd32          r60.cache, r29, r50.cache, r60.discard
 846: aec900402b080411     bfeil            r50.cache, 0, r58.cache, 8, mask 0xF
 84e: baf15c222af84211     fmadd32          r60.cache, r14, r49.cache, r60.discard
 856: aec500402b0c0411     bfeil            r49.cache, 0, r58.cache, 12, mask 0xF
 85e: baed7c622bf84211     fmadd32          r59.cache, r30, r59.cache, r60.discard
 866: bec90b442a11         convert          s32_to_f, r50.cache, r50.cache, rte
 86c: bae55e222b764211     fmadd32          r57.cache, r15, r57.cache, r59
 874: bec50b242a11         convert          s32_to_f, r49.cache, r49.cache, rte
 87a: bae57e822ab24211     fmadd32          r57.cache, r31, r52.cache, r57.cache
 882: aed100402b180411     bfeil            r52.cache, 0, r58.cache, 24, mask 0xF
 88a: bae560422ab24211     fmadd32          r57.cache, r16, r50.cache, r57.cache
 892: aec90040271c0011     bfeil            r50.cache, 0, r58, 28
 89a: bac540222a724215     fmadd32          r49.cache, r32, r49.cache, r57
 8a2: bed10b842a11         convert          s32_to_f, r52.cache, r52.cache, rte
 8a8: bac5620227a24211     fmadd32          r49.cache, r17, r56, r49.cache
 8b0: bec90b442a11         convert          s32_to_f, r50.cache, r50.cache, rte
 8b6: bad942c22aa24215     fmadd32          r54.cache, r33, r54.cache, r49.cache
 8be: aec500a00c040012     bfeil            r49.cache, 0, r69l.discard, 4
 8c6: bad164822aac4211     fmadd32          r52.cache, r18, r52.cache, r54.cache
 8ce: aede00f00a080011     bfeil            r55h.cache, 0, r55h.cache, 8
 8d6: bad944422aa84215     fmadd32          r54.cache, r34, r50.cache, r52.cache
 8de: bed10b242a11         convert          s32_to_f, r52.cache, r49.cache, rte
 8e4: fe49aff08014         and              r50.cache, r55h.cache, 15
 8ea: aec500f006040011     bfeil            r49.cache, 0, r55h, 4
 8f2: bad566a22a6c4211     fmadd32          r53.cache, r19, r53.cache, r54
 8fa: bec90b442a11         convert          s32_to_f, r50.cache, r50.cache, rte
 900: bad146822a6a4215     fmadd32          r52.cache, r35, r52.cache, r53
 908: bec50b242a11         convert          s32_to_f, r49.cache, r49.cache, rte
 90e: bac968422a684211     fmadd32          r50.cache, r20, r50.cache, r52
 916: 3ac5482226a44215     fmadd32          r49, r36, r49, r50.cache
 91e: 92c9ae10005ac0855014 icmpsel          seq, r50.cache, r55l.cache, 1, r45, r46
 928: 12c96e30005440865014 icmpsel          seq, r50, r55l, 3, r42, r50
 932: 3800                 wait             0
 934: bad1a2120ea44215     fmadd32          r52.cache, r49.cache, r48h.discard, r50.cache
 93c: 2ec9002028080010     bfeil            r50, 0, r1.cache, 8
 944: 12b5ae1000a8a0855014 icmpsel          seq, r45, r55l.cache, 1, r52.cache, r45
 94e: 2ec5002028180010     bfeil            r49, 0, r1.cache, 24
 956: 12a9ae3000a840855014 icmpsel          seq, r42, r55l.cache, 3, r52.cache, r42
 960: 2ed9004028080010     bfeil            r54, 0, r2.cache, 8
 968: 62d200000010         mov_imm          r52h, 0
 96e: 2ed5004028180010     bfeil            r53, 0, r2.cache, 24
 976: 62de00000010         mov_imm          r55h, 0
 97c: 2ee1002029080011     bfeil            r56, 0, r41.cache, 8
 984: 62ee00000010         mov_imm          r59h, 0
 98a: 2ee9002029180011     bfeil            r58, 0, r41.cache, 24
 992: 7e5043088010         mov              r52l, r1h
 998: 7e5c45088010         mov              r55l, r2h
 99e: 2ee4000006080011     bfeil            r57l, 0, r48l, 8
 9a6: 7e6c53088014         mov              r59l, r41h
 9ac: 529546420000         while_icmp       r0l, nseq, r3, 4, 2
 9b2: 00c0e6f8ffff         jmp_exec_any     0x298
 9b8: d21600000000         pop_exec         r0l.cache, 2
 9be: 8e3d01e02d110000     iadd             r47.cache, 1, r47.discard
 9c6: 52955ea21904         while_icmp       r0l, nseq, r47, u13, 2
 9cc: 00c098f7ffff         jmp_exec_any     0x164
 9d2: d21600000000         pop_exec         r0l.cache, 2
 9d8: d20e00000000         pop_exec         r0l.cache, 1
 9de: f2150100             get_sr           r5.cache, sr1 (threadgroup_position_in_grid.y)
 9e2: fe05a0098000         mov              r1.cache, u16
 9e8: 9e0bc2a228000000     imadd            r2_r3.cache, r1.discard, r5.cache, 0
 9f0: 92058a0200a401b0     icmpsel          slt, r1.cache, r5.cache, 0, u18, 0
 9f8: 9e19a0212cc60200     imadd            r6.cache, u16, r1.discard, r3.discard
 a00: 6f8ddc320004         simd_fadd        r3, r46.discard
 a06: 8e0584622d011000     iadd             r1.cache, r2.cache, r43.discard, lsl 2
 a0e: 9e15a2a12ccc0200     imadd            r5.cache, u17, r5.discard, r6.discard
 a16: 920882422c010130     icmpsel          ult, r2l.cache, r1.cache, r2.discard, 1, 0
 a1e: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 a22: 8e09c4a02c000000     iadd             r2.cache, r2l.discard, r5.discard
 a2a: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 a30: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 a38: 0e21d0a218000000     iadd             r8, r8.discard, u5
 a40: 45190e0500c01200     device_store     0, i32, x, r3, r7_r8, 0, signed, 0
 a48: d20e00000000         pop_exec         r0l.cache, 1
 a4e: 6f8dda320004         simd_fadd        r3, r45.discard
 a54: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 a5a: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 a62: 0e21d0a218000000     iadd             r8, r8.discard, u5
 a6a: 45191e0500c01200     device_store     0, i32, x, r3, r7_r8, 1, signed, 0
 a72: d20e00000000         pop_exec         r0l.cache, 1
 a78: 6f8dd8320004         simd_fadd        r3, r44.discard
 a7e: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 a82: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 a88: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 a90: 0e21d0a218000000     iadd             r8, r8.discard, u5
 a98: 45191e0500c41200     device_store     0, i32, x, r3, r7_r8, 1, signed, lsl 1, 0
 aa0: d20e00000000         pop_exec         r0l.cache, 1
 aa6: 6f8dd4320004         simd_fadd        r3, r42.discard
 aac: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 ab2: 8e1f88213c001000     iadd             r7_r8.cache, u4, r1_r2.discard, lsl 2
 aba: 0e21d0a218000000     iadd             r8, r8.discard, u5
 ac2: 45193e0500c01200     device_store     0, i32, x, r3, r7_r8, 3, signed, 0
 aca: 520e00000000         pop_exec         r0l, 1
 ad0: 8800                 stop

The kernel start at the label compute shader:, and I usually analyze the structure by finding jmp and device_load instructions. In this code there are two jmp_exec_any at 0x9b2 and 0x9cc, jumping to 0x298 and 0x164. There are corresponding the two loops in our codes. Between 0x164 and 0x298 we can see a block of device_load:

 180: 0e074e6324141000     iadd             r33_r34, r39_r40, r3, lsl 2
 188: 05296e4640c0f200     device_load      1, i32, xyzw, r5_r6_r7_r8, r39_r40, r3, unsigned
 190: 0549124540c8f200     device_load      1, i32, xyzw, r9_r10_r11_r12, r33_r34, 1, signed, lsl 2
 198: 0569224540c8f200     device_load      1, i32, xyzw, r13_r14_r15_r16, r33_r34, 2, signed, lsl 2
 1a0: 0589324540c8f200     device_load      1, i32, xyzw, r17_r18_r19_r20, r33_r34, 3, signed, lsl 2
 1a8: 05a9420540c8f200     device_load      0, i32, xyzw, r21_r22_r23_r24, r33_r34, 4, signed, lsl 2
 1b0: 05c9520540c8f200     device_load      0, i32, xyzw, r25_r26_r27_r28, r33_r34, 5, signed, lsl 2
 1b8: 05e9620540c8f200     device_load      0, i32, xyzw, r29_r30_r31_r32, r33_r34, 6, signed, lsl 2
 1c0: 0509720540c9f200     device_load      0, i32, xyzw, r33_r34_r35_r36, r33_r34, 7, signed, lsl 2
 1c8: 3801                 wait             1
 1ca: aa8d4ca22400         fadd32           r3.cache, r6, r5
 1d0: aa8dc6e22400         fadd32           r3.cache, r3.discard, r7

The first device_load loads 4 i32 to 32-bit registers r5,r6,r7,r8 from address stored in r39,r40. The whole 8 device_load load 32 float. That's the 32 float stored in y_curr before starting the inner loop. The wait 1 means waiting until group 1 load instructions finished, which are the first four device_load in this block.

Between 0x298 and 0x9b2 is our inner loop, we still look for device_load. Notice that r48h means the high 16-bit of register r48 and r48l means the low 16-bit.

 5ea: 1e535222014a4314     imadd            r52_r53, r41, 18, r37_r38
 5f2: 8584020500c11000     device_load      0, i16, x, r48h, r1_r2, 0, signed
 5fa: 8508124500c21000     device_load      1, i16, x, r65l, r1_r2, 1, signed
 602: 8588124500c5f000     device_load      1, i16, xyzw, r49l_r49h_r50l_r50h, r1_r2, 1, signed, lsl 1
 60a: 85d0324500c53000     device_load      1, i16, xy, r58l_r58h, r1_r2, 3, signed, lsl 1
 612: 85bc224500c91000     device_load      1, i16, x, r55h, r1_r2, 2, signed, lsl 2
 61a: 8504080560c01000     device_load      0, i16, x, r0h, r52_r53, 0, signed
 622: 8580180560c11000     device_load      0, i16, x, r48l, r52_r53, 1, signed
 62a: 8508180560c4f000     device_load      0, i16, xyzw, r1l_r1h_r2l_r2h, r52_r53, 1, signed, lsl 1
 632: 8548380560c53000     device_load      0, i16, xy, r41l_r41h, r52_r53, 3, signed, lsl 1
 63a: 8520280560c81000     device_load      0, i16, x, r4l, r52_r53, 2, signed, lsl 2
 642: 3801                 wait             1
 644: fe54a3088014         mov              r53l.cache, r49h.cache

In our code the inner loop runs 4 times, with each time loading one block. In the assembly the inner loop actually runs 2 times, each time loading 2 blocks. When it loads a block, it first loads one 16-bit, then another 16-bit, then four 16-bit, then two 16-bit and finally one last 16-bit. Before and after these device_load we also see a lot mov and bfeil, meaning the GPU copys a 16-bit to another register and mask its high or low 8-bit.

Example 1

Here is the kernel_mul_mat_q4_0_f32 function from PR #2248 . I removed some logics for simplicity.

Details

#include <metal_stdlib>
using namespace metal;
#define QK4_0 32

typedef struct {
    half    d;             // delta
    uint16_t qs[QK4_0 / 4]; // nibbles / quants
} block_q4_0;


// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(block_q4_0 qb_curr, float sumy, thread float * yl) {
    float d = qb_curr.d;
    float acc = sumy * -8.f;
    for (int i = 0; i < 16; i+=2) {
        acc += yl[i]     * (qb_curr.qs[i / 2] & 0x000F) + yl[i + 16] * (qb_curr.qs[i / 2] & 0x00F0);
        acc += yl[i + 1] * (qb_curr.qs[i / 2] & 0x0F00) + yl[i + 17] * (qb_curr.qs[i / 2] & 0xF000);
    }
    return d * acc;
}

#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32

template<typename block_q_type>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
                    uint2 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
    block_q_type qb_curr;
    float4 y_curr[8];       // src1 vector cache
    float sumf[N_DST]={0.f}, all_sum;
    thread float * yl=(thread float *)y_curr;

    // bootstrap
    qb_curr = x[tiisg];
    // each thread in a SIMD group deals with 1 block.
    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {

        float sumy = 0;
        for (int i = 0; i < QK4_0 / 4; i++) {
            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
        }
        // we don't right shift packed 4-bit weights, so we have to devide y by 16/256/4096 to conpensate this.
        // this design is q4_0 and q4_1 centered, but I think most of the people use these two quantizations.
        for (int i = 0; i < 32; i++) {
            yl[i] *= pow(1.f/16.f, 2 * (i % 2) + i / 16);
        }

        for (int row = 0; row < N_DST; row++) {
            sumf[row] += block_q_n_dot_y(qb_curr, sumy, yl);
            qb_curr = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
        }
    }

    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
        }
    }

}

kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0,
        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}

And here is the disassembled code (very long):

Details

compute shader prolog:
   0: 05110c0d00c43200     device_load      0, i32, xy, r2_r3, u6_u7, 0, signed, lsl 1
   8: 3800                 wait             0
   a: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
  10: 922d8602008200b0     icmpsel          slt, r11.cache, r3.cache, 0, r1.cache, 0
  18: e21520000000         mov_imm          r5.cache, 32, 0b0
  1e: 9205960200c200b0     icmpsel          slt, r1.cache, r11.cache, 0, r1.discard, 0
  26: be19ca0e0000         ffs              r6.cache, r5.discard
  2c: fe0dc26a6c00         xor              r3.cache, r1.discard, r3.discard
  32: e20500000000         mov_imm          r1.cache, 0, 0b0
  38: fe09964a6c00         xor              r2.cache, r11.cache, r2.discard
  3e: be15820e0000         ffs              r5.cache, r1.cache
  44: 8e1fc46b65000000     isub             r7_r8.cache, r2_r3.discard, r11.sx
  4c: 8e0d3fc82c000000     isub             r3.cache, 63, r6.discard
  54: 8e091fa82c000000     isub             r2.cache, 31, r5.discard
  5c: 9215420200c6408c     icmpsel          seq, r5.cache, r1, 0, r3.discard, r2.discard
  64: be0d8e0e0000         ffs              r3.cache, r7.cache
  6a: be09900e0000         ffs              r2.cache, r8.cache
  70: 8e0d3f682c000000     isub             r3.cache, 63, r3.discard
  78: 8e091f482c000000     isub             r2.cache, 31, r2.discard
  80: 9209900200c6408c     icmpsel          seq, r2.cache, r8.cache, 0, r3.discard, r2.discard
  88: 8e0b8a4a2c000000     isub             r2_r3.cache, r5.cache, r2.discard
  90: fe198e0ae500         or               r6.cache, r7.cache, r8
  96: 9210cc0200010190     icmpsel          seq, r4l.cache, r6.discard, 0, 1, 0
  9e: 921844f203010150     icmpsel          ugt, r6l.cache, r2, 63, 1, 0
  a6: 9202860200010150     icmpsel          ugt, r0h.cache, r3.cache, 0, 1, 0
  ae: 9202460200cc108c     icmpsel          seq, r0h.cache, r3, 0, r6l.discard, r0h.discard
  b6: 9210c10000c81090     icmpsel          seq, r4l.cache, r0h.discard, 0, r4l.discard, 1
  be: 9202caf203018188     icmpsel          seq, r0h.cache, r5.discard, 63, 1, r4l.cache
  c6: 1219c800004e0090     icmpsel          seq, r6, r4l.discard, 0, r7, 0
  ce: e2000000             mov_imm          r0l.cache, 0
  d2: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
  d8: 20c04a010000         jmp_exec_none    0x222
  de: 8e273f4838000000     isub             r9_r10.cache, 63, r2_r3.cache
  e6: 8e15014028000000     iadd             r5.cache, 1, r2.cache
  ee: 92028a422c010130     icmpsel          ult, r0h.cache, r5.cache, r2.discard, 1, 0
  f6: 8e09202829000000     isub             r2.cache, 32, r9.cache
  fe: 8e19c1602c000000     iadd             r6.cache, r0h.discard, r3.discard
 106: aea900e028c40200     bfeil            r10.cache, 0, r7.cache, r2.discard
 10e: 8e0d920a02000000     isub             r3.cache, r9.cache, 32
 116: ae3100e028920200     bfi              r12.cache, 0, r7.cache, r9.cache
 11e: fe094acae400         or               r2.cache, r5, r6
 124: ae29d40225d20200     bfi              r10.cache, r10.discard, r8, r9.discard
 12c: 9202c40200010190     icmpsel          seq, r0h.cache, r2.discard, 0, 1, 0
 134: ae2500e024860200     bfi              r9.cache, 0, r7, r3.cache
 13c: 1229860200d420ad     icmpsel          slt, r10, r3.cache, 0, r10.discard, r9.discard
 144: 1225c60200d800b0     icmpsel          slt, r9, r3.discard, 0, r12.discard, 0
 14c: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 152: 20c0bc000000         jmp_exec_none    0x20E
 158: e20900000000         mov_imm          r2.cache, 0, 0b0
 15e: ae39ce06298a0200     extr             r14.cache, r7.discard, r8.cache, r5.cache
 166: e20d00000000         mov_imm          r3.cache, 0, 0b0
 16c: aebd00002d8a0200     bfeil            r15.cache, 0, r8.discard, r5.cache
 174: e23100000000         mov_imm          r12.cache, 0, 0b0
 17a: 623500000000         mov_imm          r13, 0, 0b0
 180: 421000000000         push_exec        r0l, 2
 186: 8e4300c0bd000000     iadd             r16_r17.cache, 0, r14_r15.discard, lsl 1
 18e: 92059402000101b0     icmpsel          slt, r1.cache, r10.cache, 0, 1, 0
 196: 8e4f0020b9000000     iadd             r19_r20.cache, 0, r9_r10.cache, lsl 1
 19e: fe41e02ae800         or               r16.cache, r16.discard, r1.cache
 1a4: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 1aa: 0e1f1f083a000000     isub             r7_r8, 31, r16_r17.cache
 1b2: 7e25d86aee00         or               r9, r12.discard, r19.discard
 1b8: 8e178a1b00000000     isub             r5_r6.cache, r5_r6.cache, 1
 1c0: 7e29da8aee00         or               r10, r13.discard, r20.discard
 1c6: 9205d002008200b0     icmpsel          slt, r1.cache, r8.discard, 0, r1.cache, 0
 1ce: fe0d4acae400         or               r3.cache, r5, r6
 1d4: fe1d82028200         and              r7.cache, r1.cache, 32
 1da: fe0582128000         and              r1.cache, r1.cache, 1
 1e0: 0e3be0eb2c000000     isub             r14_r15, r16_r17.discard, r7.discard
 1e8: 9202c60200010190     icmpsel          seq, r0h.cache, r3.discard, 0, 1, 0
 1f0: 7e31820a8000         mov              r12, r1.cache
 1f6: 7e35440a8000         mov              r13, r2
 1fc: 5294c1000000         while_icmp       r0l, seq, r0h.discard, 0, 2
 202: 00c084ffffff         jmp_exec_any     0x186
 208: d21600000000         pop_exec         r0l.cache, 2
 20e: d20e00000000         pop_exec         r0l.cache, 1
 214: 8e1f0020bd000000     iadd             r7_r8.cache, 0, r9_r10.discard, lsl 1
 21c: fe19c2eaec00         or               r6.cache, r1.discard, r7.discard
 222: 520e00000000         pop_exec         r0l, 1
 228: fe15cc6a6900         xor              r5.cache, r6.discard, r11.cache
 22e: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 234: 8e09ca6a2d000000     isub             r2.cache, r5.discard, r11.discard
 23c: 9205840200c200b0     icmpsel          slt, r1.cache, r2.cache, 0, r1.discard, 0
 244: ae8500202c1b0000     bfeil            r1.cache, 0, r1.discard, 27
 24c: 8e0544222c000000     iadd             r1.cache, r2, r1.discard
 254: 2e85c25600000000     asr              r1, r1.discard, 5
 25c: c510803d01803000     uniform_store    2, i16, xy, 0, r2l_r2h, 24
 264: c508a03d01803000     uniform_store    2, i16, xy, 0, r1l_r1h, 26
 26c: 8800                 stop             

compute shader:
   0: f2023500             get_sr           r0h.cache, sr53 (simdgroup_index_in_threadgroup)
   4: f2050000             get_sr           r1.cache, sr0 (threadgroup_position_in_grid.x)
   8: 626900000000         mov_imm          r26, 0, 0b0
   e: 8e65c120ac000000     iadd             r25.cache, r0h.discard, r1.discard, lsl 1
  16: 626d00000000         mov_imm          r27, 0, 0b0
  1c: 627100000000         mov_imm          r28, 0, 0b0
  22: 627500000000         mov_imm          r29, 0, 0b0
  28: e2000000             mov_imm          r0l.cache, 0
  2c: 52c898f10100         if_icmp          r0l, sgt, u12, 31, 1
  32: 8e09002027001000     iadd             r2.cache, 0, r25, lsl 2
  3a: 9e05c48219000000     imadd            r1.cache, r2.discard, u12, 0
  42: 9e57c22201800100     imadd            r21_r22.cache, r1.discard, 18, u0
  4a: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
  4e: 8e59ec2218000000     iadd             r22.cache, r22.discard, u1
  56: 1e17c120016a0300     imadd            r5_r6, r0h.discard, 18, r21_r22
  5e: 20c06a080000         jmp_exec_none    0x8C8
  64: 85040a0500c8f000     device_load      0, i16, xyzw, r0h_r1l_r1h_r2l, r5_r6, 0, signed, lsl 2
  6c: 85141a4500c8f000     device_load      1, i16, xyzw, r2h_r3l_r3h_r4l, r5_r6, 1, signed, lsl 2
  74: 85882a4500c91000     device_load      1, i16, x, r49l, r5_r6, 2, signed, lsl 2
  7c: f21d0100             get_sr           r7.cache, sr1 (threadgroup_position_in_grid.y)
  80: fe159c098000         mov              r5.cache, u14
  86: 9e17cae228000000     imadd            r5_r6.cache, r5.discard, r7.cache, 0
  8e: 92218e0200a801b0     icmpsel          slt, r8.cache, r7.cache, 0, u20, 0
  96: 626d00000000         mov_imm          r27, 0, 0b0
  9c: 9e199c012dcc0200     imadd            r6.cache, u14, r8.discard, r6.discard
  a4: 626900000000         mov_imm          r26, 0, 0b0
  aa: 9e199ee12ccc0200     imadd            r6.cache, u15, r7.discard, r6.discard
  b2: e27900000000         mov_imm          r30.cache, 0, 0b0
  b8: 8e5f84a13c001000     iadd             r23_r24.cache, u2, r5_r6.discard, lsl 2
  c0: 627100000000         mov_imm          r28, 0, 0b0
  c6: 627500000000         mov_imm          r29, 0, 0b0
  cc: 8e61f06218000000     iadd             r24.cache, r24.discard, u3
  d4: 421000000000         push_exec        r0l, 2
  da: ae1500c0270a0000     bfi              r5.cache, 0, r30, 10
  e2: f2183400             get_sr           r6l.cache, sr52 (thread_index_in_simdgroup)
  e6: 8e1900c08c000000     iadd             r6.cache, 0, r6l.discard, lsl 1
  ee: 8e15cac22c002000     iadd             r5.cache, r5.discard, r6.discard, lsl 4
  f6: 0e3b6ea324101000     iadd             r46_r47, r23_r24, r5, lsl 2
  fe: 0529ae4620c0f200     device_load      1, i32, xyzw, r5_r6_r7_r8, r23_r24, r5, unsigned
 106: 05491c4550c8f200     device_load      1, i32, xyzw, r9_r10_r11_r12, r46_r47, 1, signed, lsl 2
 10e: 05692c4550c8f200     device_load      1, i32, xyzw, r13_r14_r15_r16, r46_r47, 2, signed, lsl 2
 116: 05893c0550c8f200     device_load      0, i32, xyzw, r17_r18_r19_r20, r46_r47, 3, signed, lsl 2
 11e: 05114c0550c9f200     device_load      0, i32, xyzw, r34_r35_r36_r37, r46_r47, 4, signed, lsl 2
 126: 05315c0550c9f200     device_load      0, i32, xyzw, r38_r39_r40_r41, r46_r47, 5, signed, lsl 2
 12e: 05516c0550c9f200     device_load      0, i32, xyzw, r42_r43_r44_r45, r46_r47, 6, signed, lsl 2
 136: 05917c0550c9f200     device_load      0, i32, xyzw, r50_r51_r52_r53, r46_r47, 7, signed, lsl 2
 13e: 3801                 wait             1
 140: aafd8ca22400         fadd32           r31.cache, r6.cache, r5
 146: aafdfee22400         fadd32           r31.cache, r31.discard, r7
 14c: aafdfe022900         fadd32           r31.cache, r31.discard, r8.cache
 152: aafdfe422900         fadd32           r31.cache, r31.discard, r10.cache
 158: aafdfe222500         fadd32           r31.cache, r31.discard, r9
 15e: aa81fe622510         fadd32           r32.cache, r31.discard, r11
 164: aa81c0822914         fadd32           r32.cache, r32.discard, r12.cache
 16a: 1a99cc721200         fmul32           r6, r6.discard, u19h
 170: aa81c0c22914         fadd32           r32.cache, r32.discard, r14.cache
 176: 1aa1d0721200         fmul32           r8, r8.discard, u19h
 17c: aa81c0a22514         fadd32           r32.cache, r32.discard, r13
 182: 1aa9d4721200         fmul32           r10, r10.discard, u19h
 188: aa81c0e22514         fadd32           r32.cache, r32.discard, r15
 18e: 1ab1d8721200         fmul32           r12, r12.discard, u19h
 194: 2a81c0022614         fadd32           r32, r32.discard, r16
 19a: 1ab9dc721200         fmul32           r14, r14.discard, u19h
 1a0: 3800                 wait             0
 1a2: 627d00000000         mov_imm          r31, 0, 0b0
 1a8: aa81c0422a14         fadd32           r32.cache, r32.discard, r18.cache
 1ae: 1ac1e0721200         fmul32           r16, r16.discard, u19h
 1b4: aa81c0222614         fadd32           r32.cache, r32.discard, r17
 1ba: 1ac9e4721200         fmul32           r18, r18.discard, u19h
 1c0: aa85c0622a14         fadd32           r33.cache, r32.discard, r19.cache
 1c6: 9a81a8721210         fmul32           r32.cache, r20.cache, u19h
 1cc: aa85c2822e14         fadd32           r33.cache, r33.discard, r20.discard
 1d2: 1ad186621204         fmul32           r20, r35.cache, u19l
 1d8: aa8dc2622c15         fadd32           r35.cache, r33.discard, r35.discard
 1de: 1a8584420014         fmul32           r33, r34.cache, 0.0625
 1e4: aa8dc6422c15         fadd32           r35.cache, r35.discard, r34.discard
 1ea: 1a8988420014         fmul32           r34, r36.cache, 0.0625
 1f0: aa91c6822c15         fadd32           r36.cache, r35.discard, r36.discard
 1f6: 1a8d8a621214         fmul32           r35, r37.cache, u19l
 1fc: aa95c8a22c15         fadd32           r37.cache, r36.discard, r37.discard
 202: 1a918e621214         fmul32           r36, r39.cache, u19l
 208: aa9dcae22c15         fadd32           r39.cache, r37.discard, r39.discard
 20e: 1a958c420014         fmul32           r37, r38.cache, 0.0625
 214: aa9dcec22c15         fadd32           r39.cache, r39.discard, r38.discard
 21a: 1a9990420014         fmul32           r38, r40.cache, 0.0625
 220: aaa1ce022d15         fadd32           r40.cache, r39.discard, r40.discard
 226: 1a9d92621214         fmul32           r39, r41.cache, u19l
 22c: aaa5d0222d15         fadd32           r41.cache, r40.discard, r41.discard
 232: 1aa196621214         fmul32           r40, r43.cache, u19l
 238: aaadd2622d15         fadd32           r43.cache, r41.discard, r43.discard
 23e: 1aa594420014         fmul32           r41, r42.cache, 0.0625
 244: aaadd6422d15         fadd32           r43.cache, r43.discard, r42.discard
 24a: 1aa998420014         fmul32           r42, r44.cache, 0.0625
 250: aab1d6822d15         fadd32           r44.cache, r43.discard, r44.discard
 256: 1aad9a621214         fmul32           r43, r45.cache, u19l
 25c: aab5d8a22d15         fadd32           r45.cache, r44.discard, r45.discard
 262: 1ab1a6621214         fmul32           r44, r51.cache, u19l
 268: aab9da622e15         fadd32           r46.cache, r45.discard, r51.discard
 26e: 1ab5a4420014         fmul32           r45, r50.cache, 0.0625
 274: aabddc422e15         fadd32           r47.cache, r46.discard, r50.discard
 27a: 9ab9a8420014         fmul32           r46.cache, r52.cache, 0.0625
 280: aac1de822e15         fadd32           r48.cache, r47.discard, r52.discard
 286: 1abdaa621214         fmul32           r47, r53.cache, u19l
 28c: aac1e0a22e15         fadd32           r48.cache, r48.discard, r53.discard
 292: 421000000000         push_exec        r0l, 2
 298: 7e5985f08010         and              r54, r2h.cache, 15
 29e: 7e5585008313         and              r53, r2h.cache, 240
 2a4: 7e5185409210         and              r52, r2h.cache, u18l
 2aa: 7e4d45509210         and              r51, r2h, u18h
 2b0: 7e6d86f08010         and              r59, r3l.cache, 15
 2b6: 7e6586008313         and              r57, r3l.cache, 240
 2bc: 7e6186409210         and              r56, r3l.cache, u18l
 2c2: 7e5d86509210         and              r55, r3l.cache, u18h
 2c8: 7e7d87f08010         and              r63, r3h.cache, 15
 2ce: 7e4987008313         and              r50, r3h.cache, 240
 2d4: 7e7587409210         and              r61, r3h.cache, u18l
 2da: 7e0d47509200         and              r3, r3h, u18h
 2e0: fe0d88f08020         and              r67.cache, r4l.cache, 15
 2e6: 7e7988008313         and              r62, r4l.cache, 240
 2ec: 7e0588409220         and              r65, r4l.cache, u18l
 2f2: fe69a2f08014         and              r58.cache, r49l.cache, 15
 2f8: 7e7148509210         and              r60, r4l, u18h
 2fe: be890b442f21         convert          s32_to_f, r66.cache, r58.discard, rte
 304: fe01a2008327         and              r64.cache, r49l.cache, 240
 30a: 9ae960020217         fmul32           r58.cache, r48, -8.0
 310: be910b042c22         convert          s32_to_f, r68.cache, r64.discard, rte
 316: fe01a2409224         and              r64.cache, r49l.cache, u18l
 31c: ba9566422c744222     fmadd32          r69.cache, r19, r66.discard, r58
 324: be890b042c22         convert          s32_to_f, r66.cache, r64.discard, rte
 32a: fe01a2509224         and              r64.cache, r49l.cache, u18h
 330: bac55c822cca8216     fmadd32          r49.cache, r46, r68.discard, r69.discard
 338: be810b042c22         convert          s32_to_f, r64.cache, r64.discard, rte
 33e: ba8940422ca24226     fmadd32          r66.cache, r32, r66.discard, r49.cache
 346: bec50b642c12         convert          s32_to_f, r49.cache, r67.discard, rte
 34c: ba815e022cc48226     fmadd32          r64.cache, r47, r64.discard, r66.discard
 354: bef90bc42f11         convert          s32_to_f, r62.cache, r62.discard, rte
 35a: ba8162222ac08221     fmadd32          r64.cache, r17, r49.cache, r64.discard
 362: bec50b242c12         convert          s32_to_f, r49.cache, r65.discard, rte
 368: baf95ac22fc08215     fmadd32          r62.cache, r45, r62.discard, r64.discard
 370: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 376: baf964222afc4211     fmadd32          r62.cache, r18, r49.cache, r62.discard
 37e: bec50be42f11         convert          s32_to_f, r49.cache, r63.discard, rte
 384: baf158822ffc4215     fmadd32          r60.cache, r44, r60.discard, r62.discard
 38c: bec90b442e11         convert          s32_to_f, r50.cache, r50.discard, rte
 392: baf15e222af84211     fmadd32          r60.cache, r15, r49.cache, r60.discard
 39a: bec50ba42f11         convert          s32_to_f, r49.cache, r61.discard, rte
 3a0: bac954422ef84215     fmadd32          r50.cache, r42, r50.discard, r60.discard
 3a8: be8d0b642800         convert          s32_to_f, r3.cache, r3.cache, rte
 3ae: bac960222ae44211     fmadd32          r50.cache, r16, r49.cache, r50.discard
 3b6: bec50b642f11         convert          s32_to_f, r49.cache, r59.discard, rte
 3bc: bac9566228e44214     fmadd32          r50.cache, r43, r3.cache, r50.discard
 3c4: be8d0b242f01         convert          s32_to_f, r3.cache, r57.discard, rte
 3ca: bac95a222ae44211     fmadd32          r50.cache, r13, r49.cache, r50.discard
 3d2: bec50b042f11         convert          s32_to_f, r49.cache, r56.discard, rte
 3d8: bac9526228e44214     fmadd32          r50.cache, r41, r3.cache, r50.discard
 3e0: be8d0be42e01         convert          s32_to_f, r3.cache, r55.discard, rte
 3e6: bac95c222ae44211     fmadd32          r50.cache, r14, r49.cache, r50.discard
 3ee: bec50bc42e11         convert          s32_to_f, r49.cache, r54.discard, rte
 3f4: bac9506228e44214     fmadd32          r50.cache, r40, r3.cache, r50.discard
 3fc: be8d0ba42e01         convert          s32_to_f, r3.cache, r53.discard, rte
 402: bac956222ae44211     fmadd32          r50.cache, r11, r49.cache, r50.discard
 40a: bec50b842e11         convert          s32_to_f, r49.cache, r52.discard, rte
 410: bac94c6228e44214     fmadd32          r50.cache, r38, r3.cache, r50.discard
 418: be8d0b642e01         convert          s32_to_f, r3.cache, r51.discard, rte
 41e: bac558222ae44211     fmadd32          r49.cache, r12, r49.cache, r50.discard
 426: bacd4e6228a24214     fmadd32          r51.cache, r39, r3.cache, r49.cache
 42e: fe0d84f08000         and              r3.cache, r2l.cache, 15
 434: fe4584008313         and              r49.cache, r2l.cache, 240
 43a: bec90b642810         convert          s32_to_f, r50.cache, r3.cache, rte
 440: fe0d84409200         and              r3.cache, r2l.cache, u18l
 446: bec50b242a11         convert          s32_to_f, r49.cache, r49.cache, rte
 44c: bac952422ee64211     fmadd32          r50.cache, r9, r50.discard, r51.discard
 454: be8d0b642800         convert          s32_to_f, r3.cache, r3.cache, rte
 45a: bac54a222ae44215     fmadd32          r49.cache, r37, r49.cache, r50.discard
 462: bac9546228a24210     fmadd32          r50.cache, r10, r3.cache, r49.cache
 46a: fe0984509200         and              r2.cache, r2l.cache, u18h
 470: fe0d83f08000         and              r3.cache, r1h.cache, 15
 476: bec50b442810         convert          s32_to_f, r49.cache, r2.cache, rte
 47c: fe0983008303         and              r2.cache, r1h.cache, 240
 482: be8d0b642800         convert          s32_to_f, r3.cache, r3.cache, rte
 488: bac548222ae44215     fmadd32          r49.cache, r36, r49.cache, r50.discard
 490: be890b442800         convert          s32_to_f, r2.cache, r2.cache, rte
 496: ba8d4e6228a24200     fmadd32          r3.cache, r7, r3.cache, r49.cache
 49e: bac9444228860214     fmadd32          r50.cache, r34, r2.cache, r3.cache
 4a6: fe0983409200         and              r2.cache, r1h.cache, u18l
 4ac: fe0d83509200         and              r3.cache, r1h.cache, u18h
 4b2: bec50b442810         convert          s32_to_f, r49.cache, r2.cache, rte
 4b8: fe0982f08000         and              r2.cache, r1l.cache, 15
 4be: be8d0b642800         convert          s32_to_f, r3.cache, r3.cache, rte
 4c4: bac550222ae44211     fmadd32          r49.cache, r8, r49.cache, r50.discard
 4cc: be890b442800         convert          s32_to_f, r2.cache, r2.cache, rte
 4d2: ba8d466228a24204     fmadd32          r3.cache, r35, r3.cache, r49.cache
 4da: bac54a4228860210     fmadd32          r49.cache, r5, r2.cache, r3.cache
 4e2: fe0d82008303         and              r3.cache, r1l.cache, 240
 4e8: fe0982409200         and              r2.cache, r1l.cache, u18l
 4ee: be8d0b642800         convert          s32_to_f, r3.cache, r3.cache, rte
 4f4: fe0582509200         and              r1.cache, r1l.cache, u18h
 4fa: be890b442800         convert          s32_to_f, r2.cache, r2.cache, rte
 500: ba8d426228624204     fmadd32          r3.cache, r33, r3.cache, r49
 508: be850b242800         convert          s32_to_f, r1.cache, r1.cache, rte
 50e: ba894c4228860200     fmadd32          r2.cache, r6, r2.cache, r3.cache
 516: ba89682228840200     fmadd32          r2.cache, r20, r1.cache, r2.cache
 51e: f2103400             get_sr           r4l.cache, sr52 (thread_index_in_simdgroup)
 522: 920dbe2000b6a08b     icmpsel          seq, r3.cache, r31l.cache, 2, r27.cache, r29.cache
 52a: ae8500e02b020000     bfeil            r1.cache, 0, r31.cache, 2
 532: ba89841204860200     fmadd32          r2.cache, r2.cache, r0h, r3.cache
 53a: 8e6101e02b100000     iadd             r56.cache, 1, r31.cache
 542: 8e0582c22b000000     iadd             r1.cache, r1.cache, r30.cache
 54a: 1275be000084a087     icmpsel          seq, r29, r31l.cache, 0, r2.cache, r29
 552: 126dbe2000846087     icmpsel          seq, r27, r31l.cache, 2, r2.cache, r27
 55a: 8e7d02e02f000000     iadd             r31.cache, 2, r31.discard
 562: 9e09708219880004     imadd            r2.cache, r56, u12, r4l.cache
 56a: 8e0d0020a8000000     iadd             r3.cache, 0, r1.cache, lsl 1
 572: ae8500e02b020000     bfeil            r1.cache, 0, r31.cache, 2
 57a: 8e4d846228102000     iadd             r51.cache, r2.cache, r3.cache, lsl 4
 582: fe097e228000         and              r2.cache, r31, 2
 588: 8e0582c227000000     iadd             r1.cache, r1.cache, r30
 590: 9e0d848019480000     imadd            r3.cache, r2l.cache, u12, r4l
 598: 8e450020a8100000     iadd             r49.cache, 0, r1.cache, lsl 1
 5a0: 1e07e62201aa0304     imadd            r1_r2, r51.discard, 18, r21_r22.cache
 5a8: 8e0d862226012000     iadd             r3.cache, r3.cache, r49, lsl 4
 5b0: 1e5b4622016a0310     imadd            r54_r55, r3, 18, r21_r22
 5b8: 858c020500c9f000     device_load      0, i16, xyzw, r49h_r50l_r50h_r51l, r1_r2, 0, signed, lsl 2
 5c0: 859c124500c9f000     device_load      1, i16, xyzw, r51h_r52l_r52h_r53l, r1_r2, 1, signed, lsl 2
 5c8: 85c4224500c91000     device_load      1, i16, x, r56h, r1_r2, 2, signed, lsl 2
 5d0: 85040c4560c8f000     device_load      1, i16, xyzw, r0h_r1l_r1h_r2l, r54_r55, 0, signed, lsl 2
 5d8: 85141c4560c8f000     device_load      1, i16, xyzw, r2h_r3l_r3h_r4l, r54_r55, 1, signed, lsl 2
 5e0: 85882c4560c91000     device_load      1, i16, x, r49l, r54_r55, 2, signed, lsl 2
 5e8: 3800                 wait             0
 5ea: fe59a4f08014         and              r54.cache, r50l.cache, 15
 5f0: fe5da4008317         and              r55.cache, r50l.cache, 240
 5f6: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 5fc: fe65a4409214         and              r57.cache, r50l.cache, u18l
 602: bedd0be42e11         convert          s32_to_f, r55.cache, r55.discard, rte
 608: bee50b242f11         convert          s32_to_f, r57.cache, r57.discard, rte
 60e: bae94ac22ef44211     fmadd32          r58.cache, r5, r54.discard, r58.discard
 616: fe59e4509214         and              r54.cache, r50l.discard, u18h
 61c: bae942e22ef44215     fmadd32          r58.cache, r33, r55.discard, r58.discard
 624: bedd0bc42e11         convert          s32_to_f, r55.cache, r54.discard, rte
 62a: fe59a5f08014         and              r54.cache, r50h.cache, 15
 630: bae94c222ff44211     fmadd32          r58.cache, r6, r57.discard, r58.discard
 638: bee50bc42e11         convert          s32_to_f, r57.cache, r54.discard, rte
 63e: fe59a5008317         and              r54.cache, r50h.cache, 240
 644: bae968e22ef44211     fmadd32          r58.cache, r20, r55.discard, r58.discard
 64c: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 652: fe5da5409214         and              r55.cache, r50h.cache, u18l
 658: bae54e222ff44211     fmadd32          r57.cache, r7, r57.discard, r58.discard
 660: bedd0be42e11         convert          s32_to_f, r55.cache, r55.discard, rte
 666: fe49e5509214         and              r50.cache, r50h.discard, u18h
 66c: bae544c22ef24215     fmadd32          r57.cache, r34, r54.discard, r57.discard
 674: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 67a: fe49a6f08014         and              r50.cache, r51l.cache, 15
 680: bae550e22ef24211     fmadd32          r57.cache, r8, r55.discard, r57.discard
 688: bedd0b442e11         convert          s32_to_f, r55.cache, r50.discard, rte
 68e: fe49a6008317         and              r50.cache, r51l.cache, 240
 694: bae546c22ef24215     fmadd32          r57.cache, r35, r54.discard, r57.discard
 69c: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 6a2: fe49a6409214         and              r50.cache, r51l.cache, u18l
 6a8: bae552e22ef24211     fmadd32          r57.cache, r9, r55.discard, r57.discard
 6b0: 3edd0b442e11         convert          s32_to_f, r55, r50.discard, rte
 6b6: fe49e6509214         and              r50.cache, r51l.discard, u18h
 6bc: 3ae54ac22ef24215     fmadd32          r57, r37, r54.discard, r57.discard
 6c4: 3ed90b442e11         convert          s32_to_f, r54, r50.discard, rte
 6ca: 3801                 wait             1
 6cc: fe49a7f08014         and              r50.cache, r51h.cache, 15
 6d2: bae554e22ef24211     fmadd32          r57.cache, r10, r55.discard, r57.discard
 6da: bedd0b442e11         convert          s32_to_f, r55.cache, r50.discard, rte
 6e0: fe49a7008317         and              r50.cache, r51h.cache, 240
 6e6: bae548c22ef24215     fmadd32          r57.cache, r36, r54.discard, r57.discard
 6ee: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 6f4: fe49a7409214         and              r50.cache, r51h.cache, u18l
 6fa: bae556e22ef24211     fmadd32          r57.cache, r11, r55.discard, r57.discard
 702: bedd0b442e11         convert          s32_to_f, r55.cache, r50.discard, rte
 708: fe49e7509214         and              r50.cache, r51h.discard, u18h
 70e: bacd4cc22ef24215     fmadd32          r51.cache, r38, r54.discard, r57.discard
 716: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 71c: fe49a8f08014         and              r50.cache, r52l.cache, 15
 722: badd58e22ee64211     fmadd32          r55.cache, r12, r55.discard, r51.discard
 72a: becd0b442e11         convert          s32_to_f, r51.cache, r50.discard, rte
 730: fe49a8008317         and              r50.cache, r52l.cache, 240
 736: badd4ec22eee4215     fmadd32          r55.cache, r39, r54.discard, r55.discard
 73e: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 744: fe49a8409214         and              r50.cache, r52l.cache, u18l
 74a: badd5a622eee4211     fmadd32          r55.cache, r13, r51.discard, r55.discard
 752: becd0b442e11         convert          s32_to_f, r51.cache, r50.discard, rte
 758: fe49e8509214         and              r50.cache, r52l.discard, u18h
 75e: badd52c22eee4215     fmadd32          r55.cache, r41, r54.discard, r55.discard
 766: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 76c: fe49a9f08014         and              r50.cache, r52h.cache, 15
 772: badd5c622eee4211     fmadd32          r55.cache, r14, r51.discard, r55.discard
 77a: becd0b442e11         convert          s32_to_f, r51.cache, r50.discard, rte
 780: fe49a9008317         and              r50.cache, r52h.cache, 240
 786: badd50c22eee4215     fmadd32          r55.cache, r40, r54.discard, r55.discard
 78e: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 794: fe49a9409214         and              r50.cache, r52h.cache, u18l
 79a: badd5e622eee4211     fmadd32          r55.cache, r15, r51.discard, r55.discard
 7a2: bec90b442e11         convert          s32_to_f, r50.cache, r50.discard, rte
 7a8: fe4de9509214         and              r51.cache, r52h.discard, u18h
 7ae: bad154c22eee4215     fmadd32          r52.cache, r42, r54.discard, r55.discard
 7b6: becd0b642e11         convert          s32_to_f, r51.cache, r51.discard, rte
 7bc: bad160422ee84211     fmadd32          r52.cache, r16, r50.discard, r52.discard
 7c4: fe49aaf08014         and              r50.cache, r53l.cache, 15
 7ca: badd56622ee84215     fmadd32          r55.cache, r43, r51.discard, r52.discard
 7d2: bed90b442e11         convert          s32_to_f, r54.cache, r50.discard, rte
 7d8: fe51aa008317         and              r52.cache, r53l.cache, 240
 7de: fe4daa409214         and              r51.cache, r53l.cache, u18l
 7e4: fe49ea509214         and              r50.cache, r53l.discard, u18h
 7ea: bed10b842e11         convert          s32_to_f, r52.cache, r52.discard, rte
 7f0: bad562c22eee4211     fmadd32          r53.cache, r17, r54.discard, r55.discard
 7f8: becd0b642e11         convert          s32_to_f, r51.cache, r51.discard, rte
 7fe: bad55a822eea4215     fmadd32          r53.cache, r45, r52.discard, r53.discard
 806: bed10b442e11         convert          s32_to_f, r52.cache, r50.discard, rte
 80c: fe49b1f08014         and              r50.cache, r56h.cache, 15
 812: bad564622eea4211     fmadd32          r53.cache, r18, r51.discard, r53.discard
 81a: becd0b442e11         convert          s32_to_f, r51.cache, r50.discard, rte
 820: fe49b1008317         and              r50.cache, r56h.cache, 240
 826: bad558822eea4215     fmadd32          r53.cache, r44, r52.discard, r53.discard
 82e: bed10b442e11         convert          s32_to_f, r52.cache, r50.discard, rte
 834: fe49b1409214         and              r50.cache, r56h.cache, u18l
 83a: bad566622eea4211     fmadd32          r53.cache, r19, r51.discard, r53.discard
 842: becd0b442e11         convert          s32_to_f, r51.cache, r50.discard, rte
 848: fe49f1509214         and              r50.cache, r56h.discard, u18h
 84e: bad15c822eea4215     fmadd32          r52.cache, r46, r52.discard, r53.discard
 856: bec90b442e11         convert          s32_to_f, r50.cache, r50.discard, rte
 85c: bacd40622ee84215     fmadd32          r51.cache, r32, r51.discard, r52.discard
 864: bac95e422ee64215     fmadd32          r50.cache, r47, r50.discard, r51.discard
 86c: 92cdb01000b8a0870014 icmpsel          seq, r51.cache, r56l.cache, 1, r28.cache, r29
 876: 92cdb03000b4608e1014 icmpsel          seq, r51.cache, r56l.cache, 3, r26.cache, r51.discard
 880: bac9e4320ee64215     fmadd32          r50.cache, r50.discard, r49h.discard, r51.discard
 888: 12f1b01000a480874004 icmpsel          seq, r28, r56l.cache, 1, r50.cache, r28
 892: 12e9f03000e440874004 icmpsel          seq, r26, r56l.discard, 3, r50.discard, r26
 89c: 52957e420000         while_icmp       r0l, nseq, r31, 4, 2
 8a2: 00c0f6f9ffff         jmp_exec_any     0x298
 8a8: d21600000000         pop_exec         r0l.cache, 2
 8ae: 8e7901c02f000000     iadd             r30.cache, 1, r30.discard
 8b6: 52957ca21900         while_icmp       r0l, nseq, r30, u13, 2
 8bc: 00c01ef8ffff         jmp_exec_any     0xDA
 8c2: d21600000000         pop_exec         r0l.cache, 2
 8c8: d20e00000000         pop_exec         r0l.cache, 1
 8ce: f2150100             get_sr           r5.cache, sr1 (threadgroup_position_in_grid.y)
 8d2: fe05a0098000         mov              r1.cache, u16
 8d8: 9e0bc2a228000000     imadd            r2_r3.cache, r1.discard, r5.cache, 0
 8e0: 92058a0200a801b0     icmpsel          slt, r1.cache, r5.cache, 0, u20, 0
 8e8: 9e19a0212cc60200     imadd            r6.cache, u16, r1.discard, r3.discard
 8f0: 6f8dfa320000         simd_fadd        r3, r29.discard
 8f6: 8e0584222f001000     iadd             r1.cache, r2.cache, r25.discard, lsl 2
 8fe: 9e15a2a12ccc0200     imadd            r5.cache, u17, r5.discard, r6.discard
 906: 920882422c010130     icmpsel          ult, r2l.cache, r1.cache, r2.discard, 1, 0
 90e: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 912: 8e09c4a02c000000     iadd             r2.cache, r2l.discard, r5.discard
 91a: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 920: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 928: 0e21d0a218000000     iadd             r8, r8.discard, u5
 930: 45190e0500c01200     device_store     0, i32, x, r3, r7_r8, 0, signed, 0
 938: d20e00000000         pop_exec         r0l.cache, 1
 93e: 6f8df8320000         simd_fadd        r3, r28.discard
 944: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 94a: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 952: 0e21d0a218000000     iadd             r8, r8.discard, u5
 95a: 45191e0500c01200     device_store     0, i32, x, r3, r7_r8, 1, signed, 0
 962: d20e00000000         pop_exec         r0l.cache, 1
 968: 6f95f6320000         simd_fadd        r5, r27.discard
 96e: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 972: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 978: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 980: 0e21d0a218000000     iadd             r8, r8.discard, u5
 988: 45291e0500c41200     device_store     0, i32, x, r5, r7_r8, 1, signed, lsl 1, 0
 990: d20e00000000         pop_exec         r0l.cache, 1
 996: 6f8df4320000         simd_fadd        r3, r26.discard
 99c: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 9a2: 8e1f88213c001000     iadd             r7_r8.cache, u4, r1_r2.discard, lsl 2
 9aa: 0e21d0a218000000     iadd             r8, r8.discard, u5
 9b2: 45193e0500c01200     device_store     0, i32, x, r3, r7_r8, 3, signed, 0
 9ba: 520e00000000         pop_exec         r0l, 1
 9c0: 8800                 stop

The structure is similar to Example 0, so we only analyze the inner loop.:

 5b0: 1e5b4622016a0310     imadd            r54_r55, r3, 18, r21_r22
 5b8: 858c020500c9f000     device_load      0, i16, xyzw, r49h_r50l_r50h_r51l, r1_r2, 0, signed, lsl 2
 5c0: 859c124500c9f000     device_load      1, i16, xyzw, r51h_r52l_r52h_r53l, r1_r2, 1, signed, lsl 2
 5c8: 85c4224500c91000     device_load      1, i16, x, r56h, r1_r2, 2, signed, lsl 2
 5d0: 85040c4560c8f000     device_load      1, i16, xyzw, r0h_r1l_r1h_r2l, r54_r55, 0, signed, lsl 2
 5d8: 85141c4560c8f000     device_load      1, i16, xyzw, r2h_r3l_r3h_r4l, r54_r55, 1, signed, lsl 2
 5e0: 85882c4560c91000     device_load      1, i16, x, r49l, r54_r55, 2, signed, lsl 2
 5e8: 3800                 wait             0
 5ea: fe59a4f08014         and              r54.cache, r50l.cache, 15

Now when it loads a block it first loads one 16-bit, then four 16-bit and four 16-bit. Before and after these device_load we don't see mov and bfeil any more because now we directly operate on 16-bit values.

lshzh-ww · 2023-07-19T16:36:05Z

lshzh-ww
Jul 19, 2023
Author

Strange...

When I tested the PR #2248 I checked disassembled Example 0 and then Example 1, so I make a conclusion that the speed up is due to reduced device_load. Then @ikawrakow suggested that I can keep the struct unchanged but try to cast uint8_t as uint16_t. I applied such change and noticed that there was no more speed up. Without checking disassembled code I made a conclusion that the complier may waste device_load again.

I just disassembled code with the suggested change:
Code :

Details

#include <metal_stdlib>
using namespace metal;
#define QK4_0 32

typedef struct {
    half    d;             // delta
    uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;


// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(block_q4_0 qb_curr, float sumy, thread float * yl) {
    float d = qb_curr.d;
    float acc = sumy * -8.f;
    thread uint16_t * qs = (thread uint16_t *) qb_curr.qs;
    for (int i = 0; i < 16; i+=2) {
        acc += yl[i]     * (qs[i / 2] & 0x000F) + yl[i + 16] * (qs[i / 2] & 0x00F0);
        acc += yl[i + 1] * (qs[i / 2] & 0x0F00) + yl[i + 17] * (qs[i / 2] & 0xF000);
    }
    return d * acc;
}

#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32

template<typename block_q_type>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
                    uint2 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
    block_q_type qb_curr;
    float4 y_curr[8];       // src1 vector cache
    float sumf[N_DST]={0.f}, all_sum;
    thread float * yl=(thread float *)y_curr;

    // bootstrap
    qb_curr = x[tiisg];
    // each thread in a SIMD group deals with 1 block.
    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {

        float sumy = 0;
        for (int i = 0; i < QK4_0 / 4; i++) {
            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
        }
        // we don't right shift packed 4-bit weights, so we have to devide y by 16/256/4096 to conpensate this.
        // this design is q4_0 and q4_1 centered, but I think most of the people use these two quantizations.
        for (int i = 0; i < 32; i++) {
            yl[i] *= pow(1.f/16.f, 2 * (i % 2) + i / 16);
        }

        for (int row = 0; row < N_DST; row++) {
            sumf[row] += block_q_n_dot_y(qb_curr, sumy, yl);
            qb_curr = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
        }
    }

    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
        }
    }

}

Disassembled:

Details

compute shader prolog:
   0: 05110c0d00c43200     device_load      0, i32, xy, r2_r3, u6_u7, 0, signed, lsl 1
   8: 3800                 wait             0
   a: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
  10: 922d8602008200b0     icmpsel          slt, r11.cache, r3.cache, 0, r1.cache, 0
  18: e21520000000         mov_imm          r5.cache, 32, 0b0
  1e: 9205960200c200b0     icmpsel          slt, r1.cache, r11.cache, 0, r1.discard, 0
  26: be19ca0e0000         ffs              r6.cache, r5.discard
  2c: fe0dc26a6c00         xor              r3.cache, r1.discard, r3.discard
  32: e20500000000         mov_imm          r1.cache, 0, 0b0
  38: fe09964a6c00         xor              r2.cache, r11.cache, r2.discard
  3e: be15820e0000         ffs              r5.cache, r1.cache
  44: 8e1fc46b65000000     isub             r7_r8.cache, r2_r3.discard, r11.sx
  4c: 8e0d3fc82c000000     isub             r3.cache, 63, r6.discard
  54: 8e091fa82c000000     isub             r2.cache, 31, r5.discard
  5c: 9215420200c6408c     icmpsel          seq, r5.cache, r1, 0, r3.discard, r2.discard
  64: be0d8e0e0000         ffs              r3.cache, r7.cache
  6a: be09900e0000         ffs              r2.cache, r8.cache
  70: 8e0d3f682c000000     isub             r3.cache, 63, r3.discard
  78: 8e091f482c000000     isub             r2.cache, 31, r2.discard
  80: 9209900200c6408c     icmpsel          seq, r2.cache, r8.cache, 0, r3.discard, r2.discard
  88: 8e0b8a4a2c000000     isub             r2_r3.cache, r5.cache, r2.discard
  90: fe198e0ae500         or               r6.cache, r7.cache, r8
  96: 9210cc0200010190     icmpsel          seq, r4l.cache, r6.discard, 0, 1, 0
  9e: 921844f203010150     icmpsel          ugt, r6l.cache, r2, 63, 1, 0
  a6: 9202860200010150     icmpsel          ugt, r0h.cache, r3.cache, 0, 1, 0
  ae: 9202460200cc108c     icmpsel          seq, r0h.cache, r3, 0, r6l.discard, r0h.discard
  b6: 9210c10000c81090     icmpsel          seq, r4l.cache, r0h.discard, 0, r4l.discard, 1
  be: 9202caf203018188     icmpsel          seq, r0h.cache, r5.discard, 63, 1, r4l.cache
  c6: 1219c800004e0090     icmpsel          seq, r6, r4l.discard, 0, r7, 0
  ce: e2000000             mov_imm          r0l.cache, 0
  d2: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
  d8: 20c04a010000         jmp_exec_none    0x222
  de: 8e273f4838000000     isub             r9_r10.cache, 63, r2_r3.cache
  e6: 8e15014028000000     iadd             r5.cache, 1, r2.cache
  ee: 92028a422c010130     icmpsel          ult, r0h.cache, r5.cache, r2.discard, 1, 0
  f6: 8e09202829000000     isub             r2.cache, 32, r9.cache
  fe: 8e19c1602c000000     iadd             r6.cache, r0h.discard, r3.discard
 106: aea900e028c40200     bfeil            r10.cache, 0, r7.cache, r2.discard
 10e: 8e0d920a02000000     isub             r3.cache, r9.cache, 32
 116: ae3100e028920200     bfi              r12.cache, 0, r7.cache, r9.cache
 11e: fe094acae400         or               r2.cache, r5, r6
 124: ae29d40225d20200     bfi              r10.cache, r10.discard, r8, r9.discard
 12c: 9202c40200010190     icmpsel          seq, r0h.cache, r2.discard, 0, 1, 0
 134: ae2500e024860200     bfi              r9.cache, 0, r7, r3.cache
 13c: 1229860200d420ad     icmpsel          slt, r10, r3.cache, 0, r10.discard, r9.discard
 144: 1225c60200d800b0     icmpsel          slt, r9, r3.discard, 0, r12.discard, 0
 14c: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 152: 20c0bc000000         jmp_exec_none    0x20E
 158: e20900000000         mov_imm          r2.cache, 0, 0b0
 15e: ae39ce06298a0200     extr             r14.cache, r7.discard, r8.cache, r5.cache
 166: e20d00000000         mov_imm          r3.cache, 0, 0b0
 16c: aebd00002d8a0200     bfeil            r15.cache, 0, r8.discard, r5.cache
 174: e23100000000         mov_imm          r12.cache, 0, 0b0
 17a: 623500000000         mov_imm          r13, 0, 0b0
 180: 421000000000         push_exec        r0l, 2
 186: 8e4300c0bd000000     iadd             r16_r17.cache, 0, r14_r15.discard, lsl 1
 18e: 92059402000101b0     icmpsel          slt, r1.cache, r10.cache, 0, 1, 0
 196: 8e4f0020b9000000     iadd             r19_r20.cache, 0, r9_r10.cache, lsl 1
 19e: fe41e02ae800         or               r16.cache, r16.discard, r1.cache
 1a4: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 1aa: 0e1f1f083a000000     isub             r7_r8, 31, r16_r17.cache
 1b2: 7e25d86aee00         or               r9, r12.discard, r19.discard
 1b8: 8e178a1b00000000     isub             r5_r6.cache, r5_r6.cache, 1
 1c0: 7e29da8aee00         or               r10, r13.discard, r20.discard
 1c6: 9205d002008200b0     icmpsel          slt, r1.cache, r8.discard, 0, r1.cache, 0
 1ce: fe0d4acae400         or               r3.cache, r5, r6
 1d4: fe1d82028200         and              r7.cache, r1.cache, 32
 1da: fe0582128000         and              r1.cache, r1.cache, 1
 1e0: 0e3be0eb2c000000     isub             r14_r15, r16_r17.discard, r7.discard
 1e8: 9202c60200010190     icmpsel          seq, r0h.cache, r3.discard, 0, 1, 0
 1f0: 7e31820a8000         mov              r12, r1.cache
 1f6: 7e35440a8000         mov              r13, r2
 1fc: 5294c1000000         while_icmp       r0l, seq, r0h.discard, 0, 2
 202: 00c084ffffff         jmp_exec_any     0x186
 208: d21600000000         pop_exec         r0l.cache, 2
 20e: d20e00000000         pop_exec         r0l.cache, 1
 214: 8e1f0020bd000000     iadd             r7_r8.cache, 0, r9_r10.discard, lsl 1
 21c: fe19c2eaec00         or               r6.cache, r1.discard, r7.discard
 222: 520e00000000         pop_exec         r0l, 1
 228: fe15cc6a6900         xor              r5.cache, r6.discard, r11.cache
 22e: e205ffffffff         mov_imm          r1.cache, 4294967295, 0b0
 234: 8e09ca6a2d000000     isub             r2.cache, r5.discard, r11.discard
 23c: 9205840200c200b0     icmpsel          slt, r1.cache, r2.cache, 0, r1.discard, 0
 244: ae8500202c1b0000     bfeil            r1.cache, 0, r1.discard, 27
 24c: 8e0544222c000000     iadd             r1.cache, r2, r1.discard
 254: 2e85c25600000000     asr              r1, r1.discard, 5
 25c: c510803d01803000     uniform_store    2, i16, xy, 0, r2l_r2h, 24
 264: c508a03d01803000     uniform_store    2, i16, xy, 0, r1l_r1h, 26
 26c: 8800                 stop             

compute shader:
   0: f2023500             get_sr           r0h.cache, sr53 (simdgroup_index_in_threadgroup)
   4: f2050000             get_sr           r1.cache, sr0 (threadgroup_position_in_grid.x)
   8: 627d00000000         mov_imm          r31, 0, 0b0
   e: 8e0dc120ac000000     iadd             r3.cache, r0h.discard, r1.discard, lsl 1
  16: 6281000000000010     mov_imm          r32, 0, 0b0
  1e: 6285000000000010     mov_imm          r33, 0, 0b0
  26: 6289000000000010     mov_imm          r34, 0, 0b0
  2e: e2000000             mov_imm          r0l.cache, 0
  32: 52c898f10100         if_icmp          r0l, sgt, u12, 31, 1
  38: 8e05006024001000     iadd             r1.cache, 0, r3, lsl 2
  40: 9e05c28219000000     imadd            r1.cache, r1.discard, u12, 0
  48: 9e5fc22201800100     imadd            r23_r24.cache, r1.discard, 18, u0
  50: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
  54: 8e61f02218000000     iadd             r24.cache, r24.discard, u1
  5c: 1e17c120016e0300     imadd            r5_r6, r0h.discard, 18, r23_r24
  64: 20c0040b0000         jmp_exec_none    0xB68
  6a: 85140a0500c01000     device_load      0, i16, x, r2h, r5_r6, 0, signed
  72: 85041a4500c0f000     device_load      1, i16, xyzw, r0h_r1l_r1h_r2l, r5_r6, 1, signed
  7a: 85a45a4500c0f000     device_load      1, i16, xyzw, r20h_r21l_r21h_r22l, r5_r6, 5, signed
  82: 3801                 wait             1
  84: f21d0100             get_sr           r7.cache, sr1 (threadgroup_position_in_grid.y)
  88: fe159c098000         mov              r5.cache, u14
  8e: 9e17cae228000000     imadd            r5_r6.cache, r5.discard, r7.cache, 0
  96: 92218e0200a801b0     icmpsel          slt, r8.cache, r7.cache, 0, u20, 0
  9e: 9e199c012dcc0200     imadd            r6.cache, u14, r8.discard, r6.discard
  a6: 9e199ee12ccc0200     imadd            r6.cache, u15, r7.discard, r6.discard
  ae: 8e6784a13c001000     iadd             r25_r26.cache, u2, r5_r6.discard, lsl 2
  b6: 2eda00900a080000     bfeil            r22h, 0, r20h.cache, 8
  be: 2eee00a006080000     bfeil            r27h, 0, r21l, 8
  c6: 2ef200c006080000     bfeil            r28h, 0, r22l, 8
  ce: 8e69f46218000000     iadd             r26.cache, r26.discard, u3
  d6: 2ef600b00a080000     bfeil            r29h, 0, r21h.cache, 8
  de: 7e78eb088000         mov              r30l, r21h.discard
  e4: 2e90002004080000     bfeil            r4l, 0, r1l, 8
  ec: 7e74e9088000         mov              r29l, r20h.discard
  f2: 2ed6003008080000     bfeil            r21h, 0, r1h.cache, 8
  fa: 7e70c3088000         mov              r28l, r1h.discard
 100: 2e86001008080000     bfeil            r1h, 0, r0h.cache, 8
 108: 7e6cc1088000         mov              r27l, r0h.discard
 10e: 2e82004004080000     bfeil            r0h, 0, r2l, 8
 116: e28d000000000010     mov_imm          r35.cache, 0, 0b0
 11e: 627d00000000         mov_imm          r31, 0, 0b0
 124: 6281000000000010     mov_imm          r32, 0, 0b0
 12c: 6285000000000010     mov_imm          r33, 0, 0b0
 134: 6289000000000010     mov_imm          r34, 0, 0b0
 13c: 421000000000         push_exec        r0l, 2
 142: ae150060240a0001     bfi              r5.cache, 0, r35, 10
 14a: f2183400             get_sr           r6l.cache, sr52 (thread_index_in_simdgroup)
 14e: 8e1900c08c000000     iadd             r6.cache, 0, r6l.discard, lsl 1
 156: 8e15cac22c002000     iadd             r5.cache, r5.discard, r6.discard, lsl 4
 15e: 0e1372a324101000     iadd             r36_r37, r25_r26, r5, lsl 2
 166: 0529a24630c0f200     device_load      1, i32, xyzw, r5_r6_r7_r8, r25_r26, r5, unsigned
 16e: 0549184540c8f200     device_load      1, i32, xyzw, r9_r10_r11_r12, r36_r37, 1, signed, lsl 2
 176: 0569284540c8f200     device_load      1, i32, xyzw, r13_r14_r15_r16, r36_r37, 2, signed, lsl 2
 17e: 0589380540c8f200     device_load      0, i32, xyzw, r17_r18_r19_r20, r36_r37, 3, signed, lsl 2
 186: 0539480540c9f200     device_load      0, i32, xyzw, r39_r40_r41_r42, r36_r37, 4, signed, lsl 2
 18e: 0559580540c9f200     device_load      0, i32, xyzw, r43_r44_r45_r46, r36_r37, 5, signed, lsl 2
 196: 0579680540c9f200     device_load      0, i32, xyzw, r47_r48_r49_r50, r36_r37, 6, signed, lsl 2
 19e: 0599780540c9f200     device_load      0, i32, xyzw, r51_r52_r53_r54, r36_r37, 7, signed, lsl 2
 1a6: 3801                 wait             1
 1a8: aa918ca22410         fadd32           r36.cache, r6.cache, r5
 1ae: aa91c8e22414         fadd32           r36.cache, r36.discard, r7
 1b4: aa91c8022914         fadd32           r36.cache, r36.discard, r8.cache
 1ba: aa91c8422914         fadd32           r36.cache, r36.discard, r10.cache
 1c0: aa91c8222514         fadd32           r36.cache, r36.discard, r9
 1c6: aa95c8622514         fadd32           r37.cache, r36.discard, r11
 1cc: aa95ca822914         fadd32           r37.cache, r37.discard, r12.cache
 1d2: 1a99cc621200         fmul32           r6, r6.discard, u19l
 1d8: aa95cac22914         fadd32           r37.cache, r37.discard, r14.cache
 1de: 1aa1d0621200         fmul32           r8, r8.discard, u19l
 1e4: aa95caa22514         fadd32           r37.cache, r37.discard, r13
 1ea: 1aa9d4621200         fmul32           r10, r10.discard, u19l
 1f0: aa95cae22514         fadd32           r37.cache, r37.discard, r15
 1f6: 1ab1d8621200         fmul32           r12, r12.discard, u19l
 1fc: 2a95ca022614         fadd32           r37, r37.discard, r16
 202: 1ab9dc621200         fmul32           r14, r14.discard, u19l
 208: 3800                 wait             0
 20a: 6291000000000010     mov_imm          r36, 0, 0b0
 212: aa95ca422a14         fadd32           r37.cache, r37.discard, r18.cache
 218: 1ac1e0621200         fmul32           r16, r16.discard, u19l
 21e: aa95ca222a14         fadd32           r37.cache, r37.discard, r17.cache
 224: 9ac9e4621200         fmul32           r18.cache, r18.discard, u19l
 22a: aa99ca622a14         fadd32           r38.cache, r37.discard, r19.cache
 230: 1a95a8621210         fmul32           r37, r20.cache, u19l
 236: aa99cc822e14         fadd32           r38.cache, r38.discard, r20.discard
 23c: 1ad190521204         fmul32           r20, r40.cache, u18h
 242: aaa1cc022d15         fadd32           r40.cache, r38.discard, r40.discard
 248: 1a998e420014         fmul32           r38, r39.cache, 0.0625
 24e: aaa1d0e22c15         fadd32           r40.cache, r40.discard, r39.discard
 254: 1a9d92420014         fmul32           r39, r41.cache, 0.0625
 25a: aaa5d0222d15         fadd32           r41.cache, r40.discard, r41.discard
 260: 1aa194521214         fmul32           r40, r42.cache, u18h
 266: aaa9d2422d15         fadd32           r42.cache, r41.discard, r42.discard
 26c: 1aa598521214         fmul32           r41, r44.cache, u18h
 272: aab1d4822d15         fadd32           r44.cache, r42.discard, r44.discard
 278: 1aa996420014         fmul32           r42, r43.cache, 0.0625
 27e: aab1d8622d15         fadd32           r44.cache, r44.discard, r43.discard
 284: 1aad9a420014         fmul32           r43, r45.cache, 0.0625
 28a: aab5d8a22d15         fadd32           r45.cache, r44.discard, r45.discard
 290: 1ab19c521214         fmul32           r44, r46.cache, u18h
 296: aab9dac22d15         fadd32           r46.cache, r45.discard, r46.discard
 29c: 1ab5a0521214         fmul32           r45, r48.cache, u18h
 2a2: aac1dc022e15         fadd32           r48.cache, r46.discard, r48.discard
 2a8: 1ab99e420014         fmul32           r46, r47.cache, 0.0625
 2ae: aac1e0e22d15         fadd32           r48.cache, r48.discard, r47.discard
 2b4: 1abda2420014         fmul32           r47, r49.cache, 0.0625
 2ba: aac5e0222e15         fadd32           r49.cache, r48.discard, r49.discard
 2c0: 1ac1a4521214         fmul32           r48, r50.cache, u18h
 2c6: aac9e2422e15         fadd32           r50.cache, r49.discard, r50.discard
 2cc: 9ac5a8521214         fmul32           r49.cache, r52.cache, u18h
 2d2: aad1e4822e15         fadd32           r52.cache, r50.discard, r52.discard
 2d8: 9ac9a6420014         fmul32           r50.cache, r51.cache, 0.0625
 2de: aad1e8622e15         fadd32           r52.cache, r52.discard, r51.discard
 2e4: 1acdaa420014         fmul32           r51, r53.cache, 0.0625
 2ea: aad5e8a22e15         fadd32           r53.cache, r52.discard, r53.discard
 2f0: 1ad1ac521214         fmul32           r52, r54.cache, u18h
 2f6: aad5eac22e15         fadd32           r53.cache, r53.discard, r54.discard
 2fc: 421000000000         push_exec        r0l, 2
 302: fe59bcf28313         and              r54.cache, r30.cache, 255
 308: fe79acf28004         and              r30.cache, r54.cache, 15
 30e: fe59ec028317         and              r54.cache, r54.discard, 240
 314: bef90bc42b00         convert          s32_to_f, r30.cache, r30.cache, rte
 31a: 9ae96a020217         fmul32           r58.cache, r53, -8.0
 320: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 326: badd62c22b744210     fmadd32          r55.cache, r17, r30.cache, r58
 32e: 8e7900b00b002000     iadd             r30.cache, 0, r29h.cache, lsl 4
 336: bae564c22eee4215     fmadd32          r57.cache, r50, r54.discard, r55.discard
 33e: 8e7900c02b002000     iadd             r30.cache, 0, r30.cache, lsl 4
 346: ae5d00b007080410     bfi              r55.cache, 0, r29h, 8, mask 0xF
 34e: fe79bc429200         and              r30.cache, r30.cache, u18l
 354: fe596cf28313         and              r54.cache, r22, 255
 35a: bee10be42e11         convert          s32_to_f, r56.cache, r55.discard, rte
 360: fe5dacf28014         and              r55.cache, r54.cache, 15
 366: bef90bc42b00         convert          s32_to_f, r30.cache, r30.cache, rte
 36c: bae164022ff24211     fmadd32          r56.cache, r18, r56.discard, r57.discard
 374: bedd0be42e11         convert          s32_to_f, r55.cache, r55.discard, rte
 37a: bae162c22bf04214     fmadd32          r56.cache, r49, r30.cache, r56.discard
 382: fe79aaf28303         and              r30.cache, r21.cache, 255
 388: bae566e22ef04211     fmadd32          r57.cache, r19, r55.discard, r56.discard
 390: fe5dec028317         and              r55.cache, r54.discard, 240
 396: 8e5900900b102000     iadd             r54.cache, 0, r28h.cache, lsl 4
 39e: bee10be42e11         convert          s32_to_f, r56.cache, r55.discard, rte
 3a4: 8e5d00c02e112000     iadd             r55.cache, 0, r54.discard, lsl 4
 3ac: ae5900900b080410     bfi              r54.cache, 0, r28h.cache, 8, mask 0xF
 3b4: bae566022ff24215     fmadd32          r57.cache, r51, r56.discard, r57.discard
 3bc: fe5dee429214         and              r55.cache, r55.discard, u18l
 3c2: bee10bc42e11         convert          s32_to_f, r56.cache, r54.discard, rte
 3c8: fe59bcf28010         and              r54.cache, r30.cache, 15
 3ce: bedd0be42e11         convert          s32_to_f, r55.cache, r55.discard, rte
 3d4: bae14a022ff24215     fmadd32          r56.cache, r37, r56.discard, r57.discard
 3dc: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 3e2: badd68e22ef04215     fmadd32          r55.cache, r52, r55.discard, r56.discard
 3ea: fe75baf28303         and              r29.cache, r29.cache, 255
 3f0: bae15ec22eee4211     fmadd32          r56.cache, r15, r54.discard, r55.discard
 3f8: fe59bc028313         and              r54.cache, r30.cache, 240
 3fe: 8e7900700b002000     iadd             r30.cache, 0, r27h.cache, lsl 4
 406: bedd0bc42e11         convert          s32_to_f, r55.cache, r54.discard, rte
 40c: 8e5900c02b102000     iadd             r54.cache, 0, r30.cache, lsl 4
 414: ae79007007080400     bfi              r30.cache, 0, r27h, 8, mask 0xF
 41c: bae15ee22ef04215     fmadd32          r56.cache, r47, r55.discard, r56.discard
 424: fe5dec429214         and              r55.cache, r54.discard, u18l
 42a: bed90bc42b10         convert          s32_to_f, r54.cache, r30.cache, rte
 430: fe79baf28000         and              r30.cache, r29.cache, 15
 436: bedd0be42e11         convert          s32_to_f, r55.cache, r55.discard, rte
 43c: bae160c22ef04211     fmadd32          r56.cache, r16, r54.discard, r56.discard
 444: bed90bc42b10         convert          s32_to_f, r54.cache, r30.cache, rte
 44a: fe79ba028303         and              r30.cache, r29.cache, 240
 450: ae7500d00a080400     bfi              r29.cache, 0, r22h.cache, 8, mask 0xF
 458: badd60e22ef04215     fmadd32          r55.cache, r48, r55.discard, r56.discard
 460: bef90bc42b00         convert          s32_to_f, r30.cache, r30.cache, rte
 466: bad95ac22eee4211     fmadd32          r54.cache, r13, r54.discard, r55.discard
 46e: bef50ba42b00         convert          s32_to_f, r29.cache, r29.cache, rte
 474: baf95cc22bec4204     fmadd32          r30.cache, r46, r30.cache, r54.discard
 47c: 8e5900d00a002000     iadd             r22.cache, 0, r22h.cache, lsl 4
 484: bad95ca22bbc0210     fmadd32          r54.cache, r14, r29.cache, r30.cache
 48c: 8e5900c02a002000     iadd             r22.cache, 0, r22.cache, lsl 4
 494: fe7584f28303         and              r29.cache, r2.cache, 255
 49a: fe79ac429200         and              r30.cache, r22.cache, u18l
 4a0: fe59baf28000         and              r22.cache, r29.cache, 15
 4a6: bef90bc42b00         convert          s32_to_f, r30.cache, r30.cache, rte
 4ac: fe75ba028303         and              r29.cache, r29.cache, 240
 4b2: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 4b8: baf95ac22bec4204     fmadd32          r30.cache, r45, r30.cache, r54.discard
 4c0: bef50ba42b00         convert          s32_to_f, r29.cache, r29.cache, rte
 4c6: baf956c22abc0200     fmadd32          r30.cache, r11, r22.cache, r30.cache
 4ce: 8e59001008002000     iadd             r22.cache, 0, r0h.cache, lsl 4
 4d6: bad956a22bbc0214     fmadd32          r54.cache, r43, r29.cache, r30.cache
 4de: 8e5900c02a002000     iadd             r22.cache, 0, r22.cache, lsl 4
 4e6: ae75001008080400     bfi              r29.cache, 0, r0h.cache, 8, mask 0xF
 4ee: fe59ac429200         and              r22.cache, r22.cache, u18l
 4f4: fe71b8f28303         and              r28.cache, r28.cache, 255
 4fa: bef90ba42b00         convert          s32_to_f, r30.cache, r29.cache, rte
 500: fe75b8f28000         and              r29.cache, r28.cache, 15
 506: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 50c: baf958c22bec4200     fmadd32          r30.cache, r12, r30.cache, r54.discard
 514: bef50ba42b00         convert          s32_to_f, r29.cache, r29.cache, rte
 51a: baf958c22abc0204     fmadd32          r30.cache, r44, r22.cache, r30.cache
 522: fe5982f28303         and              r22.cache, r1.cache, 255
 528: bad952a22bbc0210     fmadd32          r54.cache, r9, r29.cache, r30.cache
 530: fe75b8028303         and              r29.cache, r28.cache, 240
 536: 8e7100b00a002000     iadd             r28.cache, 0, r21h.cache, lsl 4
 53e: bef90ba42b00         convert          s32_to_f, r30.cache, r29.cache, rte
 544: 8e7500802b002000     iadd             r29.cache, 0, r28.cache, lsl 4
 54c: ae7100b00a080400     bfi              r28.cache, 0, r21h.cache, 8, mask 0xF
 554: baf954c22bec4204     fmadd32          r30.cache, r42, r30.cache, r54.discard
 55c: fe55ba429200         and              r21.cache, r29.cache, u18l
 562: bef50b842b00         convert          s32_to_f, r29.cache, r28.cache, rte
 568: fe71acf28000         and              r28.cache, r22.cache, 15
 56e: bed50ba42a00         convert          s32_to_f, r21.cache, r21.cache, rte
 574: baf554a22b7c0200     fmadd32          r29.cache, r10, r29.cache, r30
 57c: bef10b842b00         convert          s32_to_f, r28.cache, r28.cache, rte
 582: baf552a22aba0204     fmadd32          r29.cache, r41, r21.cache, r29.cache
 58a: fe55b6f28303         and              r21.cache, r27.cache, 255
 590: baf54e822bba0200     fmadd32          r29.cache, r7, r28.cache, r29.cache
 598: fe6dac028303         and              r27.cache, r22.cache, 240
 59e: 8e59008008002000     iadd             r22.cache, 0, r4l.cache, lsl 4
 5a6: bef10b642b00         convert          s32_to_f, r28.cache, r27.cache, rte
 5ac: 8e6d00c02a002000     iadd             r27.cache, 0, r22.cache, lsl 4
 5b4: ae59008004080400     bfi              r22.cache, 0, r4l, 8, mask 0xF
 5bc: baf54e822bba0204     fmadd32          r29.cache, r39, r28.cache, r29.cache
 5c4: fe6db6429200         and              r27.cache, r27.cache, u18l
 5ca: bef10bc42a00         convert          s32_to_f, r28.cache, r22.cache, rte
 5d0: fe59aaf28000         and              r22.cache, r21.cache, 15
 5d6: beed0b642b00         convert          s32_to_f, r27.cache, r27.cache, rte
 5dc: baf150822b7a0200     fmadd32          r28.cache, r8, r28.cache, r29
 5e4: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 5ea: baed50622bb80204     fmadd32          r27.cache, r40, r27.cache, r28.cache
 5f2: baed4ac22ab60200     fmadd32          r27.cache, r5, r22.cache, r27.cache
 5fa: fe59aa028303         and              r22.cache, r21.cache, 240
 600: 8e71003008002000     iadd             r28.cache, 0, r1h.cache, lsl 4
 608: ae55003008080400     bfi              r21.cache, 0, r1h.cache, 8, mask 0xF
 610: 8e05008027002000     iadd             r1.cache, 0, r28, lsl 4
 618: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 61e: fe0582429200         and              r1.cache, r1.cache, u18l
 624: bed50ba42a00         convert          s32_to_f, r21.cache, r21.cache, rte
 62a: bad94cc22a760204     fmadd32          r22.cache, r38, r22.cache, r27
 632: be850b242800         convert          s32_to_f, r1.cache, r1.cache, rte
 638: bad54ca22aac0200     fmadd32          r21.cache, r6, r21.cache, r22.cache
 640: ba85682228aa0200     fmadd32          r1.cache, r20, r1.cache, r21.cache
 648: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 64c: 92d98820008040885004 icmpsel          seq, r22.cache, r36l.cache, 2, r32.cache, r34.cache
 656: aed5008028020001     bfeil            r21.cache, 0, r36.cache, 2
 65e: ba858252086c0200     fmadd32          r1.cache, r1.cache, r2h.cache, r22
 666: 8e6d018028110000     iadd             r59.cache, 1, r36.cache
 66e: 8e09aa6228010000     iadd             r2.cache, r21.cache, r35.cache
 676: 12898800008240841014 icmpsel          seq, r34, r36l.cache, 0, r1.cache, r34
 680: 12818820008200841014 icmpsel          seq, r32, r36l.cache, 2, r1.cache, r32
 68a: 8e1102802c110000     iadd             r36.cache, 2, r36.discard
 692: 9e05768219810004     imadd            r1.cache, r59, u12, r0h.cache
 69a: 8e550040a8000000     iadd             r21.cache, 0, r2.cache, lsl 1
 6a2: ae89008028020001     bfeil            r2.cache, 0, r36.cache, 2
 6aa: 8e5582a22a002000     iadd             r21.cache, r1.cache, r21.cache, lsl 4
 6b2: fe0548228004         and              r1.cache, r36, 2
 6b8: 8e09846224010000     iadd             r2.cache, r2.cache, r35
 6c0: 9e05828019410000     imadd            r1.cache, r1l.cache, u12, r0h
 6c8: 8e090040a8000000     iadd             r2.cache, 0, r2.cache, lsl 1
 6d0: 1e5b6a2201ae0310     imadd            r54_r55, r21, 18, r23_r24.cache
 6d8: 8e05824224002000     iadd             r1.cache, r1.cache, r2, lsl 4
 6e0: 1e774222016e0300     imadd            r29_r30, r1, 18, r23_r24
 6e8: 85c40c0560c11000     device_load      0, i16, x, r56h, r54_r55, 0, signed
 6f0: 85140a4530c01000     device_load      1, i16, x, r2h, r29_r30, 0, signed
 6f8: 85d81a0530c0f000     device_load      0, i16, xyzw, r27l_r27h_r28l_r28h, r29_r30, 1, signed
 700: 85a81c0560c0f000     device_load      0, i16, xyzw, r21l_r21h_r22l_r22h, r54_r55, 1, signed
 708: 85b45c4560c1f000     device_load      1, i16, xyzw, r54h_r55l_r55h_r56l, r54_r55, 5, signed
 710: 85e85a4530c0f000     device_load      1, i16, xyzw, r29l_r29h_r30l_r30h, r29_r30, 5, signed
 718: 3800                 wait             0
 71a: fe0477088000         mov              r1l.cache, r27h
 720: fe0879088000         mov              r2l.cache, r28h
 726: 2e90002004080000     bfeil            r4l, 0, r1l, 8
 72e: fe65aaf28313         and              r57.cache, r21.cache, 255
 734: 2e82004004080000     bfeil            r0h, 0, r2l, 8
 73c: fe71b2f28014         and              r60.cache, r57.cache, 15
 742: 2e8600d00a080000     bfeil            r1h, 0, r22h.cache, 8
 74a: 7e58ad088010         mov              r54l, r22h.cache
 750: bef10b842f11         convert          s32_to_f, r60.cache, r60.discard, rte
 756: fe65f2028317         and              r57.cache, r57.discard, 240
 75c: aed400a00a080000     bfeil            r21l.cache, 0, r21l.cache, 8
 764: baf54a822ff44211     fmadd32          r61.cache, r5, r60.discard, r58.discard
 76c: bef10b242f11         convert          s32_to_f, r60.cache, r57.discard, rte
 772: 8e6500a00a102000     iadd             r57.cache, 0, r21l.cache, lsl 4
 77a: ae6900a00a080410     bfi              r58.cache, 0, r21l.cache, 8, mask 0xF
 782: bafd4c822ffa4215     fmadd32          r63.cache, r38, r60.discard, r61.discard
 78a: 8e6500202f112000     iadd             r57.cache, 0, r57.discard, lsl 4
 792: bef50b442f11         convert          s32_to_f, r61.cache, r58.discard, rte
 798: fe54ab088000         mov              r21l.cache, r21h.cache
 79e: fe69f2429214         and              r58.cache, r57.discard, u18l
 7a4: fe65aaf28313         and              r57.cache, r21.cache, 255
 7aa: bef10b442f11         convert          s32_to_f, r60.cache, r58.discard, rte
 7b0: fe69b2f28014         and              r58.cache, r57.cache, 15
 7b6: baf54ca22ffe4211     fmadd32          r61.cache, r6, r61.discard, r63.discard
 7be: bee90b442f11         convert          s32_to_f, r58.cache, r58.discard, rte
 7c4: baf168822ffa4211     fmadd32          r60.cache, r20, r60.discard, r61.discard
 7cc: aed400b00a080000     bfeil            r21l.cache, 0, r21h.cache, 8
 7d4: baf14e422ff84211     fmadd32          r60.cache, r7, r58.discard, r60.discard
 7dc: fe69f2028317         and              r58.cache, r57.discard, 240
 7e2: ae6500a00a080410     bfi              r57.cache, 0, r21l.cache, 8, mask 0xF
 7ea: 8e5500a00a002000     iadd             r21.cache, 0, r21l.cache, lsl 4
 7f2: bee90b442f11         convert          s32_to_f, r58.cache, r58.discard, rte
 7f8: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 800: bee50b242f11         convert          s32_to_f, r57.cache, r57.discard, rte
 806: fe55aa429200         and              r21.cache, r21.cache, u18l
 80c: baf14e422ff84215     fmadd32          r60.cache, r39, r58.discard, r60.discard
 814: bee90ba42a10         convert          s32_to_f, r58.cache, r21.cache, rte
 81a: fe55acf28303         and              r21.cache, r22.cache, 255
 820: baf150222ff84211     fmadd32          r60.cache, r8, r57.discard, r60.discard
 828: fe65aaf28010         and              r57.cache, r21.cache, 15
 82e: bae950422ff84215     fmadd32          r58.cache, r40, r58.discard, r60.discard
 836: bee50b242f11         convert          s32_to_f, r57.cache, r57.discard, rte
 83c: fe55aa028303         and              r21.cache, r21.cache, 240
 842: aed800c00a080000     bfeil            r22l.cache, 0, r22l.cache, 8
 84a: bae952222ff44211     fmadd32          r58.cache, r9, r57.discard, r58.discard
 852: bee50ba42a10         convert          s32_to_f, r57.cache, r21.cache, rte
 858: 8e5500c00a002000     iadd             r21.cache, 0, r22l.cache, lsl 4
 860: ae5900c00a080400     bfi              r22.cache, 0, r22l.cache, 8, mask 0xF
 868: 3af154222ff44215     fmadd32          r60, r42, r57.discard, r58.discard
 870: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 878: 3ee90bc42a10         convert          s32_to_f, r58, r22.cache, rte
 87e: 7e596a429200         and              r22, r21, u18l
 884: 3801                 wait             1
 886: fe55acf28307         and              r21.cache, r54.cache, 255
 88c: bee50bc42a10         convert          s32_to_f, r57.cache, r22.cache, rte
 892: fe59aaf28000         and              r22.cache, r21.cache, 15
 898: bae954422ff84211     fmadd32          r58.cache, r10, r58.discard, r60.discard
 8a0: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 8a6: bae552222ff44215     fmadd32          r57.cache, r41, r57.discard, r58.discard
 8ae: fe55aa028303         and              r21.cache, r21.cache, 240
 8b4: bae956c22af24210     fmadd32          r58.cache, r11, r22.cache, r57.discard
 8bc: bee50ba42a10         convert          s32_to_f, r57.cache, r21.cache, rte
 8c2: 8e55003008002000     iadd             r21.cache, 0, r1h.cache, lsl 4
 8ca: ae59003008080400     bfi              r22.cache, 0, r1h.cache, 8, mask 0xF
 8d2: bae956222ff44215     fmadd32          r58.cache, r43, r57.discard, r58.discard
 8da: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 8e2: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 8e8: fe65aa429210         and              r57.cache, r21.cache, u18l
 8ee: fe54ad088004         mov              r21l.cache, r54h.cache
 8f4: bee50b242f11         convert          s32_to_f, r57.cache, r57.discard, rte
 8fa: fe55aaf28303         and              r21.cache, r21.cache, 255
 900: bae958c22af44210     fmadd32          r58.cache, r12, r22.cache, r58.discard
 908: fe59aaf28000         and              r22.cache, r21.cache, 15
 90e: bae558222ff44215     fmadd32          r57.cache, r44, r57.discard, r58.discard
 916: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 91c: fe55aa028303         and              r21.cache, r21.cache, 240
 922: ae8600d00e080001     bfeil            r1h.cache, 0, r54h.discard, 8
 92a: bae55ac22af24210     fmadd32          r57.cache, r13, r22.cache, r57.discard
 932: bed90ba42a10         convert          s32_to_f, r54.cache, r21.cache, rte
 938: 8e55003008002000     iadd             r21.cache, 0, r1h.cache, lsl 4
 940: ae59003008080400     bfi              r22.cache, 0, r1h.cache, 8, mask 0xF
 948: bae95cc22ef24215     fmadd32          r58.cache, r46, r54.discard, r57.discard
 950: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 958: bee50bc42a10         convert          s32_to_f, r57.cache, r22.cache, rte
 95e: fe55aa429200         and              r21.cache, r21.cache, u18l
 964: fe59aef28307         and              r22.cache, r55.cache, 255
 96a: bed90ba42a10         convert          s32_to_f, r54.cache, r21.cache, rte
 970: fe55acf28000         and              r21.cache, r22.cache, 15
 976: bae55c222ff44211     fmadd32          r57.cache, r14, r57.discard, r58.discard
 97e: bed50ba42a00         convert          s32_to_f, r21.cache, r21.cache, rte
 984: fe59ac028303         and              r22.cache, r22.cache, 240
 98a: ae8600e00e080001     bfeil            r1h.cache, 0, r55l.discard, 8
 992: bad95ac22ef24215     fmadd32          r54.cache, r45, r54.discard, r57.discard
 99a: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 9a0: bad95ea22aec4210     fmadd32          r54.cache, r15, r21.cache, r54.discard
 9a8: 8e55003008002000     iadd             r21.cache, 0, r1h.cache, lsl 4
 9b0: bae95ec22aec4214     fmadd32          r58.cache, r47, r22.cache, r54.discard
 9b8: 8e5900a02a102000     iadd             r54.cache, 0, r21.cache, lsl 4
 9c0: ae55003008080400     bfi              r21.cache, 0, r1h.cache, 8, mask 0xF
 9c8: fe58af088004         mov              r22l.cache, r55h.cache
 9ce: fe59ec429214         and              r54.cache, r54.discard, u18l
 9d4: fe59acf28303         and              r22.cache, r22.cache, 255
 9da: bee50ba42a10         convert          s32_to_f, r57.cache, r21.cache, rte
 9e0: fe55acf28000         and              r21.cache, r22.cache, 15
 9e6: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 9ec: bae560222ff44211     fmadd32          r57.cache, r16, r57.discard, r58.discard
 9f4: bed50ba42a00         convert          s32_to_f, r21.cache, r21.cache, rte
 9fa: fe59ac028303         and              r22.cache, r22.cache, 240
 a00: ae8600f00e080001     bfeil            r1h.cache, 0, r55h.discard, 8
 a08: bad960c22ef24215     fmadd32          r54.cache, r48, r54.discard, r57.discard
 a10: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 a16: bad962a22aec4210     fmadd32          r54.cache, r17, r21.cache, r54.discard
 a1e: 8e55003008002000     iadd             r21.cache, 0, r1h.cache, lsl 4
 a26: bae564c22aec4214     fmadd32          r57.cache, r50, r22.cache, r54.discard
 a2e: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 a36: ae59003008080400     bfi              r22.cache, 0, r1h.cache, 8, mask 0xF
 a3e: fe59aa429210         and              r54.cache, r21.cache, u18l
 a44: fe55b0f28307         and              r21.cache, r56.cache, 255
 a4a: bedd0bc42a10         convert          s32_to_f, r55.cache, r22.cache, rte
 a50: fe59aaf28000         and              r22.cache, r21.cache, 15
 a56: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 a5c: badd64e22ef24211     fmadd32          r55.cache, r18, r55.discard, r57.discard
 a64: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 a6a: bad962c22eee4215     fmadd32          r54.cache, r49, r54.discard, r55.discard
 a72: ae8600000f080001     bfeil            r1h.cache, 0, r56l.discard, 8
 a7a: badd66c22aec4210     fmadd32          r55.cache, r19, r22.cache, r54.discard
 a82: fe59aa028313         and              r54.cache, r21.cache, 240
 a88: 8e55003008002000     iadd             r21.cache, 0, r1h.cache, lsl 4
 a90: ae59003008080400     bfi              r22.cache, 0, r1h.cache, 8, mask 0xF
 a98: 8e5500a02a002000     iadd             r21.cache, 0, r21.cache, lsl 4
 aa0: bed90bc42e11         convert          s32_to_f, r54.cache, r54.discard, rte
 aa6: fe55aa429200         and              r21.cache, r21.cache, u18l
 aac: bed90bc42a00         convert          s32_to_f, r22.cache, r22.cache, rte
 ab2: badd66c22eee4215     fmadd32          r55.cache, r51, r54.discard, r55.discard
 aba: bed90ba42a10         convert          s32_to_f, r54.cache, r21.cache, rte
 ac0: fe547b088000         mov              r21l.cache, r29h
 ac6: badd4ac22aee4214     fmadd32          r55.cache, r37, r22.cache, r55.discard
 ace: 2eee00a006080000     bfeil            r27h, 0, r21l, 8
 ad6: fe58fd088000         mov              r22l.cache, r30h.discard
 adc: bad968c22eee4215     fmadd32          r54.cache, r52, r54.discard, r55.discard
 ae4: 2ef200c006080000     bfeil            r28h, 0, r22l, 8
 aec: 92ddb610008240845014 icmpsel          seq, r55.cache, r59l.cache, 1, r33.cache, r34
 af6: 2e86006007080000     bfeil            r1h, 0, r27l, 8
 afe: 92ddb63000bee08e1014 icmpsel          seq, r55.cache, r59l.cache, 3, r31.cache, r55.discard
 b08: 2ed6008007080000     bfeil            r21h, 0, r28l, 8
 b10: bad9ec120fee4215     fmadd32          r54.cache, r54.discard, r56h.discard, r55.discard
 b18: 2eda00a007080000     bfeil            r22h, 0, r29l, 8
 b20: 1285b61000ac20845014 icmpsel          seq, r33, r59l.cache, 1, r54.cache, r33
 b2a: 2ef600c007080000     bfeil            r29h, 0, r30l, 8
 b32: 12fdf63000ece0874004 icmpsel          seq, r31, r59l.discard, 3, r54.discard, r31
 b3c: 529548420004         while_icmp       r0l, nseq, r36, 4, 2
 b42: 00c0c0f7ffff         jmp_exec_any     0x302
 b48: d21600000000         pop_exec         r0l.cache, 2
 b4e: 8e0d01602c110000     iadd             r35.cache, 1, r35.discard
 b56: 529546a21904         while_icmp       r0l, nseq, r35, u13, 2
 b5c: 00c0e6f5ffff         jmp_exec_any     0x142
 b62: d21600000000         pop_exec         r0l.cache, 2
 b68: d20e00000000         pop_exec         r0l.cache, 1
 b6e: f2090100             get_sr           r2.cache, sr1 (threadgroup_position_in_grid.y)
 b72: fe05a0098000         mov              r1.cache, u16
 b78: 9e1bc24228000000     imadd            r6_r7.cache, r1.discard, r2.cache, 0
 b80: 9205840200a801b0     icmpsel          slt, r1.cache, r2.cache, 0, u20, 0
 b88: 9e21a0212cce0200     imadd            r8.cache, u16, r1.discard, r7.discard
 b90: 6f95c4320004         simd_fadd        r5, r34.discard
 b96: 8e058c622c001000     iadd             r1.cache, r6.cache, r3.discard, lsl 2
 b9e: 9e09a2412cd00200     imadd            r2.cache, u17, r2.discard, r8.discard
 ba6: 920c82c22c010130     icmpsel          ult, r3l.cache, r1.cache, r6.discard, 1, 0
 bae: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 bb2: 8e09c6402c000000     iadd             r2.cache, r3l.discard, r2.discard
 bba: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 bc0: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 bc8: 0e21d0a218000000     iadd             r8, r8.discard, u5
 bd0: 45290e0500c01200     device_store     0, i32, x, r5, r7_r8, 0, signed, 0
 bd8: d20e00000000         pop_exec         r0l.cache, 1
 bde: 6f8dc2320004         simd_fadd        r3, r33.discard
 be4: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 bea: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 bf2: 0e21d0a218000000     iadd             r8, r8.discard, u5
 bfa: 45191e0500c01200     device_store     0, i32, x, r3, r7_r8, 1, signed, 0
 c02: d20e00000000         pop_exec         r0l.cache, 1
 c08: 6f8dc0320004         simd_fadd        r3, r32.discard
 c0e: f2023400             get_sr           r0h.cache, sr52 (thread_index_in_simdgroup)
 c12: 528841000000         if_icmp          r0l, seq, r0h, 0, 1
 c18: 8e1f882134001000     iadd             r7_r8.cache, u4, r1_r2, lsl 2
 c20: 0e21d0a218000000     iadd             r8, r8.discard, u5
 c28: 45191e0500c41200     device_store     0, i32, x, r3, r7_r8, 1, signed, lsl 1, 0
 c30: d20e00000000         pop_exec         r0l.cache, 1
 c36: 6f95fe320000         simd_fadd        r5, r31.discard
 c3c: 5288c1000000         if_icmp          r0l, seq, r0h.discard, 0, 1
 c42: 8e1f88213c001000     iadd             r7_r8.cache, u4, r1_r2.discard, lsl 2
 c4a: 0e21d0a218000000     iadd             r8, r8.discard, u5
 c52: 45293e0500c01200     device_store     0, i32, x, r5, r7_r8, 3, signed, 0
 c5a: 520e00000000         pop_exec         r0l, 1
 c60: 8800                 stop

This code loads correctly, but the speed is clearly slower...

 6e8: 85c40c0560c11000     device_load      0, i16, x, r56h, r54_r55, 0, signed
 6f0: 85140a4530c01000     device_load      1, i16, x, r2h, r29_r30, 0, signed
 6f8: 85d81a0530c0f000     device_load      0, i16, xyzw, r27l_r27h_r28l_r28h, r29_r30, 1, signed
 700: 85a81c0560c0f000     device_load      0, i16, xyzw, r21l_r21h_r22l_r22h, r54_r55, 1, signed
 708: 85b45c4560c1f000     device_load      1, i16, xyzw, r54h_r55l_r55h_r56l, r54_r55, 5, signed
 710: 85e85a4530c0f000     device_load      1, i16, xyzw, r29l_r29h_r30l_r30h, r29_r30, 5, signed

2 replies

lshzh-ww Jul 19, 2023
Author

Okay, here is the reason:

Although we have thread uint16_t * qs = (thread uint16_t *) qb_curr.qs; to cast qb_curr.qs to a uint16_t pointer, the compiler still only generates instructions to operate on its high 8-bit and low 8-bit separately, leading to lots of useless mov, bfi and bfeil instructions.

lshzh-ww Jul 19, 2023
Author

What the complier do:

// r21 register contains 32-bit quantized weight

and              r57.cache, r21.cache, 255  //copy low 8-bit of r21 to r57
and              r60.cache, r57.cache, 15    //copy low 4-bit of r57 to r60
convert          s32_to_f, r60.cache, r60.discard, rte  //convert the 4-bit to float

What it shoud do:

and              r54, r2h.cache, 15 //directly copy low-4 bit from high 16-bit of r2 to r54

Why...

ikawrakow · 2023-07-19T21:34:38Z

ikawrakow
Jul 19, 2023

So, I don't know what the Metal compiler on your computer does, but on my M2 Max, the kernel below gives me a run time of 18.7 ms/token for 7B Q4_0, 31.7 ms/token at 13B, 74.8 ms/token at 33B, and 141.8 ms/token at 65B. This is slightly better than your PR and does not need a modification of block_q4_0:

kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst, 
        constant   int64_t & ne00,
        constant   int64_t & ne10,
        constant   int64_t & ne0, 
        constant   int64_t & ne01[[buffer(4)]],
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
    device const block_q4_0 * x = (device const block_q4_0 *) src0 + first_row * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;

    const int ix = tiisg/2;
    const int il = 8*(tiisg%2);
    float yl[16];
    float sumf[N_DST]={0.f};

    const int step = sizeof(block_q4_0) * nb / 2; 

    device const float * yb = y + ix * QK4_0 + il;

    for (int ib = ix; ib < nb; ib += 16) {

        float sumy = 0; 
        for (int i = 0; i < 8; ++i) {
            yl[i+0] = yb[i+ 0]; sumy += yl[i+0];
            yl[i+8] = yb[i+16]; sumy += yl[i+8];
        }    
        sumy *= (-8.f);

        device const half * dh = &x[ib].d;
        device const uint16_t * qs = (device const uint16_t *)(x[ib].qs + il); 

        for (int row = 0; row < N_DST; ++row) {

            // calculate
            float d = *dh; 
            float4 acc = {0.f, 0.f, 0.f, 0.f};
            for (int i = 0; i < 8; i += 2) { 
                acc[0] += yl[i+0] * (qs[i/2] & 0x000F);
                acc[1] += yl[i+1] * (qs[i/2] & 0x0F00);
                acc[2] += yl[i+8] * (qs[i/2] & 0x00F0);
                acc[3] += yl[i+9] * (qs[i/2] & 0xF000);
            }    
            sumf[row] += d * (acc[0] + 1.f/256.f * acc[1] + 1.f/16.f * acc[2] + 1.f/4096.f * acc[3] + sumy);

            dh += step;
            qs += step;
        }    

        yb += QK4_0 * 16;
    }    

    for (int row = 0; row < N_DST; ++row) {
        const float tot = simd_sum(sumf[row]);
        if (tiisg == 0) { 
            dst[r1*ne0 + first_row + row] = tot; 
        }    
    }    
}

1 reply

lshzh-ww Jul 20, 2023
Author

I decompiled your code and this time the complier somehow knows how to correctly deal with uint16_t. I should try to modify my template to instruct my complier do this too, instead of modification of block_q4_0. I also feel that my codes has a much, much worse readability than yours :(

Though it's a bit out of the topic, I would like to say that let two threads calculate one block bring a performance penalty because we have to load d two times for different threads. Why this code could reach 74.8 ms/tok on 33B model is because that for 33B model nb is divisible by 16. By this way we don't waste half of the simd-group during the last column.

I have a draft that let each thread calculate one block when we have enough blocks, and switch to two threads per block when there are only 16 blocks left, and it can reach ~72 ms/tok on my M1 Max on 33B model. The reason it remains a draft is soon I found that for 7B model there is a matrix has nb=344. Adding more lines to deal with this extra 8 blocks is making the codes ugly enough, and you will never know if there is a model that has nb%8=4!

ikawrakow · 2023-07-20T15:36:31Z

ikawrakow
Jul 20, 2023

I decompiled your code and this time the complier somehow knows how to correctly deal with uint16_t. I should try to modify my template to instruct my complier do this too, instead of modification of block_q4_0. I also feel that my codes has a much, much worse readability than yours :(

Well, yes, it looks like the Metal compiler does need a hand here and there.

Though it's a bit out of the topic, I would like to say that let two threads calculate one block bring a performance penalty because we have to load d two times for different threads. Why this code could reach 74.8 ms/tok on 33B model is because that for 33B model nb is divisible by 16. By this way we don't waste half of the simd-group during the last column.

This is not my experience. In fact, letting a thread in a simd group compute half a block at a time for Q4_0 brings additional speedup compared to what ended up on master. One simply needs to try and see what is the performance. But while one may ponder if to split or not to split for blocks with 32 elements, not splitting the k-quants blocks of 256 is a total disaster in terms of performance.

1 reply

lshzh-ww Jul 21, 2023
Author

I didn't see that improvement :(. I feel that this maybe somehow hardware dependent, maybe due to the size of SLC or different behavior of MMU?

By the way, after so much works on Q_K optimization, do you plan to continue work on mat-mat kernels, or take a rest? I would like to avoid duplicate implementations on this. I may visit this topic late August or so, if you have no plan to do that.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

A quick guide for Metal disassembler #2279

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 3 comments 4 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

A quick guide for Metal disassembler #2279

Uh oh!

Uh oh!

lshzh-ww Jul 19, 2023

Usage

Example 0

Example 1

Replies: 3 comments · 4 replies

Uh oh!

lshzh-ww Jul 19, 2023 Author

Uh oh!

Uh oh!

lshzh-ww Jul 19, 2023 Author

Uh oh!

lshzh-ww Jul 19, 2023 Author

Uh oh!

ikawrakow Jul 19, 2023

Uh oh!

lshzh-ww Jul 20, 2023 Author

Uh oh!

ikawrakow Jul 20, 2023

Uh oh!

lshzh-ww Jul 21, 2023 Author

lshzh-ww
Jul 19, 2023

Replies: 3 comments 4 replies

lshzh-ww
Jul 19, 2023
Author

lshzh-ww Jul 19, 2023
Author

lshzh-ww Jul 19, 2023
Author

ikawrakow
Jul 19, 2023

lshzh-ww Jul 20, 2023
Author

ikawrakow
Jul 20, 2023

lshzh-ww Jul 21, 2023
Author