Optional F16 KV cache (--kv16): halve attention DRAM bandwidth

farkasmark · farkasmark · commit c4a49f2549ae · 2026-03-13T15:57:01.000+01:00
Store KV cache in FP16 instead of F32 when --kv16 is passed. Writes
K/V to temp F32 buffers, applies RoPE, then converts to F16 via NEON
vcvt_f16_f32 (scalar fallback for non-ARM). Read path converts F16
back to a stack F32 buffer before dot products — 512 bytes fits in L1.

Both gqa_range and flash_gqa_range get the F16 treatment. Arena sizing
and allocation are conditional on the new kv_f16 config flag.

Benchmark (M1 Max, bitnet-b1.58-2B-4T, 128 tokens):
  F32 KV: 40.8 tok/s
  F16 KV: 47.9 tok/s (+17%)

Greedy argmax matches F32; generated text diverges slightly due to
accumulated F16 rounding but remains coherent and correct.
diff --git a/Makefile b/Makefile
@@ -35,7 +35,7 @@ src/%.o: src/%.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
 # --- Tests ---
-.PHONY: debug test test_gguf test_quant test_tokenizer test_transformer test_threadpool test_safety test_prefill clean
+.PHONY: debug test test_gguf test_quant test_tokenizer test_transformer test_threadpool test_safety test_prefill test_kv_f16 clean
 
 test: test_gguf test_quant test_tokenizer test_transformer test_threadpool test_safety
 
@@ -71,5 +71,10 @@ test_prefill: test/test_prefill.c src/platform.c src/gguf.c src/quant.c src/mode
               src/sh_arena.c src/sh_log.c
 	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
+test_kv_f16: test/test_kv_f16.c src/platform.c src/gguf.c src/quant.c src/model.c \
+             src/transformer.c src/tokenizer.c src/sampler.c src/threadpool.c \
+             src/sh_arena.c src/sh_log.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
 clean:
-	rm -f bitnet src/*.o test_gguf test_quant test_tokenizer test_transformer test_threadpool test_safety test_e2e test_prefill
+	rm -f bitnet src/*.o test_gguf test_quant test_tokenizer test_transformer test_threadpool test_safety test_e2e test_prefill test_kv_f16
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ A clean-room, pure C inference engine for [BitNet b1.58](https://arxiv.org/abs/2
 
 Inspired by Andrej Karpathy's [llama2.c](https://github.com/karpathy/llama2.c) — a beautifully minimal LLaMA inference implementation in a single C file — **bitnet.c** takes the same philosophy and applies it to Microsoft's [BitNet](https://github.com/microsoft/BitNet) architecture with its 1.58-bit ternary weights.
 
-Where Microsoft's official BitNet inference framework depends on a modified llama.cpp fork (~100K+ lines of C++), bitnet.c delivers a complete inference pipeline in ~4,100 lines of modular, readable C.
+Where Microsoft's official BitNet inference framework depends on a modified llama.cpp fork (~100K+ lines of C++), bitnet.c delivers a complete inference pipeline in ~4,500 lines of modular, readable C.
 
 ## Features
 
@@ -13,6 +13,7 @@ Where Microsoft's official BitNet inference framework depends on a modified llam
 - **I2_S, TQ1_0, & TQ2_0 formats** — native support for Microsoft's I2_S and GGML ternary quantization
 - **Full transformer forward pass** — RoPE, GQA, RMSNorm, sub-norms, tied embeddings
 - **Flash GQA attention** — online softmax with KV-head grouping, single-pass over KV cache
+- **Optional F16 KV cache** — `--kv16` halves attention DRAM bandwidth with minimal precision loss
 - **ARM NEON/SDOT optimizations** — SDOT int8 matvec, native FP16 logits, INT8 output embeddings
 - **Pthread thread pool** — persistent workers with condvar dispatch (~2us), replaces OpenMP
 - **BPE tokenizer** — loaded directly from GGUF metadata
@@ -50,8 +51,11 @@ Usage: ./bitnet <model.gguf> [options]
   --topp <float>  Top-p sampling (default: 0.9)
   --seed <int>    Random seed (default: 42)
   --maxseq <int>  Max sequence length (default: model max)
+  --flash         Use flash attention (online softmax)
   --chat          Interactive chat REPL mode
   --repeat-penalty <float>  Repetition penalty (default: 1.0, chat: 1.1)
+  --kv16          Store KV cache in FP16 (halves attention DRAM bandwidth)
+  --no-prefill    Disable batch prompt prefill (compute logits for every token)
 ```
 
 ### Chat Mode
@@ -120,14 +124,17 @@ bitnet.c/
 │   ├── test_tokenizer.c    # BPE encode/decode tests
 │   ├── test_threadpool.c   # Thread pool dispatch tests
 │   ├── test_safety.c       # Safety/bounds-checking regression tests
-│   └── test_e2e.c      # End-to-end greedy decode test
+│   ├── test_prefill.c      # Prefill vs sequential correctness test
+│   ├── test_kv_f16.c       # F16 KV cache correctness test
+│   └── test_e2e.c          # End-to-end greedy decode test
 ├── wasm/
 │   ├── api.c           # WASM-exported API wrapper
 │   ├── build.sh        # Emscripten build script
 │   ├── worker.js       # Web Worker for non-blocking inference
 │   └── index.html      # Browser demo
 ├── docs/
-│   └── roadmap.md      # Development roadmap
+│   ├── roadmap.md      # Development roadmap
+│   └── audit.md        # Security/correctness audit
 └── Makefile
 ```
 
@@ -176,7 +183,7 @@ Benchmarked on Apple M1 Max (8 P-cores, 32 GB), `bitnet-b1.58-2B-4T` (I2_S forma
 | Baseline (scalar C) | ~15.5 | 1.0x |
 | + SDOT int8 accumulation + batch matvec | ~33 | 2.1x |
 | + Arithmetic ternary decode + RoPE precompute | ~38 | 2.5x |
-| + Pthread thread pool (replace OpenMP) | ~38 | 2.5x |
+| + Pthread thread pool (replace OpenMP) | ~41 | 2.6x |
 | + Arena allocator + native FP16 logits + prefetch | ~46 | 3.0x |
 | + INT8 output embeddings (SDOT logits) | **~52** | **3.4x** |
 
@@ -214,6 +221,7 @@ BitNet b1.58 is a transformer variant where all linear layer weights are constra
 
 | Format | Bits/Weight | Packing | Block Size |
 |--------|-------------|---------|------------|
+| I2_S   | 2.0         | 2-bit interleaved (4 values/byte) + per-tensor scale | 128 |
 | TQ1_0  | 1.6875      | Base-3 (5 values/byte) + residual | 256 |
 | TQ2_0  | 2.0625      | 2-bit fields (4 values/byte) | 256 |
 
@@ -223,9 +231,9 @@ BitNet b1.58 is a transformer variant where all linear layer weights are constra
 |-----------|------|
 | GGUF buffer (weights + F16 embeddings) | ~620 MB |
 | INT8 embedding cache (128K × 2560) | ~329 MB |
-| KV cache (30 layers × 2048 × 640 × 4 × 2) | ~298 MB |
+| KV cache (30 layers × 2048 × 640 × 4 × 2) | ~298 MB (~149 MB with `--kv16`) |
 | RunState activations | ~3 MB |
-| **Total** | **~1,250 MB** |
+| **Total** | **~1,250 MB** (~1,101 MB with `--kv16`) |
 
 ## Design Decisions
 
diff --git a/include/model.h b/include/model.h
@@ -17,6 +17,7 @@ typedef struct {
     int head_size, kv_dim, kv_mul;  // derived
     int has_ffn_gate, act_type;     // 0=SiLU, 1=ReLU²
     int flash_attn;                 // use flash attention (online softmax)
+    int kv_f16;                     // store KV cache in FP16 (halves attention DRAM bandwidth)
 } BnConfig;
 
 typedef struct {
@@ -56,7 +57,7 @@ typedef struct {
     SHArena *arena;     // arena for all RunState buffers
 } BnModel;
 
-int  bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len);
+int  bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len, int kv_f16);
 void bn_model_free(BnModel *m);
 void bn_model_embed_token(const BnModel *m, float *out, int token);
 
diff --git a/src/main.c b/src/main.c
@@ -30,6 +30,7 @@ typedef struct {
     float repeat_penalty;
     int repeat_set;     // whether user explicitly set --repeat-penalty
     int no_prefill;
+    int kv_f16;
 } CLIArgs;
 
 static void print_usage(const char *prog) {
@@ -44,6 +45,7 @@ static void print_usage(const char *prog) {
     fprintf(stderr, "  --flash         Use flash attention (online softmax)\n");
     fprintf(stderr, "  --chat          Interactive chat REPL mode\n");
     fprintf(stderr, "  --repeat-penalty <float>  Repetition penalty (default: 1.0, chat: 1.1)\n");
+    fprintf(stderr, "  --kv16          Store KV cache in FP16 (halves attention DRAM bandwidth)\n");
     fprintf(stderr, "  --no-prefill    Disable batch prompt prefill (compute logits for every token)\n");
 }
 
@@ -81,6 +83,8 @@ static CLIArgs parse_args(int argc, char **argv) {
             args.flash_attn = 1;
         } else if (strcmp(argv[i], "--chat") == 0) {
             args.chat = 1;
+        } else if (strcmp(argv[i], "--kv16") == 0) {
+            args.kv_f16 = 1;
         } else if (strcmp(argv[i], "--no-prefill") == 0) {
             args.no_prefill = 1;
         } else if (strcmp(argv[i], "--repeat-penalty") == 0 && i + 1 < argc) {
@@ -145,7 +149,7 @@ int main(int argc, char **argv) {
 
     // Load model
     BnModel model;
-    if (bn_model_load(&model, gf, args.max_seq_len) != 0) {
+    if (bn_model_load(&model, gf, args.max_seq_len, args.kv_f16) != 0) {
         SH_LOG_ERROR("Failed to load model");
         bn_gguf_free(gf);
         bn_platform_unload_file(&mf);
diff --git a/src/model.c b/src/model.c
@@ -55,9 +55,10 @@ static float *load_f32_tensor(BnGGUFFile *f, const char *name) {
 
 // --- Model loading ---
 
-int bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len) {
+int bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len, int kv_f16) {
     memset(m, 0, sizeof(BnModel));
     BnConfig *c = &m->config;
+    c->kv_f16 = kv_f16;
 
     // Try to detect architecture prefix
     const char *arch = bn_gguf_get_str(f, "general.architecture");
@@ -272,7 +273,8 @@ int bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len) {
     arena_size += 2 * (size_t)c->hidden_dim * sizeof(float);  // hb, hb2
     arena_size += att_size * sizeof(float);                     // att
     arena_size += (size_t)c->vocab_size * sizeof(float);       // logits
-    arena_size += 2 * kv_cache_size * sizeof(float);           // key_cache, value_cache
+    size_t kv_elem_size = c->kv_f16 ? sizeof(uint16_t) : sizeof(float);
+    arena_size += 2 * kv_cache_size * kv_elem_size;           // key_cache, value_cache
     arena_size += (size_t)x_q_size * sizeof(int8_t);           // x_q
     arena_size += (size_t)half_head * sizeof(float);           // rope_freq
     arena_size += emb_i8_bytes + emb_i8_scales_bytes;          // INT8 embeddings
@@ -292,8 +294,8 @@ int bn_model_load(BnModel *m, BnGGUFFile *f, int max_seq_len) {
     s->hb2         = (float *)sh_arena_calloc(m->arena, c->hidden_dim, sizeof(float));
     s->att         = (float *)sh_arena_calloc(m->arena, att_size, sizeof(float));
     s->logits      = (float *)sh_arena_calloc(m->arena, c->vocab_size, sizeof(float));
-    s->key_cache   = (float *)sh_arena_calloc(m->arena, kv_cache_size, sizeof(float));
-    s->value_cache = (float *)sh_arena_calloc(m->arena, kv_cache_size, sizeof(float));
+    s->key_cache   = (float *)sh_arena_calloc(m->arena, kv_cache_size, kv_elem_size);
+    s->value_cache = (float *)sh_arena_calloc(m->arena, kv_cache_size, kv_elem_size);
     s->x_q         = (int8_t *)sh_arena_calloc(m->arena, x_q_size, sizeof(int8_t));
     s->rope_freq   = (float *)sh_arena_alloc(m->arena, half_head * sizeof(float));
 
diff --git a/src/transformer.c b/src/transformer.c
@@ -75,6 +75,7 @@ static void gqa_range(void *ctx, int h_start, int h_end) {
     int kv_mul = g->kv_mul;
     int pos = g->pos;
     size_t loff = g->loff;
+    int kv_f16 = c->kv_f16;
 
     for (int h = h_start; h < h_end; h++) {
         float *q_h = s->q + h * head_size;
@@ -83,7 +84,22 @@ static void gqa_range(void *ctx, int h_start, int h_end) {
         float inv_sqrt_hs = 1.0f / sqrtf((float)head_size);
 
         for (int t = 0; t <= pos; t++) {
-            float *k_t = s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+            float k_buf[head_size];
+            const float *k_t;
+            if (kv_f16) {
+                const uint16_t *k_f16 = (const uint16_t *)s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+#ifdef __ARM_NEON
+                for (int d = 0; d < head_size; d += 4) {
+                    float16x4_t hv = vreinterpret_f16_u16(vld1_u16(k_f16 + d));
+                    vst1q_f32(k_buf + d, vcvt_f32_f16(hv));
+                }
+#else
+                for (int d = 0; d < head_size; d++) k_buf[d] = bn_fp16_to_fp32(k_f16[d]);
+#endif
+                k_t = k_buf;
+            } else {
+                k_t = s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+            }
 #ifdef __ARM_NEON
             float32x4_t a0 = vdupq_n_f32(0), a1 = vdupq_n_f32(0);
             float32x4_t a2 = vdupq_n_f32(0), a3 = vdupq_n_f32(0);
@@ -107,7 +123,22 @@ static void gqa_range(void *ctx, int h_start, int h_end) {
         float *xb_h = s->xb + h * head_size;
         memset(xb_h, 0, head_size * sizeof(float));
         for (int t = 0; t <= pos; t++) {
-            float *v_t = s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+            float v_buf[head_size];
+            const float *v_t;
+            if (kv_f16) {
+                const uint16_t *v_f16 = (const uint16_t *)s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+#ifdef __ARM_NEON
+                for (int d = 0; d < head_size; d += 4) {
+                    float16x4_t hv = vreinterpret_f16_u16(vld1_u16(v_f16 + d));
+                    vst1q_f32(v_buf + d, vcvt_f32_f16(hv));
+                }
+#else
+                for (int d = 0; d < head_size; d++) v_buf[d] = bn_fp16_to_fp32(v_f16[d]);
+#endif
+                v_t = v_buf;
+            } else {
+                v_t = s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+            }
             float a = att[t];
 #ifdef __ARM_NEON
             float32x4_t a_v = vdupq_n_f32(a);
@@ -131,13 +162,15 @@ static void gqa_range(void *ctx, int h_start, int h_end) {
 
 static void flash_gqa_range(void *ctx, int h_start, int h_end) {
     GQACtx *g = (GQACtx *)ctx;
+    const BnConfig *c = g->c;
     BnRunState *s = g->s;
     int head_size = g->head_size;
     int kv_dim = g->kv_dim;
     int kv_mul = g->kv_mul;
     int pos = g->pos;
     size_t loff = g->loff;
     int n_pos = pos + 1;
+    int kv_f16 = c->kv_f16;
     float inv_sqrt_hs = 1.0f / sqrtf((float)head_size);
 
     for (int h = h_start; h < h_end; h++) {
@@ -156,10 +189,25 @@ static void flash_gqa_range(void *ctx, int h_start, int h_end) {
             if (t_end > n_pos) t_end = n_pos;
 
             for (int t = t_start; t < t_end; t++) {
-                float *k_t = s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                float k_buf[head_size];
+                const float *k_t;
+                if (kv_f16) {
+                    const uint16_t *k_f16 = (const uint16_t *)s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                    for (int d = 0; d < head_size; d += 4) {
+                        float16x4_t hv = vreinterpret_f16_u16(vld1_u16(k_f16 + d));
+                        vst1q_f32(k_buf + d, vcvt_f32_f16(hv));
+                    }
+                    k_t = k_buf;
+                } else {
+                    k_t = s->key_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                }
 
-                if (t + 1 < t_end)
-                    __builtin_prefetch(s->key_cache + loff + (size_t)(t+1) * kv_dim + kv_h * head_size, 0, 0);
+                if (t + 1 < t_end) {
+                    if (kv_f16)
+                        __builtin_prefetch((const uint16_t *)s->key_cache + loff + (size_t)(t+1) * kv_dim + kv_h * head_size, 0, 0);
+                    else
+                        __builtin_prefetch(s->key_cache + loff + (size_t)(t+1) * kv_dim + kv_h * head_size, 0, 0);
+                }
 
                 // Score: dot(Q, K) * scale
                 float32x4_t a0 = vdupq_n_f32(0), a1 = vdupq_n_f32(0);
@@ -173,7 +221,18 @@ static void flash_gqa_range(void *ctx, int h_start, int h_end) {
                 float score = neon_hsum_f32(vaddq_f32(vaddq_f32(a0, a1), vaddq_f32(a2, a3))) * inv_sqrt_hs;
 
                 // Online softmax update
-                float *v_t = s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                float v_buf[head_size];
+                const float *v_t;
+                if (kv_f16) {
+                    const uint16_t *v_f16 = (const uint16_t *)s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                    for (int d = 0; d < head_size; d += 4) {
+                        float16x4_t hv = vreinterpret_f16_u16(vld1_u16(v_f16 + d));
+                        vst1q_f32(v_buf + d, vcvt_f32_f16(hv));
+                    }
+                    v_t = v_buf;
+                } else {
+                    v_t = s->value_cache + loff + (size_t)t * kv_dim + kv_h * head_size;
+                }
                 __builtin_prefetch(v_t, 0, 0);
 
                 float old_max = running_max;
@@ -394,28 +453,72 @@ static int forward_layers(BnModel *m, int token, int pos) {
 
         rmsnorm(s->xb, s->x, lw->attn_norm, dim, c->norm_eps);
 
-        // QKV projections (unified path — bn_quant_matvec_batch handles SDOT internally)
-        {
+        if (c->kv_f16) {
+            // F16 KV cache: write K/V to temp F32 buffers, apply RoPE, convert to F16
+            float *k_tmp = s->hb, *v_tmp = s->hb2;  // [hidden_dim] >= kv_dim
+            BnMatvecTask qkv[3] = {
+                { s->q,  &lw->wq },
+                { k_tmp, &lw->wk },
+                { v_tmp, &lw->wv },
+            };
+            bn_quant_matvec_batch(qkv, 3, s->xb, s->x_q, m->pool);
+
+            // RoPE on Q
+            for (int i = 0; i < dim; i += 2) {
+                int fi = (i / 2) % half_head;
+                float v0 = s->q[i], v1 = s->q[i + 1];
+                s->q[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
+                s->q[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
+            }
+
+            // RoPE on K temp buffer
+            for (int i = 0; i < kv_dim; i += 2) {
+                int fi = (i / 2) % half_head;
+                float v0 = k_tmp[i], v1 = k_tmp[i + 1];
+                k_tmp[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
+                k_tmp[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
+            }
+
+            // Convert F32 → F16 into cache
+            uint16_t *kc = (uint16_t *)s->key_cache   + loff + (size_t)pos * kv_dim;
+            uint16_t *vc = (uint16_t *)s->value_cache + loff + (size_t)pos * kv_dim;
+#ifdef __ARM_NEON
+            for (int i = 0; i < kv_dim; i += 4) {
+                float32x4_t kv4 = vld1q_f32(k_tmp + i);
+                float16x4_t kh4 = vcvt_f16_f32(kv4);
+                vst1_u16(kc + i, vreinterpret_u16_f16(kh4));
+                float32x4_t vv4 = vld1q_f32(v_tmp + i);
+                float16x4_t vh4 = vcvt_f16_f32(vv4);
+                vst1_u16(vc + i, vreinterpret_u16_f16(vh4));
+            }
+#else
+            for (int i = 0; i < kv_dim; i++) {
+                kc[i] = bn_fp32_to_fp16(k_tmp[i]);
+                vc[i] = bn_fp32_to_fp16(v_tmp[i]);
+            }
+#endif
+        } else {
+            // F32 KV cache: matvec directly into cache, RoPE in-place
             BnMatvecTask qkv[3] = {
                 { s->q,            &lw->wq },
                 { key_cache_row,   &lw->wk },
                 { value_cache_row, &lw->wv },
             };
             bn_quant_matvec_batch(qkv, 3, s->xb, s->x_q, m->pool);
-        }
 
-        // RoPE using precomputed cos/sin (no trig calls here)
-        for (int i = 0; i < dim; i += 2) {
-            int fi = (i / 2) % half_head;
-            float v0 = s->q[i], v1 = s->q[i + 1];
-            s->q[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
-            s->q[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
-        }
-        for (int i = 0; i < kv_dim; i += 2) {
-            int fi = (i / 2) % half_head;
-            float v0 = key_cache_row[i], v1 = key_cache_row[i + 1];
-            key_cache_row[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
-            key_cache_row[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
+            // RoPE using precomputed cos/sin (no trig calls here)
+            for (int i = 0; i < dim; i += 2) {
+                int fi = (i / 2) % half_head;
+                float v0 = s->q[i], v1 = s->q[i + 1];
+                s->q[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
+                s->q[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
+            }
+            for (int i = 0; i < kv_dim; i += 2) {
+                int fi = (i / 2) % half_head;
+                float v0 = key_cache_row[i], v1 = key_cache_row[i + 1];
+                key_cache_row[i]     = v0 * rope_cos[fi] - v1 * rope_sin[fi];
+                key_cache_row[i + 1] = v0 * rope_sin[fi] + v1 * rope_cos[fi];
+            }
         }
 
         // GQA attention
diff --git a/test/test_kv_f16.c b/test/test_kv_f16.c
diff --git a/test/test_prefill.c b/test/test_prefill.c
diff --git a/wasm/api.c b/wasm/api.c