Skip to content

Commit 8c84f06

Browse files
committed
Q4_1, BF16, IQ4_NL, IQ4_XS, IQ3_XXS, IQ3_S, IQ2_XXS, IQ2_XS, IQ2_S formats
Add 9 quantization formats with orthogonal per-format per-backend kernels (NEON/AVX2/WASM/scalar). Includes codebook lookup tables, dequant functions, dispatch, model loading, build system, and README rewrite.
1 parent 3dc1ede commit 8c84f06

49 files changed

Lines changed: 2931 additions & 919 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Makefile

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,21 @@ ifneq ($(filter arm% aarch%,$(UNAME_M)),)
2828
src/quant/tq1_neon_sdot.c src/quant/tq1_neon.c src/quant/tq1_scalar.c \
2929
src/quant/q8_neon.c src/quant/q8_scalar.c \
3030
src/quant/q4_neon_sdot.c src/quant/q4_neon.c src/quant/q4_scalar.c \
31+
src/quant/q4_1_neon.c src/quant/q4_1_scalar.c \
32+
src/quant/bf16_neon.c src/quant/bf16_scalar.c \
3133
src/quant/q6k_neon.c src/quant/q6k_scalar.c \
3234
src/quant/q8k_neon.c src/quant/q8k_scalar.c \
3335
src/quant/q4k_neon.c src/quant/q4k_scalar.c \
3436
src/quant/q5k_neon.c src/quant/q5k_scalar.c \
3537
src/quant/q3k_neon.c src/quant/q3k_scalar.c \
36-
src/quant/q2k_neon.c src/quant/q2k_scalar.c
38+
src/quant/q2k_neon.c src/quant/q2k_scalar.c \
39+
src/quant/iq4nl_neon.c src/quant/iq4nl_scalar.c \
40+
src/quant/iq4xs_neon.c src/quant/iq4xs_scalar.c \
41+
src/quant/iq3xxs_neon.c src/quant/iq3xxs_scalar.c \
42+
src/quant/iq3s_neon.c src/quant/iq3s_scalar.c \
43+
src/quant/iq2xxs_neon.c src/quant/iq2xxs_scalar.c \
44+
src/quant/iq2xs_neon.c src/quant/iq2xs_scalar.c \
45+
src/quant/iq2s_neon.c src/quant/iq2s_scalar.c
3746

3847
TRANSFORMER_BACKEND = src/transformer/rmsnorm_neon.c src/transformer/rmsnorm_scalar.c \
3948
src/transformer/gqa_neon.c src/transformer/gqa_scalar.c \
@@ -45,12 +54,21 @@ else
4554
src/quant/tq2_scalar.c src/quant/tq1_scalar.c \
4655
src/quant/q8_avx2.c src/quant/q8_scalar.c \
4756
src/quant/q4_avx2.c src/quant/q4_scalar.c \
57+
src/quant/q4_1_avx2.c src/quant/q4_1_scalar.c \
58+
src/quant/bf16_avx2.c src/quant/bf16_scalar.c \
4859
src/quant/q6k_avx2.c src/quant/q6k_scalar.c \
4960
src/quant/q8k_avx2.c src/quant/q8k_scalar.c \
5061
src/quant/q4k_avx2.c src/quant/q4k_scalar.c \
5162
src/quant/q5k_avx2.c src/quant/q5k_scalar.c \
5263
src/quant/q3k_avx2.c src/quant/q3k_scalar.c \
53-
src/quant/q2k_avx2.c src/quant/q2k_scalar.c
64+
src/quant/q2k_avx2.c src/quant/q2k_scalar.c \
65+
src/quant/iq4nl_avx2.c src/quant/iq4nl_scalar.c \
66+
src/quant/iq4xs_avx2.c src/quant/iq4xs_scalar.c \
67+
src/quant/iq3xxs_avx2.c src/quant/iq3xxs_scalar.c \
68+
src/quant/iq3s_avx2.c src/quant/iq3s_scalar.c \
69+
src/quant/iq2xxs_avx2.c src/quant/iq2xxs_scalar.c \
70+
src/quant/iq2xs_avx2.c src/quant/iq2xs_scalar.c \
71+
src/quant/iq2s_avx2.c src/quant/iq2s_scalar.c
5472

5573
TRANSFORMER_BACKEND = src/transformer/rmsnorm_avx2.c src/transformer/rmsnorm_scalar.c \
5674
src/transformer/gqa_avx2.c src/transformer/gqa_scalar.c \
@@ -158,12 +176,21 @@ AVX2_QUANT_SRCS = $(QUANT_COMMON) \
158176
src/quant/tq2_scalar.c src/quant/tq1_scalar.c \
159177
src/quant/q8_avx2.c src/quant/q8_scalar.c \
160178
src/quant/q4_avx2.c src/quant/q4_scalar.c \
179+
src/quant/q4_1_avx2.c src/quant/q4_1_scalar.c \
180+
src/quant/bf16_avx2.c src/quant/bf16_scalar.c \
161181
src/quant/q6k_avx2.c src/quant/q6k_scalar.c \
162182
src/quant/q8k_avx2.c src/quant/q8k_scalar.c \
163183
src/quant/q4k_avx2.c src/quant/q4k_scalar.c \
164184
src/quant/q5k_avx2.c src/quant/q5k_scalar.c \
165185
src/quant/q3k_avx2.c src/quant/q3k_scalar.c \
166-
src/quant/q2k_avx2.c src/quant/q2k_scalar.c
186+
src/quant/q2k_avx2.c src/quant/q2k_scalar.c \
187+
src/quant/iq4nl_avx2.c src/quant/iq4nl_scalar.c \
188+
src/quant/iq4xs_avx2.c src/quant/iq4xs_scalar.c \
189+
src/quant/iq3xxs_avx2.c src/quant/iq3xxs_scalar.c \
190+
src/quant/iq3s_avx2.c src/quant/iq3s_scalar.c \
191+
src/quant/iq2xxs_avx2.c src/quant/iq2xxs_scalar.c \
192+
src/quant/iq2xs_avx2.c src/quant/iq2xs_scalar.c \
193+
src/quant/iq2s_avx2.c src/quant/iq2s_scalar.c
167194

168195
AVX2_TRANSFORMER_BACKEND = src/transformer/rmsnorm_avx2.c src/transformer/rmsnorm_scalar.c \
169196
src/transformer/gqa_avx2.c src/transformer/gqa_scalar.c \

README.md

Lines changed: 74 additions & 154 deletions
Large diffs are not rendered by default.

docs/roadmap.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,18 @@ Development roadmap for bitnet.c.
7373
- [x] Q3_K (3-bit k-quant) with NEON/AVX2/WASM kernels
7474
- [x] Non-tied output weights (separate output projection matrix)
7575

76-
## Phase 8: Extended Format Coverage
76+
## Phase 8: Extended Format Coverage — Done
7777

7878
- [x] Q2_K (2-bit k-quant) with NEON/AVX2/WASM kernels
79-
- [ ] IQ2_XXS, IQ2_XS, IQ2_S (codebook-based ~2 bpw, designed)
80-
- [ ] F16/BF16 weight types (unquantized dense matvec, designed)
79+
- [x] Q4_1 (4-bit with min) with NEON/AVX2/WASM kernels
80+
- [x] BF16 weight type with NEON/AVX2/WASM kernels
81+
- [x] IQ4_NL (4-bit non-linear codebook) with NEON/AVX2/WASM kernels
82+
- [x] IQ4_XS (4-bit non-linear with sub-block scales) with NEON/AVX2/WASM kernels
83+
- [x] IQ3_XXS (3-bit codebook) with NEON/AVX2/WASM kernels
84+
- [x] IQ3_S (3-bit codebook with separate signs) with NEON/AVX2/WASM kernels
85+
- [x] IQ2_XXS (2-bit codebook) with NEON/AVX2/WASM kernels
86+
- [x] IQ2_XS (2-bit codebook with scales) with NEON/AVX2/WASM kernels
87+
- [x] IQ2_S (2-bit codebook, 1024-entry grid) with NEON/AVX2/WASM kernels
8188

8289
## Performance Analysis (M1 Max, bitnet-b1.58-2B-4T)
8390

include/gguf.h

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,28 @@ enum {
2929

3030
// GGUF tensor types we care about
3131
enum {
32-
BN_GGUF_TENSOR_F32 = 0,
33-
BN_GGUF_TENSOR_F16 = 1,
34-
BN_GGUF_TENSOR_Q4_0 = 2,
35-
BN_GGUF_TENSOR_Q8_0 = 8,
36-
BN_GGUF_TENSOR_Q2_K = 10,
37-
BN_GGUF_TENSOR_Q3_K = 11,
38-
BN_GGUF_TENSOR_Q4_K = 12,
39-
BN_GGUF_TENSOR_Q5_K = 13,
40-
BN_GGUF_TENSOR_Q6_K = 14,
41-
BN_GGUF_TENSOR_Q8_K = 15,
42-
BN_GGUF_TENSOR_TQ1_0 = 34,
43-
BN_GGUF_TENSOR_TQ2_0 = 35,
44-
BN_GGUF_TENSOR_I2_S = 36,
32+
BN_GGUF_TENSOR_F32 = 0,
33+
BN_GGUF_TENSOR_F16 = 1,
34+
BN_GGUF_TENSOR_Q4_0 = 2,
35+
BN_GGUF_TENSOR_Q4_1 = 3,
36+
BN_GGUF_TENSOR_Q8_0 = 8,
37+
BN_GGUF_TENSOR_Q2_K = 10,
38+
BN_GGUF_TENSOR_Q3_K = 11,
39+
BN_GGUF_TENSOR_Q4_K = 12,
40+
BN_GGUF_TENSOR_Q5_K = 13,
41+
BN_GGUF_TENSOR_Q6_K = 14,
42+
BN_GGUF_TENSOR_Q8_K = 15,
43+
BN_GGUF_TENSOR_IQ2_XXS = 16,
44+
BN_GGUF_TENSOR_IQ2_XS = 17,
45+
BN_GGUF_TENSOR_IQ3_XXS = 18,
46+
BN_GGUF_TENSOR_IQ4_NL = 20,
47+
BN_GGUF_TENSOR_IQ3_S = 21,
48+
BN_GGUF_TENSOR_IQ2_S = 22,
49+
BN_GGUF_TENSOR_IQ4_XS = 23,
50+
BN_GGUF_TENSOR_BF16 = 30,
51+
BN_GGUF_TENSOR_TQ1_0 = 34,
52+
BN_GGUF_TENSOR_TQ2_0 = 35,
53+
BN_GGUF_TENSOR_I2_S = 36,
4554
};
4655

4756
typedef struct {

0 commit comments

Comments
 (0)