|
4 | 4 | Quick start: |
5 | 5 |
|
6 | 6 | from quantcpp import Model |
7 | | - m = Model.from_pretrained("SmolLM2-1.7B") |
| 7 | + m = Model.from_pretrained("Phi-3.5-mini") |
8 | 8 | print(m.ask("What is gravity?")) |
9 | 9 |
|
10 | 10 | Model selection guide: |
11 | | - SmolLM2-1.7B (1.7 GB, vocab 49K) — recommended. ~12 tok/s on Apple M3. |
12 | | - Llama-3.2-1B (750 MB, vocab 128K) — smaller download but slower |
| 11 | + Phi-3.5-mini (2.4 GB, vocab 32K) — DEFAULT. 3.8B params with the |
| 12 | + smallest lm_head in the registry, |
| 13 | + producing the best speed/quality |
| 14 | + combo. Coherent multi-paragraph |
| 15 | + output even at Q4_K_M. |
| 16 | + SmolLM2-1.7B (1.7 GB, vocab 49K) — lightweight all-rounder. ~12 tok/s |
| 17 | + on Apple M3, smaller download. |
| 18 | + Llama-3.2-1B (750 MB, vocab 128K) — smallest download but slower |
13 | 19 | due to large vocab (~2 tok/s on M3). |
14 | | - SmolLM2-135M (138 MB, vocab 49K) — demo only, low quality output. |
| 20 | + SmolLM2-135M (138 MB, vocab 49K) — demo only, low quality output. |
15 | 21 |
|
16 | 22 | Larger vocab = slower lm_head matmul → smaller params with smaller vocab |
17 | 23 | often beats larger params with larger vocab. See docs/supported_models.md |
@@ -65,47 +71,48 @@ class ChatContextOverflow(RuntimeError): |
65 | 71 | # Verify both fields against the actual HuggingFace listing before |
66 | 72 | # adding new entries — there is no integrity check at runtime. |
67 | 73 | _MODEL_REGISTRY = { |
68 | | - # 138 MB demo model. Tokenizer + arch are llama-compatible but the |
69 | | - # model is too small to produce coherent output for general chat. |
70 | | - # Listed only so users can verify the install/load path quickly. |
71 | | - "SmolLM2-135M": ( |
72 | | - "Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct", |
73 | | - "smollm2-135m-instruct-q8_0.gguf", |
74 | | - 135, |
| 74 | + # ── DEFAULT ── |
| 75 | + # Phi-3.5-mini-instruct (3.8B params, vocab 32K). Set as default on |
| 76 | + # 2026-04-12 after end-to-end Phi-3 architecture support landed |
| 77 | + # (fused QKV / fused gate+up FFN / LongRoPE). The 32K vocab is the |
| 78 | + # smallest of the registry, which makes the lm_head matmul the |
| 79 | + # fastest per-token. Combined with 3.8B params it produces the |
| 80 | + # best quality-per-token of any model we ship. |
| 81 | + "Phi-3.5-mini": ( |
| 82 | + "bartowski/Phi-3.5-mini-instruct-GGUF", |
| 83 | + "Phi-3.5-mini-instruct-Q4_K_M.gguf", |
| 84 | + 2400, |
75 | 85 | ), |
76 | | - # Recommended default for first-time users on Apple Silicon / typical |
77 | | - # laptops. vocab 49K keeps the lm_head matmul small, so even on a |
78 | | - # mid-range M-series chip we measure ~12 tok/s — comfortable for |
79 | | - # interactive chat. Same llama arch family as SmolLM2-135M, so it |
80 | | - # exercises the most-tested code path. |
| 86 | + # Lightweight all-rounder for users who want a smaller download |
| 87 | + # than Phi-3.5-mini. vocab 49K keeps the lm_head matmul small, so |
| 88 | + # on a mid-range M-series chip we measure ~12 tok/s — comfortable |
| 89 | + # for interactive chat. Same llama arch family as SmolLM2-135M. |
81 | 90 | "SmolLM2-1.7B": ( |
82 | 91 | "bartowski/SmolLM2-1.7B-Instruct-GGUF", |
83 | 92 | "SmolLM2-1.7B-Instruct-Q8_0.gguf", |
84 | 93 | 1700, |
85 | 94 | ), |
86 | | - "Qwen3.5-0.8B": ( |
87 | | - "unsloth/Qwen3.5-0.8B-GGUF", |
88 | | - "Qwen3.5-0.8B-Q4_K_M.gguf", |
89 | | - 508, |
90 | | - ), |
91 | | - # Smaller download than SmolLM2-1.7B but slower at inference time |
92 | | - # because of the 128K Llama-3 vocab (~5x slower lm_head matmul on M3). |
93 | | - # Kept in the registry for users who specifically want a Llama model. |
| 95 | + # Smallest download in the "actually usable" tier. Slower at |
| 96 | + # inference time because of the 128K Llama-3 vocab (~5x slower |
| 97 | + # lm_head matmul on M3). Kept in the registry for users who |
| 98 | + # specifically want a Llama model. |
94 | 99 | "Llama-3.2-1B": ( |
95 | 100 | "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", |
96 | 101 | "llama-3.2-1b-instruct-q4_k_m.gguf", |
97 | 102 | 750, |
98 | 103 | ), |
99 | | - # Phi-3.5-mini-instruct (3.8B params, vocab 32K). |
100 | | - # Added 2026-04-12 after end-to-end Phi-3 architecture support |
101 | | - # landed (fused QKV / fused gate+up FFN / LongRoPE). The 32K vocab |
102 | | - # is the smallest of the registry, which makes the lm_head matmul |
103 | | - # the fastest per-token. Combined with 3.8B params it's the best |
104 | | - # quality-per-token model we ship. |
105 | | - "Phi-3.5-mini": ( |
106 | | - "bartowski/Phi-3.5-mini-instruct-GGUF", |
107 | | - "Phi-3.5-mini-instruct-Q4_K_M.gguf", |
108 | | - 2400, |
| 104 | + "Qwen3.5-0.8B": ( |
| 105 | + "unsloth/Qwen3.5-0.8B-GGUF", |
| 106 | + "Qwen3.5-0.8B-Q4_K_M.gguf", |
| 107 | + 508, |
| 108 | + ), |
| 109 | + # 138 MB demo model. Tokenizer + arch are llama-compatible but the |
| 110 | + # model is too small to produce coherent output for general chat. |
| 111 | + # Listed only so users can verify the install/load path quickly. |
| 112 | + "SmolLM2-135M": ( |
| 113 | + "Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct", |
| 114 | + "smollm2-135m-instruct-q8_0.gguf", |
| 115 | + 135, |
109 | 116 | ), |
110 | 117 | } |
111 | 118 |
|
@@ -208,9 +215,9 @@ class Model: |
208 | 215 |
|
209 | 216 | Examples |
210 | 217 | -------- |
211 | | - >>> m = Model.from_pretrained("SmolLM2-1.7B") |
| 218 | + >>> m = Model.from_pretrained("Phi-3.5-mini") |
212 | 219 | >>> m.ask("What is gravity?") |
213 | | - 'Gravity is a force that attracts ...' |
| 220 | + 'Gravity is a fundamental force that attracts ...' |
214 | 221 |
|
215 | 222 | >>> with Model("model.gguf") as m: |
216 | 223 | ... for tok in m.generate("Once upon a time"): |
|
0 commit comments