raullenchai · raullenchai · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@
 | **32+ GB** Mac Mini / Studio | Qwen3.6-35B | 95 tok/s | 256 experts, 262K context |
 | **64 GB** Mac Mini / Studio | Qwen3.5-35B | 83 tok/s | Best balance of smart + fast |
 | **96+ GB** Mac Studio / Pro | Qwen3.5-122B | 57 tok/s | Frontier-level intelligence |
+| **128+ GB** Mac Studio Ultra | 🆕 DeepSeek V4 Flash 158B-A13B | 31-56 tok/s | Day-0 frontier MoE, 1M context |
 
 <details>
 <summary><b>New to local AI? Quick glossary</b></summary>
@@ -348,7 +349,9 @@ The model has to fit in your Mac's RAM. If your Mac slows down or Activity Monit
 | **48 GB** Mac Mini / Studio | [Qwen3.5-35B-A3B 8bit](https://huggingface.co/mlx-community/Qwen3.5-35B-A3B-8bit) | 37 GB | 83 tok/s | **Sweet spot** — smart + fast |
 | **64 GB** Mac Mini / Studio | [Qwen3.5-35B-A3B 8bit](https://huggingface.co/mlx-community/Qwen3.5-35B-A3B-8bit) | 37 GB | 83 tok/s | Same model, more room for KV cache |
 | **96 GB** Mac Studio / Pro | [Qwen3.5-122B mxfp4](https://huggingface.co/nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx) | 65 GB | 57 tok/s | Best model, fits comfortably |
+| **128 GB** Mac Studio / Pro | 🆕 [DeepSeek V4 Flash 2-bit DQ](https://huggingface.co/mlx-community/DeepSeek-V4-Flash-2bit-DQ) | 91 GB | 56 tok/s | 158B-A13B frontier MoE, day-0 (chat only) |
 | **192 GB** Mac Studio / Pro | [Qwen3.5-122B 8bit](https://huggingface.co/mlx-community/Qwen3.5-122B-A10B-8bit) | 130 GB | 44 tok/s | Maximum quality |
+| **256 GB** Mac Studio Ultra | 🆕 [DeepSeek V4 Flash 8-bit](https://huggingface.co/mlx-community/DeepSeek-V4-Flash-8bit) | 136 GB | 31 tok/s | 158B-A13B frontier MoE, 1M context (chat only) |
 
 > **4bit vs 8bit:** 4bit models are compressed to use less memory (recommended for most users). 8bit models are higher quality but need more RAM. "mxfp4" is a high-quality 4bit format.
 
@@ -427,6 +430,8 @@ Tested on **Mac Studio M3 Ultra (256GB)**. Rapid-MLX uses Apple's [MLX framework
 | **Phi-4 Mini 14B** | **180** tok/s | 77 (mlx-lm) / 56 (Ollama) | **2.3x** / **3.2x** |
 | **Qwen3.5-4B** | **160** tok/s | 155 (mlx-lm serve) | **1.0x** |
 | **Nemotron-Nano 30B** | **141** tok/s · 100% tools | — | — |
+| 🆕 **DeepSeek V4 Flash 158B-A13B** (2-bit DQ) | **56** tok/s | — (only MLX engine, day-0) | — |
+| 🆕 **DeepSeek V4 Flash 158B-A13B** (8-bit) | **31** tok/s | — (only MLX engine, day-0) | — |
 | **GPT-OSS 20B** | **127** tok/s · 100% tools | 79 (mlx-lm serve) | **1.6x** |
 | **Qwen3.5-9B** | **108** tok/s | 41 (Ollama) | **2.6x** |
 | **Qwen3.6-35B-A3B** | **95** tok/s · 100% tools | — | — |

diff --git a/pyproject.toml b/pyproject.toml
@@ -143,6 +143,11 @@ target-version = ["py310", "py311", "py312", "py313"]
 [tool.ruff]
 line-length = 88
 
+[tool.ruff.format]
+# Vendored upstream files — keep formatting identical to the source PR
+# so future syncs produce a clean diff.
+exclude = ["vllm_mlx/models/deepseek_v4.py"]
+
 [tool.ruff.lint]
 select = ["E", "F", "W", "I", "N", "UP", "B", "SIM"]
 ignore = [
@@ -170,6 +175,12 @@ ignore = [
     "B904",   # raise without from inside except
 ]
 
+[tool.ruff.lint.per-file-ignores]
+# Vendored from ml-explore/mlx-lm PR #1192 (Blaizzy). Keep as-is so
+# upstream syncs remain a clean diff. Drop this once mlx-lm 0.32+
+# ships native deepseek_v4 support and we delete the vendored copy.
+"vllm_mlx/models/deepseek_v4.py" = ["UP", "B", "SIM", "N", "F", "I", "E"]
+
 [tool.mypy]
 python_version = "3.10"
 warn_return_any = true

diff --git a/tests/test_deepseek_v4_vendored.py b/tests/test_deepseek_v4_vendored.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for the vendored DeepSeek-V4 architecture.
+
+mlx-lm 0.31.x doesn't ship `deepseek_v4` yet (see ml-explore/mlx-lm#1192).
+We vendor the module so users can serve mlx-community/DeepSeek-V4-Flash-*
+day-0. These tests pin the contract that:
+
+1. The vendored module is importable on its own.
+2. `_register_vendored_archs()` exposes it to mlx-lm's importlib lookup.
+3. A tiny synthetic config can construct + run the model end-to-end
+   (proves Metal kernels compile and the forward path produces logits).
+"""
+
+import importlib
+import sys
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _clear_vendored_register():
+    """Registration is sys.modules-level state — reset before each test."""
+    sys.modules.pop("mlx_lm.models.deepseek_v4", None)
+    yield
+    sys.modules.pop("mlx_lm.models.deepseek_v4", None)
+
+
+def test_module_imports():
+    from vllm_mlx.models import deepseek_v4
+
+    assert hasattr(deepseek_v4, "Model")
+    assert hasattr(deepseek_v4, "ModelArgs")
+    assert deepseek_v4.ModelArgs.__dataclass_fields__["model_type"].default == (
+        "deepseek_v4"
+    )
+
+
+def test_register_vendored_archs_makes_mlx_lm_loader_find_it():
+    from vllm_mlx.utils.tokenizer import _register_vendored_archs
+
+    assert "mlx_lm.models.deepseek_v4" not in sys.modules
+    _register_vendored_archs()
+    assert "mlx_lm.models.deepseek_v4" in sys.modules
+
+    # mlx-lm's _get_classes() does exactly this lookup.
+    mod = importlib.import_module("mlx_lm.models.deepseek_v4")
+    assert mod is sys.modules["mlx_lm.models.deepseek_v4"]
+    assert mod.__name__ == "vllm_mlx.models.deepseek_v4"
+    assert hasattr(mod, "Model")
+
+
+def test_register_vendored_archs_is_idempotent():
+    from vllm_mlx.utils.tokenizer import _register_vendored_archs
+
+    _register_vendored_archs()
+    first = sys.modules["mlx_lm.models.deepseek_v4"]
+    _register_vendored_archs()
+    second = sys.modules["mlx_lm.models.deepseek_v4"]
+    assert first is second
+
+
+def test_tiny_model_forward_pass():
+    """Smoke test the full forward path on a CPU-sized synthetic config.
+
+    This is the same shape as upstream PR #1192's test_deepseek_v4 — it
+    exercises HCA attention + sinkhorn + MoE routing without needing any
+    real weights. If a Metal kernel breaks, this catches it.
+    """
+    import mlx.core as mx
+
+    from vllm_mlx.models import deepseek_v4
+
+    args = deepseek_v4.ModelArgs(
+        model_type="deepseek_v4",
+        vocab_size=128,
+        hidden_size=64,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_key_value_heads=1,
+        q_lora_rank=16,
+        o_lora_rank=8,
+        o_groups=2,
+        head_dim=16,
+        qk_rope_head_dim=4,
+        sliding_window=16,
+        compress_ratios=[0, 0, 4, 0],
+        index_n_heads=4,
+        index_head_dim=8,
+        index_topk=4,
+        moe_intermediate_size=16,
+        n_routed_experts=4,
+        n_shared_experts=1,
+        num_experts_per_tok=2,
+        num_hash_layers=1,
+        hc_mult=2,
+        hc_sinkhorn_iters=2,
+    )
+    model = deepseek_v4.Model(args)
+    inputs = mx.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=mx.int32)
+    cache = model.make_cache()
+    logits = model(inputs, cache=cache)
+    mx.eval(logits, [c.state for c in cache])
+
+    assert logits.shape == (1, 8, args.vocab_size)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm_mlx/aliases.json b/vllm_mlx/aliases.json
@@ -9,6 +9,10 @@
   "qwen3.6-27b-8bit": "unsloth/Qwen3.6-27B-MLX-8bit",
   "qwen3.6-35b": "mlx-community/Qwen3.6-35B-A3B-4bit",
   "qwen3.6-35b-6bit": "mlx-community/Qwen3.6-35B-A3B-6bit",
+  "deepseek-v4-flash": "mlx-community/DeepSeek-V4-Flash-8bit",
+  "deepseek-v4-flash-2bit": "mlx-community/DeepSeek-V4-Flash-2bit-DQ",
+  "deepseek-v4-flash-8bit": "mlx-community/DeepSeek-V4-Flash-8bit",
+  "deepseek-v4-flash-4bit": "mlx-community/DeepSeek-V4-Flash-4bit",
   "qwen3-coder": "lmstudio-community/Qwen3-Coder-Next-MLX-4bit",
   "qwen3-vl-4b": "mlx-community/Qwen3-VL-4B-Instruct-MLX-4bit",
   "llama3-3b": "mlx-community/Llama-3.2-3B-Instruct-4bit",