From f306c6414136e0a8faecf014e5c2e691fb810244 Mon Sep 17 00:00:00 2001 From: Tai An Date: Fri, 24 Apr 2026 09:20:52 -0700 Subject: [PATCH] fix(model_runner): all_reduce num_kvcache_blocks to MIN across TP ranks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under tensor parallelism each rank independently estimates the number of KV-cache blocks from its local GPU memory snapshot. Different ranks can arrive at different values (due to different driver/activation overhead), so block IDs are no longer consistent across ranks — the BlockManager on rank 0 may allocate block 42 while rank 1 has no block 42 in its cache, silently corrupting KV-cache lookups. Fix: after the local estimate, synchronize across all TP ranks via `dist.all_reduce(..., op=ReduceOp.MIN)` so every rank allocates exactly the same number of blocks — the minimum among all ranks. Fixes #187 --- nanovllm/engine/model_runner.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index 5e6342bb9..132b573d2 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -111,6 +111,12 @@ def allocate_kv_cache(self): head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads) block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.dtype.itemsize config.num_kvcache_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes + if self.world_size > 1: + # Under TP all ranks must agree on the same block count so block IDs are + # consistent. Take the minimum to avoid over-allocating on any rank. + t = torch.tensor(config.num_kvcache_blocks, dtype=torch.int64, device="cuda") + dist.all_reduce(t, op=dist.ReduceOp.MIN) + config.num_kvcache_blocks = int(t.item()) assert config.num_kvcache_blocks > 0 self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim) layer_id = 0