From f306c6414136e0a8faecf014e5c2e691fb810244 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Fri, 24 Apr 2026 09:20:52 -0700
Subject: [PATCH] fix(model_runner): all_reduce num_kvcache_blocks to MIN
 across TP ranks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under tensor parallelism each rank independently estimates the number of
KV-cache blocks from its local GPU memory snapshot. Different ranks can
arrive at different values (due to different driver/activation overhead),
so block IDs are no longer consistent across ranks — the BlockManager on
rank 0 may allocate block 42 while rank 1 has no block 42 in its cache,
silently corrupting KV-cache lookups.

Fix: after the local estimate, synchronize across all TP ranks via
`dist.all_reduce(..., op=ReduceOp.MIN)` so every rank allocates exactly
the same number of blocks — the minimum among all ranks.

Fixes #187
---
 nanovllm/engine/model_runner.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py
index 5e6342bb9..132b573d2 100644
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -111,6 +111,12 @@ def allocate_kv_cache(self):
         head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
         block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.dtype.itemsize
         config.num_kvcache_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
+        if self.world_size > 1:
+            # Under TP all ranks must agree on the same block count so block IDs are
+            # consistent.  Take the minimum to avoid over-allocating on any rank.
+            t = torch.tensor(config.num_kvcache_blocks, dtype=torch.int64, device="cuda")
+            dist.all_reduce(t, op=dist.ReduceOp.MIN)
+            config.num_kvcache_blocks = int(t.item())
         assert config.num_kvcache_blocks > 0
         self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
         layer_id = 0