From ae4eb83ac269cf522736b79076988278fa3bd4e3 Mon Sep 17 00:00:00 2001
From: royliang <royliang@tencent.com>
Date: Thu, 30 Apr 2026 10:15:34 +0800
Subject: [PATCH 01/15] page allocator migrates from python to C++

---
 .gitignore                           |   6 +-
 csrc/allocator.cpp                   |  17 +-
 csrc/ftensor.cpp                     |   4 +-
 csrc/inc/cuda_utils.hpp              |  64 +++
 csrc/inc/mem_info_tracker.hpp        | 246 +++++++++
 csrc/inc/page_allocator.hpp          | 160 ++++++
 csrc/page_allocator.cpp              | 719 +++++++++++++++++++++++++++
 csrc/torch_bindings.cpp              | 164 ++++++
 csrc/util.cpp                        |  41 ++
 kvcached/integration/vllm/patches.py |  13 +-
 kvcached/kv_cache_manager.py         |  83 +++-
 kvcached/page_allocator.py           | 592 ----------------------
 setup.py                             |   2 +
 13 files changed, 1485 insertions(+), 626 deletions(-)
 create mode 100644 csrc/inc/mem_info_tracker.hpp
 create mode 100644 csrc/inc/page_allocator.hpp
 create mode 100644 csrc/page_allocator.cpp
 create mode 100644 csrc/util.cpp
 delete mode 100644 kvcached/page_allocator.py

diff --git a/.gitignore b/.gitignore
index e0b0d03d..d9a5da5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -182,4 +182,8 @@ tools/addlicense
 tools/.addlicense.lock
 
 .vscode/*
-.claude
\ No newline at end of file
+.claude
+
+# macOS metadata files
+.DS_Store
+._*
\ No newline at end of file
diff --git a/csrc/allocator.cpp b/csrc/allocator.cpp
index 8e52817b..9adb2715 100644
--- a/csrc/allocator.cpp
+++ b/csrc/allocator.cpp
@@ -65,7 +65,7 @@ void FTensorAllocator::init(const std::string &dev_str, size_t page_size,
                             bool contiguous_layout) {
   std::lock_guard<std::mutex> lock(g_allocator_mutex_);
   if (!g_allocators_.empty()) {
-    LOGE("FTensorAllocator has been initialized. Re-initializing...")
+    LOGGER(ERROR, "FTensorAllocator has been initialized. Re-initializing...");
     g_allocators_.clear();
   }
 
@@ -74,8 +74,10 @@ void FTensorAllocator::init(const std::string &dev_str, size_t page_size,
     // Validate that page_size is a multiple of 2MB
     size_t base_size = 2 * 1024 * 1024; // 2MB
     if (page_size % base_size != 0) {
-      LOGE("Invalid page size: %zu, must be a multiple of 2MB (2097152 bytes)",
-           page_size);
+      LOGGER(
+          ERROR,
+          "Invalid page size: %zu, must be a multiple of 2MB (2097152 bytes)",
+          page_size);
       abort();
     }
     kPageSize = page_size;
@@ -120,8 +122,8 @@ std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors(
   size_t aligned_size = size;
   if (size % kPageSize != 0) {
     aligned_size = ((size + kPageSize - 1) / kPageSize) * kPageSize;
-    LOGW("Size %zu is not aligned to page size %zu, aligning to %zu", size,
-         kPageSize, aligned_size);
+    LOGGER(WARNING, "Size %zu is not aligned to page size %zu, aligning to %zu",
+           size, kPageSize, aligned_size);
   }
   kv_tensor_size_per_layer_ = aligned_size;
 
@@ -151,7 +153,7 @@ bool FTensorAllocator::kv_tensors_created() {
 bool FTensorAllocator::map_to_kv_tensors(const std::vector<offset_t> &offsets) {
   std::unique_lock<std::mutex> lock(mtx_);
   if (num_layers_ == 0) {
-    LOGE("try to map to KV tensors when KV tensors are not created");
+    LOGGER(ERROR, "try to map to KV tensors when KV tensors are not created");
     return false;
   }
 
@@ -202,7 +204,8 @@ bool FTensorAllocator::unmap_from_kv_tensors(
     const std::vector<offset_t> &offsets) {
   std::unique_lock<std::mutex> lock(mtx_);
   if (num_layers_ == 0) {
-    LOGE("try to unmap from KV tensors when KV tensors are not created");
+    LOGGER(ERROR,
+           "try to unmap from KV tensors when KV tensors are not created");
     return false;
   }
 
diff --git a/csrc/ftensor.cpp b/csrc/ftensor.cpp
index 10eba830..e4b4e76f 100644
--- a/csrc/ftensor.cpp
+++ b/csrc/ftensor.cpp
@@ -75,7 +75,7 @@ bool FTensor::map(offset_t offset) {
 
   page_id_t page_id = offset / page_size_;
   if (mapping_.find(page_id) != mapping_.end()) {
-    LOGE("Page %ld is already mapped.", page_id);
+    LOGGER(ERROR, "Page %ld is already mapped.", page_id);
     return false;
   }
 
@@ -93,7 +93,7 @@ bool FTensor::unmap(offset_t offset) {
 
   page_id_t page_id = offset / page_size_;
   if (mapping_.find(page_id) == mapping_.end()) {
-    LOGE("Page %ld is not mapped.", page_id);
+    LOGGER(ERROR, "Page %ld is not mapped.", page_id);
     return false;
   }
 
diff --git a/csrc/inc/cuda_utils.hpp b/csrc/inc/cuda_utils.hpp
index 08db6954..5a984d57 100644
--- a/csrc/inc/cuda_utils.hpp
+++ b/csrc/inc/cuda_utils.hpp
@@ -8,6 +8,70 @@
 
 #include <cassert>
 #include <iostream>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+typedef enum {
+  FATAL = 0,
+  ERROR = 1,
+  WARNING = 2,
+  INFO = 3,
+  DEBUG = 4,
+  VERBOSE = 5,
+} log_level_enum_t;
+
+extern void now_to_string(char *buf, int length);
+#ifdef __cplusplus
+__attribute__((unused)) static char *logger_level_str[] = {
+    (char *)"FATAL", (char *)"ERROR", (char *)"WARNING",
+    (char *)"INFO",  (char *)"DEBUG", (char *)"VERBOSE"};
+#else
+__attribute__((unused)) static char *logger_level_str[] = {
+    "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "VERBOSE"};
+#endif
+
+// glibc >= 2.30 provides a native gettid() wrapper; only define our own
+// syscall-based version on older systems to avoid macro/function conflicts.
+#if !defined(__GLIBC__) || !defined(__GLIBC_MINOR__) || (__GLIBC__ < 2) ||     \
+    (__GLIBC__ == 2 && __GLIBC_MINOR__ < 30)
+#ifndef SYS_gettid
+#error "SYS_gettid unavailable on this system"
+#endif
+static inline pid_t gettid(void) { return (pid_t)syscall(SYS_gettid); }
+#endif
+
+#define LOGGER(level, format, ...)                                             \
+  ({                                                                           \
+    char *_print_level_str = getenv("KVCACHED_LOG_LEVEL");                     \
+    char time[64];                                                             \
+    now_to_string(time, 64);                                                   \
+    int _print_level = 0;                                                      \
+    if (_print_level_str == NULL) {                                            \
+      _print_level = WARNING;                                                  \
+    } else if (_print_level_str[0] == 'F') {                                   \
+      _print_level = FATAL;                                                    \
+    } else if (_print_level_str[0] == 'E') {                                   \
+      _print_level = ERROR;                                                    \
+    } else if (_print_level_str[0] == 'W') {                                   \
+      _print_level = WARNING;                                                  \
+    } else if (_print_level_str[0] == 'I') {                                   \
+      _print_level = INFO;                                                     \
+    } else if (_print_level_str[0] == 'D') {                                   \
+      _print_level = DEBUG;                                                    \
+    } else if (_print_level_str[0] == 'V') {                                   \
+      _print_level = VERBOSE;                                                  \
+    }                                                                          \
+    if (level <= _print_level) {                                               \
+      fprintf(stderr,                                                          \
+              "[KVCACHED_MEMORY_POOL][%s][%s]%s:%d [p:%u t:%u]" format "\n",   \
+              logger_level_str[level], time, __FILE__, __LINE__,               \
+              (unsigned int)getpid(), (unsigned int)gettid(), ##__VA_ARGS__);  \
+    }                                                                          \
+    if (level == FATAL) {                                                      \
+      exit(-1);                                                                \
+    }                                                                          \
+  })
 
 #define LOGE(format, ...)                                                      \
   fprintf(stderr, "ERROR: %s:%d: " format "\n", __FILE__, __LINE__,            \
diff --git a/csrc/inc/mem_info_tracker.hpp b/csrc/inc/mem_info_tracker.hpp
new file mode 100644
index 00000000..176d8040
--- /dev/null
+++ b/csrc/inc/mem_info_tracker.hpp
@@ -0,0 +1,246 @@
+// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <fcntl.h>
+#include <signal.h>
+#include <string>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "cuda_utils.hpp"
+
+namespace kvcached {
+
+static constexpr const char *SHM_DIR = "/dev/shm";
+
+// Memory info struct stored in shared memory, compatible with Python
+// MemInfoStruct. Layout: [total_size(int64), used_size(int64),
+// prealloc_size(int64)]
+struct MemInfoStruct {
+  int64_t total_size;
+  int64_t used_size;
+  int64_t prealloc_size;
+
+  static constexpr int N_FIELDS = 3;
+  static constexpr size_t SHM_SIZE = sizeof(int64_t) * N_FIELDS;
+
+  MemInfoStruct() : total_size(0), used_size(0), prealloc_size(0) {}
+  MemInfoStruct(int64_t total, int64_t used, int64_t prealloc)
+      : total_size(total), used_size(used), prealloc_size(prealloc) {}
+};
+
+// RAII class for file-lock + mmap operations on /dev/shm files
+class RwLockedShm {
+public:
+  enum LockType { RLOCK = LOCK_SH, WLOCK = LOCK_EX };
+
+  RwLockedShm(const std::string &file_path, size_t size, LockType lock_type)
+      : file_path_(get_ipc_path(file_path)), size_(size), lock_type_(lock_type),
+        fd_(-1), mapped_(nullptr) {}
+
+  ~RwLockedShm() { close(); }
+
+  // Open and lock the shared memory file, returns whether successful
+  bool open() {
+    // Try to open the file
+    fd_ = ::open(file_path_.c_str(), O_RDWR);
+    if (fd_ < 0) {
+      if (lock_type_ != WLOCK) {
+        return false;
+      }
+      // Create file in write-lock mode
+      fd_ = ::open(file_path_.c_str(), O_RDWR | O_CREAT, 0666);
+      if (fd_ < 0) {
+        return false;
+      }
+      if (ftruncate(fd_, size_) < 0) {
+        ::close(fd_);
+        fd_ = -1;
+        return false;
+      }
+    }
+
+    // Ensure file size is sufficient
+    struct stat st;
+    if (fstat(fd_, &st) == 0 && static_cast<size_t>(st.st_size) < size_) {
+      if (lock_type_ == WLOCK) {
+        (void)ftruncate(fd_, size_);
+      }
+    }
+
+    // Acquire file lock
+    if (flock(fd_, lock_type_) < 0) {
+      ::close(fd_);
+      fd_ = -1;
+      return false;
+    }
+
+    // mmap
+    int prot = (lock_type_ == WLOCK) ? (PROT_READ | PROT_WRITE) : PROT_READ;
+    mapped_ = mmap(nullptr, size_, prot, MAP_SHARED, fd_, 0);
+    if (mapped_ == MAP_FAILED) {
+      mapped_ = nullptr;
+      flock(fd_, LOCK_UN);
+      ::close(fd_);
+      fd_ = -1;
+      return false;
+    }
+
+    return true;
+  }
+
+  void close() {
+    if (mapped_ != nullptr) {
+      munmap(mapped_, size_);
+      mapped_ = nullptr;
+    }
+    if (fd_ >= 0) {
+      flock(fd_, LOCK_UN);
+      ::close(fd_);
+      fd_ = -1;
+    }
+  }
+
+  void *data() { return mapped_; }
+  const void *data() const { return mapped_; }
+
+  // Read MemInfoStruct from mmap buffer
+  MemInfoStruct read_mem_info() const {
+    MemInfoStruct info;
+    if (mapped_) {
+      const int64_t *arr = static_cast<const int64_t *>(mapped_);
+      info.total_size = arr[0];
+      info.used_size = arr[1];
+      info.prealloc_size = arr[2];
+    }
+    return info;
+  }
+
+  // Write MemInfoStruct to mmap buffer
+  void write_mem_info(const MemInfoStruct &info) {
+    if (mapped_) {
+      int64_t *arr = static_cast<int64_t *>(mapped_);
+      arr[0] = info.total_size;
+      arr[1] = info.used_size;
+      arr[2] = info.prealloc_size;
+    }
+  }
+
+private:
+  static std::string get_ipc_path(const std::string &name) {
+    if (name.empty())
+      return "";
+    if (name[0] == '/')
+      return name;
+    return std::string(SHM_DIR) + "/" + name;
+  }
+
+  std::string file_path_;
+  size_t size_;
+  LockType lock_type_;
+  int fd_;
+  void *mapped_;
+};
+
+// MemInfoTracker: tracks memory usage info via POSIX shared memory
+class MemInfoTracker {
+public:
+  explicit MemInfoTracker(int64_t total_mem_size, int64_t group_id = 0,
+                          const std::string &ipc_name = "")
+      : ipc_name_(ipc_name), total_mem_size_(total_mem_size) {
+    if (ipc_name_.empty()) {
+      std::string base = obtain_default_ipc_name();
+      // Non-zero group_id gets a "_g<id>" suffix so multiple pools
+      // in one process don't share a segment.
+      if (group_id != 0) {
+        base += "_g" + std::to_string(group_id);
+      }
+      ipc_name_ = base;
+    }
+    init_kv_cache_limit(total_mem_size_);
+    LOGGER(INFO,
+           "MemInfoTracker initialized: ipc_name=%s, total_mem_size=%ld, "
+           "group_id=%ld",
+           ipc_name_.c_str(), total_mem_size_, group_id);
+  }
+
+  ~MemInfoTracker() { cleanup(); }
+
+  // Update memory usage info in shared memory
+  void update_memory_usage(int64_t used_size, int64_t prealloc_size) {
+    RwLockedShm shm(ipc_name_, MemInfoStruct::SHM_SIZE, RwLockedShm::WLOCK);
+    if (!shm.open()) {
+      LOGGER(ERROR, "MemInfoTracker: failed to open shm for update: %s",
+             ipc_name_.c_str());
+      return;
+    }
+    MemInfoStruct info = shm.read_mem_info();
+    info.used_size = used_size;
+    info.prealloc_size = prealloc_size;
+    shm.write_mem_info(info);
+  }
+
+  // Check if resize is needed, returns new mem_size (per layer), or -1 if not
+  // needed
+  int64_t check_and_get_resize_target(int64_t current_mem_size,
+                                      int64_t num_layers,
+                                      int64_t num_kv_buffers = 2) {
+    RwLockedShm shm(ipc_name_, MemInfoStruct::SHM_SIZE, RwLockedShm::RLOCK);
+    if (!shm.open()) {
+      return -1;
+    }
+    MemInfoStruct info = shm.read_mem_info();
+    int64_t new_mem_size = info.total_size / num_layers / num_kv_buffers;
+    if (new_mem_size != current_mem_size) {
+      return new_mem_size;
+    }
+    return -1;
+  }
+
+  const std::string &get_ipc_name() const { return ipc_name_; }
+
+private:
+  // Initialize kv cache limit in shared memory
+  void init_kv_cache_limit(int64_t kv_cache_limit) {
+    RwLockedShm shm(ipc_name_, MemInfoStruct::SHM_SIZE, RwLockedShm::WLOCK);
+    if (!shm.open()) {
+      LOGGER(ERROR, "MemInfoTracker: failed to create shm: %s",
+             ipc_name_.c_str());
+      return;
+    }
+    MemInfoStruct info(kv_cache_limit, 0, 0);
+    shm.write_mem_info(info);
+  }
+
+  // Cleanup shared memory
+  void cleanup() {
+    std::string path = std::string(SHM_DIR) + "/" + ipc_name_;
+    ::unlink(path.c_str());
+  }
+
+  // Get default IPC name (consistent with Python version logic)
+  static std::string obtain_default_ipc_name() {
+    // Prefer environment variable
+    const char *env_name = std::getenv("KVCACHED_IPC_NAME");
+    if (env_name && env_name[0] != '\0') {
+      return std::string(env_name);
+    }
+
+    // Construct name using pgid
+    pid_t pgid = getpgrp();
+    char buf[256];
+    snprintf(buf, sizeof(buf), "kvcached_engine_%d", static_cast<int>(pgid));
+    return std::string(buf);
+  }
+
+  std::string ipc_name_;
+  int64_t total_mem_size_;
+};
+
+} // namespace kvcached
diff --git a/csrc/inc/page_allocator.hpp b/csrc/inc/page_allocator.hpp
new file mode 100644
index 00000000..6538bde5
--- /dev/null
+++ b/csrc/inc/page_allocator.hpp
@@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "constants.hpp"
+#include "mem_info_tracker.hpp"
+#include "page.hpp"
+
+namespace kvcached {
+
+// Callback function types for multi-process support
+using BroadcastMapCallback =
+    std::function<void(int64_t, const std::vector<offset_t> &)>;
+using BroadcastUnmapCallback =
+    std::function<void(int64_t, const std::vector<offset_t> &)>;
+using ShouldUseWorkerIpcCallback = std::function<bool()>;
+
+// Independent InternalPage class
+class InternalPage {
+public:
+  page_id_t page_id;
+  int64_t page_size;
+  int64_t start_block;
+  int64_t end_block;
+  int64_t num_kv_blocks;
+  std::vector<int64_t> free_list;
+
+  InternalPage(page_id_t id, int64_t size);
+  void init(int64_t block_mem_size);
+  std::vector<int64_t> alloc(int64_t num_blocks = 1);
+  void free(int64_t block_id);
+  void free_batch(const std::vector<int64_t> &block_ids);
+  bool empty() const;
+  bool full() const;
+  int64_t num_free_blocks() const;
+  std::vector<int64_t> get_free_blocks() const;
+
+  static std::pair<int64_t, int64_t>
+  get_block_range(page_id_t page_id, int64_t page_size, int64_t block_mem_size);
+  static int64_t get_num_blocks(int64_t page_size, int64_t block_mem_size);
+};
+
+class PageAllocator {
+public:
+  PageAllocator(int64_t num_layers, int64_t mem_size_per_layer,
+                int64_t page_size, int64_t world_size = 1, int64_t pp_rank = 0,
+                bool async_sched = false, bool contiguous_layout = true,
+                bool enable_page_prealloc = true, int64_t num_kv_buffers = 2,
+                int64_t group_id = 0);
+
+  ~PageAllocator();
+
+  // Page allocation and deallocation
+  std::shared_ptr<InternalPage> alloc_page();
+  void free_page(page_id_t page_id);
+  void free_pages(const std::vector<page_id_t> &page_ids);
+
+  // Memory management
+  bool resize(int64_t new_mem_size);
+  void trim();
+
+  // Status queries
+  int64_t get_num_free_pages() const;
+  int64_t get_num_inuse_pages() const;
+  int64_t get_num_total_pages() const;
+  int64_t get_num_reserved_pages() const;
+  int64_t get_avail_physical_pages() const;
+
+  // Utility functions
+  page_id_t get_page_id(int64_t block_id, int64_t block_mem_size) const;
+
+  // New method for efficient index grouping
+  std::unordered_map<page_id_t, std::vector<int64_t>>
+  group_indices_by_page(const std::vector<int64_t> &indices,
+                        int64_t block_mem_size) const;
+
+  // Page list management
+  void reset_free_page_order();
+
+  // Thread management
+  void start_prealloc_thread();
+  void stop_prealloc_thread();
+
+  // Callback function setters for multi-process support
+  void set_broadcast_map_callback(BroadcastMapCallback callback);
+  void set_broadcast_unmap_callback(BroadcastUnmapCallback callback);
+  void set_should_use_worker_ipc_callback(ShouldUseWorkerIpcCallback callback);
+
+private:
+  // Preallocation thread worker
+  void prealloc_worker();
+
+  // Internal methods
+  void map_pages(const std::vector<page_id_t> &page_ids);
+  void unmap_pages(const std::vector<page_id_t> &page_ids);
+  void update_memory_usage();
+  void trigger_preallocation();
+  void start_prealloc_thread_internal();
+  void stop_prealloc_thread_internal();
+  bool should_use_worker_ipc() const;
+
+  // Configuration
+  int64_t num_layers_;
+  int64_t mem_size_per_layer_;
+  int64_t page_size_;
+  int64_t world_size_;
+  int64_t pp_rank_;
+  int64_t num_kv_buffers_;
+  int64_t group_id_;
+  bool async_sched_;
+  bool contiguous_layout_;
+  bool enable_page_prealloc_;
+  double gpu_utilization_;
+
+  // Memory tracking
+  int64_t num_free_pages_;
+  int64_t num_total_pages_;
+
+  // Page lists
+  std::deque<page_id_t> free_page_list_;
+  std::deque<page_id_t> reserved_page_list_;
+  std::deque<page_id_t> reclaimed_page_list_;
+
+  // Preallocation settings
+  int64_t min_reserved_pages_;
+  int64_t max_reserved_pages_;
+
+  // Thread management
+  mutable std::mutex lock_;
+  std::condition_variable cond_;
+  std::atomic<bool> prealloc_running_;
+  std::atomic<bool> prealloc_needed_;
+  std::unique_ptr<std::thread> prealloc_thread_;
+
+  // Memory info tracker
+  int64_t total_memory_size_;
+  std::unique_ptr<MemInfoTracker> mem_info_tracker_;
+
+  // Callback functions for multi-process support
+  BroadcastMapCallback broadcast_map_callback_;
+  BroadcastUnmapCallback broadcast_unmap_callback_;
+  ShouldUseWorkerIpcCallback should_use_worker_ipc_callback_;
+};
+
+} // namespace kvcached
diff --git a/csrc/page_allocator.cpp b/csrc/page_allocator.cpp
new file mode 100644
index 00000000..fc2f35ee
--- /dev/null
+++ b/csrc/page_allocator.cpp
@@ -0,0 +1,719 @@
+// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+// SPDX-License-Identifier: Apache-2.0
+
+#include "page_allocator.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+
+#include "allocator.hpp"
+#include "cuda_utils.hpp"
+#include "mem_info_tracker.hpp"
+#include "torch_utils.hpp"
+
+namespace kvcached {
+
+// Constants
+constexpr double PREALLOC_THREAD_TIMEOUT = 2.0; // seconds
+
+// Environment variable based constants
+const int64_t MIN_RESERVED_PAGES = []() {
+  const char *env_val = std::getenv("KVCACHED_MIN_RESERVED_PAGES");
+  return env_val ? std::atoi(env_val) : 5;
+}();
+
+const int64_t MAX_RESERVED_PAGES = []() {
+  const char *env_val = std::getenv("KVCACHED_MAX_RESERVED_PAGES");
+  return env_val ? std::atoi(env_val) : 10;
+}();
+
+const double GPU_UTILIZATION = []() {
+  const char *env_val = std::getenv("KVCACHED_GPU_UTILIZATION");
+  return env_val ? std::atof(env_val) : 0.95;
+}();
+
+// InternalPage implementation
+InternalPage::InternalPage(page_id_t id, int64_t size)
+    : page_id(id), page_size(size), start_block(0), end_block(0),
+      num_kv_blocks(0) {}
+
+void InternalPage::init(int64_t block_mem_size) {
+  auto range = get_block_range(page_id, page_size, block_mem_size);
+  start_block = range.first;
+  end_block = range.second;
+  num_kv_blocks = end_block - start_block;
+  free_list.clear();
+  for (int64_t i = start_block; i < end_block; ++i) {
+    free_list.push_back(i);
+  }
+}
+
+std::vector<int64_t> InternalPage::alloc(int64_t num_blocks) {
+  if (free_list.size() < static_cast<size_t>(num_blocks)) {
+    throw std::runtime_error("Not enough free blocks in page");
+  }
+
+  std::vector<int64_t> block_ids;
+  block_ids.reserve(num_blocks);
+  for (int64_t i = 0; i < num_blocks; ++i) {
+    block_ids.push_back(free_list[i]);
+  }
+  free_list.erase(free_list.begin(), free_list.begin() + num_blocks);
+  return block_ids;
+}
+
+void InternalPage::free(int64_t block_id) { free_list.push_back(block_id); }
+
+void InternalPage::free_batch(const std::vector<int64_t> &block_ids) {
+  free_list.insert(free_list.end(), block_ids.begin(), block_ids.end());
+}
+
+bool InternalPage::empty() const {
+  return free_list.size() == static_cast<size_t>(num_kv_blocks);
+}
+
+bool InternalPage::full() const { return free_list.empty(); }
+
+int64_t InternalPage::num_free_blocks() const {
+  return static_cast<int64_t>(free_list.size());
+}
+
+std::vector<int64_t> InternalPage::get_free_blocks() const { return free_list; }
+
+std::pair<int64_t, int64_t>
+InternalPage::get_block_range(page_id_t page_id, int64_t page_size,
+                              int64_t block_mem_size) {
+
+  int64_t start_block =
+      (page_id * page_size + block_mem_size - 1) / block_mem_size;
+  int64_t end_block = ((page_id + 1) * page_size) / block_mem_size;
+  return {start_block, end_block};
+}
+
+int64_t InternalPage::get_num_blocks(int64_t page_size,
+                                     int64_t block_mem_size) {
+  return page_size / block_mem_size;
+}
+
+// PageAllocator implementation
+PageAllocator::PageAllocator(int64_t num_layers, int64_t mem_size_per_layer,
+                             int64_t page_size, int64_t world_size,
+                             int64_t pp_rank, bool async_sched,
+                             bool contiguous_layout, bool enable_page_prealloc,
+                             int64_t num_kv_buffers, int64_t group_id)
+    : num_layers_(num_layers), mem_size_per_layer_(mem_size_per_layer),
+      page_size_(page_size), world_size_(world_size), pp_rank_(pp_rank),
+      num_kv_buffers_(num_kv_buffers), group_id_(group_id),
+      async_sched_(async_sched), contiguous_layout_(contiguous_layout),
+      enable_page_prealloc_(enable_page_prealloc),
+      gpu_utilization_(GPU_UTILIZATION),
+      num_free_pages_(mem_size_per_layer / page_size),
+      num_total_pages_(mem_size_per_layer / page_size),
+      min_reserved_pages_(std::min(num_free_pages_, MIN_RESERVED_PAGES)),
+      max_reserved_pages_(std::min(num_free_pages_, MAX_RESERVED_PAGES)),
+      prealloc_running_(false), prealloc_needed_(false),
+      total_memory_size_(mem_size_per_layer * num_layers * num_kv_buffers) {
+
+  // Initialize free page list
+  for (int64_t i = 0; i < num_free_pages_; ++i) {
+    free_page_list_.push_back(i);
+  }
+
+  // Initialize memory info tracker
+  mem_info_tracker_ =
+      std::make_unique<MemInfoTracker>(total_memory_size_, group_id_);
+
+  std::cout << "Init C++ PageAllocator: "
+            << "num_layers=" << num_layers << ", "
+            << "mem_size_per_layer=" << mem_size_per_layer / (1024 * 1024)
+            << "MB, "
+            << "total_mem_size="
+            << (num_kv_buffers * num_layers * mem_size_per_layer) /
+                   (1024 * 1024)
+            << "MB, "
+            << "page_size=" << page_size / (1024 * 1024) << "MB, "
+            << "world_size=" << world_size << ", "
+            << "pp_rank=" << pp_rank << ", "
+            << "async_sched=" << async_sched << ", "
+            << "contiguous_layout=" << contiguous_layout << ", "
+            << "enable_prealloc=" << enable_page_prealloc << ", "
+            << "num_kv_buffers=" << num_kv_buffers << ", "
+            << "group_id=" << group_id << ", "
+            << "min_reserved_pages=" << min_reserved_pages_ << ", "
+            << "max_reserved_pages=" << max_reserved_pages_ << std::endl;
+}
+
+PageAllocator::~PageAllocator() {
+  try {
+    if (enable_page_prealloc_ && prealloc_thread_) {
+      stop_prealloc_thread_internal();
+    }
+  } catch (...) {
+    // Silently ignore exceptions during cleanup
+  }
+}
+
+std::shared_ptr<InternalPage> PageAllocator::alloc_page() {
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::unique_lock<std::mutex> lock(lock_);
+  page_id_t page_id = -1;
+
+  while (page_id == -1) {
+    // Fast path: allocate from reserved pages
+    if (!reserved_page_list_.empty()) {
+      page_id = reserved_page_list_.front();
+      reserved_page_list_.pop_front();
+      num_free_pages_--;
+
+      // Trigger preallocation if reserved pool is getting low
+      if (free_page_list_.size() != 0 &&
+          reserved_page_list_.size() <
+              static_cast<size_t>(min_reserved_pages_)) {
+        max_reserved_pages_ =
+            std::max(max_reserved_pages_ * 4, num_total_pages_);
+        LOGGER(INFO, "Increase max_reserved_pages_ %ld -> %ld",
+               max_reserved_pages_ / 2, max_reserved_pages_);
+        prealloc_needed_ = true;
+        cond_.notify_all();
+        // pass
+      }
+
+      update_memory_usage();
+      auto end_time = std::chrono::steady_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+          end_time - start_time);
+      LOGGER(DEBUG, "alloc 1 page fast path cost %lu us", duration.count());
+      // std::cout << "alloc 1 page fast path cost " << duration.count() << "
+      // us" << std::endl;
+
+      return std::make_shared<InternalPage>(page_id, page_size_);
+    }
+
+    // Slow path: allocate from free pages
+    if (!free_page_list_.empty()) {
+      page_id = free_page_list_.front();
+      free_page_list_.pop_front();
+      num_free_pages_--;
+      break;
+    }
+
+    if (num_free_pages_ <= 0) {
+      throw std::runtime_error("No free pages left");
+    }
+
+    if (!enable_page_prealloc_) {
+      throw std::runtime_error(
+          "Inconsistent page allocator state: no free pages available");
+    }
+
+    // Wait for background preallocation
+    cond_.wait(lock);
+  }
+
+  lock.unlock();
+
+  try {
+    map_pages({page_id});
+  } catch (const std::exception &e) {
+    std::lock_guard<std::mutex> guard(lock_);
+    free_page_list_.push_front(page_id);
+    num_free_pages_++;
+    cond_.notify_all();
+    throw std::runtime_error("Failed to map page " + std::to_string(page_id) +
+                             ": " + e.what());
+  }
+
+  if (enable_page_prealloc_) {
+    trigger_preallocation();
+  }
+
+  update_memory_usage();
+  auto end_time = std::chrono::steady_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  LOGGER(DEBUG, "alloc 1 page slow path cost %lu us", duration.count());
+
+  return std::make_shared<InternalPage>(page_id, page_size_);
+}
+
+void PageAllocator::free_page(page_id_t page_id) {
+  std::lock_guard<std::mutex> lock(lock_);
+
+  num_free_pages_++;
+
+  if (reserved_page_list_.size() < static_cast<size_t>(max_reserved_pages_)) {
+    // Fast path: reserve page
+    reserved_page_list_.push_back(page_id);
+    update_memory_usage();
+    cond_.notify_all();
+    return;
+  }
+
+  lock_.unlock();
+
+  // Slow path: free page and unmap
+  unmap_pages({page_id});
+
+  lock_.lock();
+  free_page_list_.push_back(page_id);
+  update_memory_usage();
+  cond_.notify_all();
+}
+
+void PageAllocator::free_pages(const std::vector<page_id_t> &page_ids) {
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::lock_guard<std::mutex> lock(lock_);
+
+  num_free_pages_ += page_ids.size();
+  int64_t num_to_reserve = max_reserved_pages_ - reserved_page_list_.size();
+
+  if (num_to_reserve > 0) {
+    // Fast path: reserve pages
+    auto reserve_end =
+        page_ids.begin() +
+        std::min(static_cast<size_t>(num_to_reserve), page_ids.size());
+    reserved_page_list_.insert(reserved_page_list_.end(), page_ids.begin(),
+                               reserve_end);
+
+    std::vector<page_id_t> remaining_pages(reserve_end, page_ids.end());
+
+    if (remaining_pages.empty()) {
+      update_memory_usage();
+      cond_.notify_all();
+      return;
+    }
+
+    lock_.unlock();
+
+    // Slow path: free remaining pages
+    unmap_pages(remaining_pages);
+
+    lock_.lock();
+    free_page_list_.insert(free_page_list_.end(), remaining_pages.begin(),
+                           remaining_pages.end());
+  } else {
+    lock_.unlock();
+    unmap_pages(page_ids);
+    lock_.lock();
+    free_page_list_.insert(free_page_list_.end(), page_ids.begin(),
+                           page_ids.end());
+  }
+
+  update_memory_usage();
+  cond_.notify_all();
+
+  auto end_time = std::chrono::steady_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  LOGGER(DEBUG, "free %ld pages cost %lu us", page_ids.size(),
+         duration.count());
+}
+
+bool PageAllocator::resize(int64_t new_mem_size) {
+  int64_t new_num_pages = new_mem_size / page_size_;
+  std::lock_guard<std::mutex> lock(lock_);
+
+  if (new_num_pages < get_num_inuse_pages()) {
+    return false;
+  }
+
+  if (new_num_pages == num_total_pages_) {
+    return true;
+  } else if (new_num_pages > num_total_pages_) {
+    int64_t num_to_expand = new_num_pages - num_total_pages_;
+
+    // Reuse previously reclaimed pages first
+    int64_t num_to_reuse = std::min(
+        static_cast<int64_t>(reclaimed_page_list_.size()), num_to_expand);
+    if (num_to_reuse > 0) {
+      for (int64_t i = 0; i < num_to_reuse; ++i) {
+        free_page_list_.push_back(reclaimed_page_list_.front());
+        reclaimed_page_list_.pop_front();
+      }
+      num_to_expand -= num_to_reuse;
+      num_free_pages_ += num_to_reuse;
+    }
+
+    // Allocate new pages if needed
+    if (num_to_expand > 0) {
+      for (int64_t i = num_total_pages_; i < num_total_pages_ + num_to_expand;
+           ++i) {
+        free_page_list_.push_back(i);
+      }
+      num_free_pages_ += num_to_expand;
+    }
+    num_total_pages_ = new_num_pages;
+    update_memory_usage();
+  } else {
+    int64_t num_to_reclaim = num_total_pages_ - new_num_pages;
+
+    if (free_page_list_.size() < static_cast<size_t>(num_to_reclaim)) {
+      // Need to trim reserved pages first
+      if (!reserved_page_list_.empty()) {
+        std::vector<page_id_t> pages_to_unmap(reserved_page_list_.begin(),
+                                              reserved_page_list_.end());
+        reserved_page_list_.clear();
+
+        lock_.unlock();
+        unmap_pages(pages_to_unmap);
+        lock_.lock();
+
+        free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
+                               pages_to_unmap.end());
+        update_memory_usage();
+      }
+    }
+
+    if (free_page_list_.size() < static_cast<size_t>(num_to_reclaim)) {
+      return false;
+    }
+
+    for (int64_t i = 0; i < num_to_reclaim; ++i) {
+      reclaimed_page_list_.push_back(free_page_list_.back());
+      free_page_list_.pop_back();
+    }
+    num_free_pages_ -= num_to_reclaim;
+    num_total_pages_ = new_num_pages;
+  }
+  return true;
+}
+
+void PageAllocator::trim() {
+  std::lock_guard<std::mutex> lock(lock_);
+  std::vector<page_id_t> pages_to_unmap(reserved_page_list_.begin(),
+                                        reserved_page_list_.end());
+  reserved_page_list_.clear();
+
+  if (pages_to_unmap.empty()) {
+    update_memory_usage();
+    return;
+  }
+
+  lock_.unlock();
+  unmap_pages(pages_to_unmap);
+  lock_.lock();
+
+  free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
+                         pages_to_unmap.end());
+  update_memory_usage();
+}
+
+int64_t PageAllocator::get_num_free_pages() const { return num_free_pages_; }
+
+int64_t PageAllocator::get_num_inuse_pages() const {
+  return num_total_pages_ - num_free_pages_;
+}
+
+int64_t PageAllocator::get_num_total_pages() const { return num_total_pages_; }
+
+int64_t PageAllocator::get_num_reserved_pages() const {
+  std::lock_guard<std::mutex> lock(lock_);
+  return reserved_page_list_.size();
+}
+
+int64_t PageAllocator::get_avail_physical_pages() const {
+  size_t avail_phy_mem_size, total_phy_mem_size;
+  cudaMemGetInfo(&avail_phy_mem_size, &total_phy_mem_size);
+
+  size_t headroom = total_phy_mem_size * (1.0 - gpu_utilization_);
+  avail_phy_mem_size =
+      std::max(avail_phy_mem_size - headroom, static_cast<size_t>(0));
+
+  // Calculate available pages considering layers and KV buffers
+  int64_t avail_phy_pages = avail_phy_mem_size / page_size_;
+  int64_t avail_pages_per_layer =
+      avail_phy_pages / num_layers_ / num_kv_buffers_;
+  return avail_pages_per_layer;
+}
+
+page_id_t PageAllocator::get_page_id(int64_t block_id,
+                                     int64_t block_mem_size) const {
+  return block_id * block_mem_size / page_size_;
+}
+
+std::unordered_map<page_id_t, std::vector<int64_t>>
+PageAllocator::group_indices_by_page(const std::vector<int64_t> &indices,
+                                     int64_t block_mem_size) const {
+
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::unordered_map<page_id_t, std::vector<int64_t>> result;
+
+  // Pre-calculate constants for efficiency
+  int64_t blocks_per_page = page_size_ / block_mem_size;
+
+  // Reserve space for efficiency
+  result.reserve(indices.size() / blocks_per_page + 1);
+
+  // Group indices by page_id
+  for (int64_t idx : indices) {
+    page_id_t page_id = get_page_id(idx, block_mem_size);
+    result[page_id].push_back(idx);
+  }
+
+  auto end_time = std::chrono::steady_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  LOGGER(DEBUG, "C++ group_indices_by_page processed %zu indices in %ld us",
+         indices.size(), duration.count());
+
+  return result;
+}
+
+// Callback function setters
+void PageAllocator::set_broadcast_map_callback(BroadcastMapCallback callback) {
+  std::lock_guard<std::mutex> lock(lock_);
+  broadcast_map_callback_ = callback;
+  LOGGER(INFO, "Broadcast map callback set for PageAllocator (world_size=%ld)",
+         world_size_);
+}
+
+void PageAllocator::set_broadcast_unmap_callback(
+    BroadcastUnmapCallback callback) {
+  std::lock_guard<std::mutex> lock(lock_);
+  broadcast_unmap_callback_ = callback;
+  LOGGER(INFO,
+         "Broadcast unmap callback set for PageAllocator (world_size=%ld)",
+         world_size_);
+}
+
+void PageAllocator::set_should_use_worker_ipc_callback(
+    ShouldUseWorkerIpcCallback callback) {
+  std::lock_guard<std::mutex> lock(lock_);
+  should_use_worker_ipc_callback_ = callback;
+  LOGGER(INFO, "Should-use-worker-ipc callback set for PageAllocator");
+}
+
+void PageAllocator::start_prealloc_thread() {
+  if (enable_page_prealloc_) {
+    start_prealloc_thread_internal();
+  }
+}
+
+void PageAllocator::stop_prealloc_thread() {
+  if (enable_page_prealloc_) {
+    stop_prealloc_thread_internal();
+  }
+}
+
+void PageAllocator::prealloc_worker() {
+  auto start_time = std::chrono::steady_clock::now();
+
+  while (prealloc_running_) {
+    std::unique_lock<std::mutex> lock(lock_);
+
+    // Wait until preallocation is needed or thread is stopped
+    while (!prealloc_needed_ && prealloc_running_) {
+      cond_.wait(lock);
+    }
+
+    LOGGER(INFO, "prealloc worker triggered...");
+    if (!prealloc_running_) {
+      break;
+    }
+
+    start_time = std::chrono::steady_clock::now();
+    prealloc_needed_ = false;
+
+    int64_t current_reserved = reserved_page_list_.size();
+    int64_t to_reserve = std::max(0L, min_reserved_pages_ - current_reserved);
+    // Only try to reserve up to the available free pages and physical memory
+    to_reserve =
+        std::min({to_reserve, static_cast<int64_t>(free_page_list_.size()),
+                  get_avail_physical_pages()});
+
+    LOGGER(INFO,
+           "max_reserved_pages: %ld, min_reserved_pages: %ld, "
+           "current_reserved: %ld, to_reserve: %ld, len(free_page_list): %zu",
+           max_reserved_pages_, min_reserved_pages_, current_reserved,
+           to_reserve, free_page_list_.size());
+
+    if (to_reserve <= 0) {
+      auto end_time = std::chrono::steady_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+          end_time - start_time);
+      LOGGER(INFO,
+             "prealloc cost: %ld us, no need to preallocate(to_reserve: %ld).",
+             duration.count(), to_reserve);
+      continue;
+    }
+
+    std::vector<page_id_t> pages_to_reserve;
+    pages_to_reserve.reserve(to_reserve);
+
+    // Get pages from free list
+    for (int64_t i = 0; i < to_reserve && !free_page_list_.empty(); ++i) {
+      pages_to_reserve.push_back(free_page_list_.front());
+      free_page_list_.pop_front();
+    }
+
+    lock.unlock();
+
+    if (!pages_to_reserve.empty()) {
+      try {
+        map_pages(pages_to_reserve);
+        lock.lock();
+        reserved_page_list_.insert(reserved_page_list_.end(),
+                                   pages_to_reserve.begin(),
+                                   pages_to_reserve.end());
+        update_memory_usage();
+        cond_.notify_all();
+        LOGGER(INFO, "Preallocated %ld pages, reserved=%ld",
+               pages_to_reserve.size(), reserved_page_list_.size());
+      } catch (const std::exception &e) {
+        lock.lock();
+        free_page_list_.insert(free_page_list_.begin(),
+                               pages_to_reserve.begin(),
+                               pages_to_reserve.end());
+        cond_.notify_all();
+        LOGGER(ERROR, "Failed to preallocate %ld pages: %s",
+               pages_to_reserve.size(), e.what());
+      }
+
+      auto end_time = std::chrono::steady_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+          end_time - start_time);
+      LOGGER(INFO, "prealloc cost: %ld us, prealloc %ld pages.",
+             duration.count(), pages_to_reserve.size());
+    }
+  }
+}
+
+void PageAllocator::map_pages(const std::vector<page_id_t> &page_ids) {
+  std::vector<offset_t> offsets;
+  offsets.reserve(page_ids.size());
+
+  if (contiguous_layout_) {
+    for (page_id_t pid : page_ids) {
+      offsets.push_back(pid * page_size_ * num_layers_ * num_kv_buffers_);
+    }
+  } else {
+    for (page_id_t pid : page_ids) {
+      offsets.push_back(pid * page_size_);
+    }
+  }
+
+  if ((world_size_ > 1 || should_use_worker_ipc()) && broadcast_map_callback_) {
+    // Multi-process mode: execute map on all TP workers via broadcast callback
+    broadcast_map_callback_(world_size_, offsets);
+  } else {
+    // Single-process mode: directly call FTensorAllocator
+    auto allocator = FTensorAllocator::global_allocator(group_id_);
+    bool success = allocator->map_to_kv_tensors(offsets);
+    if (!success) {
+      throw std::runtime_error("Failed to map pages to KV tensors");
+    }
+  }
+
+  LOGGER(INFO, "Mapped %zu pages to KV tensors", page_ids.size());
+}
+
+void PageAllocator::unmap_pages(const std::vector<page_id_t> &page_ids) {
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::vector<offset_t> offsets;
+  offsets.reserve(page_ids.size());
+
+  if (contiguous_layout_) {
+    for (page_id_t pid : page_ids) {
+      offsets.push_back(pid * page_size_ * num_layers_ * num_kv_buffers_);
+    }
+  } else {
+    for (page_id_t pid : page_ids) {
+      offsets.push_back(pid * page_size_);
+    }
+  }
+
+  if ((world_size_ > 1 || should_use_worker_ipc()) &&
+      broadcast_unmap_callback_) {
+    // Multi-process mode: execute unmap on all TP workers via broadcast
+    // callback
+    broadcast_unmap_callback_(world_size_, offsets);
+  } else {
+    // Need to synchronize CUDA first in async scheduling mode
+    if (async_sched_) {
+      CHECK_RT(cudaDeviceSynchronize());
+    }
+    auto allocator = FTensorAllocator::global_allocator(group_id_);
+    bool success = allocator->unmap_from_kv_tensors(offsets);
+    if (!success) {
+      throw std::runtime_error("Failed to unmap pages from KV tensors");
+    }
+  }
+
+  auto end_time = std::chrono::steady_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  LOGGER(INFO, "Unmapped %zu pages from KV tensors, cost: %lu us",
+         page_ids.size(), duration.count());
+}
+
+void PageAllocator::update_memory_usage() {
+  // Calculate currently used physical memory (excluding preallocated pages)
+  int64_t used_phy_mem_size =
+      get_num_inuse_pages() * num_layers_ * page_size_ * num_kv_buffers_;
+  // Calculate physical memory occupied by preallocated pages
+  int64_t prealloc_phy_mem_size =
+      static_cast<int64_t>(reserved_page_list_.size()) * num_layers_ *
+      page_size_ * num_kv_buffers_;
+
+  if (mem_info_tracker_) {
+    mem_info_tracker_->update_memory_usage(used_phy_mem_size,
+                                           prealloc_phy_mem_size);
+  }
+}
+
+void PageAllocator::reset_free_page_order() {
+  std::lock_guard<std::mutex> lock(lock_);
+  std::vector<page_id_t> sorted_pages(free_page_list_.begin(),
+                                      free_page_list_.end());
+  std::sort(sorted_pages.begin(), sorted_pages.end());
+  free_page_list_.assign(sorted_pages.begin(), sorted_pages.end());
+}
+
+void PageAllocator::trigger_preallocation() {
+  std::lock_guard<std::mutex> lock(lock_);
+  prealloc_needed_ = true;
+  cond_.notify_all();
+}
+
+void PageAllocator::start_prealloc_thread_internal() {
+  if (!prealloc_thread_) {
+    prealloc_running_ = true;
+    prealloc_thread_ =
+        std::make_unique<std::thread>(&PageAllocator::prealloc_worker, this);
+
+    // Initial preallocation trigger
+    trigger_preallocation();
+  }
+}
+
+void PageAllocator::stop_prealloc_thread_internal() {
+  if (prealloc_thread_) {
+    {
+      std::lock_guard<std::mutex> lock(lock_);
+      prealloc_running_ = false;
+      cond_.notify_all();
+    }
+
+    prealloc_thread_->join();
+    prealloc_thread_.reset();
+    LOGGER(DEBUG, "Stopped page preallocation thread");
+  }
+}
+
+bool PageAllocator::should_use_worker_ipc() const {
+  if (should_use_worker_ipc_callback_) {
+    return should_use_worker_ipc_callback_();
+  }
+  return false;
+}
+
+} // namespace kvcached
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 28830555..3177f3ae 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright contributors to the kvcached project
 // SPDX-License-Identifier: Apache-2.0
 
+#include <memory>
 #include <pybind11/pybind11.h>
 #include <string>
 #include <torch/extension.h>
@@ -8,6 +9,7 @@
 
 #include "allocator.hpp"
 #include "constants.hpp"
+#include "page_allocator.hpp"
 #include "torch_utils.hpp"
 
 namespace kvcached {
@@ -54,6 +56,113 @@ bool unmap_from_kv_tensors(const std::vector<offset_t> &offsets,
   return allocator->unmap_from_kv_tensors(offsets);
 }
 
+// PageAllocator bindings
+std::shared_ptr<PageAllocator> create_page_allocator(
+    int64_t num_layers, int64_t mem_size_per_layer, int64_t page_size,
+    int64_t world_size = 1, int64_t pp_rank = 0, bool async_sched = false,
+    bool contiguous_layout = true, bool enable_page_prealloc = true,
+    int64_t num_kv_buffers = 2, int64_t group_id = 0) {
+
+  return std::make_shared<PageAllocator>(
+      num_layers, mem_size_per_layer, page_size, world_size, pp_rank,
+      async_sched, contiguous_layout, enable_page_prealloc, num_kv_buffers,
+      group_id);
+}
+
+// PageAllocator method bindings
+void page_allocator_start_prealloc_thread(
+    std::shared_ptr<PageAllocator> allocator) {
+  allocator->start_prealloc_thread();
+}
+
+void page_allocator_stop_prealloc_thread(
+    std::shared_ptr<PageAllocator> allocator) {
+  allocator->stop_prealloc_thread();
+}
+
+std::shared_ptr<InternalPage>
+page_allocator_alloc_page(std::shared_ptr<PageAllocator> allocator) {
+  return allocator->alloc_page();
+}
+
+void page_allocator_free_page(std::shared_ptr<PageAllocator> allocator,
+                              page_id_t page_id) {
+  allocator->free_page(page_id);
+}
+
+void page_allocator_free_pages(std::shared_ptr<PageAllocator> allocator,
+                               const std::vector<page_id_t> &page_ids) {
+  allocator->free_pages(page_ids);
+}
+
+bool page_allocator_resize(std::shared_ptr<PageAllocator> allocator,
+                           int64_t new_mem_size) {
+  return allocator->resize(new_mem_size);
+}
+
+void page_allocator_trim(std::shared_ptr<PageAllocator> allocator) {
+  allocator->trim();
+}
+
+void page_allocator_reset_free_page_order(
+    std::shared_ptr<PageAllocator> allocator) {
+  allocator->reset_free_page_order();
+}
+
+int64_t
+page_allocator_get_num_free_pages(std::shared_ptr<PageAllocator> allocator) {
+  return allocator->get_num_free_pages();
+}
+
+int64_t
+page_allocator_get_num_inuse_pages(std::shared_ptr<PageAllocator> allocator) {
+  return allocator->get_num_inuse_pages();
+}
+
+int64_t
+page_allocator_get_num_total_pages(std::shared_ptr<PageAllocator> allocator) {
+  return allocator->get_num_total_pages();
+}
+
+int64_t page_allocator_get_num_reserved_pages(
+    std::shared_ptr<PageAllocator> allocator) {
+  return allocator->get_num_reserved_pages();
+}
+
+int64_t page_allocator_get_avail_physical_pages(
+    std::shared_ptr<PageAllocator> allocator) {
+  return allocator->get_avail_physical_pages();
+}
+
+void page_allocator_set_broadcast_map_callback(
+    std::shared_ptr<PageAllocator> allocator, BroadcastMapCallback callback) {
+  allocator->set_broadcast_map_callback(callback);
+}
+
+void page_allocator_set_broadcast_unmap_callback(
+    std::shared_ptr<PageAllocator> allocator, BroadcastUnmapCallback callback) {
+  allocator->set_broadcast_unmap_callback(callback);
+}
+
+void page_allocator_set_should_use_worker_ipc_callback(
+    std::shared_ptr<PageAllocator> allocator,
+    ShouldUseWorkerIpcCallback callback) {
+  allocator->set_should_use_worker_ipc_callback(callback);
+}
+
+page_id_t page_allocator_get_page_id(std::shared_ptr<PageAllocator> allocator,
+                                     int64_t block_id, int64_t block_mem_size) {
+  return allocator->get_page_id(block_id, block_mem_size);
+}
+
+// New function for grouping indices by page
+std::unordered_map<page_id_t, std::vector<int64_t>>
+page_allocator_group_indices_by_page(std::shared_ptr<PageAllocator> allocator,
+                                     const std::vector<int64_t> &indices,
+                                     int64_t block_mem_size) {
+  return allocator->group_indices_by_page(indices, block_mem_size);
+}
+
 } // namespace kvcached
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -73,4 +182,59 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("offsets"), py::arg("group_id") = 0);
   m.def("unmap_from_kv_tensors", &kvcached::unmap_from_kv_tensors,
         "unmap_from_kv_tensors", py::arg("offsets"), py::arg("group_id") = 0);
+
+  // PageAllocator bindings
+  py::class_<kvcached::PageAllocator, std::shared_ptr<kvcached::PageAllocator>>(
+      m, "PageAllocator")
+      .def(py::init(&kvcached::create_page_allocator), py::arg("num_layers"),
+           py::arg("mem_size_per_layer"), py::arg("page_size"),
+           py::arg("world_size") = 1, py::arg("pp_rank") = 0,
+           py::arg("async_sched") = false, py::arg("contiguous_layout") = true,
+           py::arg("enable_page_prealloc") = true,
+           py::arg("num_kv_buffers") = 2, py::arg("group_id") = 0)
+      .def("start_prealloc_thread",
+           &kvcached::page_allocator_start_prealloc_thread)
+      .def("stop_prealloc_thread",
+           &kvcached::page_allocator_stop_prealloc_thread)
+      .def("alloc_page", &kvcached::page_allocator_alloc_page)
+      .def("free_page", &kvcached::page_allocator_free_page)
+      .def("free_pages", &kvcached::page_allocator_free_pages)
+      .def("resize", &kvcached::page_allocator_resize)
+      .def("trim", &kvcached::page_allocator_trim)
+      .def("reset_free_page_order",
+           &kvcached::page_allocator_reset_free_page_order)
+      .def("get_num_free_pages", &kvcached::page_allocator_get_num_free_pages)
+      .def("get_num_inuse_pages", &kvcached::page_allocator_get_num_inuse_pages)
+      .def("get_num_total_pages", &kvcached::page_allocator_get_num_total_pages)
+      .def("get_num_reserved_pages",
+           &kvcached::page_allocator_get_num_reserved_pages)
+      .def("get_avail_physical_pages",
+           &kvcached::page_allocator_get_avail_physical_pages)
+      .def("get_page_id", &kvcached::page_allocator_get_page_id)
+      .def("group_indices_by_page",
+           &kvcached::page_allocator_group_indices_by_page)
+      .def("set_broadcast_map_callback",
+           &kvcached::page_allocator_set_broadcast_map_callback)
+      .def("set_broadcast_unmap_callback",
+           &kvcached::page_allocator_set_broadcast_unmap_callback)
+      .def("set_should_use_worker_ipc_callback",
+           &kvcached::page_allocator_set_should_use_worker_ipc_callback);
+
+  // InternalPage bindings (now as independent class)
+  py::class_<kvcached::InternalPage, std::shared_ptr<kvcached::InternalPage>>(
+      m, "InternalPage")
+      .def(py::init<kvcached::page_id_t, int64_t>(), py::arg("page_id"),
+           py::arg("page_size"))
+      .def_readonly("page_id", &kvcached::InternalPage::page_id)
+      .def_readonly("page_size", &kvcached::InternalPage::page_size)
+      .def("init", &kvcached::InternalPage::init)
+      .def("alloc", &kvcached::InternalPage::alloc)
+      .def("free", &kvcached::InternalPage::free)
+      .def("free_batch", &kvcached::InternalPage::free_batch)
+      .def("empty", &kvcached::InternalPage::empty)
+      .def("full", &kvcached::InternalPage::full)
+      .def("num_free_blocks", &kvcached::InternalPage::num_free_blocks)
+      .def("get_free_blocks", &kvcached::InternalPage::get_free_blocks)
+      .def_static("get_block_range", &kvcached::InternalPage::get_block_range)
+      .def_static("get_num_blocks", &kvcached::InternalPage::get_num_blocks);
 }
diff --git a/csrc/util.cpp b/csrc/util.cpp
new file mode 100644
index 00000000..57fceb4b
--- /dev/null
+++ b/csrc/util.cpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+// SPDX-License-Identifier: Apache-2.0
+
+#include <chrono>
+#include <iostream>
+#include <sys/time.h>
+#include <time.h>
+
+#define NSEC_PER_USEC (1000ULL)
+#define USEC_PER_SEC (1000000ULL)
+
+uint64_t timespec_to_us(struct timespec ts) {
+  return (ts.tv_sec * USEC_PER_SEC + ts.tv_nsec / NSEC_PER_USEC);
+}
+
+uint64_t get_current_timestamp_in_us() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return timespec_to_us(ts);
+}
+
+void now_to_string(char *buf, int length) {
+  auto now = std::chrono::system_clock::now();
+  auto seconds = std::chrono::system_clock::to_time_t(now);
+  auto us = std::chrono::duration_cast<std::chrono::microseconds>(
+                now.time_since_epoch()) %
+            1000000;
+
+  std::tm tm_struct = {};
+
+#ifdef _WIN32
+  localtime_s(&tm_struct, &seconds);
+#else
+  localtime_r(&seconds, &tm_struct);
+#endif
+
+  char buffer[64];
+  std::strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", &tm_struct);
+
+  std::snprintf(buf, length, "%s.%06ld", buffer, static_cast<long>(us.count()));
+}
\ No newline at end of file
diff --git a/kvcached/integration/vllm/patches.py b/kvcached/integration/vllm/patches.py
index e97c003e..a87e47e3 100644
--- a/kvcached/integration/vllm/patches.py
+++ b/kvcached/integration/vllm/patches.py
@@ -308,6 +308,7 @@ def __init__(
                 self.num_gpu_blocks = num_gpu_blocks
                 self.enable_kv_cache_events = enable_kv_cache_events
                 self.kv_event_queue = []  # type: ignore[var-annotated]
+                self.kv_block_pool = [KVCacheBlockClass(i) for i in range(num_gpu_blocks)]
 
                 from kvcached.integration.vllm.interfaces import get_kv_cache_manager
 
@@ -481,7 +482,12 @@ def get_new_blocks(
                 block_ids = self.kv_cache_manager.alloc(num_blocks)
                 assert block_ids is not None and len(block_ids) == num_blocks
 
-                return [KVCacheBlockClass(bid, ref_cnt=1) for bid in block_ids]
+                blocks = []
+                for bid in block_ids:
+                    block = self.kv_block_pool[bid]
+                    block.ref_cnt = 1
+                    blocks.append(block)
+                return blocks
 
             def touch(
                 self, blocks: list["KVCacheBlock"] | tuple[list["KVCacheBlock"], ...]
@@ -1208,7 +1214,10 @@ def _reshape_kv_cache_tensors_from_kvcached(
             self, kv_cache_config, kv_cache_raw_tensors, *args: Any, **kwargs: Any
         ):
             import torch
-            from vllm.utils.torch_utils import get_dtype_size
+            try:
+                from vllm.utils.torch_utils import get_dtype_size
+            except ImportError:
+                from vllm.utils import get_dtype_size  # type: ignore[attr-defined]
 
             kv_caches: dict[str, torch.Tensor] = {}
 
diff --git a/kvcached/kv_cache_manager.py b/kvcached/kv_cache_manager.py
index f22b4018..b45f1b60 100644
--- a/kvcached/kv_cache_manager.py
+++ b/kvcached/kv_cache_manager.py
@@ -9,18 +9,25 @@
 - Blocks: Smaller units within pages that are allocated to store KV cache data
 """
 
+from __future__ import annotations
+
 import functools
 import threading
 import time
-from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from kvcached.locks import NoOpLock
-from kvcached.page_allocator import Page, PageAllocator
 from kvcached.tp_ipc_util import broadcast_kv_tensors_created
 from kvcached.utils import PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
 from kvcached.vmm_ops import kv_tensors_created
 
+try:
+    import kvcached.vmm_ops as kvcached_cpp
+    PageAllocator = kvcached_cpp.PageAllocator
+    InternalPage: Any = kvcached_cpp.InternalPage
+except ImportError as e:
+    raise ImportError(f"Failed to import kvcached.vmm_ops. Please ensure the C++ extension is built properly. err: {e}")
+
 logger = get_kvcached_logger()
 
 KV_TENSOR_WAIT_TIMEOUT: float = 10.0  # seconds
@@ -91,13 +98,51 @@ def __init__(
             self.world_size,
             pp_rank=self.pp_rank,
             async_sched=async_sched,
+            contiguous_layout=True,
+            enable_page_prealloc=True,
             num_kv_buffers=self.num_kv_buffers,
             group_id=self.group_id,
         )
+        # Register should_use_worker_ipc callback so C++ PageAllocator
+        # knows when to use broadcast IPC even with world_size == 1
+        # (e.g. vLLM V1 EngineCore + worker in separate processes).
+        try:
+            from kvcached.integration.vllm.interfaces import should_use_worker_ipc
+            self.page_allocator.set_should_use_worker_ipc_callback(should_use_worker_ipc)
+            use_worker_ipc = should_use_worker_ipc()
+        except ImportError:
+            use_worker_ipc = False
+
+        if self.world_size > 1 or use_worker_ipc:
+            try:
+                from kvcached.tp_ipc_util import (
+                    broadcast_map_to_kv_tensors,
+                    broadcast_unmap_from_kv_tensors,
+                )
+
+                # Wrap Python functions to match C++ callback signature
+                def map_callback(world_size: int, offsets: List[int], pp_rank: int = 0, group_id: int = 0) -> None:
+                    """Wrapper for Python broadcast function"""
+                    broadcast_map_to_kv_tensors(world_size, offsets, pp_rank, group_id)
+
+                def unmap_callback(world_size: int, offsets: List[int]) -> None:
+                    """Wrapper for Python broadcast function"""
+                    broadcast_unmap_from_kv_tensors(world_size, offsets, pp_rank, group_id)
+
+                # Set the callbacks in the PageAllocator
+                self.page_allocator.set_broadcast_map_callback(map_callback)
+                self.page_allocator.set_broadcast_unmap_callback(unmap_callback)
+
+                logger.info("Set up broadcast callbacks for multi-process (world_size=%d, use_worker_ipc=%s)",
+                            self.world_size, use_worker_ipc)
+            except ImportError as e:
+                logger.warning("Failed to import tp_ipc_util module: %s. Broadcast callbacks will not be available.", e)
+            except Exception as e:
+                logger.warning("Failed to set up broadcast callbacks: %s. Falling back to single-process mode.", e)
 
         self.num_avail_blocks = 0  # Only count free blocks in avail_pages
-        self.avail_pages: Dict[int, Page] = {}
-        self.full_pages: Dict[int, Page] = {}
+        self.avail_pages: Dict[int, InternalPage] = {}
+        self.full_pages: Dict[int, InternalPage] = {}
 
         self.reserved_blocks: List[int] = []
         self.null_block: Optional[list[int]] = None
@@ -182,10 +227,11 @@ def _alloc(self,
             # finished and then perform the usual capacity check.
             self._wait_post_init()
 
-        new_mem_size = self.page_allocator.mem_info_tracker.check_and_get_resize_target(
-            self.mem_size, self.num_layers, self.num_kv_buffers)
-        if new_mem_size is not None:
-            self.resize(new_mem_size)
+        # todo: check if we need to resize
+        #new_mem_size = self.page_allocator.mem_info_tracker.check_and_get_resize_target(
+        #    self.mem_size, self.num_layers, self.num_kv_buffers)
+        #if new_mem_size is not None:
+        #    self.resize(new_mem_size)
 
         if self.available_size() < need_size:
             logger.warning(f"available_size()={self.available_size()} < "
@@ -193,7 +239,7 @@ def _alloc(self,
             return None
 
         ret_index = []
-        page: Optional[Page] = None
+        page: Optional[InternalPage] = None
 
         remaining_need = need_size
 
@@ -244,11 +290,7 @@ def free(self, indices: List[int]):
                     raise ValueError(f"Freed index {idx} is in "
                                      " reserved_blocks, which is not allowed.")
 
-        # Group indices by page_id
-        idx_dict = defaultdict(list)
-        for idx in indices:
-            page_id = self.page_allocator.get_page_id(idx, self.block_mem_size)
-            idx_dict[page_id].append(idx)
+        idx_dict = self.page_allocator.group_indices_by_page(indices, self.block_mem_size)
 
         pages_to_free: List[int] = []
         for page_id, idxs in idx_dict.items():
@@ -346,11 +388,8 @@ def available_size(self) -> int:
         if self.in_shrink:
             blocks_from_free_pages = 0
         else:
-            virtual_free_pages = self.page_allocator.get_num_free_pages()
-            physical_free_pages = self.page_allocator.get_avail_physical_pages(
-            ) + self.page_allocator.get_num_reserved_pages()
-            free_pages = min(virtual_free_pages, physical_free_pages)
-            blocks_from_free_pages = free_pages * Page.get_num_blocks(
+            free_pages = self.page_allocator.get_num_free_pages()
+            blocks_from_free_pages = free_pages * InternalPage.get_num_blocks(
                 self.page_size, self.block_mem_size)
         return avail_blocks + blocks_from_free_pages
 
@@ -424,13 +463,13 @@ def clear(self):
     @synchronized
     def _get_num_alloced_blocks(self) -> int:
         # Blocks from fully allocated pages
-        blocks_from_full_pages = len(self.full_pages) * Page.get_num_blocks(
+        blocks_from_full_pages = len(self.full_pages) * InternalPage.get_num_blocks(
             self.page_size, self.block_mem_size)
         # Blocks from partially allocated pages. num_avail_blocks is the number
         # of free blocks in the partially allocated pages so the number of
         # allocated blocks is the total number of blocks in the partially
         # allocated pages minus the number of free blocks.
-        blocks_from_avail_pages = len(self.avail_pages) * Page.get_num_blocks(
+        blocks_from_avail_pages = len(self.avail_pages) * InternalPage.get_num_blocks(
             self.page_size, self.block_mem_size) - self.num_avail_blocks
         # Blocks from reserved blocks
         blocks_from_reserved_blocks = len(self.reserved_blocks)
diff --git a/kvcached/page_allocator.py b/kvcached/page_allocator.py
deleted file mode 100644
index 830c02ea..00000000
--- a/kvcached/page_allocator.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
-# SPDX-License-Identifier: Apache-2.0
-
-import threading
-from collections import deque
-from typing import List, Optional, Tuple, cast
-
-import torch
-
-from kvcached.locks import ConditionLike, LockLike, NoOpCondition, NoOpLock
-from kvcached.mem_info_tracker import MemInfoTracker
-from kvcached.tp_ipc_util import broadcast_map_to_kv_tensors, broadcast_unmap_from_kv_tensors
-from kvcached.utils import (
-    CONTIGUOUS_LAYOUT,
-    GPU_UTILIZATION,
-    MAX_RESERVED_PAGES,
-    MIN_RESERVED_PAGES,
-    PAGE_PREALLOC_ENABLED,
-    SANITY_CHECK,
-    get_kvcached_logger,
-)
-from kvcached.vmm_ops import map_to_kv_tensors, unmap_from_kv_tensors
-
-logger = get_kvcached_logger()
-
-PREALLOC_THREAD_TIMEOUT: float = 2.0  # seconds
-
-
-def _should_use_worker_ipc() -> bool:
-    try:
-        from kvcached.integration.vllm.interfaces import should_use_worker_ipc
-        return should_use_worker_ipc()
-    except ImportError:
-        return False
-
-
-class Page:
-
-    def __init__(self, page_id: int, page_size: int):
-        self.page_id = page_id
-        self.page_size = page_size
-
-        self.start_block: Optional[int] = None
-        self.end_block: Optional[int] = None
-        self.num_kv_blocks: Optional[int] = None
-        self.free_list: List[int] = []
-
-    def _require_init(self) -> None:
-        """Raise AssertionError if the page has not been initialised.
-        """
-        assert self.start_block is not None, "Page not initialised"
-        assert self.end_block is not None, "Page not initialised"
-        assert self.num_kv_blocks is not None, "Page not initialised"
-
-    def init(self, block_mem_size: int) -> None:
-        self.start_block, self.end_block = self.get_block_range(
-            self.page_id, self.page_size, block_mem_size)
-
-        self.num_kv_blocks = self.end_block - self.start_block
-        self.free_list = list(range(self.start_block, self.end_block))
-
-    def alloc(self, num_blocks: int = 1) -> List[int]:
-        self._require_init()
-        if self.full():
-            raise ValueError(f"Page {self.page_id} is already full")
-        block_ids = self.free_list[:num_blocks]
-        self.free_list = self.free_list[num_blocks:]
-        return block_ids
-
-    def free(self, block_id: int) -> None:
-        self._require_init()
-        if SANITY_CHECK:
-            self._sanity_check(block_id)
-        self.free_list.append(block_id)
-
-    def free_batch(self, block_ids: List[int]) -> None:
-        self._require_init()
-        if SANITY_CHECK:
-            for block_id in block_ids:
-                self._sanity_check(block_id)
-        self.free_list.extend(block_ids)
-
-    def empty(self) -> bool:
-        self._require_init()
-        return len(self.free_list) == self.num_kv_blocks
-
-    def full(self) -> bool:
-        self._require_init()
-        return not self.free_list
-
-    def num_free_blocks(self) -> int:
-        self._require_init()
-        return len(self.free_list)
-
-    def get_free_blocks(self) -> List[int]:
-        self._require_init()
-        return self.free_list
-
-    def _has_block(self, block_id: int) -> bool:
-        self._require_init()
-        return block_id >= cast(int, self.start_block) and block_id < cast(
-            int, self.end_block)
-
-    def _sanity_check(self, block_id: int) -> None:
-        self._require_init()
-        if not self._has_block(block_id):
-            raise ValueError(
-                f"Page {self.page_id} does not have block {block_id}")
-        if block_id in self.free_list:
-            raise ValueError(f"Block {block_id} is already free")
-
-    @staticmethod
-    def get_block_range(page_id: int, page_size: int,
-                        block_mem_size: int) -> Tuple[int, int]:
-        """
-        Get the block range of a page.
-        The page contains [start_block, end_block), which handles the case where
-        page_size is not divisible by block_mem_size.
-        For example, if page_size = 16 and block_mem_size = 6, the page 0
-        contains [0, 2) blocks, and the page 1 contains [3, 5) blocks.
-        Pages:  |      0-16       |        16-32        |
-                | 0-6 | 6-12 | 12-18 | 18-24 | 24-30 | 30-32 |
-        Blocks: |  0  |  1   |2<skip>|   3   |   4   |5<skip>|
-        """
-        start_block = (page_id * page_size + block_mem_size -
-                       1) // block_mem_size
-        end_block = ((page_id + 1) * page_size) // block_mem_size
-        return start_block, end_block
-
-    @staticmethod
-    def get_num_blocks(page_size: int, block_mem_size: int) -> int:
-        """
-        Calculate the number of blocks that can fit in a page.
-        This calculation is accurate even when page_size is not divisible by
-        block_mem_size.
-        """
-        return page_size // block_mem_size
-
-
-class PageAllocator:
-
-    def __init__(self,
-                 num_layers: int,
-                 mem_size_per_layer: int,
-                 page_size: int,
-                 world_size: int = 1,
-                 pp_rank: int = 0,
-                 async_sched: bool = False,
-                 contiguous_layout: bool = CONTIGUOUS_LAYOUT,
-                 enable_page_prealloc: bool = PAGE_PREALLOC_ENABLED,
-                 num_kv_buffers: int = 2,
-                 group_id: int = 0):
-        """
-        Args:
-            num_layers: Number of layers (for physical memory calculation).
-            mem_size_per_layer: Memory size per layer per K/V tensor in bytes.
-            page_size: Page size in bytes.
-            world_size: Tensor parallel world size within a pipeline stage.
-            pp_rank: Pipeline parallel rank (for IPC socket namespacing).
-            async_sched: Whether asynchronous scheduling is enabled.
-            contiguous_layout: Whether to use contiguous layout.
-            enable_page_prealloc: Whether to enable page preallocation.
-            num_kv_buffers: Number of KV buffers per layer (2 for MHA K+V,
-                1 for MLA combined KV).
-            group_id: KV cache group identifier for hybrid attention models.
-                Different groups have independent FTensors and page spaces.
-        """
-        logger.info(
-            f"Init kvcached KV cache allocator: "
-            f"num_layers={num_layers}, "
-            f"mem_size_per_layer={mem_size_per_layer//(1024*1024)}MB, "
-            f"total_mem_size={num_kv_buffers * num_layers * mem_size_per_layer//(1024*1024)}MB, "
-            f"page_size={page_size//(1024*1024)}MB, "
-            f"world_size={world_size}, "
-            f"async_sched={async_sched}, "
-            f"contiguous_layout={contiguous_layout}, "
-            f"enable_prealloc={enable_page_prealloc}")
-        # WARNING (YIFAN): kvcached_ops.init_kvcached must have been called
-        # before this.
-
-        self.num_layers = num_layers
-        self.mem_size_per_layer = mem_size_per_layer
-        self.page_size = page_size
-        self.world_size = world_size
-        self.pp_rank = pp_rank
-        self.async_sched = async_sched
-        self.contiguous_layout = contiguous_layout
-        self.num_kv_buffers = num_kv_buffers
-        self.group_id = group_id
-        # TODO: make this compatible with engine's memory limit after getting
-        # better configuration management.
-        self.gpu_utilization = GPU_UTILIZATION
-        self.num_free_pages = mem_size_per_layer // page_size
-        self.num_total_pages = mem_size_per_layer // page_size
-
-        self.free_page_list: deque[int] = deque(range(self.num_free_pages))
-
-        self.min_reserved_pages: int = MIN_RESERVED_PAGES
-        self.max_reserved_pages: int = MAX_RESERVED_PAGES
-        self.reserved_page_list: deque[int] = deque()  # Fast path allocation
-
-        self.reclaimed_page_list: deque[int] = deque()  # Reclaimed page ids
-
-        # Initialize memory info tracker
-        self.mem_info_tracker = MemInfoTracker(
-            self.mem_size_per_layer * num_layers * num_kv_buffers,
-            group_id=group_id)
-
-        # Preallocation thread management
-        self.enable_page_prealloc: bool = enable_page_prealloc
-
-        self._lock: LockLike
-        self._cond: ConditionLike
-
-        if self.enable_page_prealloc:
-            self._lock = threading.RLock()
-            self._cond = threading.Condition(self._lock)
-        else:  # No preallocation lock and condition are needed.
-            self._lock = NoOpLock()
-            self._cond = NoOpCondition(self._lock)
-        self.prealloc_running: bool = False
-        self.prealloc_needed: bool = False
-        self.prealloc_thd: Optional[threading.Thread] = None
-
-    def __del__(self):
-        try:
-            if self.enable_page_prealloc and self.prealloc_thd is not None:
-                self._stop_prealloc_thread(timeout=PREALLOC_THREAD_TIMEOUT)
-        except Exception:
-            # Silently ignore exceptions during cleanup
-            pass
-
-    def start_prealloc_thread(self):
-        # NOTE: called by KVCacheManager after reserving the null block
-        if self.enable_page_prealloc:
-            self._lock = threading.RLock()
-            self._cond = threading.Condition(self._lock)
-            self._start_prealloc_thread()
-
-    def alloc_page(self) -> Page:
-        with self._lock:
-            page_id: Optional[int] = None
-
-            while page_id is None:
-                # Fast path: allocate pages with reserved physical memory mapping.
-                if self.reserved_page_list:
-                    page_id = self.reserved_page_list.popleft()
-                    self.num_free_pages -= 1
-
-                    # Trigger preallocation to refill reserved pool if getting low
-                    if len(self.reserved_page_list) < self.min_reserved_pages:
-                        self.prealloc_needed = True
-                        self._cond.notify_all()
-
-                    # Update memory usage after fast path allocation
-                    self._update_memory_usage()
-                    return Page(page_id, self.page_size)
-
-                # Slow path: allocate pages with new physical memory mapping.
-                if self.free_page_list:
-                    page_id = self.free_page_list.popleft()
-                    self.num_free_pages -= 1
-                    break
-
-                if self.num_free_pages <= 0:
-                    raise ValueError("No free pages left")
-
-                if not self.enable_page_prealloc:
-                    raise RuntimeError(
-                        "Inconsistent page allocator state: no free pages "
-                        "available to allocate")
-
-                # Wait for background preallocation or page freeing to finish.
-                self._cond.wait()
-
-        assert page_id is not None
-
-        try:
-            self._map_pages([page_id])
-        except Exception as e:
-            # If mapping fails, return page to free list and restore count
-            with self._lock:
-                self.free_page_list.appendleft(page_id)
-                self.num_free_pages += 1
-                self._cond.notify_all()
-            raise RuntimeError(f"Failed to map page {page_id}: {e}") from e
-
-        if self.enable_page_prealloc:
-            # Trigger preallocation to refill the pool
-            self._trigger_preallocation()
-
-        # Update memory usage after mapping pages
-        self._update_memory_usage()
-        return Page(page_id, self.page_size)
-
-    def free_page(self, page_id: int) -> None:
-        with self._lock:
-            if SANITY_CHECK and (page_id in self.free_page_list
-                                 or page_id in self.reserved_page_list):
-                raise ValueError(f"Page {page_id} is already free or reserved")
-
-            self.num_free_pages += 1
-            if len(self.reserved_page_list) < self.max_reserved_pages:
-                # Fast path: reserve page with its physical memory mapping.
-                self.reserved_page_list.append(page_id)
-                # Update memory usage after fast path free/reserve
-                self._update_memory_usage()
-                self._cond.notify_all()
-                return
-
-        # Slow path: free page and its physical memory mapping.
-        self._unmap_pages([page_id])
-        with self._lock:
-            self.free_page_list.append(page_id)
-            # Update memory usage after unmapping pages
-            self._update_memory_usage()
-            self._cond.notify_all()
-
-    def free_pages(self, page_ids: List[int]) -> None:
-        with self._lock:
-            if SANITY_CHECK:
-                for page_id in page_ids:
-                    if (page_id in self.free_page_list
-                            or page_id in self.reserved_page_list):
-                        raise ValueError(
-                            f"Page {page_id} is already free or reserved")
-
-            self.num_free_pages += len(page_ids)
-            num_to_reserve = self.max_reserved_pages - len(
-                self.reserved_page_list)
-            if num_to_reserve > 0:
-                # Fast path: reserve pages with their physical memory mapping.
-                self.reserved_page_list.extend(page_ids[:num_to_reserve])
-                self._cond.notify_all()
-                page_ids = page_ids[num_to_reserve:]
-
-        if len(page_ids) == 0:
-            # Update memory usage after fast path free/reserve
-            self._update_memory_usage()
-            return
-
-        # Slow path: free page_ids and their physical memory mapping.
-        self._unmap_pages(page_ids)
-        with self._lock:
-            self.free_page_list.extend(page_ids)
-            # Update memory usage after unmapping pages
-            self._update_memory_usage()
-            self._cond.notify_all()
-
-    def resize(self, new_mem_size: int) -> bool:
-        new_num_pages = new_mem_size // self.page_size
-        with self._lock:
-            if new_num_pages < self.get_num_inuse_pages():
-                return False
-            if new_num_pages == self.num_total_pages:
-                return True
-            elif new_num_pages > self.num_total_pages:
-                num_to_expand = new_num_pages - self.num_total_pages
-
-                # Reuse previously reclaimed pages first.
-                num_to_reuse = min(len(self.reclaimed_page_list),
-                                   num_to_expand)
-                if num_to_reuse > 0:
-                    for _ in range(num_to_reuse):
-                        self.free_page_list.append(
-                            self.reclaimed_page_list.popleft())
-                    num_to_expand -= num_to_reuse
-                    self.num_free_pages += num_to_reuse
-
-                # Allocate new pages if needed.
-                if num_to_expand > 0:
-                    new_page_ids = list(
-                        range(self.num_total_pages,
-                              self.num_total_pages + num_to_expand))
-                    self.free_page_list.extend(new_page_ids)
-                    self.num_free_pages += num_to_expand
-                self.num_total_pages = new_num_pages
-                self._update_memory_usage()
-            else:  # new_num_pages < self.num_total_pages and new_num_pages >= num_inuse_pages
-                num_to_reclaim = self.num_total_pages - new_num_pages
-
-                if len(self.free_page_list) < num_to_reclaim:
-                    # Need to trim reserved pages first
-                    reserved_count = len(self.reserved_page_list)
-                    if reserved_count > 0:
-                        # Move reserved pages back to free list
-                        pages_to_unmap = list(self.reserved_page_list)
-                        self.reserved_page_list.clear()
-                        # Unmap outside the lock to avoid holding it during I/O
-                        try:
-                            self._lock.release()
-                            self._unmap_pages(pages_to_unmap)
-                        finally:
-                            self._lock.acquire()
-                        self.free_page_list.extend(pages_to_unmap)
-                        # Update memory usage after unmapping pages
-                        self._update_memory_usage()
-
-                if len(self.free_page_list) < num_to_reclaim:
-                    # Still not enough free pages
-                    return False
-
-                for _ in range(num_to_reclaim):
-                    self.reclaimed_page_list.append(self.free_page_list.pop())
-                self.num_free_pages -= num_to_reclaim
-                self.num_total_pages = new_num_pages
-        return True
-
-    def trim(self) -> None:
-        """
-        Trim the reserved pages to free up physical memory.
-        """
-        with self._lock:
-            pages_to_unmap = list(self.reserved_page_list)  # copy
-            self.reserved_page_list.clear()
-
-            if not pages_to_unmap:
-                # Update memory usage after trimming
-                self._update_memory_usage()
-                return
-
-            try:
-                self._lock.release()
-                self._unmap_pages(pages_to_unmap)
-            finally:
-                self._lock.acquire()
-
-            self.free_page_list.extend(pages_to_unmap)
-            # Update memory usage after unmapping pages
-            self._update_memory_usage()
-
-    def reset_free_page_order(self) -> None:
-        """Reset the free page list to sorted order.
-
-        After free_pages + trim cycles, freed pages are appended to the
-        end of the deque so the ordering becomes scrambled.  This method
-        re-sorts the free list so that low-numbered pages (starting with
-        page 0) are allocated first — important because SGLang expects
-        the very first block (on page 0) to be reserved as the null block.
-        """
-        with self._lock:
-            sorted_pages = sorted(self.free_page_list)
-            self.free_page_list = deque(sorted_pages)
-
-    def get_num_free_pages(self) -> int:
-        return self.num_free_pages
-
-    def get_num_inuse_pages(self) -> int:
-        return self.num_total_pages - self.num_free_pages
-
-    def get_num_total_pages(self) -> int:
-        return self.num_total_pages
-
-    def get_num_reserved_pages(self) -> int:
-        with self._lock:
-            return len(self.reserved_page_list)
-
-    def get_avail_physical_pages(self) -> int:
-        avail_phy_mem_size, total_phy_mem_size = torch.cuda.mem_get_info()
-        headroom = int(total_phy_mem_size * (1 - self.gpu_utilization))
-        avail_phy_mem_size = max(avail_phy_mem_size - headroom, 0)
-
-        # Calculate available pages considering layers and KV buffers
-        avail_phy_pages = avail_phy_mem_size // self.page_size
-        # Each layer needs num_kv_buffers pages (2 for MHA K+V, 1 for MLA).
-        avail_pages_per_layer = avail_phy_pages // self.num_layers // self.num_kv_buffers
-        return int(avail_pages_per_layer)
-
-    def get_page_id(self, block_id: int, block_mem_size: int) -> int:
-        return block_id * block_mem_size // self.page_size
-
-    # Private methods
-    def _prealloc_worker(self):
-        """Worker thread that preallocates and maps physical pages."""
-        while self.prealloc_running:
-            with self._lock:
-                # Wait until preallocation is needed or thread is stopped
-                while not self.prealloc_needed and self.prealloc_running:
-                    self._cond.wait()
-
-                if not self.prealloc_running:
-                    break
-
-                self.prealloc_needed = False
-                current_reserved = len(self.reserved_page_list)
-                to_reserve = max(0, self.min_reserved_pages - current_reserved)
-                # Only try to reserve up to the available free pages
-                to_reserve = min(to_reserve, len(self.free_page_list),
-                                 self.get_avail_physical_pages())
-                if to_reserve <= 0:
-                    continue
-
-                pages_to_reserve = []
-
-                # Get pages from free list
-                for _ in range(to_reserve):
-                    if self.free_page_list:
-                        pages_to_reserve.append(self.free_page_list.popleft())
-                    else:
-                        break
-
-            if pages_to_reserve:
-                try:
-                    self._map_pages(pages_to_reserve)
-                    with self._lock:
-                        self.reserved_page_list.extend(pages_to_reserve)
-                        # Update memory usage after mapping pages
-                        self._update_memory_usage()
-                        self._cond.notify_all()
-                    logger.debug(
-                        f"Preallocated {len(pages_to_reserve)} pages, "
-                        f"reserved={len(self.reserved_page_list)}")
-                except Exception as e:
-                    # If mapping fails, return pages to free list
-                    with self._lock:
-                        self.free_page_list.extendleft(pages_to_reserve)
-                        self._cond.notify_all()
-                    logger.error(
-                        f"Failed to preallocate {len(pages_to_reserve)} pages: "
-                        f"{e}")
-
-    def _start_prealloc_thread(self):
-        if self.prealloc_thd is None:
-            self.prealloc_running = True
-            self.prealloc_thd = threading.Thread(target=self._prealloc_worker,
-                                                 daemon=True)
-            self.prealloc_thd.start()
-
-            # Initial preallocation trigger
-            self._trigger_preallocation()
-
-    def _stop_prealloc_thread(self, timeout: Optional[float] = None):
-        if self.prealloc_thd is not None:
-            with self._lock:
-                self.prealloc_running = False
-                self._cond.notify_all()
-            self.prealloc_thd.join(timeout)
-            if self.prealloc_thd.is_alive():
-                logger.warning(
-                    "Preallocation thread did not stop within timeout")
-            self.prealloc_thd = None
-            logger.debug("Stopped page preallocation thread")
-
-    def _trigger_preallocation(self):
-        """Trigger the preallocation thread to fill up reserved blocks"""
-        with self._lock:
-            self.prealloc_needed = True
-            self._cond.notify_all()
-
-    def _map_pages(self, page_ids: list[int]) -> None:
-        if self.contiguous_layout:
-            offsets = [
-                pid * self.page_size * self.num_layers * self.num_kv_buffers
-                for pid in page_ids
-            ]
-        else:
-            offsets = [pid * self.page_size for pid in page_ids]
-        if self.world_size > 1 or _should_use_worker_ipc():
-            broadcast_map_to_kv_tensors(self.world_size, offsets, self.pp_rank,
-                                        group_id=self.group_id)
-        else:
-            map_to_kv_tensors(offsets, group_id=self.group_id)
-
-    def _unmap_pages(self, page_ids: list[int]) -> None:
-        if self.contiguous_layout:
-            offsets = [
-                pid * self.page_size * self.num_layers * self.num_kv_buffers
-                for pid in page_ids
-            ]
-        else:
-            offsets = [pid * self.page_size for pid in page_ids]
-        if self.world_size > 1 or _should_use_worker_ipc():
-            broadcast_unmap_from_kv_tensors(self.world_size, offsets,
-                                            self.pp_rank,
-                                            group_id=self.group_id)
-        else:
-            if self.async_sched:
-                torch.cuda.synchronize()
-            unmap_from_kv_tensors(offsets, group_id=self.group_id)
-
-    def _update_memory_usage(self):
-        """Update memory usage information in shared memory."""
-        # Memory actively used by allocations (excludes preallocated pages).
-        used_phy_mem_size = (self.get_num_inuse_pages() * self.num_layers *
-                             self.page_size * self.num_kv_buffers)
-        # Memory held by preallocated pages that are not yet actively used.
-        prealloc_phy_mem_size = (self.get_num_reserved_pages() *
-                                 self.num_layers * self.page_size *
-                                 self.num_kv_buffers)
-
-        self.mem_info_tracker.update_memory_usage(
-            used_size=used_phy_mem_size, prealloc_size=prealloc_phy_mem_size)
diff --git a/setup.py b/setup.py
index 23a75981..3ea4ce83 100644
--- a/setup.py
+++ b/setup.py
@@ -31,8 +31,10 @@
 def get_csrc_files(path) -> List[str]:
     src_dir = Path(path)
     # setuptools requires relative paths
+    # Filter out macOS AppleDouble metadata files (._* prefix)
     cpp_files = [
         str(f.relative_to(SCRIPT_PATH)) for f in src_dir.rglob("*.cpp")
+        if not f.name.startswith("._")
     ]
     return cpp_files
 

From 98d9bb34047cd704c0901daac1e1f63ce9428f3d Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 3 May 2026 10:51:42 -0500
Subject: [PATCH 02/15] fix include error and modify test file

---
 csrc/torch_bindings.cpp       |  2 ++
 tests/test_kvcache_manager.py | 45 +++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3177f3ae..abf46bd3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <memory>
+#include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 #include <string>
 #include <torch/extension.h>
 #include <vector>
diff --git a/tests/test_kvcache_manager.py b/tests/test_kvcache_manager.py
index 575e4930..25e13d52 100644
--- a/tests/test_kvcache_manager.py
+++ b/tests/test_kvcache_manager.py
@@ -72,15 +72,13 @@ def setup_kvcache():
     )
 
     # wait a bit for pre-allocation to finish
-    if manager.page_allocator.enable_page_prealloc:
-        start_time = time.time()
-        timeout = 5.0  # seconds
-        min_pages = manager.page_allocator.min_reserved_pages
-        while len(manager.page_allocator.reserved_page_list) < min_pages:
-            if time.time() - start_time > timeout:
-                # This is not a hard failure, but test_trim might become flaky.
-                break
-            time.sleep(0.1)
+    start_time = time.time()
+    timeout = 5.0  # seconds
+    while manager.page_allocator.get_num_reserved_pages() == 0:
+        if time.time() - start_time > timeout:
+            # This is not a hard failure, but test_trim might become flaky.
+            break
+        time.sleep(0.1)
 
     yield manager
 
@@ -116,6 +114,12 @@ def test_over_allocation_fails(setup_kvcache):
     assert handle is None
 
 
+@pytest.mark.skip(
+    reason="kvctl-driven resize flow is broken in this PR: "
+    "(a) check_and_get_resize_target is not bound on C++ PageAllocator, "
+    "(b) C++ MemInfoTracker uses a different shm name than Python's "
+    "DEFAULT_IPC_NAME, so update_kv_cache_limit writes to a segment the "
+    "engine never reads. Re-enable once those are restored.")
 def test_resize_smaller_and_larger(setup_kvcache):
     # instantiate a kv cache manager with known size
     # Terminology:
@@ -124,7 +128,7 @@ def test_resize_smaller_and_larger(setup_kvcache):
     # - mem_size:
     # used by resize method, corresponds K (or V) tensor size in 1 layer, typically in few GBs
     manager = setup_kvcache
-    initial_total_pages = manager.page_allocator.num_total_pages
+    initial_total_pages = manager.page_allocator.get_num_total_pages()
     initial_attribute_mem_size = manager.mem_size
     meminfo = get_kv_cache_limit(IPC_NAME)
     assert meminfo is not None
@@ -137,11 +141,11 @@ def test_resize_smaller_and_larger(setup_kvcache):
     # update the shm total_size field
     update_kv_cache_limit(IPC_NAME, shrink_kv_cache_limit)
     # infer the new mem_size based on shm total_size --- workflow in kvcached
-    shrink_shm_mem_size = manager.page_allocator.mem_info_tracker.check_and_get_resize_target(
-            manager.mem_size, manager.num_layers)
+    shrink_shm_mem_size = manager.page_allocator.check_and_get_resize_target(
+            manager.mem_size)
     # actual resize method
     manager.resize(shrink_shm_mem_size)
-    shrink_total_pages = manager.page_allocator.num_total_pages
+    shrink_total_pages = manager.page_allocator.get_num_total_pages()
     assert initial_total_pages == shrink_total_pages + initial_total_pages // 2
 
     # RESIZE LARGER: add back the deducted half of initial total pages
@@ -149,11 +153,11 @@ def test_resize_smaller_and_larger(setup_kvcache):
     # update the shm total_size field
     update_kv_cache_limit(IPC_NAME, expand_kv_cache_limit)
     # infer the new mem_size based on shm total_size --- workflow in kvcached
-    expand_shm_mem_size = manager.page_allocator.mem_info_tracker.check_and_get_resize_target(
-            shrink_shm_mem_size, manager.num_layers)
+    expand_shm_mem_size = manager.page_allocator.check_and_get_resize_target(
+            shrink_shm_mem_size)
     # actual resize method
     manager.resize(expand_shm_mem_size)
-    expand_total_pages = manager.page_allocator.num_total_pages
+    expand_total_pages = manager.page_allocator.get_num_total_pages()
     assert expand_total_pages == initial_total_pages
 
 
@@ -161,14 +165,13 @@ def test_trim(setup_kvcache):
     # instantiate a kv cache manager with known size
     manager = setup_kvcache
 
-    # initial reserved pages
-    initial_reserved = len(manager.page_allocator.reserved_page_list)
-    if manager.page_allocator.enable_page_prealloc:
-        assert initial_reserved == manager.page_allocator.min_reserved_pages
+    # initial reserved pages (assumes prealloc is enabled, which is the default)
+    initial_reserved = manager.page_allocator.get_num_reserved_pages()
+    assert initial_reserved > 0
 
     # trim reserved pages
     manager.trim()
-    after_trim_reserved = len(manager.page_allocator.reserved_page_list)
+    after_trim_reserved = manager.page_allocator.get_num_reserved_pages()
     assert after_trim_reserved == 0
 
 

From c96feb673ff5daec8c3f1762641df12629d2e775 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 3 May 2026 10:17:14 -0500
Subject: [PATCH 03/15] fix: restore elastic resize + match shm name + add
 pybind11 includes

- Bind PageAllocator::check_and_get_resize_target so kv_cache_manager can poll

- Pin KVCACHED_IPC_NAME from Python so C++ MemInfoTracker uses the same shm

- Include pybind11/functional.h and pybind11/stl.h in torch_bindings

- Update test_kvcache_manager.py to use C++ public methods
---
 csrc/inc/page_allocator.hpp   |  5 +++++
 csrc/page_allocator.cpp       |  9 +++++++++
 csrc/torch_bindings.cpp       |  7 +++++++
 kvcached/kv_cache_manager.py  | 18 ++++++++++++------
 tests/test_kvcache_manager.py |  6 ------
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/csrc/inc/page_allocator.hpp b/csrc/inc/page_allocator.hpp
index 6538bde5..6ffd03d5 100644
--- a/csrc/inc/page_allocator.hpp
+++ b/csrc/inc/page_allocator.hpp
@@ -81,6 +81,11 @@ class PageAllocator {
   int64_t get_num_reserved_pages() const;
   int64_t get_avail_physical_pages() const;
 
+  // Poll the shared-memory MemInfoStruct to see if an external controller
+  // (e.g. `kvctl limit`) has written a new total_size. Returns the new
+  // per-layer mem_size if it differs from current_mem_size, otherwise -1.
+  int64_t check_and_get_resize_target(int64_t current_mem_size) const;
+
   // Utility functions
   page_id_t get_page_id(int64_t block_id, int64_t block_mem_size) const;
 
diff --git a/csrc/page_allocator.cpp b/csrc/page_allocator.cpp
index fc2f35ee..e81c373f 100644
--- a/csrc/page_allocator.cpp
+++ b/csrc/page_allocator.cpp
@@ -438,6 +438,15 @@ page_id_t PageAllocator::get_page_id(int64_t block_id,
   return block_id * block_mem_size / page_size_;
 }
 
+int64_t
+PageAllocator::check_and_get_resize_target(int64_t current_mem_size) const {
+  if (!mem_info_tracker_) {
+    return -1;
+  }
+  return mem_info_tracker_->check_and_get_resize_target(
+      current_mem_size, num_layers_, num_kv_buffers_);
+}
+
 std::unordered_map<page_id_t, std::vector<int64_t>>
 PageAllocator::group_indices_by_page(const std::vector<int64_t> &indices,
                                      int64_t block_mem_size) const {
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index abf46bd3..d4b4e987 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -136,6 +136,11 @@ int64_t page_allocator_get_avail_physical_pages(
   return allocator->get_avail_physical_pages();
 }
 
+int64_t page_allocator_check_and_get_resize_target(
+    std::shared_ptr<PageAllocator> allocator, int64_t current_mem_size) {
+  return allocator->check_and_get_resize_target(current_mem_size);
+}
+
 void page_allocator_set_broadcast_map_callback(
     std::shared_ptr<PageAllocator> allocator, BroadcastMapCallback callback) {
   allocator->set_broadcast_map_callback(callback);
@@ -212,6 +217,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
            &kvcached::page_allocator_get_num_reserved_pages)
       .def("get_avail_physical_pages",
            &kvcached::page_allocator_get_avail_physical_pages)
+      .def("check_and_get_resize_target",
+           &kvcached::page_allocator_check_and_get_resize_target)
       .def("get_page_id", &kvcached::page_allocator_get_page_id)
       .def("group_indices_by_page",
            &kvcached::page_allocator_group_indices_by_page)
diff --git a/kvcached/kv_cache_manager.py b/kvcached/kv_cache_manager.py
index b45f1b60..b2ddc2b0 100644
--- a/kvcached/kv_cache_manager.py
+++ b/kvcached/kv_cache_manager.py
@@ -18,9 +18,16 @@
 
 from kvcached.locks import NoOpLock
 from kvcached.tp_ipc_util import broadcast_kv_tensors_created
-from kvcached.utils import PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
+from kvcached.utils import (DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK,
+                            get_kvcached_logger)
 from kvcached.vmm_ops import kv_tensors_created
 
+# The C++ MemInfoTracker derives its shm name from KVCACHED_IPC_NAME or
+# falls back to "kvcached_engine_<pgid>", which does NOT match Python's
+# "kvcached_<engine_tag>_<pgid>". Pin the env var so both sides agree.
+import os as _os
+_os.environ.setdefault("KVCACHED_IPC_NAME", DEFAULT_IPC_NAME)
+
 try:
     import kvcached.vmm_ops as kvcached_cpp
     PageAllocator = kvcached_cpp.PageAllocator
@@ -227,11 +234,10 @@ def _alloc(self,
             # finished and then perform the usual capacity check.
             self._wait_post_init()
 
-        # todo: check if we need to resize
-        #new_mem_size = self.page_allocator.mem_info_tracker.check_and_get_resize_target(
-        #    self.mem_size, self.num_layers, self.num_kv_buffers)
-        #if new_mem_size is not None:
-        #    self.resize(new_mem_size)
+        new_mem_size = self.page_allocator.check_and_get_resize_target(
+            self.mem_size)
+        if new_mem_size > 0:
+            self.resize(new_mem_size)
 
         if self.available_size() < need_size:
             logger.warning(f"available_size()={self.available_size()} < "
diff --git a/tests/test_kvcache_manager.py b/tests/test_kvcache_manager.py
index 25e13d52..f7162dbe 100644
--- a/tests/test_kvcache_manager.py
+++ b/tests/test_kvcache_manager.py
@@ -114,12 +114,6 @@ def test_over_allocation_fails(setup_kvcache):
     assert handle is None
 
 
-@pytest.mark.skip(
-    reason="kvctl-driven resize flow is broken in this PR: "
-    "(a) check_and_get_resize_target is not bound on C++ PageAllocator, "
-    "(b) C++ MemInfoTracker uses a different shm name than Python's "
-    "DEFAULT_IPC_NAME, so update_kv_cache_limit writes to a segment the "
-    "engine never reads. Re-enable once those are restored.")
 def test_resize_smaller_and_larger(setup_kvcache):
     # instantiate a kv cache manager with known size
     # Terminology:

From 513ee49380801d55161e751cd59fe17193d82bf8 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 3 May 2026 11:06:49 -0500
Subject: [PATCH 04/15] bench: add KVCacheManager.alloc/free microbenchmark

Times alloc(k) + free(handles) cycles at varying k. Used to compare main vs C++ migration vs C++ + restored elastic resize.
---
 benchmarks/bench_alloc/bench_alloc.py | 63 +++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 benchmarks/bench_alloc/bench_alloc.py

diff --git a/benchmarks/bench_alloc/bench_alloc.py b/benchmarks/bench_alloc/bench_alloc.py
new file mode 100644
index 00000000..318baff8
--- /dev/null
+++ b/benchmarks/bench_alloc/bench_alloc.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+# SPDX-License-Identifier: Apache-2.0
+"""Microbench for KVCacheManager.alloc() hot path.
+
+Run on each branch and compare:
+    python bench_alloc.py
+"""
+import time
+
+import torch
+
+from kvcached.integration.vllm.interfaces import alloc_kv_cache, init_kvcached, shutdown_kvcached
+from kvcached.kv_cache_manager import KVCacheManager
+from kvcached.vmm_ops import kv_tensors_created
+
+TP_RANK, TP_SIZE = 0, 1
+NUM_LAYERS = 16
+BLOCK_SIZE = 16
+NUM_BLOCKS = 65536
+DTYPE = torch.float16
+DEVICE = f"cuda:{TP_RANK}"
+KV_SHAPE = (2, NUM_BLOCKS, BLOCK_SIZE, 8, 64)
+
+
+def setup():
+    torch.cuda.set_device(TP_RANK)
+    init_kvcached(tp_rank=TP_RANK, world_size=TP_SIZE, is_worker=True,
+                  async_sched=False)
+    alloc_kv_cache(kvcache_shape=KV_SHAPE, block_size=BLOCK_SIZE, dtype=DTYPE,
+                   device=DEVICE, num_layers=NUM_LAYERS)
+    t0 = time.time()
+    while not kv_tensors_created():
+        if time.time() - t0 > 10.0:
+            raise RuntimeError("KV tensors not created within 10s")
+        time.sleep(0.05)
+    return KVCacheManager(num_blocks=NUM_BLOCKS, block_size=BLOCK_SIZE,
+                          cell_size=1024, num_layers=NUM_LAYERS,
+                          world_size=TP_SIZE)
+
+
+def bench_alloc_free(manager, k, iters):
+    # warm up
+    for _ in range(100):
+        h = manager.alloc(k)
+        manager.free(h)
+
+    t0 = time.perf_counter()
+    for _ in range(iters):
+        h = manager.alloc(k)
+        manager.free(h)
+    elapsed = time.perf_counter() - t0
+    per_op_us = elapsed / iters * 1e6
+    return per_op_us
+
+
+if __name__ == "__main__":
+    manager = setup()
+    print(f"{'k':>6} {'iters':>8} {'us/alloc+free':>16}")
+    for k, iters in [(1, 50000), (4, 50000), (16, 50000), (64, 20000),
+                     (256, 10000)]:
+        per_op = bench_alloc_free(manager, k, iters)
+        print(f"{k:>6} {iters:>8} {per_op:>16.2f}")
+    shutdown_kvcached()

From d05ae566e4426f72fb50db1dc98db15c0320aff8 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 3 May 2026 11:31:01 -0500
Subject: [PATCH 05/15] fix isort

---
 kvcached/kv_cache_manager.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kvcached/kv_cache_manager.py b/kvcached/kv_cache_manager.py
index b2ddc2b0..1e93df79 100644
--- a/kvcached/kv_cache_manager.py
+++ b/kvcached/kv_cache_manager.py
@@ -12,21 +12,20 @@
 from __future__ import annotations
 
 import functools
+import os
 import threading
 import time
 from typing import Any, Dict, List, Optional
 
 from kvcached.locks import NoOpLock
 from kvcached.tp_ipc_util import broadcast_kv_tensors_created
-from kvcached.utils import (DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK,
-                            get_kvcached_logger)
+from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
 from kvcached.vmm_ops import kv_tensors_created
 
 # The C++ MemInfoTracker derives its shm name from KVCACHED_IPC_NAME or
 # falls back to "kvcached_engine_<pgid>", which does NOT match Python's
 # "kvcached_<engine_tag>_<pgid>". Pin the env var so both sides agree.
-import os as _os
-_os.environ.setdefault("KVCACHED_IPC_NAME", DEFAULT_IPC_NAME)
+os.environ.setdefault("KVCACHED_IPC_NAME", DEFAULT_IPC_NAME)
 
 try:
     import kvcached.vmm_ops as kvcached_cpp

From 99c676ff097db3a32c9c77ee9e9c80523e49b958 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Mon, 4 May 2026 15:53:02 -0500
Subject: [PATCH 06/15] update overhead results

---
 benchmarks/bench_alloc/bench_result.md | 143 +++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 benchmarks/bench_alloc/bench_result.md

diff --git a/benchmarks/bench_alloc/bench_result.md b/benchmarks/bench_alloc/bench_result.md
new file mode 100644
index 00000000..b7cc848b
--- /dev/null
+++ b/benchmarks/bench_alloc/bench_result.md
@@ -0,0 +1,143 @@
+All measurements done on the same machine (NVIDIA GB10, aarch64) with clean rebuild + reinstall before each run. `main` = `ovg-project/kvcached` main, `PR HEAD` = `lianghao_c++` (incl. must-fix `98d9bb3`), `fix branch` = `fix/pr319-restore-resize`.
+
+---
+
+## A. `available_size()` overhead
+
+main calls `cudaMemGetInfo` via `get_avail_physical_pages()`; PR removed that.
+
+| | μs/call |
+|--:|--:|
+| main | 6.52 |
+| PR HEAD | 0.52 |
+| fix branch | 0.52 |
+
+**12.5× speedup**, no fix-branch cost. Trade-off: PR's `available_size()` no longer caps by physical memory.
+
+---
+
+## B. `group_indices_by_page` (called by `KVCacheManager.free`)
+
+main: Python `for + defaultdict` + Python `get_page_id` per iter. PR: single C++ call.
+
+| N indices | main μs | PR HEAD μs | fix branch μs | speedup |
+|--:|--:|--:|--:|--:|
+| 64 | 3.36 | 1.28 | 1.21 | 2.6× |
+| 256 | 12.88 | 3.34 | 3.24 | 3.9× |
+| 1024 | 52.57 | 16.84 | 16.86 | 3.1× |
+| 4096 | 209.12 | 71.06 | 72.50 | 2.9× |
+| 16384 | 834.13 | 291.87 | 296.72 | 2.9× |
+
+~3× across the board. Matters for large batch frees (e.g. burst sequence completion).
+
+---
+
+## C. Slow-path alloc (`cuMemMap` cost)
+
+`KVCACHED_MIN_RESERVED_PAGES=0`, `KVCACHED_MAX_RESERVED_PAGES=0` → every alloc maps a fresh page, every free unmaps. `k=128` = 1 full 2 MB page.
+
+| k | main μs | PR HEAD μs | fix branch μs |
+|--:|--:|--:|--:|
+| 128 | 4196 | 4023 | 4354 |
+| 256 | 8384 | 8113 | 8657 |
+| 1024 | 33028 | 32488 | 34662 |
+| 4096 | 134479 | 134430 | 137295 |
+
+All within ~5% of each other. **Slow path is dominated by `cuMemMap` syscalls**; the C++ migration doesn't help here. PR's gains are concentrated in the fast path (reserved-pool hit).
+
+---
+
+## D. Multi-threaded throughput (`async_sched=True`)
+
+N Python threads each running `alloc(k)+free(h)` in a tight loop, aggregate ops/s.
+
+**main (Python alloc + RLock + GIL):**
+| threads | k=1 ops/s | k=16 ops/s |
+|--:|--:|--:|
+| 1 | 15.7K | 15.1K |
+| 2 | 15.3K | 14.4K |
+| 4 | 13.3K | 12.0K |
+| 8 | 10.4K | 9.1K |
+
+→ More threads = LESS throughput.
+
+**PR HEAD (C++ alloc):**
+| threads | k=1 ops/s | k=16 ops/s |
+|--:|--:|--:|
+| 1 | 45.9K | 41.2K |
+| 2 | 41.1K | 37.8K |
+| 4 | 38.5K | 48.6K |
+| 8 | 43.6K | 51.5K |
+
+→ ~3× single-thread, holds up to 8 threads.
+
+**fix branch:**
+| threads | k=1 ops/s | k=16 ops/s |
+|--:|--:|--:|
+| 1 | 31.5K | 32.5K |
+| 2 | 30.6K | 30.7K |
+| 4 | 29.5K | 31.6K |
+| 8 | 27.8K | 29.1K |
+
+→ ~2× single-thread, holds steady. ~30% slower than PR HEAD due to per-alloc shm poll.
+
+**Caveat:** `page_allocator_alloc_page` etc. don't `py::gil_scoped_release`, so GIL is still held during C++ work. Multi-thread benefit comes from shorter critical sections, not actual parallelism. Real vLLM uses `async_sched=False` (NoOpLock), so the per-thread RLock is gone but contention via GIL + manager state remains.
+
+---
+
+## F. `KVCacheBlock` object pool (`patches.py`)
+
+Synthetic test: object creation per-call vs object reuse from a pre-allocated pool. Independent of branch (PR introduces the pool; main has no pool).
+
+| N | no-pool μs | pool μs | speedup |
+|--:|--:|--:|--:|
+| 8 | 1.06 | 0.19 | 5.6× |
+| 64 | 8.07 | 1.13 | 7.1× |
+| 256 | 31.28 | 4.38 | 7.1× |
+| 1024 | 147.4 | 17.44 | 8.5× |
+| 4096 | 650.8 | 67.65 | 9.6× |
+
+~6-10×. Saves 7-27 μs per `get_new_blocks` call at typical batch sizes.
+
+---
+
+## E. End-to-end vLLM serving
+
+`vllm bench serve --backend vllm --dataset-name random --random-input-len 512 --random-output-len 128 --num-prompts 500 --request-rate inf` against a locally running vLLM server with `ENABLE_KVCACHED=true`. Model: `Qwen/Qwen3-0.6B`, `gpu-memory-utilization=0.5`, `max-model-len=2048`. 100-prompt warmup discarded; 3 seeds (42, 99, 7) per branch, median reported.
+
+| | Throughput (req/s) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|---|--:|--:|--:|--:|--:|
+| **vanilla vLLM** (no kvcached) | **14.21** | **11575** | **26377** | **119.3** | **143.2** |
+| main (kvcached on) | 9.42 (-34%) | 17284 (+49%) | 40464 (+53%) | 185.1 (+55%) | 239.6 (+67%) |
+| PR HEAD | 9.86 (-31%) | 16472 (+42%) | 38756 (+47%) | 177.5 (+49%) | 230.2 (+61%) |
+| fix branch | 9.83 (-31%) | 16602 (+43%) | 38746 (+47%) | 178.2 (+49%) | 231.7 (+62%) |
+
+(Δ shown vs. vanilla vLLM.)
+
+**Two distinct gaps:**
+
+1. **kvcached vs. vanilla vLLM: ~30-50% overhead everywhere**, regardless of branch. This is the pre-existing kvcached cost (CUDA VMM map/unmap, page-level indirection, alloc bookkeeping) and is what issue #299 reported (their 81 ms vLLM vs 268 ms kvcached TTFT, 3.3× — same direction, larger magnitude).
+2. **PR's C++ migration vs. main: ~5% closer to vanilla.** Real but modest at e2e on this setup; far smaller than #299's reported gap (would need their H20 + Llama2-7B + sharegpt setup to reproduce that).
+
+**fix branch retains all of PR's 5% gain** (kvctl resize poll cost is invisible at e2e, <1%, within run-to-run noise).
+
+The microbench wins (A: 12.5×, B: 3×, D: 2-5× under contention, F: 7×) wash down to ~5% e2e because per-token model forward (hundreds of μs of attention + FFN) dominates the request budget; allocator overhead is a few μs at fast path, ~5% of the total.
+
+---
+
+## Summary
+
+| benchmark | what changed in PR | speedup vs main | retained on fix branch |
+|---|---|---|---|
+| A. `available_size()` | drop `cudaMemGetInfo` from hot path | **12.5×** | yes (full) |
+| B. `group_indices_by_page` | Python loop → C++ unordered_map | **~3×** | yes (full) |
+| C. slow-path alloc (`cuMemMap`) | unchanged (driver-bound) | **~1.0×** | yes (full) |
+| D. multi-thread alloc/free | C++ allocator + shorter critical sections | **2-5×** under contention | yes, ~70% of PR's gain |
+| E. e2e vLLM serving | sum of A–D | **~5%** | yes (full) |
+| F. `KVCacheBlock` object pool | reuse instead of new() | **~7×** (independent) | yes (full) |
+
+Microbench wins are real and broad. At e2e level they amortise to ~5% on this setup because model forward dominates the per-request time budget. On bigger models or more allocator-heavy workloads (PR #299's setup), the gap is reportedly larger; we did not reproduce that.
+
+A separate, larger overhead remains: **kvcached as a whole adds ~30-50% e2e overhead vs vanilla vLLM** (pre-existing, on all kvcached branches), which the PR partially closes (~5%) but does not eliminate.
+
+The kvctl resize poll on the fix branch costs ~7 μs/alloc in microbench (~30% in multi-thread alloc), but is invisible at e2e (<1%).

From e737974694829a58021737c0515846ecd1e26555 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Mon, 4 May 2026 19:40:52 -0500
Subject: [PATCH 07/15] test overhead and fix the error

---
 benchmarks/bench_alloc/bench_result.md        |  62 +++++++
 .../bench_alloc/run_kvcached_configs.sh       | 139 +++++++++++++++
 benchmarks/bench_alloc/run_layout_retest.sh   | 116 +++++++++++++
 benchmarks/bench_alloc/run_sweep.sh           | 162 ++++++++++++++++++
 kvcached/kv_cache_manager.py                  |  10 +-
 5 files changed, 487 insertions(+), 2 deletions(-)
 create mode 100755 benchmarks/bench_alloc/run_kvcached_configs.sh
 create mode 100755 benchmarks/bench_alloc/run_layout_retest.sh
 create mode 100755 benchmarks/bench_alloc/run_sweep.sh

diff --git a/benchmarks/bench_alloc/bench_result.md b/benchmarks/bench_alloc/bench_result.md
index b7cc848b..a71fbf89 100644
--- a/benchmarks/bench_alloc/bench_result.md
+++ b/benchmarks/bench_alloc/bench_result.md
@@ -141,3 +141,65 @@ Microbench wins are real and broad. At e2e level they amortise to ~5% on this se
 A separate, larger overhead remains: **kvcached as a whole adds ~30-50% e2e overhead vs vanilla vLLM** (pre-existing, on all kvcached branches), which the PR partially closes (~5%) but does not eliminate.
 
 The kvctl resize poll on the fix branch costs ~7 μs/alloc in microbench (~30% in multi-thread alloc), but is invisible at e2e (<1%).
+
+---
+
+## G. Investigating the 30–50% e2e gap — the layout flag is the entire story
+
+Three hypotheses for the pre-existing kvcached overhead (Section E), tested on the fix branch with the same setup as Section E (Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, 3 seeds, 100-prompt warmup discarded; all medians).
+
+### G.1 Burst alloc is **not** the cause
+
+`KVCACHED_MIN_RESERVED_PAGES=50` + `KVCACHED_MAX_RESERVED_PAGES=200` (vs default 5/10) — supposed to absorb burst alloc demand at request-rate=inf:
+
+| | tput | mean TTFT | mean TPOT |
+|---|--:|--:|--:|
+| kvcached default (5/10) | 9.87 | 16555 ms | 177.51 ms |
+| kvcached RESERVED=50/200 | 9.77 | 16548 ms | 179.38 ms |
+
+Statistically identical. The reserved-pool size has **no e2e effect** at this workload — the slow-path `cuMemMap` cost (4 ms/page from Section C) amortises out across the run.
+
+### G.2 KV layout is the **entire** cause
+
+`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one big shared buffer with cross-layer interleaving to **per-layer separate VMM pools**, matching vanilla vLLM's allocation pattern. Per-layer block stride drops from `num_layers × per_block_bytes` (28 × 64 KB = 1.79 MB on Qwen3-0.6B) back to `per_block_bytes` (64 KB).
+
+| rate=inf | tput (req/s) | mean TTFT | mean TPOT | P99 TPOT |
+|---|--:|--:|--:|--:|
+| **vanilla vLLM** | 14.25 | 11565 ms | 118.42 ms | 142.33 ms |
+| kvcached default (LAYOUT=true) | 9.87 (-31%) | 16555 (+43%) | 177.51 (+50%) | 230.84 (+62%) |
+| **kvcached LAYOUT=false** | **14.17 (-1%)** | **11642 (+1%)** | **119.03 (+0.5%)** | **142.72 (+0.3%)** |
+
+| rate=16 | tput (req/s) | mean TTFT | mean TPOT | P99 TPOT |
+|---|--:|--:|--:|--:|
+| **vanilla vLLM** | 13.20 | 240 ms | 73.62 ms | 117.27 ms |
+| kvcached default (LAYOUT=true) | 9.88 (-25%) | 1574 (6.6×) | 141.93 (+93%) | 226.60 (+93%) |
+| **kvcached LAYOUT=false + RESERVED=200** | **13.17 (-0.2%)** | **246 (+2%)** | **73.67 (+0%)** | **117.01 (-0.2%)** |
+
+`KVCACHED_CONTIGUOUS_LAYOUT=false` **eliminates the e2e gap entirely** — kvcached is statistically indistinguishable from vanilla vLLM on every metric, at both burst (rate=inf) and sustained (rate=16) load. **There is no residual "VMM mapping cost"; the entire 30-50% e2e overhead was the layer-interleaved layout.**
+
+### G.3 Mechanism
+
+`CONTIGUOUS_LAYOUT=true` (default) packs all layers' KV into one shared VM range with shape `[num_blocks, num_layers, 2, block_size, head_num, head_dim]`, and exposes per-layer tensors as strided views. Within one layer, block n+1 is `num_layers × per_block_bytes` bytes after block n — for Qwen3-0.6B that's 1.79 MB.
+
+During decode, FlashAttention reads **all blocks of one layer** for each request before moving to the next layer. With layer-interleaved layout, each block read crosses a different 2 MB VMM page → TLB pressure goes up roughly `num_layers`× vs vanilla. With per-layer pools (`LAYOUT=false`), consecutive blocks of one layer are tightly packed in one buffer, just like vanilla.
+
+This is consistent with the gap scaling with `num_layers` — small for shallow models (which is why Section E only saw +30-50% on a 28-layer model), and presumably much worse on the 32-layer Llama2-7B in PR #299's reported setup.
+
+### G.4 Required fix to make `LAYOUT=false` actually work
+
+`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True` when constructing the C++ `PageAllocator`, regardless of the env var. With `KVCACHED_CONTIGUOUS_LAYOUT=false`, the FTensor side switched to per-layer buffers but the PageAllocator continued computing offsets as if layers were striped, leading to `cuMemUnmap (...) failed in CUDA driver (1): invalid argument` on the first eviction. Patched to read `CONTIGUOUS_LAYOUT` from `kvcached.utils`. Diff:
+
+```python
+-from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
++from kvcached.utils import (CONTIGUOUS_LAYOUT, DEFAULT_IPC_NAME, PAGE_SIZE,
++                             SANITY_CHECK, get_kvcached_logger)
+@@
+-            contiguous_layout=True,
++            contiguous_layout=CONTIGUOUS_LAYOUT,
+```
+
+### G.5 Implication
+
+The kvcached "pre-existing overhead" framed in Section E is not pre-existing at all — it's a default setting. **Changing one env var (`KVCACHED_CONTIGUOUS_LAYOUT=false`) closes 100% of the gap**, and the previously-reported "~5% PR speedup" becomes the marginal piece of a feature whose primary cost was elsewhere.
+
+Open question: is there any reason `CONTIGUOUS_LAYOUT=true` is the default? The only place it's required is for hybrid-linear (mamba) models, which already check and bail out (`interfaces.py:138`). For all standard MHA/GQA/MLA workloads, `LAYOUT=false` should be the default.
diff --git a/benchmarks/bench_alloc/run_kvcached_configs.sh b/benchmarks/bench_alloc/run_kvcached_configs.sh
new file mode 100755
index 00000000..6153fe59
--- /dev/null
+++ b/benchmarks/bench_alloc/run_kvcached_configs.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+# Run remaining kvcached configs (B–H minus vanilla which is done).
+
+set -uo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+RESULTS_DIR="$SCRIPT_DIR/sweep_results"
+LOG_DIR="$SCRIPT_DIR/sweep_logs"
+mkdir -p "$RESULTS_DIR" "$LOG_DIR"
+
+VLLM="/home/xingqi/miniforge3/envs/kvcached/bin/vllm"
+MODEL="Qwen/Qwen3-0.6B"
+PORT=12347
+SEEDS=(42 99 7)
+
+cleanup_server() {
+    pkill -INT -f "vllm serve Qwen" 2>/dev/null || true
+    pkill -INT -f "VLLM::EngineCore" 2>/dev/null || true
+    for i in $(seq 1 30); do
+        if ! ss -lnt 2>/dev/null | awk '$4 ~ /:'"$PORT"'$/' | grep -q .; then
+            return 0
+        fi
+        sleep 1
+    done
+    pkill -KILL -f "vllm serve" 2>/dev/null || true
+    pkill -KILL -f "VLLM::EngineCore" 2>/dev/null || true
+    sleep 2
+}
+
+wait_for_ready() {
+    local log=$1
+    for i in $(seq 1 240); do
+        if curl -fsS "http://localhost:$PORT/v1/models" >/dev/null 2>&1; then
+            return 0
+        fi
+        if grep -qE "EngineCore failed|Traceback|RuntimeError|FATAL" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 2
+    done
+    return 1
+}
+
+run_config() {
+    local label=$1
+    local request_rate=$2
+    local env_str=$3
+
+    echo
+    echo "=========================================="
+    echo "[$(date +%H:%M:%S)] Config: $label   rate=$request_rate"
+    echo "Env: $env_str"
+    echo "=========================================="
+
+    cleanup_server
+
+    local server_log="$LOG_DIR/${label}.server.log"
+    : > "$server_log"
+
+    # shellcheck disable=SC2086
+    env $env_str \
+        "$VLLM" serve "$MODEL" \
+        --port "$PORT" \
+        --gpu-memory-utilization 0.5 \
+        --max-model-len 2048 \
+        > "$server_log" 2>&1 &
+    local server_pid=$!
+
+    if ! wait_for_ready "$server_log"; then
+        echo "  FAILED: server didn't come up; tail of log:"
+        tail -30 "$server_log"
+        kill -9 $server_pid 2>/dev/null || true
+        cleanup_server
+        return 1
+    fi
+
+    echo "  [$(date +%H:%M:%S)] Server up. Warmup (100 prompts)..."
+    "$VLLM" bench serve \
+        --backend vllm \
+        --model "$MODEL" \
+        --port "$PORT" \
+        --dataset-name random \
+        --random-input-len 512 --random-output-len 128 \
+        --num-prompts 100 \
+        --request-rate "$request_rate" \
+        --seed 1 \
+        > "$LOG_DIR/${label}.warmup.log" 2>&1 || true
+
+    for seed in "${SEEDS[@]}"; do
+        echo "  [$(date +%H:%M:%S)] seed=$seed..."
+        "$VLLM" bench serve \
+            --backend vllm \
+            --model "$MODEL" \
+            --port "$PORT" \
+            --dataset-name random \
+            --random-input-len 512 --random-output-len 128 \
+            --num-prompts 500 \
+            --request-rate "$request_rate" \
+            --seed "$seed" \
+            --save-result \
+            --result-dir "$RESULTS_DIR" \
+            --result-filename "${label}.seed${seed}.json" \
+            > "$LOG_DIR/${label}.seed${seed}.log" 2>&1
+        # extract key metrics
+        grep -E "Request throughput|Mean TTFT|Mean TPOT|P99 TTFT|P99 TPOT" "$LOG_DIR/${label}.seed${seed}.log" \
+            | sed 's/^/    /'
+    done
+
+    echo "  [$(date +%H:%M:%S)] Tearing down..."
+    kill -INT $server_pid 2>/dev/null || true
+    sleep 3
+    cleanup_server
+}
+
+# Config B: kvcached default (rate=inf)
+run_config "B_kvcached_default_inf" inf \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"
+
+# Config C: CONTIGUOUS_LAYOUT=false (rate=inf)
+run_config "C_layout_false_inf" inf \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false"
+
+# Config D: MAX_RESERVED_PAGES=200 (rate=inf)
+run_config "D_reserved200_inf" inf \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"
+
+# Config G: kvcached default (rate=16)
+run_config "G_kvcached_default_r16" 16 \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"
+
+# Config H: kvcached best (rate=16) -- combine layout+reserved if needed
+run_config "H_best_r16" 16 \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"
+
+echo
+echo "=========================================="
+echo "[$(date +%H:%M:%S)] All configs done."
+echo "Results in $RESULTS_DIR"
+echo "=========================================="
diff --git a/benchmarks/bench_alloc/run_layout_retest.sh b/benchmarks/bench_alloc/run_layout_retest.sh
new file mode 100755
index 00000000..27e208ee
--- /dev/null
+++ b/benchmarks/bench_alloc/run_layout_retest.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Re-run C (CONTIGUOUS_LAYOUT=false) and H (best r16) after the
+# kv_cache_manager.py:107 fix.
+
+set -uo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+RESULTS_DIR="$SCRIPT_DIR/sweep_results"
+LOG_DIR="$SCRIPT_DIR/sweep_logs"
+mkdir -p "$RESULTS_DIR" "$LOG_DIR"
+
+VLLM="/home/xingqi/miniforge3/envs/kvcached/bin/vllm"
+MODEL="Qwen/Qwen3-0.6B"
+PORT=12347
+SEEDS=(42 99 7)
+
+cleanup_server() {
+    pkill -INT -f "vllm serve Qwen" 2>/dev/null || true
+    pkill -INT -f "VLLM::EngineCore" 2>/dev/null || true
+    for i in $(seq 1 30); do
+        if ! ss -lnt 2>/dev/null | awk '$4 ~ /:'"$PORT"'$/' | grep -q .; then
+            return 0
+        fi
+        sleep 1
+    done
+    pkill -KILL -f "vllm serve" 2>/dev/null || true
+    pkill -KILL -f "VLLM::EngineCore" 2>/dev/null || true
+    sleep 2
+    return 0
+}
+
+wait_for_ready() {
+    local log=$1
+    for i in $(seq 1 240); do
+        if curl -fsS "http://localhost:$PORT/v1/models" >/dev/null 2>&1; then
+            return 0
+        fi
+        if grep -qE "EngineCore failed|Traceback|RuntimeError|FATAL|Engine core initialization failed|invalid argument" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 2
+    done
+    return 1
+}
+
+run_config() {
+    local label=$1
+    local request_rate=$2
+    local env_str=$3
+
+    echo
+    echo "=========================================="
+    echo "[$(date +%H:%M:%S)] Config: $label   rate=$request_rate"
+    echo "Env: $env_str"
+    echo "=========================================="
+
+    cleanup_server
+
+    local server_log="$LOG_DIR/${label}.server.log"
+    : > "$server_log"
+
+    # shellcheck disable=SC2086
+    env $env_str \
+        "$VLLM" serve "$MODEL" \
+        --port "$PORT" \
+        --gpu-memory-utilization 0.5 \
+        --max-model-len 2048 \
+        > "$server_log" 2>&1 &
+    local server_pid=$!
+
+    if ! wait_for_ready "$server_log"; then
+        echo "  FAILED: server didn't come up; tail:"
+        tail -25 "$server_log"
+        kill -9 $server_pid 2>/dev/null || true
+        cleanup_server
+        return 1
+    fi
+
+    echo "  [$(date +%H:%M:%S)] Server up. Warmup..."
+    "$VLLM" bench serve \
+        --backend vllm --model "$MODEL" --port "$PORT" \
+        --dataset-name random --random-input-len 512 --random-output-len 128 \
+        --num-prompts 100 --request-rate "$request_rate" --seed 1 \
+        > "$LOG_DIR/${label}.warmup.log" 2>&1 || true
+
+    for seed in "${SEEDS[@]}"; do
+        echo "  [$(date +%H:%M:%S)] seed=$seed"
+        "$VLLM" bench serve \
+            --backend vllm --model "$MODEL" --port "$PORT" \
+            --dataset-name random --random-input-len 512 --random-output-len 128 \
+            --num-prompts 500 --request-rate "$request_rate" --seed "$seed" \
+            --save-result --result-dir "$RESULTS_DIR" \
+            --result-filename "${label}.seed${seed}.json" \
+            > "$LOG_DIR/${label}.seed${seed}.log" 2>&1
+        grep -E "Request throughput|Mean TTFT|Mean TPOT|P99 TTFT|P99 TPOT|Successful requests" \
+            "$LOG_DIR/${label}.seed${seed}.log" | sed 's/^/    /'
+    done
+
+    echo "  [$(date +%H:%M:%S)] Tearing down..."
+    kill -INT $server_pid 2>/dev/null || true
+    sleep 3
+    cleanup_server
+}
+
+# Re-run C with the fix
+run_config "C_layout_false_inf" inf \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false"
+
+# Re-run H with the fix
+run_config "H_best_r16" 16 \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"
+
+echo
+echo "=========================================="
+echo "[$(date +%H:%M:%S)] Re-run done."
+echo "=========================================="
diff --git a/benchmarks/bench_alloc/run_sweep.sh b/benchmarks/bench_alloc/run_sweep.sh
new file mode 100755
index 00000000..6e767921
--- /dev/null
+++ b/benchmarks/bench_alloc/run_sweep.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# Sweep e2e configs for kvcached overhead investigation.
+# Each config: start server with specified env, run warmup + 3 seeds, parse, kill.
+
+set -uo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+RESULTS_DIR="$SCRIPT_DIR/sweep_results"
+LOG_DIR="$SCRIPT_DIR/sweep_logs"
+mkdir -p "$RESULTS_DIR" "$LOG_DIR"
+
+VENV_PY="/home/xingqi/miniforge3/envs/kvcached/bin/python"
+VLLM="/home/xingqi/miniforge3/envs/kvcached/bin/vllm"
+
+MODEL="Qwen/Qwen3-0.6B"
+PORT=12347
+GPU_MEM_UTIL=0.5
+MAX_MODEL_LEN=2048
+
+WARMUP_PROMPTS=100
+NUM_PROMPTS=500
+INPUT_LEN=512
+OUTPUT_LEN=128
+SEEDS=(42 99 7)
+
+cleanup_server() {
+    pkill -9 -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
+    pkill -9 -f "vllm serve" 2>/dev/null || true
+    # Wait for port to be free
+    for i in $(seq 1 30); do
+        if ! ss -ltn 2>/dev/null | grep -q ":$PORT "; then
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+wait_for_server() {
+    local log=$1
+    for i in $(seq 1 240); do
+        if curl -fsS "http://localhost:$PORT/v1/models" >/dev/null 2>&1; then
+            return 0
+        fi
+        if grep -q "Error\|Traceback" "$log" 2>/dev/null; then
+            echo "Server crashed; see $log"
+            return 1
+        fi
+        sleep 2
+    done
+    return 1
+}
+
+# Args: label, request_rate, env_string ("KEY1=VAL1 KEY2=VAL2"), extra_server_args
+run_config() {
+    local label=$1
+    local request_rate=$2
+    local env_str=$3
+    local extra=$4
+
+    echo
+    echo "=========================================="
+    echo "Config: $label   rate=$request_rate"
+    echo "Env: $env_str"
+    echo "=========================================="
+
+    cleanup_server
+
+    local server_log="$LOG_DIR/${label}.server.log"
+    : > "$server_log"
+
+    # shellcheck disable=SC2086
+    env $env_str \
+        "$VLLM" serve "$MODEL" \
+        --port "$PORT" \
+        --gpu-memory-utilization "$GPU_MEM_UTIL" \
+        --max-model-len "$MAX_MODEL_LEN" \
+        $extra \
+        > "$server_log" 2>&1 &
+    local server_pid=$!
+
+    if ! wait_for_server "$server_log"; then
+        echo "  FAILED: server didn't come up"
+        kill -9 $server_pid 2>/dev/null || true
+        cleanup_server
+        return 1
+    fi
+
+    echo "  Server up (pid=$server_pid). Running warmup..."
+    "$VLLM" bench serve \
+        --backend vllm \
+        --model "$MODEL" \
+        --port "$PORT" \
+        --dataset-name random \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
+        --num-prompts $WARMUP_PROMPTS \
+        --request-rate "$request_rate" \
+        --seed 1 \
+        > "$LOG_DIR/${label}.warmup.log" 2>&1 || true
+
+    for seed in "${SEEDS[@]}"; do
+        echo "  Run seed=$seed..."
+        local out="$RESULTS_DIR/${label}.seed${seed}.json"
+        "$VLLM" bench serve \
+            --backend vllm \
+            --model "$MODEL" \
+            --port "$PORT" \
+            --dataset-name random \
+            --random-input-len $INPUT_LEN \
+            --random-output-len $OUTPUT_LEN \
+            --num-prompts $NUM_PROMPTS \
+            --request-rate "$request_rate" \
+            --seed "$seed" \
+            --save-result \
+            --result-dir "$RESULTS_DIR" \
+            --result-filename "${label}.seed${seed}.json" \
+            > "$LOG_DIR/${label}.seed${seed}.log" 2>&1
+    done
+
+    echo "  Done. Tearing down server."
+    kill -INT $server_pid 2>/dev/null || true
+    sleep 3
+    cleanup_server
+}
+
+# ---------------- The actual sweep ----------------
+
+# Config A: vanilla vLLM (no kvcached)
+run_config "A_vanilla_inf"  inf  ""  ""
+
+# Config B: kvcached default (fix branch)
+run_config "B_kvcached_default_inf"  inf  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"  ""
+
+# Config C: kvcached, CONTIGUOUS_LAYOUT=false
+run_config "C_layout_false_inf"  inf  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false"  ""
+
+# Config D: kvcached, MAX_RESERVED_PAGES=200
+run_config "D_reserved200_inf"  inf  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"  ""
+
+# Config E: kvcached, both knobs
+run_config "E_both_inf"  inf  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"  ""
+
+# Config F: vanilla, request-rate=16
+run_config "F_vanilla_r16"  16  ""  ""
+
+# Config G: kvcached default, request-rate=16
+run_config "G_kvcached_default_r16"  16  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"  ""
+
+# Config H: kvcached best, request-rate=16
+run_config "H_best_r16"  16  \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false KVCACHED_MIN_RESERVED_PAGES=50 KVCACHED_MAX_RESERVED_PAGES=200"  ""
+
+echo
+echo "=========================================="
+echo "All configs done. Results in $RESULTS_DIR"
+echo "=========================================="
diff --git a/kvcached/kv_cache_manager.py b/kvcached/kv_cache_manager.py
index 1e93df79..8cecc313 100644
--- a/kvcached/kv_cache_manager.py
+++ b/kvcached/kv_cache_manager.py
@@ -19,7 +19,13 @@
 
 from kvcached.locks import NoOpLock
 from kvcached.tp_ipc_util import broadcast_kv_tensors_created
-from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
+from kvcached.utils import (
+    CONTIGUOUS_LAYOUT,
+    DEFAULT_IPC_NAME,
+    PAGE_SIZE,
+    SANITY_CHECK,
+    get_kvcached_logger,
+)
 from kvcached.vmm_ops import kv_tensors_created
 
 # The C++ MemInfoTracker derives its shm name from KVCACHED_IPC_NAME or
@@ -104,7 +110,7 @@ def __init__(
             self.world_size,
             pp_rank=self.pp_rank,
             async_sched=async_sched,
-            contiguous_layout=True,
+            contiguous_layout=CONTIGUOUS_LAYOUT,
             enable_page_prealloc=True,
             num_kv_buffers=self.num_kv_buffers,
             group_id=self.group_id,

From 21aa3d61a8517cf65268caf28bc7578dd5507a1f Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Tue, 5 May 2026 11:47:05 -0500
Subject: [PATCH 08/15] update

---
 benchmarks/bench_alloc/bench_result.md | 181 ++++++++++---------------
 1 file changed, 72 insertions(+), 109 deletions(-)

diff --git a/benchmarks/bench_alloc/bench_result.md b/benchmarks/bench_alloc/bench_result.md
index a71fbf89..4810bc66 100644
--- a/benchmarks/bench_alloc/bench_result.md
+++ b/benchmarks/bench_alloc/bench_result.md
@@ -1,26 +1,22 @@
-All measurements done on the same machine (NVIDIA GB10, aarch64) with clean rebuild + reinstall before each run. `main` = `ovg-project/kvcached` main, `PR HEAD` = `lianghao_c++` (incl. must-fix `98d9bb3`), `fix branch` = `fix/pr319-restore-resize`.
+NVIDIA GB10 (aarch64), clean rebuild between runs. `main` = `ovg-project/kvcached` main, `PR` = `lianghao_c++` + must-fix `98d9bb3`, `fix` = `fix/pr319-restore-resize`.
 
----
+## A. `available_size()`
 
-## A. `available_size()` overhead
-
-main calls `cudaMemGetInfo` via `get_avail_physical_pages()`; PR removed that.
+PR drops `cudaMemGetInfo` from the hot path.
 
 | | μs/call |
 |--:|--:|
 | main | 6.52 |
-| PR HEAD | 0.52 |
-| fix branch | 0.52 |
-
-**12.5× speedup**, no fix-branch cost. Trade-off: PR's `available_size()` no longer caps by physical memory.
+| PR | 0.52 |
+| fix | 0.52 |
 
----
+12.5×. Side effect: `available_size()` no longer caps by physical memory.
 
-## B. `group_indices_by_page` (called by `KVCacheManager.free`)
+## B. `group_indices_by_page`
 
-main: Python `for + defaultdict` + Python `get_page_id` per iter. PR: single C++ call.
+Called by `KVCacheManager.free`. Python loop + `defaultdict` → single C++ call.
 
-| N indices | main μs | PR HEAD μs | fix branch μs | speedup |
+| N | main | PR | fix | speedup |
 |--:|--:|--:|--:|--:|
 | 64 | 3.36 | 1.28 | 1.21 | 2.6× |
 | 256 | 12.88 | 3.34 | 3.24 | 3.9× |
@@ -28,68 +24,58 @@ main: Python `for + defaultdict` + Python `get_page_id` per iter. PR: single C++
 | 4096 | 209.12 | 71.06 | 72.50 | 2.9× |
 | 16384 | 834.13 | 291.87 | 296.72 | 2.9× |
 
-~3× across the board. Matters for large batch frees (e.g. burst sequence completion).
-
----
+(μs)
 
-## C. Slow-path alloc (`cuMemMap` cost)
+## C. Slow-path alloc
 
-`KVCACHED_MIN_RESERVED_PAGES=0`, `KVCACHED_MAX_RESERVED_PAGES=0` → every alloc maps a fresh page, every free unmaps. `k=128` = 1 full 2 MB page.
+`KVCACHED_MIN/MAX_RESERVED_PAGES=0` forces a fresh `cuMemMap` per alloc. `k=128` ≈ one 2 MB page.
 
-| k | main μs | PR HEAD μs | fix branch μs |
+| k | main | PR | fix |
 |--:|--:|--:|--:|
 | 128 | 4196 | 4023 | 4354 |
 | 256 | 8384 | 8113 | 8657 |
 | 1024 | 33028 | 32488 | 34662 |
 | 4096 | 134479 | 134430 | 137295 |
 
-All within ~5% of each other. **Slow path is dominated by `cuMemMap` syscalls**; the C++ migration doesn't help here. PR's gains are concentrated in the fast path (reserved-pool hit).
+(μs) All within 5%. The syscall dominates; C++ doesn't help.
 
----
+## D. Multi-thread throughput
 
-## D. Multi-threaded throughput (`async_sched=True`)
+N Python threads, each in a tight `alloc(k) + free(h)` loop, `async_sched=True`. Aggregate ops/s.
 
-N Python threads each running `alloc(k)+free(h)` in a tight loop, aggregate ops/s.
-
-**main (Python alloc + RLock + GIL):**
-| threads | k=1 ops/s | k=16 ops/s |
+main:
+| threads | k=1 | k=16 |
 |--:|--:|--:|
 | 1 | 15.7K | 15.1K |
 | 2 | 15.3K | 14.4K |
 | 4 | 13.3K | 12.0K |
 | 8 | 10.4K | 9.1K |
 
-→ More threads = LESS throughput.
-
-**PR HEAD (C++ alloc):**
-| threads | k=1 ops/s | k=16 ops/s |
+PR:
+| threads | k=1 | k=16 |
 |--:|--:|--:|
 | 1 | 45.9K | 41.2K |
 | 2 | 41.1K | 37.8K |
 | 4 | 38.5K | 48.6K |
 | 8 | 43.6K | 51.5K |
 
-→ ~3× single-thread, holds up to 8 threads.
-
-**fix branch:**
-| threads | k=1 ops/s | k=16 ops/s |
+fix:
+| threads | k=1 | k=16 |
 |--:|--:|--:|
 | 1 | 31.5K | 32.5K |
 | 2 | 30.6K | 30.7K |
 | 4 | 29.5K | 31.6K |
 | 8 | 27.8K | 29.1K |
 
-→ ~2× single-thread, holds steady. ~30% slower than PR HEAD due to per-alloc shm poll.
+main degrades with threads. PR holds. fix sits ~30% behind PR (per-alloc shm poll for kvctl resize).
 
-**Caveat:** `page_allocator_alloc_page` etc. don't `py::gil_scoped_release`, so GIL is still held during C++ work. Multi-thread benefit comes from shorter critical sections, not actual parallelism. Real vLLM uses `async_sched=False` (NoOpLock), so the per-thread RLock is gone but contention via GIL + manager state remains.
+GIL is still held during C++ work (no `gil_scoped_release`); gains come from shorter critical sections, not real parallelism. Real vLLM uses `async_sched=False` (NoOpLock) anyway.
 
----
+## F. `KVCacheBlock` object pool
 
-## F. `KVCacheBlock` object pool (`patches.py`)
+PR-only — main has no pool. Pre-allocated pool vs `new()` per call.
 
-Synthetic test: object creation per-call vs object reuse from a pre-allocated pool. Independent of branch (PR introduces the pool; main has no pool).
-
-| N | no-pool μs | pool μs | speedup |
+| N | no-pool | pool | speedup |
 |--:|--:|--:|--:|
 | 8 | 1.06 | 0.19 | 5.6× |
 | 64 | 8.07 | 1.13 | 7.1× |
@@ -97,97 +83,65 @@ Synthetic test: object creation per-call vs object reuse from a pre-allocated po
 | 1024 | 147.4 | 17.44 | 8.5× |
 | 4096 | 650.8 | 67.65 | 9.6× |
 
-~6-10×. Saves 7-27 μs per `get_new_blocks` call at typical batch sizes.
-
----
+(μs)
 
 ## E. End-to-end vLLM serving
 
-`vllm bench serve --backend vllm --dataset-name random --random-input-len 512 --random-output-len 128 --num-prompts 500 --request-rate inf` against a locally running vLLM server with `ENABLE_KVCACHED=true`. Model: `Qwen/Qwen3-0.6B`, `gpu-memory-utilization=0.5`, `max-model-len=2048`. 100-prompt warmup discarded; 3 seeds (42, 99, 7) per branch, median reported.
+`vllm bench serve` against a local server, `ENABLE_KVCACHED=true`. Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, `request-rate=inf`. 100-prompt warmup discarded; 3 seeds (42, 99, 7), median.
 
-| | Throughput (req/s) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+| | tput | TTFT mean | TTFT P99 | TPOT mean | TPOT P99 |
 |---|--:|--:|--:|--:|--:|
-| **vanilla vLLM** (no kvcached) | **14.21** | **11575** | **26377** | **119.3** | **143.2** |
-| main (kvcached on) | 9.42 (-34%) | 17284 (+49%) | 40464 (+53%) | 185.1 (+55%) | 239.6 (+67%) |
-| PR HEAD | 9.86 (-31%) | 16472 (+42%) | 38756 (+47%) | 177.5 (+49%) | 230.2 (+61%) |
-| fix branch | 9.83 (-31%) | 16602 (+43%) | 38746 (+47%) | 178.2 (+49%) | 231.7 (+62%) |
-
-(Δ shown vs. vanilla vLLM.)
+| vanilla | 14.21 | 11575 | 26377 | 119.3 | 143.2 |
+| main | 9.42 (-34%) | 17284 (+49%) | 40464 | 185.1 | 239.6 |
+| PR | 9.86 (-31%) | 16472 | 38756 | 177.5 | 230.2 |
+| fix | 9.83 (-31%) | 16602 | 38746 | 178.2 | 231.7 |
 
-**Two distinct gaps:**
+(req/s, ms.) PR closes ~5% of the kvcached overhead; fix retains it. The remaining ~30% gap to vanilla is the layout default — see G.
 
-1. **kvcached vs. vanilla vLLM: ~30-50% overhead everywhere**, regardless of branch. This is the pre-existing kvcached cost (CUDA VMM map/unmap, page-level indirection, alloc bookkeeping) and is what issue #299 reported (their 81 ms vLLM vs 268 ms kvcached TTFT, 3.3× — same direction, larger magnitude).
-2. **PR's C++ migration vs. main: ~5% closer to vanilla.** Real but modest at e2e on this setup; far smaller than #299's reported gap (would need their H20 + Llama2-7B + sharegpt setup to reproduce that).
+## G. Where the 30-50% e2e gap actually comes from
 
-**fix branch retains all of PR's 5% gain** (kvctl resize poll cost is invisible at e2e, <1%, within run-to-run noise).
+Three hypotheses, tested on fix with the Section E setup.
 
-The microbench wins (A: 12.5×, B: 3×, D: 2-5× under contention, F: 7×) wash down to ~5% e2e because per-token model forward (hundreds of μs of attention + FFN) dominates the request budget; allocator overhead is a few μs at fast path, ~5% of the total.
-
----
-
-## Summary
+### G.1 Reserved-pool size — no e2e effect
 
-| benchmark | what changed in PR | speedup vs main | retained on fix branch |
-|---|---|---|---|
-| A. `available_size()` | drop `cudaMemGetInfo` from hot path | **12.5×** | yes (full) |
-| B. `group_indices_by_page` | Python loop → C++ unordered_map | **~3×** | yes (full) |
-| C. slow-path alloc (`cuMemMap`) | unchanged (driver-bound) | **~1.0×** | yes (full) |
-| D. multi-thread alloc/free | C++ allocator + shorter critical sections | **2-5×** under contention | yes, ~70% of PR's gain |
-| E. e2e vLLM serving | sum of A–D | **~5%** | yes (full) |
-| F. `KVCacheBlock` object pool | reuse instead of new() | **~7×** (independent) | yes (full) |
-
-Microbench wins are real and broad. At e2e level they amortise to ~5% on this setup because model forward dominates the per-request time budget. On bigger models or more allocator-heavy workloads (PR #299's setup), the gap is reportedly larger; we did not reproduce that.
-
-A separate, larger overhead remains: **kvcached as a whole adds ~30-50% e2e overhead vs vanilla vLLM** (pre-existing, on all kvcached branches), which the PR partially closes (~5%) but does not eliminate.
-
-The kvctl resize poll on the fix branch costs ~7 μs/alloc in microbench (~30% in multi-thread alloc), but is invisible at e2e (<1%).
-
----
-
-## G. Investigating the 30–50% e2e gap — the layout flag is the entire story
-
-Three hypotheses for the pre-existing kvcached overhead (Section E), tested on the fix branch with the same setup as Section E (Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, 3 seeds, 100-prompt warmup discarded; all medians).
-
-### G.1 Burst alloc is **not** the cause
-
-`KVCACHED_MIN_RESERVED_PAGES=50` + `KVCACHED_MAX_RESERVED_PAGES=200` (vs default 5/10) — supposed to absorb burst alloc demand at request-rate=inf:
-
-| | tput | mean TTFT | mean TPOT |
+| | tput | TTFT mean | TPOT mean |
 |---|--:|--:|--:|
-| kvcached default (5/10) | 9.87 | 16555 ms | 177.51 ms |
-| kvcached RESERVED=50/200 | 9.77 | 16548 ms | 179.38 ms |
+| RESERVED=5/10 (default) | 9.87 | 16555 | 177.51 |
+| RESERVED=50/200 | 9.77 | 16548 | 179.38 |
 
-Statistically identical. The reserved-pool size has **no e2e effect** at this workload — the slow-path `cuMemMap` cost (4 ms/page from Section C) amortises out across the run.
+`cuMemMap` cost (4 ms/page from C) amortises across the run.
 
-### G.2 KV layout is the **entire** cause
+### G.2 KV layout — the entire cause
 
-`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one big shared buffer with cross-layer interleaving to **per-layer separate VMM pools**, matching vanilla vLLM's allocation pattern. Per-layer block stride drops from `num_layers × per_block_bytes` (28 × 64 KB = 1.79 MB on Qwen3-0.6B) back to `per_block_bytes` (64 KB).
+`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one shared buffer with cross-layer interleaving to per-layer VMM pools, matching vanilla's allocation pattern.
 
-| rate=inf | tput (req/s) | mean TTFT | mean TPOT | P99 TPOT |
+rate=inf:
+| | tput | TTFT mean | TPOT mean | TPOT P99 |
 |---|--:|--:|--:|--:|
-| **vanilla vLLM** | 14.25 | 11565 ms | 118.42 ms | 142.33 ms |
-| kvcached default (LAYOUT=true) | 9.87 (-31%) | 16555 (+43%) | 177.51 (+50%) | 230.84 (+62%) |
-| **kvcached LAYOUT=false** | **14.17 (-1%)** | **11642 (+1%)** | **119.03 (+0.5%)** | **142.72 (+0.3%)** |
+| vanilla | 14.25 | 11565 | 118.42 | 142.33 |
+| LAYOUT=true (default) | 9.87 (-31%) | 16555 | 177.51 | 230.84 |
+| LAYOUT=false | 14.17 (-1%) | 11642 | 119.03 | 142.72 |
 
-| rate=16 | tput (req/s) | mean TTFT | mean TPOT | P99 TPOT |
+rate=16:
+| | tput | TTFT mean | TPOT mean | TPOT P99 |
 |---|--:|--:|--:|--:|
-| **vanilla vLLM** | 13.20 | 240 ms | 73.62 ms | 117.27 ms |
-| kvcached default (LAYOUT=true) | 9.88 (-25%) | 1574 (6.6×) | 141.93 (+93%) | 226.60 (+93%) |
-| **kvcached LAYOUT=false + RESERVED=200** | **13.17 (-0.2%)** | **246 (+2%)** | **73.67 (+0%)** | **117.01 (-0.2%)** |
+| vanilla | 13.20 | 240 | 73.62 | 117.27 |
+| LAYOUT=true | 9.88 (-25%) | 1574 | 141.93 | 226.60 |
+| LAYOUT=false + RESERVED=200 | 13.17 (-0.2%) | 246 | 73.67 | 117.01 |
 
-`KVCACHED_CONTIGUOUS_LAYOUT=false` **eliminates the e2e gap entirely** — kvcached is statistically indistinguishable from vanilla vLLM on every metric, at both burst (rate=inf) and sustained (rate=16) load. **There is no residual "VMM mapping cost"; the entire 30-50% e2e overhead was the layer-interleaved layout.**
+`LAYOUT=false` is indistinguishable from vanilla on every metric, at burst (inf) and sustained (16) load. The whole 30-50% overhead is the layout default — there's no residual VMM mapping cost.
 
 ### G.3 Mechanism
 
-`CONTIGUOUS_LAYOUT=true` (default) packs all layers' KV into one shared VM range with shape `[num_blocks, num_layers, 2, block_size, head_num, head_dim]`, and exposes per-layer tensors as strided views. Within one layer, block n+1 is `num_layers × per_block_bytes` bytes after block n — for Qwen3-0.6B that's 1.79 MB.
+`CONTIGUOUS_LAYOUT=true` packs all layers into one VM range with shape `[num_blocks, num_layers, 2, block_size, head_num, head_dim]`. Within a layer, block n+1 sits `num_layers × per_block_bytes` after block n — 1.79 MB on Qwen3-0.6B (28 layers × 64 KB).
 
-During decode, FlashAttention reads **all blocks of one layer** for each request before moving to the next layer. With layer-interleaved layout, each block read crosses a different 2 MB VMM page → TLB pressure goes up roughly `num_layers`× vs vanilla. With per-layer pools (`LAYOUT=false`), consecutive blocks of one layer are tightly packed in one buffer, just like vanilla.
+FlashAttention reads all blocks of one layer before moving to the next. With this layout, each block read crosses a fresh 2 MB VMM page → ~`num_layers`× TLB pressure. Per-layer pools pack same-layer blocks contiguously.
 
-This is consistent with the gap scaling with `num_layers` — small for shallow models (which is why Section E only saw +30-50% on a 28-layer model), and presumably much worse on the 32-layer Llama2-7B in PR #299's reported setup.
+Effect scales with `num_layers`: 28 layers gave +30-50%; deeper models (32-layer Llama2-7B in #299) should be worse.
 
-### G.4 Required fix to make `LAYOUT=false` actually work
+### G.4 Patch
 
-`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True` when constructing the C++ `PageAllocator`, regardless of the env var. With `KVCACHED_CONTIGUOUS_LAYOUT=false`, the FTensor side switched to per-layer buffers but the PageAllocator continued computing offsets as if layers were striped, leading to `cuMemUnmap (...) failed in CUDA driver (1): invalid argument` on the first eviction. Patched to read `CONTIGUOUS_LAYOUT` from `kvcached.utils`. Diff:
+`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction.
 
 ```python
 -from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
@@ -198,8 +152,17 @@ This is consistent with the gap scaling with `num_layers` — small for shallow
 +            contiguous_layout=CONTIGUOUS_LAYOUT,
 ```
 
-### G.5 Implication
+## Summary
+
+| | speedup vs main | fix retains |
+|---|---|---|
+| A. `available_size()` | 12.5× | full |
+| B. `group_indices_by_page` | ~3× | full |
+| C. slow-path alloc | 1× (driver-bound) | full |
+| D. multi-thread | 2-5× | ~70% of PR |
+| F. block-object pool | ~7× (PR-only) | full |
+| E. e2e (LAYOUT=true) | ~5% | full |
 
-The kvcached "pre-existing overhead" framed in Section E is not pre-existing at all — it's a default setting. **Changing one env var (`KVCACHED_CONTIGUOUS_LAYOUT=false`) closes 100% of the gap**, and the previously-reported "~5% PR speedup" becomes the marginal piece of a feature whose primary cost was elsewhere.
+Microbench wins amortise to 5% e2e because per-token model forward dominates.
 
-Open question: is there any reason `CONTIGUOUS_LAYOUT=true` is the default? The only place it's required is for hybrid-linear (mamba) models, which already check and bail out (`interfaces.py:138`). For all standard MHA/GQA/MLA workloads, `LAYOUT=false` should be the default.
+The bigger lever is unrelated to the C++ migration: `KVCACHED_CONTIGUOUS_LAYOUT=false` closes the entire kvcached-vs-vanilla gap on standard MHA/GQA/MLA. Open question — why is `true` the default? Hybrid-linear/mamba models need it, but they already check and bail at `interfaces.py:138`.

From 7e3fdb7b8020adcac0fbdf684544c40c93a3999e Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sat, 9 May 2026 16:48:10 -0500
Subject: [PATCH 09/15] sync: pull latest changes from PR #319 head

Apply diff from 98d9bb3 -> 65a7d0a (lianghao208/kvcached:lianghao_c++):

- csrc/page_allocator.cpp: refactor free_page/free_pages/resize/trim
  to use scoped lock_guard blocks instead of manual lock/unlock,
  making the slow-path unmap exception-safe. Also drop the
  max_reserved_pages_ auto-expansion in alloc_page().

- kvcached/kv_cache_manager.py: cap available_size() by physical
  free pages (avail_physical + reserved) in addition to virtual
  free pages, so capacity reported under memory pressure stays honest.

Functionally equivalent to PR #319 head; the local override commits
(restore-resize, bench scripts, overhead notes) sit on top.
---
 csrc/page_allocator.cpp      | 245 +++++++++++++++++++----------------
 kvcached/kv_cache_manager.py |   5 +-
 2 files changed, 137 insertions(+), 113 deletions(-)

diff --git a/csrc/page_allocator.cpp b/csrc/page_allocator.cpp
index e81c373f..410809aa 100644
--- a/csrc/page_allocator.cpp
+++ b/csrc/page_allocator.cpp
@@ -171,17 +171,11 @@ std::shared_ptr<InternalPage> PageAllocator::alloc_page() {
       reserved_page_list_.pop_front();
       num_free_pages_--;
 
-      // Trigger preallocation if reserved pool is getting low
-      if (free_page_list_.size() != 0 &&
-          reserved_page_list_.size() <
-              static_cast<size_t>(min_reserved_pages_)) {
-        max_reserved_pages_ =
-            std::max(max_reserved_pages_ * 4, num_total_pages_);
-        LOGGER(INFO, "Increase max_reserved_pages_ %ld -> %ld",
-               max_reserved_pages_ / 2, max_reserved_pages_);
+      // Trigger preallocation to refill reserved pool if getting low
+      if (reserved_page_list_.size() <
+          static_cast<size_t>(min_reserved_pages_)) {
         prealloc_needed_ = true;
         cond_.notify_all();
-        // pass
       }
 
       update_memory_usage();
@@ -243,72 +237,71 @@ std::shared_ptr<InternalPage> PageAllocator::alloc_page() {
 }
 
 void PageAllocator::free_page(page_id_t page_id) {
-  std::lock_guard<std::mutex> lock(lock_);
-
-  num_free_pages_++;
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    num_free_pages_++;
 
-  if (reserved_page_list_.size() < static_cast<size_t>(max_reserved_pages_)) {
-    // Fast path: reserve page
-    reserved_page_list_.push_back(page_id);
-    update_memory_usage();
-    cond_.notify_all();
-    return;
+    if (reserved_page_list_.size() < static_cast<size_t>(max_reserved_pages_)) {
+      // Fast path: reserve page
+      reserved_page_list_.push_back(page_id);
+      update_memory_usage();
+      cond_.notify_all();
+      return;
+    }
   }
 
-  lock_.unlock();
-
-  // Slow path: free page and unmap
+  // Slow path: free page and unmap (lock released, exception-safe)
   unmap_pages({page_id});
 
-  lock_.lock();
-  free_page_list_.push_back(page_id);
-  update_memory_usage();
-  cond_.notify_all();
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    free_page_list_.push_back(page_id);
+    update_memory_usage();
+    cond_.notify_all();
+  }
 }
 
 void PageAllocator::free_pages(const std::vector<page_id_t> &page_ids) {
   auto start_time = std::chrono::steady_clock::now();
 
-  std::lock_guard<std::mutex> lock(lock_);
+  std::vector<page_id_t> pages_to_unmap;
 
-  num_free_pages_ += page_ids.size();
-  int64_t num_to_reserve = max_reserved_pages_ - reserved_page_list_.size();
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    num_free_pages_ += page_ids.size();
+    int64_t num_to_reserve = max_reserved_pages_ - reserved_page_list_.size();
 
-  if (num_to_reserve > 0) {
-    // Fast path: reserve pages
-    auto reserve_end =
-        page_ids.begin() +
-        std::min(static_cast<size_t>(num_to_reserve), page_ids.size());
-    reserved_page_list_.insert(reserved_page_list_.end(), page_ids.begin(),
-                               reserve_end);
+    if (num_to_reserve > 0) {
+      // Fast path: reserve pages
+      auto reserve_end =
+          page_ids.begin() +
+          std::min(static_cast<size_t>(num_to_reserve), page_ids.size());
+      reserved_page_list_.insert(reserved_page_list_.end(), page_ids.begin(),
+                                 reserve_end);
 
-    std::vector<page_id_t> remaining_pages(reserve_end, page_ids.end());
+      pages_to_unmap.assign(reserve_end, page_ids.end());
 
-    if (remaining_pages.empty()) {
-      update_memory_usage();
-      cond_.notify_all();
-      return;
+      if (pages_to_unmap.empty()) {
+        update_memory_usage();
+        cond_.notify_all();
+        return;
+      }
+    } else {
+      pages_to_unmap = page_ids;
     }
+  }
 
-    lock_.unlock();
-
-    // Slow path: free remaining pages
-    unmap_pages(remaining_pages);
+  // Slow path: unmap pages (lock released, exception-safe)
+  unmap_pages(pages_to_unmap);
 
-    lock_.lock();
-    free_page_list_.insert(free_page_list_.end(), remaining_pages.begin(),
-                           remaining_pages.end());
-  } else {
-    lock_.unlock();
-    unmap_pages(page_ids);
-    lock_.lock();
-    free_page_list_.insert(free_page_list_.end(), page_ids.begin(),
-                           page_ids.end());
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
+                           pages_to_unmap.end());
+    update_memory_usage();
+    cond_.notify_all();
   }
 
-  update_memory_usage();
-  cond_.notify_all();
-
   auto end_time = std::chrono::steady_clock::now();
   auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
       end_time - start_time);
@@ -318,58 +311,80 @@ void PageAllocator::free_pages(const std::vector<page_id_t> &page_ids) {
 
 bool PageAllocator::resize(int64_t new_mem_size) {
   int64_t new_num_pages = new_mem_size / page_size_;
-  std::lock_guard<std::mutex> lock(lock_);
 
-  if (new_num_pages < get_num_inuse_pages()) {
-    return false;
-  }
+  std::vector<page_id_t> pages_to_unmap;
 
-  if (new_num_pages == num_total_pages_) {
-    return true;
-  } else if (new_num_pages > num_total_pages_) {
-    int64_t num_to_expand = new_num_pages - num_total_pages_;
-
-    // Reuse previously reclaimed pages first
-    int64_t num_to_reuse = std::min(
-        static_cast<int64_t>(reclaimed_page_list_.size()), num_to_expand);
-    if (num_to_reuse > 0) {
-      for (int64_t i = 0; i < num_to_reuse; ++i) {
-        free_page_list_.push_back(reclaimed_page_list_.front());
-        reclaimed_page_list_.pop_front();
-      }
-      num_to_expand -= num_to_reuse;
-      num_free_pages_ += num_to_reuse;
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    if (new_num_pages < get_num_inuse_pages()) {
+      return false;
     }
 
-    // Allocate new pages if needed
-    if (num_to_expand > 0) {
-      for (int64_t i = num_total_pages_; i < num_total_pages_ + num_to_expand;
-           ++i) {
-        free_page_list_.push_back(i);
+    if (new_num_pages == num_total_pages_) {
+      return true;
+    } else if (new_num_pages > num_total_pages_) {
+      int64_t num_to_expand = new_num_pages - num_total_pages_;
+
+      // Reuse previously reclaimed pages first
+      int64_t num_to_reuse = std::min(
+          static_cast<int64_t>(reclaimed_page_list_.size()), num_to_expand);
+      if (num_to_reuse > 0) {
+        for (int64_t i = 0; i < num_to_reuse; ++i) {
+          free_page_list_.push_back(reclaimed_page_list_.front());
+          reclaimed_page_list_.pop_front();
+        }
+        num_to_expand -= num_to_reuse;
+        num_free_pages_ += num_to_reuse;
       }
-      num_free_pages_ += num_to_expand;
-    }
-    num_total_pages_ = new_num_pages;
-    update_memory_usage();
-  } else {
-    int64_t num_to_reclaim = num_total_pages_ - new_num_pages;
 
-    if (free_page_list_.size() < static_cast<size_t>(num_to_reclaim)) {
-      // Need to trim reserved pages first
-      if (!reserved_page_list_.empty()) {
-        std::vector<page_id_t> pages_to_unmap(reserved_page_list_.begin(),
-                                              reserved_page_list_.end());
-        reserved_page_list_.clear();
-
-        lock_.unlock();
-        unmap_pages(pages_to_unmap);
-        lock_.lock();
-
-        free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
-                               pages_to_unmap.end());
-        update_memory_usage();
+      // Allocate new pages if needed
+      if (num_to_expand > 0) {
+        for (int64_t i = num_total_pages_; i < num_total_pages_ + num_to_expand;
+             ++i) {
+          free_page_list_.push_back(i);
+        }
+        num_free_pages_ += num_to_expand;
+      }
+      num_total_pages_ = new_num_pages;
+      update_memory_usage();
+      return true;
+    } else {
+      // Shrink path
+      int64_t num_to_reclaim = num_total_pages_ - new_num_pages;
+
+      if (free_page_list_.size() < static_cast<size_t>(num_to_reclaim)) {
+        // Need to trim reserved pages first
+        if (!reserved_page_list_.empty()) {
+          pages_to_unmap.assign(reserved_page_list_.begin(),
+                                reserved_page_list_.end());
+          reserved_page_list_.clear();
+        } else {
+          return false;
+        }
+      } else {
+        // Enough free pages, reclaim directly
+        for (int64_t i = 0; i < num_to_reclaim; ++i) {
+          reclaimed_page_list_.push_back(free_page_list_.back());
+          free_page_list_.pop_back();
+        }
+        num_free_pages_ -= num_to_reclaim;
+        num_total_pages_ = new_num_pages;
+        return true;
       }
     }
+  }
+
+  // Unmap pages outside the lock (exception-safe)
+  unmap_pages(pages_to_unmap);
+
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    int64_t num_to_reclaim = num_total_pages_ - new_num_pages;
+
+    free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
+                           pages_to_unmap.end());
+    update_memory_usage();
 
     if (free_page_list_.size() < static_cast<size_t>(num_to_reclaim)) {
       return false;
@@ -386,23 +401,29 @@ bool PageAllocator::resize(int64_t new_mem_size) {
 }
 
 void PageAllocator::trim() {
-  std::lock_guard<std::mutex> lock(lock_);
-  std::vector<page_id_t> pages_to_unmap(reserved_page_list_.begin(),
-                                        reserved_page_list_.end());
-  reserved_page_list_.clear();
+  std::vector<page_id_t> pages_to_unmap;
 
-  if (pages_to_unmap.empty()) {
-    update_memory_usage();
-    return;
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    pages_to_unmap.assign(reserved_page_list_.begin(),
+                          reserved_page_list_.end());
+    reserved_page_list_.clear();
+
+    if (pages_to_unmap.empty()) {
+      update_memory_usage();
+      return;
+    }
   }
 
-  lock_.unlock();
+  // Unmap pages outside the lock (exception-safe)
   unmap_pages(pages_to_unmap);
-  lock_.lock();
 
-  free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
-                         pages_to_unmap.end());
-  update_memory_usage();
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    free_page_list_.insert(free_page_list_.end(), pages_to_unmap.begin(),
+                           pages_to_unmap.end());
+    update_memory_usage();
+  }
 }
 
 int64_t PageAllocator::get_num_free_pages() const { return num_free_pages_; }
diff --git a/kvcached/kv_cache_manager.py b/kvcached/kv_cache_manager.py
index 8cecc313..f3dc41c6 100644
--- a/kvcached/kv_cache_manager.py
+++ b/kvcached/kv_cache_manager.py
@@ -399,7 +399,10 @@ def available_size(self) -> int:
         if self.in_shrink:
             blocks_from_free_pages = 0
         else:
-            free_pages = self.page_allocator.get_num_free_pages()
+            virtual_free_pages = self.page_allocator.get_num_free_pages()
+            physical_free_pages = self.page_allocator.get_avail_physical_pages(
+            ) + self.page_allocator.get_num_reserved_pages()
+            free_pages = min(virtual_free_pages, physical_free_pages)
             blocks_from_free_pages = free_pages * InternalPage.get_num_blocks(
                 self.page_size, self.block_mem_size)
         return avail_blocks + blocks_from_free_pages

From a188b1d1cc41e662b98e783b72a5e2937989d652 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 10 May 2026 23:06:58 -0500
Subject: [PATCH 10/15] bench: split bench_alloc and add bench_layout; quantify
 contiguous-layout overhead

Restructure:
- bench_alloc/ now contains only the alloc/free microbench (bench_alloc.py)
  with a scoped bench_result.md (sections A, B, C, D, F).
- bench_layout/ (new) gathers the e2e vLLM sweep scripts (run_sweep.sh,
  run_kvcached_configs.sh, run_layout_retest.sh, parse_results.py) and the
  layout-overhead investigation: run_nsys_layout.sh, diff_nsys_kernels.py,
  run_ncu_attn.sh. Intermediate outputs (sweep_results/, sweep_logs/,
  nsys_runs/, nsys_logs/, ncu_runs/, ncu_logs/) are reproducible from the
  scripts and live-excluded via .git/info/exclude.

New evidence in bench_layout/bench_result.md G.3.2:
- nsys per-kernel diff on the Section E workload (500 prompts, rate=inf)
  isolates the entire +8 GB-ms (+34.8%) GPU-time gap to one kernel:
  flash::flash_fwd_splitkv_kernel goes from 14,666 ms to 22,879 ms
  (+8,213 ms, +56%, 3948 calls).
- Per-call attention slowdown scales with working set: +37% at 100 prompts
  vs +56% at 500 prompts, matching the prediction from the per-block stride
  (1.75 MB under contiguous=true vs 64 KB under false; VMM page = 2 MB).
- KV-write kernel (reshape_and_cache_flash_kernel) is unaffected, confirming
  the regression is in the multi-block-read path, not the write path.

ncu memory-counter run (L2 hit / DRAM throughput on flash_fwd_splitkv_kernel)
is gated on NVreg_RestrictProfilingToAdminUsers=0 and will be added once
counter access is enabled.
---
 benchmarks/bench_alloc/bench_result.md        |  82 +--------
 benchmarks/bench_layout/bench_result.md       | 143 ++++++++++++++++
 benchmarks/bench_layout/diff_nsys_kernels.py  | 101 +++++++++++
 benchmarks/bench_layout/parse_results.py      |  92 ++++++++++
 .../run_kvcached_configs.sh                   |   0
 .../run_layout_retest.sh                      |   0
 benchmarks/bench_layout/run_ncu_attn.sh       | 128 ++++++++++++++
 benchmarks/bench_layout/run_nsys_layout.sh    | 159 ++++++++++++++++++
 .../run_sweep.sh                              |   0
 9 files changed, 632 insertions(+), 73 deletions(-)
 create mode 100644 benchmarks/bench_layout/bench_result.md
 create mode 100644 benchmarks/bench_layout/diff_nsys_kernels.py
 create mode 100644 benchmarks/bench_layout/parse_results.py
 rename benchmarks/{bench_alloc => bench_layout}/run_kvcached_configs.sh (100%)
 rename benchmarks/{bench_alloc => bench_layout}/run_layout_retest.sh (100%)
 create mode 100755 benchmarks/bench_layout/run_ncu_attn.sh
 create mode 100755 benchmarks/bench_layout/run_nsys_layout.sh
 rename benchmarks/{bench_alloc => bench_layout}/run_sweep.sh (100%)

diff --git a/benchmarks/bench_alloc/bench_result.md b/benchmarks/bench_alloc/bench_result.md
index 4810bc66..8769c724 100644
--- a/benchmarks/bench_alloc/bench_result.md
+++ b/benchmarks/bench_alloc/bench_result.md
@@ -1,5 +1,11 @@
+# `bench_alloc` — KVCacheManager alloc/free microbench
+
 NVIDIA GB10 (aarch64), clean rebuild between runs. `main` = `ovg-project/kvcached` main, `PR` = `lianghao_c++` + must-fix `98d9bb3`, `fix` = `fix/pr319-restore-resize`.
 
+Driver: `bench_alloc.py` — times `KVCacheManager.alloc(k) + free(handles)` cycles after 100 warmup iterations.
+
+For e2e vLLM serving numbers and the layout-overhead investigation, see [`../bench_layout/bench_result.md`](../bench_layout/bench_result.md).
+
 ## A. `available_size()`
 
 PR drops `cudaMemGetInfo` from the hot path.
@@ -10,7 +16,7 @@ PR drops `cudaMemGetInfo` from the hot path.
 | PR | 0.52 |
 | fix | 0.52 |
 
-12.5×. Side effect: `available_size()` no longer caps by physical memory.
+12.5×. Side effect: `available_size()` no longer caps by physical memory (later restored on PR-319 head — see commit `7e3fdb7`).
 
 ## B. `group_indices_by_page`
 
@@ -85,74 +91,7 @@ PR-only — main has no pool. Pre-allocated pool vs `new()` per call.
 
 (μs)
 
-## E. End-to-end vLLM serving
-
-`vllm bench serve` against a local server, `ENABLE_KVCACHED=true`. Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, `request-rate=inf`. 100-prompt warmup discarded; 3 seeds (42, 99, 7), median.
-
-| | tput | TTFT mean | TTFT P99 | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|--:|
-| vanilla | 14.21 | 11575 | 26377 | 119.3 | 143.2 |
-| main | 9.42 (-34%) | 17284 (+49%) | 40464 | 185.1 | 239.6 |
-| PR | 9.86 (-31%) | 16472 | 38756 | 177.5 | 230.2 |
-| fix | 9.83 (-31%) | 16602 | 38746 | 178.2 | 231.7 |
-
-(req/s, ms.) PR closes ~5% of the kvcached overhead; fix retains it. The remaining ~30% gap to vanilla is the layout default — see G.
-
-## G. Where the 30-50% e2e gap actually comes from
-
-Three hypotheses, tested on fix with the Section E setup.
-
-### G.1 Reserved-pool size — no e2e effect
-
-| | tput | TTFT mean | TPOT mean |
-|---|--:|--:|--:|
-| RESERVED=5/10 (default) | 9.87 | 16555 | 177.51 |
-| RESERVED=50/200 | 9.77 | 16548 | 179.38 |
-
-`cuMemMap` cost (4 ms/page from C) amortises across the run.
-
-### G.2 KV layout — the entire cause
-
-`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one shared buffer with cross-layer interleaving to per-layer VMM pools, matching vanilla's allocation pattern.
-
-rate=inf:
-| | tput | TTFT mean | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|
-| vanilla | 14.25 | 11565 | 118.42 | 142.33 |
-| LAYOUT=true (default) | 9.87 (-31%) | 16555 | 177.51 | 230.84 |
-| LAYOUT=false | 14.17 (-1%) | 11642 | 119.03 | 142.72 |
-
-rate=16:
-| | tput | TTFT mean | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|
-| vanilla | 13.20 | 240 | 73.62 | 117.27 |
-| LAYOUT=true | 9.88 (-25%) | 1574 | 141.93 | 226.60 |
-| LAYOUT=false + RESERVED=200 | 13.17 (-0.2%) | 246 | 73.67 | 117.01 |
-
-`LAYOUT=false` is indistinguishable from vanilla on every metric, at burst (inf) and sustained (16) load. The whole 30-50% overhead is the layout default — there's no residual VMM mapping cost.
-
-### G.3 Mechanism
-
-`CONTIGUOUS_LAYOUT=true` packs all layers into one VM range with shape `[num_blocks, num_layers, 2, block_size, head_num, head_dim]`. Within a layer, block n+1 sits `num_layers × per_block_bytes` after block n — 1.79 MB on Qwen3-0.6B (28 layers × 64 KB).
-
-FlashAttention reads all blocks of one layer before moving to the next. With this layout, each block read crosses a fresh 2 MB VMM page → ~`num_layers`× TLB pressure. Per-layer pools pack same-layer blocks contiguously.
-
-Effect scales with `num_layers`: 28 layers gave +30-50%; deeper models (32-layer Llama2-7B in #299) should be worse.
-
-### G.4 Patch
-
-`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction.
-
-```python
--from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
-+from kvcached.utils import (CONTIGUOUS_LAYOUT, DEFAULT_IPC_NAME, PAGE_SIZE,
-+                             SANITY_CHECK, get_kvcached_logger)
-@@
--            contiguous_layout=True,
-+            contiguous_layout=CONTIGUOUS_LAYOUT,
-```
-
-## Summary
+## Summary (microbench only)
 
 | | speedup vs main | fix retains |
 |---|---|---|
@@ -161,8 +100,5 @@ Effect scales with `num_layers`: 28 layers gave +30-50%; deeper models (32-layer
 | C. slow-path alloc | 1× (driver-bound) | full |
 | D. multi-thread | 2-5× | ~70% of PR |
 | F. block-object pool | ~7× (PR-only) | full |
-| E. e2e (LAYOUT=true) | ~5% | full |
-
-Microbench wins amortise to 5% e2e because per-token model forward dominates.
 
-The bigger lever is unrelated to the C++ migration: `KVCACHED_CONTIGUOUS_LAYOUT=false` closes the entire kvcached-vs-vanilla gap on standard MHA/GQA/MLA. Open question — why is `true` the default? Hybrid-linear/mamba models need it, but they already check and bail at `interfaces.py:138`.
+These microbench wins are real but amortise to ~5% on e2e vLLM serving because per-token model forward dominates the hot path — see [`../bench_layout/bench_result.md`](../bench_layout/bench_result.md) for the e2e picture and the much larger layout-related lever.
diff --git a/benchmarks/bench_layout/bench_result.md b/benchmarks/bench_layout/bench_result.md
new file mode 100644
index 00000000..d5feafa1
--- /dev/null
+++ b/benchmarks/bench_layout/bench_result.md
@@ -0,0 +1,143 @@
+# `bench_layout` — vLLM e2e + `KVCACHED_CONTIGUOUS_LAYOUT` overhead
+
+NVIDIA GB10 (aarch64). `main` = `ovg-project/kvcached` main, `PR` = `lianghao_c++` + must-fix `98d9bb3`, `fix` = `fix/pr319-restore-resize` (PR #319 head + restore-resize + bench scripts).
+
+For the alloc/free microbench numbers (Sections A-D, F), see [`../bench_alloc/bench_result.md`](../bench_alloc/bench_result.md).
+
+Driver scripts:
+- `run_sweep.sh` / `run_kvcached_configs.sh` / `run_layout_retest.sh` — `vllm bench serve` sweeps over branch × layout × reserved-pool size
+- `run_nsys_layout.sh` — same workload under `nsys profile`, brackets capture window via `/start_profile` + `/stop_profile`
+- `diff_nsys_kernels.py` — per-kernel time diff between two nsys traces
+- `run_ncu_attn.sh` — ncu memory-workload analysis on `flash_fwd_splitkv_kernel` (needs `NVreg_RestrictProfilingToAdminUsers=0`)
+
+## E. End-to-end vLLM serving
+
+`vllm bench serve` against a local server, `ENABLE_KVCACHED=true`. Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, `request-rate=inf`. 100-prompt warmup discarded; 3 seeds (42, 99, 7), median.
+
+| | tput | TTFT mean | TTFT P99 | TPOT mean | TPOT P99 |
+|---|--:|--:|--:|--:|--:|
+| vanilla | 14.21 | 11575 | 26377 | 119.3 | 143.2 |
+| main | 9.42 (-34%) | 17284 (+49%) | 40464 | 185.1 | 239.6 |
+| PR | 9.86 (-31%) | 16472 | 38756 | 177.5 | 230.2 |
+| fix | 9.83 (-31%) | 16602 | 38746 | 178.2 | 231.7 |
+
+(req/s, ms.) PR closes ~5% of the kvcached overhead vs main; fix retains it. The remaining ~30% gap to vanilla is the layout default — see G.
+
+## G. Where the 30-50% e2e gap actually comes from
+
+Three hypotheses, tested on fix with the Section E setup.
+
+### G.1 Reserved-pool size — no e2e effect
+
+| | tput | TTFT mean | TPOT mean |
+|---|--:|--:|--:|
+| RESERVED=5/10 (default) | 9.87 | 16555 | 177.51 |
+| RESERVED=50/200 | 9.77 | 16548 | 179.38 |
+
+`cuMemMap` cost (4 ms/page from C) amortises across the run.
+
+### G.2 KV layout — the entire cause
+
+`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one shared buffer with cross-layer interleaving to per-layer VMM pools, matching vanilla's allocation pattern.
+
+rate=inf:
+| | tput | TTFT mean | TPOT mean | TPOT P99 |
+|---|--:|--:|--:|--:|
+| vanilla | 14.25 | 11565 | 118.42 | 142.33 |
+| LAYOUT=true (default) | 9.87 (-31%) | 16555 | 177.51 | 230.84 |
+| LAYOUT=false | 14.17 (-1%) | 11642 | 119.03 | 142.72 |
+
+rate=16:
+| | tput | TTFT mean | TPOT mean | TPOT P99 |
+|---|--:|--:|--:|--:|
+| vanilla | 13.20 | 240 | 73.62 | 117.27 |
+| LAYOUT=true | 9.88 (-25%) | 1574 | 141.93 | 226.60 |
+| LAYOUT=false + RESERVED=200 | 13.17 (-0.2%) | 246 | 73.67 | 117.01 |
+
+`LAYOUT=false` is indistinguishable from vanilla on every metric, at burst (inf) and sustained (16) load. The whole 30-50% overhead is the layout default — there's no residual VMM mapping cost.
+
+### G.3 Mechanism
+
+#### G.3.1 The stride (from code)
+
+`CONTIGUOUS_LAYOUT=true` packs all layers into one VM range. At `kvcached/integration/vllm/interfaces.py:282-289`:
+
+```python
+contiguous_shape = [num_blocks_per_layer, num_layers] + layer_elem_shape
+contiguous_tensor = raw_kv_tensors[0].view(dtype=dtype)[:num_eles].view(contiguous_shape)
+kv_tensors = [contiguous_tensor[:, i].permute(*permute_order) for i in range(num_layers)]
+```
+
+So the raw memory order is `(block, layer, k/v, token, head, dim)`. When the per-layer view `[:, i]` is sliced out for layer `i`, the stride between block `n` and block `n+1` in that view is `num_layers × per_block_bytes`. For Qwen3-0.6B:
+
+| field | value |
+|---|---|
+| `num_layers` | 28 |
+| `num_kv_heads` | 8 |
+| `head_dim` | 128 |
+| `block_size` | 16 tokens |
+| dtype | bf16 (2 B) |
+| per-block K (or V), one layer | 16·8·128·2 = **32 KB** |
+| per-block K+V, one layer | **64 KB** |
+| **block stride, contiguous=true** | 28 × 64 KB = **1.75 MB** |
+| **block stride, contiguous=false** | **64 KB** (from `interfaces.py:274`: `strides[1] = hidden_size_eles`) |
+| VMM page size | **2 MB** |
+
+So contiguous=true makes every block read of FlashAttention land within ~1 byte of a new 2 MB VMM page; non-contiguous packs ~32 blocks per page.
+
+#### G.3.2 nsys per-kernel diff
+
+Same workload as Section E (Qwen3-0.6B, gpu-mem 0.5, rand 512/128, rate=inf, 500 prompts, 100 warmup discarded, seed=42). vLLM started under `nsys profile --capture-range=cudaProfilerApi`; profiler window bracketed by `/start_profile`+`/stop_profile`.
+
+Total GPU-kernel time inside the profiler window:
+
+| | total kernel time | delta |
+|---|--:|--:|
+| LAYOUT=false | 23,121 ms | — |
+| LAYOUT=true | 31,164 ms | **+8,043 ms (+34.8%)** |
+
+Top kernels by absolute change (file: `nsys_runs/kernel_diff_500p.txt`):
+
+| kernel | calls | LAYOUT=false ms | LAYOUT=true ms | delta ms | delta % |
+|---|--:|--:|--:|--:|--:|
+| `flash::flash_fwd_splitkv_kernel` *(KV-read)* | 3948 | 14,666 | **22,879** | **+8,213** | **+56.0%** |
+| `cutlass_80_tensorop_bf16_s16816gemm_relu` | 7756 | 2,277 | 2,214 | -63 | -2.8% |
+| `vllm::reshape_and_cache_flash_kernel` *(KV-write)* | 3948 | 302 | 271 | -32 | -10.5% |
+| `nvjet_sm121_tst_mma_128x176x64` | 3752 | 580 | 552 | -28 | -4.8% |
+| `triton_red_fused_3` | 3780 | 118 | 95 | -23 | -19.8% |
+
+The +8,213 ms FlashAttention regression alone exceeds the +8,043 ms total — every other kernel is unchanged or slightly faster (likely L2 noise from cross-run variation). One kernel explains the entire e2e gap.
+
+Per-call attention cost:
+
+| | LAYOUT=false μs/call | LAYOUT=true μs/call | delta μs/call | delta % |
+|---|--:|--:|--:|--:|
+| 100 prompts (smaller WS) | 851 | 1163 | +313 | +37% |
+| 500 prompts (Section E WS) | 3,715 | 5,795 | **+2,080** | **+56%** |
+
+The per-call gap *and* the per-call slowdown both grow with concurrent working-set size — consistent with TLB/cache misses: more in-flight requests → more distinct 2 MB pages touched per attention call → cache pressure → linear-ish blow-up in kernel time. Non-contiguous is flat because ~32 blocks share each page.
+
+Note the asymmetry: only the **read** kernel regresses. `reshape_and_cache_flash_kernel` (which writes one new token per request per step) is unaffected — writes are sequential per-position so they don't trigger the cross-page stride pattern.
+
+#### G.3.3 Implication
+
+Effect scales with `num_layers`: 28 layers gave +56% per attention call; deeper models (32-layer Llama2-7B in #299) should be slightly worse, and a model with ≥48 layers crosses the 2 MB page boundary even more sharply.
+
+### G.4 Patch
+
+`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction.
+
+```python
+-from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
++from kvcached.utils import (CONTIGUOUS_LAYOUT, DEFAULT_IPC_NAME, PAGE_SIZE,
++                             SANITY_CHECK, get_kvcached_logger)
+@@
+-            contiguous_layout=True,
++            contiguous_layout=CONTIGUOUS_LAYOUT,
+```
+
+## Summary
+
+The bigger lever is unrelated to the C++ migration: `KVCACHED_CONTIGUOUS_LAYOUT=false` closes the entire kvcached-vs-vanilla gap on standard MHA/GQA/MLA.
+
+Open question — why is `true` the default? Hybrid-linear/mamba models need it, but they already check and bail at `interfaces.py:138`. On standard attention layouts, flipping the default would eliminate the kvcached e2e overhead measured here.
diff --git a/benchmarks/bench_layout/diff_nsys_kernels.py b/benchmarks/bench_layout/diff_nsys_kernels.py
new file mode 100644
index 00000000..c80038a0
--- /dev/null
+++ b/benchmarks/bench_layout/diff_nsys_kernels.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+# SPDX-License-Identifier: Apache-2.0
+"""Diff per-kernel GPU time between two nsys traces.
+
+Usage:
+    python diff_nsys_kernels.py <baseline.nsys-rep> <variant.nsys-rep>
+
+Prints kernels ordered by absolute delta (variant - baseline). Useful for
+identifying which kernels regress between KVCACHED_CONTIGUOUS_LAYOUT=true and
+false.
+"""
+import csv
+import subprocess
+import sys
+from collections import defaultdict
+from io import StringIO
+
+
+def kernel_sums(nsys_rep_path: str) -> dict:
+    """Returns {kernel_name: (total_ns, instances)} for the given trace."""
+    result = subprocess.run(
+        [
+            "nsys",
+            "stats",
+            "--report",
+            "cuda_gpu_kern_sum",
+            "--format",
+            "csv",
+            "--force-overwrite=true",
+            nsys_rep_path,
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    # nsys stats emits header text, then a CSV table. Find the CSV.
+    text = result.stdout
+    # Find a line that looks like a header row.
+    lines = text.splitlines()
+    start = None
+    for i, line in enumerate(lines):
+        if line.startswith("Time (%)") or line.startswith("\"Time (%)"):
+            start = i
+            break
+    if start is None:
+        raise RuntimeError(
+            f"No CSV header found in nsys stats output for {nsys_rep_path}.\n"
+            f"First 500 chars:\n{text[:500]}")
+
+    csv_text = "\n".join(lines[start:])
+    reader = csv.DictReader(StringIO(csv_text))
+    sums: dict = defaultdict(lambda: [0, 0])
+    for row in reader:
+        name = row["Name"]
+        total_ns = int(row["Total Time (ns)"])
+        instances = int(row["Instances"])
+        sums[name][0] += total_ns
+        sums[name][1] += instances
+    return {k: tuple(v) for k, v in sums.items()}
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(__doc__)
+        sys.exit(1)
+    baseline_path, variant_path = sys.argv[1], sys.argv[2]
+    print(f"baseline = {baseline_path}", file=sys.stderr)
+    print(f"variant  = {variant_path}", file=sys.stderr)
+
+    base = kernel_sums(baseline_path)
+    var = kernel_sums(variant_path)
+
+    all_kernels = set(base) | set(var)
+    rows = []
+    for k in all_kernels:
+        b_ns, b_n = base.get(k, (0, 0))
+        v_ns, v_n = var.get(k, (0, 0))
+        delta = v_ns - b_ns
+        rows.append((k, b_ns, b_n, v_ns, v_n, delta))
+
+    rows.sort(key=lambda r: r[5])  # by delta ascending (negative first = sped up)
+
+    total_b = sum(r[1] for r in rows)
+    total_v = sum(r[3] for r in rows)
+    print(
+        f"\nTotal kernel time: baseline={total_b/1e6:,.1f} ms  "
+        f"variant={total_v/1e6:,.1f} ms  "
+        f"delta={ (total_v-total_b)/1e6:+,.1f} ms ({(total_v-total_b)/total_b*100:+.1f}%)\n"
+    )
+
+    print(f"{'kernel':<80} {'base ms':>10} {'var ms':>10} {'delta ms':>10} {'delta %':>8}  base_n  var_n")
+    for k, b_ns, b_n, v_ns, v_n, d_ns in rows:
+        if abs(d_ns) < 1_000_000:  # < 1 ms delta, skip noise
+            continue
+        pct = (d_ns / b_ns * 100) if b_ns else float("inf")
+        print(
+            f"{k[:80]:<80} {b_ns/1e6:>10.2f} {v_ns/1e6:>10.2f} {d_ns/1e6:>+10.2f} {pct:>+7.1f}%  {b_n:>6}  {v_n:>6}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench_layout/parse_results.py b/benchmarks/bench_layout/parse_results.py
new file mode 100644
index 00000000..4af7631c
--- /dev/null
+++ b/benchmarks/bench_layout/parse_results.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+# SPDX-License-Identifier: Apache-2.0
+
+"""Aggregate sweep results into a markdown table (median across 3 seeds)."""
+import glob
+import json
+import os
+import statistics
+import sys
+
+
+def median(xs):
+    return statistics.median(xs)
+
+
+def load_one(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+def collect(label, dir_):
+    files = sorted(glob.glob(os.path.join(dir_, f"{label}.seed*.json")))
+    runs = []
+    for fp in files:
+        d = load_one(fp)
+        if d.get("completed", 0) == 0 and d.get("num_prompts", 0) > 0:
+            # benchmark crashed or all reqs failed
+            runs.append({"failed": True, "file": os.path.basename(fp)})
+            continue
+        runs.append({
+            "throughput":   d.get("request_throughput"),
+            "mean_ttft":    d.get("mean_ttft_ms"),
+            "p99_ttft":     d.get("p99_ttft_ms"),
+            "mean_tpot":    d.get("mean_tpot_ms"),
+            "p99_tpot":     d.get("p99_tpot_ms"),
+            "completed":    d.get("completed"),
+        })
+    return runs
+
+
+def median_of(runs, key):
+    valid = [r[key] for r in runs if not r.get("failed") and r.get(key) is not None]
+    if not valid:
+        return None
+    return median(valid)
+
+
+def main(dir_):
+    configs = [
+        ("A_vanilla_inf",          "vanilla vLLM (no kvcached), rate=inf"),
+        ("B_kvcached_default_inf", "kvcached default, rate=inf"),
+        ("C_layout_false_inf",     "kvcached + LAYOUT=false, rate=inf"),
+        ("D_reserved200_inf",      "kvcached + RESERVED=200, rate=inf"),
+        ("E_both_inf",             "kvcached + LAYOUT=false + RESERVED=200, rate=inf"),
+        ("F_vanilla_r16",          "vanilla vLLM, rate=16"),
+        ("G_kvcached_default_r16", "kvcached default, rate=16"),
+        ("H_best_r16",             "kvcached + LAYOUT=false + RESERVED=200, rate=16"),
+    ]
+
+    print(f"{'config':<55} {'tput':>8} {'mTTFT':>10} {'p99TTFT':>10} {'mTPOT':>8} {'p99TPOT':>9} seeds")
+    rows = []
+    for label, desc in configs:
+        runs = collect(label, dir_)
+        if not runs:
+            continue
+        ok = sum(1 for r in runs if not r.get("failed") and r.get("completed", 0) > 0)
+        failed = sum(1 for r in runs if r.get("failed") or r.get("completed", 0) == 0)
+        tput = median_of(runs, "throughput")
+        mttft = median_of(runs, "mean_ttft")
+        pttft = median_of(runs, "p99_ttft")
+        mtpot = median_of(runs, "mean_tpot")
+        ptpot = median_of(runs, "p99_tpot")
+        def s(v, d=2):
+            return f"{v:.{d}f}" if v is not None else "—"
+        print(f"{desc:<55} {s(tput,2):>8} {s(mttft,1):>10} {s(pttft,1):>10} {s(mtpot,2):>8} {s(ptpot,2):>9}  ok={ok} fail={failed}")
+        rows.append((label, desc, ok, failed, tput, mttft, pttft, mtpot, ptpot))
+
+    # Markdown table
+    print()
+    print("| config | tput (req/s) | mean TTFT (ms) | P99 TTFT (ms) | mean TPOT (ms) | P99 TPOT (ms) |")
+    print("|---|--:|--:|--:|--:|--:|")
+    for label, desc, ok, failed, tput, mttft, pttft, mtpot, ptpot in rows:
+        suffix = f" *(failed in {failed}/{ok+failed})*" if failed else ""
+        def s(v, d=2):
+            return f"{v:.{d}f}" if v is not None else "—"
+        print(f"| {desc}{suffix} | {s(tput)} | {s(mttft,1)} | {s(pttft,1)} | {s(mtpot)} | {s(ptpot)} |")
+
+
+if __name__ == "__main__":
+    dir_ = sys.argv[1] if len(sys.argv) > 1 else "."
+    main(dir_)
diff --git a/benchmarks/bench_alloc/run_kvcached_configs.sh b/benchmarks/bench_layout/run_kvcached_configs.sh
similarity index 100%
rename from benchmarks/bench_alloc/run_kvcached_configs.sh
rename to benchmarks/bench_layout/run_kvcached_configs.sh
diff --git a/benchmarks/bench_alloc/run_layout_retest.sh b/benchmarks/bench_layout/run_layout_retest.sh
similarity index 100%
rename from benchmarks/bench_alloc/run_layout_retest.sh
rename to benchmarks/bench_layout/run_layout_retest.sh
diff --git a/benchmarks/bench_layout/run_ncu_attn.sh b/benchmarks/bench_layout/run_ncu_attn.sh
new file mode 100755
index 00000000..e2bda0e6
--- /dev/null
+++ b/benchmarks/bench_layout/run_ncu_attn.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+# SPDX-License-Identifier: Apache-2.0
+#
+# After nsys identifies the regressing kernel(s), this captures detailed
+# memory metrics via ncu. Set KERNEL_REGEX to a regex matching the kernel name
+# (e.g. "flash_attn|fa_fwd|fa3").
+
+set -uo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+OUT_DIR="$SCRIPT_DIR/ncu_runs"
+LOG_DIR="$SCRIPT_DIR/ncu_logs"
+mkdir -p "$OUT_DIR" "$LOG_DIR"
+
+VLLM="/home/xingqi/miniforge3/envs/kvcached/bin/vllm"
+MODEL="Qwen/Qwen3-0.6B"
+PORT=12349
+KERNEL_REGEX="${KERNEL_REGEX:-flash_fwd_splitkv_kernel}"
+LAUNCH_SKIP="${LAUNCH_SKIP:-400}"
+LAUNCH_COUNT="${LAUNCH_COUNT:-8}"
+
+cleanup_server() {
+    pkill -INT -f "vllm serve Qwen" 2>/dev/null || true
+    pkill -INT -f "VLLM::EngineCore" 2>/dev/null || true
+    for i in $(seq 1 30); do
+        if ! ss -lnt 2>/dev/null | awk '$4 ~ /:'"$PORT"'$/' | grep -q .; then
+            return 0
+        fi
+        sleep 1
+    done
+    pkill -KILL -f "vllm serve" 2>/dev/null || true
+    pkill -KILL -f "VLLM::EngineCore" 2>/dev/null || true
+    sleep 2
+}
+
+wait_for_ready() {
+    local log=$1
+    for i in $(seq 1 240); do
+        if curl -fsS "http://localhost:$PORT/v1/models" >/dev/null 2>&1; then
+            return 0
+        fi
+        if grep -qE "EngineCore failed|Traceback|RuntimeError|FATAL" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 2
+    done
+    return 1
+}
+
+run_one() {
+    local label=$1
+    local env_str=$2
+
+    echo
+    echo "=========================================="
+    echo "[$(date +%H:%M:%S)] ncu: $label"
+    echo "Env: $env_str"
+    echo "Kernel regex: $KERNEL_REGEX"
+    echo "Skip: $LAUNCH_SKIP, Count: $LAUNCH_COUNT"
+    echo "=========================================="
+
+    cleanup_server
+
+    local server_log="$LOG_DIR/${label}.server.log"
+    : > "$server_log"
+
+    # shellcheck disable=SC2086
+    env $env_str \
+        ncu \
+            --target-processes all \
+            --export "$OUT_DIR/${label}" \
+            --force-overwrite \
+            --kernel-name "regex:$KERNEL_REGEX" \
+            --launch-skip "$LAUNCH_SKIP" \
+            --launch-count "$LAUNCH_COUNT" \
+            --section MemoryWorkloadAnalysis \
+            --section MemoryWorkloadAnalysis_Tables \
+            --section LaunchStats \
+            --section Occupancy \
+            --section SourceCounters \
+            --section SpeedOfLight \
+            --section SpeedOfLight_RooflineChart \
+            --section WarpStateStats \
+            "$VLLM" serve "$MODEL" \
+            --port "$PORT" \
+            --gpu-memory-utilization 0.5 \
+            --max-model-len 2048 \
+            > "$server_log" 2>&1 &
+    local server_pid=$!
+
+    if ! wait_for_ready "$server_log"; then
+        echo "  FAILED: server didn't come up; tail of log:"
+        tail -30 "$server_log"
+        kill -9 $server_pid 2>/dev/null || true
+        cleanup_server
+        return 1
+    fi
+    echo "  [$(date +%H:%M:%S)] Server ready"
+
+    # Just push 50 prompts; ncu replays each kernel many times so this will
+    # take ~5-15 minutes. We don't need throughput here, only kernel metrics.
+    "$VLLM" bench serve --backend vllm --model "$MODEL" --port "$PORT" \
+        --dataset-name random --random-input-len 512 --random-output-len 128 \
+        --num-prompts 50 --request-rate inf --seed 42 \
+        > "$LOG_DIR/${label}.bench.log" 2>&1 &
+    local bench_pid=$!
+    echo "  [$(date +%H:%M:%S)] Bench started (pid $bench_pid)"
+    # Wait until launch-count kernels collected then kill bench
+    wait "$bench_pid" 2>/dev/null || true
+
+    kill -INT $server_pid 2>/dev/null || true
+    sleep 5
+    cleanup_server
+    sleep 3
+}
+
+run_one "layout_true" \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"
+
+run_one "layout_false" \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false"
+
+echo
+echo "=========================================="
+echo "Reports in $OUT_DIR"
+ls -la "$OUT_DIR" | tail -10
+echo "=========================================="
diff --git a/benchmarks/bench_layout/run_nsys_layout.sh b/benchmarks/bench_layout/run_nsys_layout.sh
new file mode 100755
index 00000000..9c36bda9
--- /dev/null
+++ b/benchmarks/bench_layout/run_nsys_layout.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright contributors to the kvcached project
+# SPDX-License-Identifier: Apache-2.0
+#
+# Launch vllm-serve under nsys with KVCACHED_CONTIGUOUS_LAYOUT in {true,false}
+# and drive it with a small bench burst. Captures GPU kernel timeline so we can
+# diff per-kernel time between the two layouts.
+
+set -uo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+OUT_DIR="$SCRIPT_DIR/nsys_runs"
+LOG_DIR="$SCRIPT_DIR/nsys_logs"
+mkdir -p "$OUT_DIR" "$LOG_DIR"
+
+VLLM="/home/xingqi/miniforge3/envs/kvcached/bin/vllm"
+MODEL="Qwen/Qwen3-0.6B"
+PORT=12348
+NUM_PROMPTS=${NUM_PROMPTS:-100}
+RATE=${RATE:-inf}
+CAPTURE_DURATION=${CAPTURE_DURATION:-90}  # seconds
+WARMUP_PROMPTS=${WARMUP_PROMPTS:-30}
+
+cleanup_server() {
+    pkill -INT -f "vllm serve Qwen" 2>/dev/null || true
+    pkill -INT -f "VLLM::EngineCore" 2>/dev/null || true
+    for i in $(seq 1 30); do
+        if ! ss -lnt 2>/dev/null | awk '$4 ~ /:'"$PORT"'$/' | grep -q .; then
+            return 0
+        fi
+        sleep 1
+    done
+    pkill -KILL -f "vllm serve" 2>/dev/null || true
+    pkill -KILL -f "VLLM::EngineCore" 2>/dev/null || true
+    sleep 2
+}
+
+wait_for_ready() {
+    local log=$1
+    for i in $(seq 1 240); do
+        if curl -fsS "http://localhost:$PORT/v1/models" >/dev/null 2>&1; then
+            return 0
+        fi
+        if grep -qE "EngineCore failed|Traceback|RuntimeError|FATAL" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 2
+    done
+    return 1
+}
+
+run_one() {
+    local label=$1
+    local env_str=$2
+
+    echo
+    echo "=========================================="
+    echo "[$(date +%H:%M:%S)] nsys profile: $label"
+    echo "Env: $env_str"
+    echo "=========================================="
+
+    cleanup_server
+
+    local server_log="$LOG_DIR/${label}.server.log"
+    : > "$server_log"
+
+    # Launch vllm serve under nsys. We don't constrain duration here -- we'll
+    # rely on cudaProfilerApi-style capture range below for the steady-state
+    # decode window. Until we add explicit hooks, just let nsys profile the
+    # full lifetime and use --capture-range=none + --duration on the bench
+    # window.  Simpler: profile the full run, mark the bench start/end with
+    # NVTX ranges so we can filter post-hoc.
+    #
+    # Using --sample=none keeps overhead modest and CPU samples out.
+    # --trace=cuda,nvtx,osrt captures kernel launches + NVTX + a bit of OS.
+
+    # shellcheck disable=SC2086
+    env $env_str \
+        nsys profile \
+            --output="$OUT_DIR/${label}" \
+            --force-overwrite=true \
+            --trace=cuda,nvtx \
+            --sample=none \
+            --cpuctxsw=none \
+            --capture-range=cudaProfilerApi \
+            --capture-range-end=stop \
+            --gpu-metrics-device=none \
+            "$VLLM" serve "$MODEL" \
+            --port "$PORT" \
+            --gpu-memory-utilization 0.5 \
+            --max-model-len 2048 \
+            --profiler-config '{"profiler":"cuda","ignore_frontend":true}' \
+            > "$server_log" 2>&1 &
+    local server_pid=$!
+
+    if ! wait_for_ready "$server_log"; then
+        echo "  FAILED: server didn't come up; tail of log:"
+        tail -30 "$server_log"
+        kill -9 $server_pid 2>/dev/null || true
+        cleanup_server
+        return 1
+    fi
+    echo "  [$(date +%H:%M:%S)] Server ready"
+
+    # Warmup (untimed, fills caches / triggers compile)
+    if [[ "$WARMUP_PROMPTS" -gt 0 ]]; then
+        echo "  [$(date +%H:%M:%S)] Warmup ${WARMUP_PROMPTS} prompts..."
+        "$VLLM" bench serve --backend vllm --model "$MODEL" --port "$PORT" \
+            --dataset-name random --random-input-len 512 --random-output-len 128 \
+            --num-prompts "$WARMUP_PROMPTS" --request-rate "$RATE" --seed 1 \
+            > "$LOG_DIR/${label}.warmup.log" 2>&1 || true
+    fi
+
+    # Start the profiler window via the cudaProfilerStart side-channel.
+    # Easiest path: hit vllm's /start_profile endpoint, run the bench, then
+    # hit /stop_profile. vllm's start_profile uses torch.profiler, which calls
+    # cudaProfilerStart underneath. This matches nsys --capture-range=cudaProfilerApi.
+    #
+    # If /start_profile is not available (vllm too old), we fall back to a
+    # full-trace capture without --capture-range.
+
+    if curl -fsS -X POST "http://localhost:$PORT/start_profile" >/dev/null 2>&1; then
+        echo "  [$(date +%H:%M:%S)] /start_profile hit. Running bench..."
+
+        "$VLLM" bench serve --backend vllm --model "$MODEL" --port "$PORT" \
+            --dataset-name random --random-input-len 512 --random-output-len 128 \
+            --num-prompts "$NUM_PROMPTS" --request-rate "$RATE" --seed 42 \
+            > "$LOG_DIR/${label}.bench.log" 2>&1
+        local bench_rc=$?
+
+        curl -fsS -X POST "http://localhost:$PORT/stop_profile" >/dev/null 2>&1 || true
+        echo "  [$(date +%H:%M:%S)] /stop_profile hit. bench rc=$bench_rc"
+    else
+        echo "  /start_profile not available; profiler window not bounded. Tearing down."
+    fi
+
+    # Extract bench summary
+    grep -E "Request throughput|Mean TTFT|Mean TPOT|Total token throughput|Successful requests" \
+        "$LOG_DIR/${label}.bench.log" | sed 's/^/    /'
+
+    kill -INT $server_pid 2>/dev/null || true
+    sleep 5
+    cleanup_server
+    sleep 3
+}
+
+# Layout = true (default)
+run_one "layout_true" \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1"
+
+# Layout = false
+run_one "layout_false" \
+    "ENABLE_KVCACHED=true KVCACHED_AUTOPATCH=1 KVCACHED_CONTIGUOUS_LAYOUT=false"
+
+echo
+echo "=========================================="
+echo "Traces in $OUT_DIR"
+ls -la "$OUT_DIR" | tail -10
+echo "=========================================="
diff --git a/benchmarks/bench_alloc/run_sweep.sh b/benchmarks/bench_layout/run_sweep.sh
similarity index 100%
rename from benchmarks/bench_alloc/run_sweep.sh
rename to benchmarks/bench_layout/run_sweep.sh

From 1db7f4a3fc155a197bca5dc405478276181a381c Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 10 May 2026 23:26:56 -0500
Subject: [PATCH 11/15] bench: rename bench_result.md -> README.md and trim
 writeups

Match the bench_map_parallelism / bench_tp_ipc convention (README.md, with
Quick start). Rewrite the content to be readable after the branch is merged:

- Replace internal labels (main/PR/fix) with descriptive variant names:
  Python allocator / C++ allocator / C++ + restored resize.
- Drop section-by-section detail that just restated the same speedup;
  keep headline tables plus selected per-N detail where the pattern
  itself is informative (multi-thread shape, pool scaling with N).
- Reframe "open question" as "recommendation" so the takeaway about
  flipping CONTIGUOUS_LAYOUT default reads as a concrete proposal.

Net diff: bench_alloc/README.md 121 -> 61 lines; bench_layout/README.md
168 -> 81 lines. Content unchanged for the load-bearing measurements
(headline e2e tables, stride math, nsys per-kernel attribution, scaling
table).
---
 benchmarks/bench_alloc/README.md        |  61 ++++++++++
 benchmarks/bench_alloc/bench_result.md  | 104 -----------------
 benchmarks/bench_layout/README.md       |  81 ++++++++++++++
 benchmarks/bench_layout/bench_result.md | 143 ------------------------
 4 files changed, 142 insertions(+), 247 deletions(-)
 create mode 100644 benchmarks/bench_alloc/README.md
 delete mode 100644 benchmarks/bench_alloc/bench_result.md
 create mode 100644 benchmarks/bench_layout/README.md
 delete mode 100644 benchmarks/bench_layout/bench_result.md

diff --git a/benchmarks/bench_alloc/README.md b/benchmarks/bench_alloc/README.md
new file mode 100644
index 00000000..0fae5bc9
--- /dev/null
+++ b/benchmarks/bench_alloc/README.md
@@ -0,0 +1,61 @@
+# `KVCacheManager` alloc/free microbenchmark
+
+Times the `alloc(k) + free(handles)` hot path under three allocator implementations:
+
+- **Python allocator** — baseline before PR #319.
+- **C++ allocator** — PR #319 (`lianghao_c++`): allocator migrated to C++, `cudaMemGetInfo` dropped from `available_size()`, page-grouping moved into a single C++ call, `KVCacheBlock` object pool added.
+- **C++ allocator + restored resize** — PR #319 + `fix/pr319-restore-resize`: re-adds the elastic-resize poll and shm-name pin that PR #319 dropped.
+
+For e2e vLLM numbers and the layout overhead, see [`../bench_layout/README.md`](../bench_layout/README.md).
+
+## Run
+
+```bash
+python bench_alloc.py
+```
+
+NVIDIA GB10 (aarch64). 100 warmup iterations, then `alloc(k) + free(handles)` at varying `k`.
+
+## Headline numbers
+
+| operation | Python | C++ | C++ + resize |
+|---|--:|--:|--:|
+| `available_size()` μs | 6.52 | 0.52 | 0.52 |
+| `group_indices_by_page` μs @ N=1024 | 52.6 | 16.8 | 16.9 |
+| Slow-path alloc μs @ k=128 | 4196 | 4023 | 4354 |
+| Multi-thread alloc+free @ 4 threads, k=16 (Kops/s) | 12.0 | 48.6 | 31.6 |
+| `KVCacheBlock` pool μs @ N=1024 | — | 17.4 (vs 147 no-pool) | — |
+
+C++ allocator gains **3-12×** on Python-heavy paths. Slow-path alloc is CUDA-driver-bound (`cuMemMap`) and unaffected. The restored-resize variant retains all gains except multi-thread (~70% of bare C++ — each alloc now polls a resize shm descriptor).
+
+## Per-N detail
+
+`group_indices_by_page` holds ~3× across the range:
+
+| N | Python | C++ | speedup |
+|--:|--:|--:|--:|
+| 64 | 3.4 | 1.3 | 2.6× |
+| 1024 | 52.6 | 16.8 | 3.1× |
+| 16384 | 834 | 292 | 2.9× |
+
+Multi-thread (Kops/s, `async_sched=True`, k=16) — Python degrades, C++ holds:
+
+| threads | Python | C++ | C++ + resize |
+|--:|--:|--:|--:|
+| 1 | 15.1 | 41.2 | 32.5 |
+| 4 | 12.0 | 48.6 | 31.6 |
+| 8 | 9.1 | 51.5 | 29.1 |
+
+`KVCacheBlock` object pool — speedup grows with N:
+
+| N | no-pool | pool | speedup |
+|--:|--:|--:|--:|
+| 8 | 1.06 | 0.19 | 5.6× |
+| 1024 | 147 | 17.4 | 8.5× |
+| 4096 | 651 | 67.7 | 9.6× |
+
+Note: GIL is still held during C++ work, so multi-thread gains come from shorter critical sections, not real parallelism. Real vLLM uses `async_sched=False` (NoOpLock) and doesn't hit this path.
+
+## Bottom line
+
+3-12× microbench wins amortise to **~5% on e2e vLLM serving** because per-token model forward dominates. The much larger lever is the KV layout default — see [`../bench_layout/README.md`](../bench_layout/README.md).
diff --git a/benchmarks/bench_alloc/bench_result.md b/benchmarks/bench_alloc/bench_result.md
deleted file mode 100644
index 8769c724..00000000
--- a/benchmarks/bench_alloc/bench_result.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# `bench_alloc` — KVCacheManager alloc/free microbench
-
-NVIDIA GB10 (aarch64), clean rebuild between runs. `main` = `ovg-project/kvcached` main, `PR` = `lianghao_c++` + must-fix `98d9bb3`, `fix` = `fix/pr319-restore-resize`.
-
-Driver: `bench_alloc.py` — times `KVCacheManager.alloc(k) + free(handles)` cycles after 100 warmup iterations.
-
-For e2e vLLM serving numbers and the layout-overhead investigation, see [`../bench_layout/bench_result.md`](../bench_layout/bench_result.md).
-
-## A. `available_size()`
-
-PR drops `cudaMemGetInfo` from the hot path.
-
-| | μs/call |
-|--:|--:|
-| main | 6.52 |
-| PR | 0.52 |
-| fix | 0.52 |
-
-12.5×. Side effect: `available_size()` no longer caps by physical memory (later restored on PR-319 head — see commit `7e3fdb7`).
-
-## B. `group_indices_by_page`
-
-Called by `KVCacheManager.free`. Python loop + `defaultdict` → single C++ call.
-
-| N | main | PR | fix | speedup |
-|--:|--:|--:|--:|--:|
-| 64 | 3.36 | 1.28 | 1.21 | 2.6× |
-| 256 | 12.88 | 3.34 | 3.24 | 3.9× |
-| 1024 | 52.57 | 16.84 | 16.86 | 3.1× |
-| 4096 | 209.12 | 71.06 | 72.50 | 2.9× |
-| 16384 | 834.13 | 291.87 | 296.72 | 2.9× |
-
-(μs)
-
-## C. Slow-path alloc
-
-`KVCACHED_MIN/MAX_RESERVED_PAGES=0` forces a fresh `cuMemMap` per alloc. `k=128` ≈ one 2 MB page.
-
-| k | main | PR | fix |
-|--:|--:|--:|--:|
-| 128 | 4196 | 4023 | 4354 |
-| 256 | 8384 | 8113 | 8657 |
-| 1024 | 33028 | 32488 | 34662 |
-| 4096 | 134479 | 134430 | 137295 |
-
-(μs) All within 5%. The syscall dominates; C++ doesn't help.
-
-## D. Multi-thread throughput
-
-N Python threads, each in a tight `alloc(k) + free(h)` loop, `async_sched=True`. Aggregate ops/s.
-
-main:
-| threads | k=1 | k=16 |
-|--:|--:|--:|
-| 1 | 15.7K | 15.1K |
-| 2 | 15.3K | 14.4K |
-| 4 | 13.3K | 12.0K |
-| 8 | 10.4K | 9.1K |
-
-PR:
-| threads | k=1 | k=16 |
-|--:|--:|--:|
-| 1 | 45.9K | 41.2K |
-| 2 | 41.1K | 37.8K |
-| 4 | 38.5K | 48.6K |
-| 8 | 43.6K | 51.5K |
-
-fix:
-| threads | k=1 | k=16 |
-|--:|--:|--:|
-| 1 | 31.5K | 32.5K |
-| 2 | 30.6K | 30.7K |
-| 4 | 29.5K | 31.6K |
-| 8 | 27.8K | 29.1K |
-
-main degrades with threads. PR holds. fix sits ~30% behind PR (per-alloc shm poll for kvctl resize).
-
-GIL is still held during C++ work (no `gil_scoped_release`); gains come from shorter critical sections, not real parallelism. Real vLLM uses `async_sched=False` (NoOpLock) anyway.
-
-## F. `KVCacheBlock` object pool
-
-PR-only — main has no pool. Pre-allocated pool vs `new()` per call.
-
-| N | no-pool | pool | speedup |
-|--:|--:|--:|--:|
-| 8 | 1.06 | 0.19 | 5.6× |
-| 64 | 8.07 | 1.13 | 7.1× |
-| 256 | 31.28 | 4.38 | 7.1× |
-| 1024 | 147.4 | 17.44 | 8.5× |
-| 4096 | 650.8 | 67.65 | 9.6× |
-
-(μs)
-
-## Summary (microbench only)
-
-| | speedup vs main | fix retains |
-|---|---|---|
-| A. `available_size()` | 12.5× | full |
-| B. `group_indices_by_page` | ~3× | full |
-| C. slow-path alloc | 1× (driver-bound) | full |
-| D. multi-thread | 2-5× | ~70% of PR |
-| F. block-object pool | ~7× (PR-only) | full |
-
-These microbench wins are real but amortise to ~5% on e2e vLLM serving because per-token model forward dominates the hot path — see [`../bench_layout/bench_result.md`](../bench_layout/bench_result.md) for the e2e picture and the much larger layout-related lever.
diff --git a/benchmarks/bench_layout/README.md b/benchmarks/bench_layout/README.md
new file mode 100644
index 00000000..023239d2
--- /dev/null
+++ b/benchmarks/bench_layout/README.md
@@ -0,0 +1,81 @@
+# vLLM e2e + `KVCACHED_CONTIGUOUS_LAYOUT` overhead
+
+Where the 30-50% kvcached-vs-vanilla e2e overhead comes from, and why `KVCACHED_CONTIGUOUS_LAYOUT=false` closes it.
+
+For the alloc/free microbench, see [`../bench_alloc/README.md`](../bench_alloc/README.md).
+
+## Setup
+
+NVIDIA GB10 (aarch64). `Qwen/Qwen3-0.6B` (28 layers, 8 KV heads, head_dim 128, bf16). `vllm serve --gpu-memory-utilization 0.5 --max-model-len 2048`. Bench: `vllm bench serve` random 512in/128out, 500 prompts, 3 seeds, median.
+
+Variants compared: **vanilla** vs **kvcached** (Python allocator / C++ allocator / C++ + restored resize — all three behave identically at e2e).
+
+## Run
+
+```bash
+# E2E sweep (vanilla vs kvcached × LAYOUT × reserved pool)
+bash run_sweep.sh
+python parse_results.py sweep_results/
+
+# Kernel-level profile under both layouts
+bash run_nsys_layout.sh
+python diff_nsys_kernels.py nsys_runs/layout_false.nsys-rep nsys_runs/layout_true.nsys-rep
+```
+
+Intermediate outputs (`sweep_results/`, `nsys_runs/`, ...) are reproducible from the scripts and not tracked in git.
+
+## 1. The gap
+
+`rate=inf`, 500 prompts:
+
+| | tput (req/s) | TTFT mean (ms) | TPOT mean (ms) |
+|---|--:|--:|--:|
+| vanilla | 14.21 | 11575 | 119.3 |
+| kvcached default (`LAYOUT=true`) | 9.87 (-31%) | 16555 | 177.5 |
+| kvcached + `LAYOUT=false` | 14.17 (-1%) | 11642 | 119.0 |
+
+`LAYOUT=false` matches vanilla on every metric, also at `rate=16` (sustained load). The C++ allocator from PR #319 only closes ~5% of the kvcached overhead; reserved-pool size has no measurable effect. **The entire 30% gap is the layout default.**
+
+## 2. Mechanism
+
+### Stride math (from `interfaces.py:282-289`)
+
+Under `CONTIGUOUS_LAYOUT=true`, the raw KV tensor has shape `[num_blocks, num_layers, k/v, token, head, dim]`. When sliced to layer `i`, block n→n+1 stride is `num_layers × per_block_bytes`. For Qwen3-0.6B:
+
+- per-block K+V, one layer = 16 × 8 × 128 × 2 = **64 KB**
+- stride **`LAYOUT=true`** = 28 × 64 KB = **1.75 MB** (≈ VMM page size = 2 MB)
+- stride **`LAYOUT=false`** = **64 KB** (~32 blocks share one 2 MB page)
+
+So every FlashAttention block read crosses a fresh 2 MB VMM page under `LAYOUT=true`; non-contiguous packs them densely.
+
+### nsys per-kernel attribution
+
+Same workload as section 1. Total GPU kernel time grows by **+8,043 ms (+34.8%)** going from `LAYOUT=false → true`:
+
+| kernel | calls | `LAYOUT=false` ms | `LAYOUT=true` ms | Δms | Δ% |
+|---|--:|--:|--:|--:|--:|
+| `flash::flash_fwd_splitkv_kernel` (KV-read) | 3948 | 14,666 | **22,879** | **+8,213** | **+56.0%** |
+| `cutlass_80_..._gemm_relu` | 7756 | 2,277 | 2,214 | -63 | -2.8% |
+| `vllm::reshape_and_cache_flash_kernel` (KV-write) | 3948 | 302 | 271 | -32 | -10.5% |
+| (everything else) | — | ~6,000 | ~6,000 | ~0 | flat |
+
+**One kernel — FlashAttention's split-KV read — accounts for more than the entire e2e gap.** Every other kernel is unchanged or noise. The KV-*write* kernel is even slightly faster under contiguous (writes one position per request, no stride pattern).
+
+### Scaling
+
+Per-call attention slowdown grows with concurrent working-set size — consistent with TLB/L2 pressure:
+
+| workload | `LAYOUT=false` μs/call | `LAYOUT=true` μs/call | Δ |
+|---|--:|--:|--:|
+| 100 prompts | 851 | 1163 | +37% |
+| 500 prompts | 3,715 | 5,795 | **+56%** |
+
+`LAYOUT=false` is flat with workload because ~32 blocks share each page. Deeper models (32-layer Llama2-7B, 48+ layer Llama3-70B) cross the 2 MB page boundary even more sharply.
+
+## 3. Patch needed for `LAYOUT=false`
+
+`kvcached/kv_cache_manager.py:107` previously hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction. Fix: wire `CONTIGUOUS_LAYOUT` env var into the `KVCacheManager` constructor.
+
+## Recommendation
+
+`CONTIGUOUS_LAYOUT=true` is the kvcached default but is only needed for hybrid-linear/mamba models (`interfaces.py:138` already detects those and bails out of the non-contiguous path). On standard MHA/GQA/MLA, flipping the default to `false` eliminates the entire kvcached-vs-vanilla e2e overhead measured here.
diff --git a/benchmarks/bench_layout/bench_result.md b/benchmarks/bench_layout/bench_result.md
deleted file mode 100644
index d5feafa1..00000000
--- a/benchmarks/bench_layout/bench_result.md
+++ /dev/null
@@ -1,143 +0,0 @@
-# `bench_layout` — vLLM e2e + `KVCACHED_CONTIGUOUS_LAYOUT` overhead
-
-NVIDIA GB10 (aarch64). `main` = `ovg-project/kvcached` main, `PR` = `lianghao_c++` + must-fix `98d9bb3`, `fix` = `fix/pr319-restore-resize` (PR #319 head + restore-resize + bench scripts).
-
-For the alloc/free microbench numbers (Sections A-D, F), see [`../bench_alloc/bench_result.md`](../bench_alloc/bench_result.md).
-
-Driver scripts:
-- `run_sweep.sh` / `run_kvcached_configs.sh` / `run_layout_retest.sh` — `vllm bench serve` sweeps over branch × layout × reserved-pool size
-- `run_nsys_layout.sh` — same workload under `nsys profile`, brackets capture window via `/start_profile` + `/stop_profile`
-- `diff_nsys_kernels.py` — per-kernel time diff between two nsys traces
-- `run_ncu_attn.sh` — ncu memory-workload analysis on `flash_fwd_splitkv_kernel` (needs `NVreg_RestrictProfilingToAdminUsers=0`)
-
-## E. End-to-end vLLM serving
-
-`vllm bench serve` against a local server, `ENABLE_KVCACHED=true`. Qwen3-0.6B, gpu-mem-util 0.5, max-model-len 2048, random 512in/128out, 500 prompts, `request-rate=inf`. 100-prompt warmup discarded; 3 seeds (42, 99, 7), median.
-
-| | tput | TTFT mean | TTFT P99 | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|--:|
-| vanilla | 14.21 | 11575 | 26377 | 119.3 | 143.2 |
-| main | 9.42 (-34%) | 17284 (+49%) | 40464 | 185.1 | 239.6 |
-| PR | 9.86 (-31%) | 16472 | 38756 | 177.5 | 230.2 |
-| fix | 9.83 (-31%) | 16602 | 38746 | 178.2 | 231.7 |
-
-(req/s, ms.) PR closes ~5% of the kvcached overhead vs main; fix retains it. The remaining ~30% gap to vanilla is the layout default — see G.
-
-## G. Where the 30-50% e2e gap actually comes from
-
-Three hypotheses, tested on fix with the Section E setup.
-
-### G.1 Reserved-pool size — no e2e effect
-
-| | tput | TTFT mean | TPOT mean |
-|---|--:|--:|--:|
-| RESERVED=5/10 (default) | 9.87 | 16555 | 177.51 |
-| RESERVED=50/200 | 9.77 | 16548 | 179.38 |
-
-`cuMemMap` cost (4 ms/page from C) amortises across the run.
-
-### G.2 KV layout — the entire cause
-
-`KVCACHED_CONTIGUOUS_LAYOUT=false` switches from one shared buffer with cross-layer interleaving to per-layer VMM pools, matching vanilla's allocation pattern.
-
-rate=inf:
-| | tput | TTFT mean | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|
-| vanilla | 14.25 | 11565 | 118.42 | 142.33 |
-| LAYOUT=true (default) | 9.87 (-31%) | 16555 | 177.51 | 230.84 |
-| LAYOUT=false | 14.17 (-1%) | 11642 | 119.03 | 142.72 |
-
-rate=16:
-| | tput | TTFT mean | TPOT mean | TPOT P99 |
-|---|--:|--:|--:|--:|
-| vanilla | 13.20 | 240 | 73.62 | 117.27 |
-| LAYOUT=true | 9.88 (-25%) | 1574 | 141.93 | 226.60 |
-| LAYOUT=false + RESERVED=200 | 13.17 (-0.2%) | 246 | 73.67 | 117.01 |
-
-`LAYOUT=false` is indistinguishable from vanilla on every metric, at burst (inf) and sustained (16) load. The whole 30-50% overhead is the layout default — there's no residual VMM mapping cost.
-
-### G.3 Mechanism
-
-#### G.3.1 The stride (from code)
-
-`CONTIGUOUS_LAYOUT=true` packs all layers into one VM range. At `kvcached/integration/vllm/interfaces.py:282-289`:
-
-```python
-contiguous_shape = [num_blocks_per_layer, num_layers] + layer_elem_shape
-contiguous_tensor = raw_kv_tensors[0].view(dtype=dtype)[:num_eles].view(contiguous_shape)
-kv_tensors = [contiguous_tensor[:, i].permute(*permute_order) for i in range(num_layers)]
-```
-
-So the raw memory order is `(block, layer, k/v, token, head, dim)`. When the per-layer view `[:, i]` is sliced out for layer `i`, the stride between block `n` and block `n+1` in that view is `num_layers × per_block_bytes`. For Qwen3-0.6B:
-
-| field | value |
-|---|---|
-| `num_layers` | 28 |
-| `num_kv_heads` | 8 |
-| `head_dim` | 128 |
-| `block_size` | 16 tokens |
-| dtype | bf16 (2 B) |
-| per-block K (or V), one layer | 16·8·128·2 = **32 KB** |
-| per-block K+V, one layer | **64 KB** |
-| **block stride, contiguous=true** | 28 × 64 KB = **1.75 MB** |
-| **block stride, contiguous=false** | **64 KB** (from `interfaces.py:274`: `strides[1] = hidden_size_eles`) |
-| VMM page size | **2 MB** |
-
-So contiguous=true makes every block read of FlashAttention land within ~1 byte of a new 2 MB VMM page; non-contiguous packs ~32 blocks per page.
-
-#### G.3.2 nsys per-kernel diff
-
-Same workload as Section E (Qwen3-0.6B, gpu-mem 0.5, rand 512/128, rate=inf, 500 prompts, 100 warmup discarded, seed=42). vLLM started under `nsys profile --capture-range=cudaProfilerApi`; profiler window bracketed by `/start_profile`+`/stop_profile`.
-
-Total GPU-kernel time inside the profiler window:
-
-| | total kernel time | delta |
-|---|--:|--:|
-| LAYOUT=false | 23,121 ms | — |
-| LAYOUT=true | 31,164 ms | **+8,043 ms (+34.8%)** |
-
-Top kernels by absolute change (file: `nsys_runs/kernel_diff_500p.txt`):
-
-| kernel | calls | LAYOUT=false ms | LAYOUT=true ms | delta ms | delta % |
-|---|--:|--:|--:|--:|--:|
-| `flash::flash_fwd_splitkv_kernel` *(KV-read)* | 3948 | 14,666 | **22,879** | **+8,213** | **+56.0%** |
-| `cutlass_80_tensorop_bf16_s16816gemm_relu` | 7756 | 2,277 | 2,214 | -63 | -2.8% |
-| `vllm::reshape_and_cache_flash_kernel` *(KV-write)* | 3948 | 302 | 271 | -32 | -10.5% |
-| `nvjet_sm121_tst_mma_128x176x64` | 3752 | 580 | 552 | -28 | -4.8% |
-| `triton_red_fused_3` | 3780 | 118 | 95 | -23 | -19.8% |
-
-The +8,213 ms FlashAttention regression alone exceeds the +8,043 ms total — every other kernel is unchanged or slightly faster (likely L2 noise from cross-run variation). One kernel explains the entire e2e gap.
-
-Per-call attention cost:
-
-| | LAYOUT=false μs/call | LAYOUT=true μs/call | delta μs/call | delta % |
-|---|--:|--:|--:|--:|
-| 100 prompts (smaller WS) | 851 | 1163 | +313 | +37% |
-| 500 prompts (Section E WS) | 3,715 | 5,795 | **+2,080** | **+56%** |
-
-The per-call gap *and* the per-call slowdown both grow with concurrent working-set size — consistent with TLB/cache misses: more in-flight requests → more distinct 2 MB pages touched per attention call → cache pressure → linear-ish blow-up in kernel time. Non-contiguous is flat because ~32 blocks share each page.
-
-Note the asymmetry: only the **read** kernel regresses. `reshape_and_cache_flash_kernel` (which writes one new token per request per step) is unaffected — writes are sequential per-position so they don't trigger the cross-page stride pattern.
-
-#### G.3.3 Implication
-
-Effect scales with `num_layers`: 28 layers gave +56% per attention call; deeper models (32-layer Llama2-7B in #299) should be slightly worse, and a model with ≥48 layers crosses the 2 MB page boundary even more sharply.
-
-### G.4 Patch
-
-`kvcached/kv_cache_manager.py:107` hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction.
-
-```python
--from kvcached.utils import DEFAULT_IPC_NAME, PAGE_SIZE, SANITY_CHECK, get_kvcached_logger
-+from kvcached.utils import (CONTIGUOUS_LAYOUT, DEFAULT_IPC_NAME, PAGE_SIZE,
-+                             SANITY_CHECK, get_kvcached_logger)
-@@
--            contiguous_layout=True,
-+            contiguous_layout=CONTIGUOUS_LAYOUT,
-```
-
-## Summary
-
-The bigger lever is unrelated to the C++ migration: `KVCACHED_CONTIGUOUS_LAYOUT=false` closes the entire kvcached-vs-vanilla gap on standard MHA/GQA/MLA.
-
-Open question — why is `true` the default? Hybrid-linear/mamba models need it, but they already check and bail at `interfaces.py:138`. On standard attention layouts, flipping the default would eliminate the kvcached e2e overhead measured here.

From 5726d69a03e1b0f112ca5e0da16a1bae7e28d839 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 10 May 2026 23:31:00 -0500
Subject: [PATCH 12/15] bench_alloc: restructure README; lead with allocator
 gains
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace the mixed-unit "headline numbers" table with five clearly
  scoped subsections, each with: one-line context, table with explicit
  units (per-call μs vs aggregate Kops/s) and direction (lower/higher
  is better), and a bold speedup line.
- Rewrite the summary to lead with what the C++ allocator delivers
  (5 bullets of concrete gains). The e2e-amortization note moves to a
  footer instead of the headline, so the alloc work is no longer
  framed primarily by its dilution.
---
 benchmarks/bench_alloc/README.md | 72 +++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 20 deletions(-)

diff --git a/benchmarks/bench_alloc/README.md b/benchmarks/bench_alloc/README.md
index 0fae5bc9..44765998 100644
--- a/benchmarks/bench_alloc/README.md
+++ b/benchmarks/bench_alloc/README.md
@@ -4,9 +4,11 @@ Times the `alloc(k) + free(handles)` hot path under three allocator implementati
 
 - **Python allocator** — baseline before PR #319.
 - **C++ allocator** — PR #319 (`lianghao_c++`): allocator migrated to C++, `cudaMemGetInfo` dropped from `available_size()`, page-grouping moved into a single C++ call, `KVCacheBlock` object pool added.
-- **C++ allocator + restored resize** — PR #319 + `fix/pr319-restore-resize`: re-adds the elastic-resize poll and shm-name pin that PR #319 dropped.
+- **C++ + restored resize** — PR #319 + `fix/pr319-restore-resize`: re-adds the elastic-resize poll and shm-name pin that PR #319 dropped.
 
-For e2e vLLM numbers and the layout overhead, see [`../bench_layout/README.md`](../bench_layout/README.md).
+NVIDIA GB10 (aarch64). All latency numbers are per call in μs (lower is better) unless noted.
+
+For e2e vLLM serving numbers, see [`../bench_layout/README.md`](../bench_layout/README.md).
 
 ## Run
 
@@ -14,23 +16,23 @@ For e2e vLLM numbers and the layout overhead, see [`../bench_layout/README.md`](
 python bench_alloc.py
 ```
 
-NVIDIA GB10 (aarch64). 100 warmup iterations, then `alloc(k) + free(handles)` at varying `k`.
+## Results
+
+### 1. `available_size()` — the most frequently called allocator function
 
-## Headline numbers
+The Python path called `cudaMemGetInfo` on every invocation (~6 μs each). The C++ path skips it.
 
-| operation | Python | C++ | C++ + resize |
-|---|--:|--:|--:|
-| `available_size()` μs | 6.52 | 0.52 | 0.52 |
-| `group_indices_by_page` μs @ N=1024 | 52.6 | 16.8 | 16.9 |
-| Slow-path alloc μs @ k=128 | 4196 | 4023 | 4354 |
-| Multi-thread alloc+free @ 4 threads, k=16 (Kops/s) | 12.0 | 48.6 | 31.6 |
-| `KVCacheBlock` pool μs @ N=1024 | — | 17.4 (vs 147 no-pool) | — |
+| | μs/call |
+|--:|--:|
+| Python | 6.52 |
+| C++ | 0.52 |
+| C++ + resize | 0.52 |
 
-C++ allocator gains **3-12×** on Python-heavy paths. Slow-path alloc is CUDA-driver-bound (`cuMemMap`) and unaffected. The restored-resize variant retains all gains except multi-thread (~70% of bare C++ — each alloc now polls a resize shm descriptor).
+**12.5×.** Called once per scheduler step.
 
-## Per-N detail
+### 2. `group_indices_by_page` — called inside `free()`
 
-`group_indices_by_page` holds ~3× across the range:
+Maps a list of N block indices to their owning pages. Python used a per-element loop + `defaultdict`; C++ replaces it with one call.
 
 | N | Python | C++ | speedup |
 |--:|--:|--:|--:|
@@ -38,15 +40,37 @@ C++ allocator gains **3-12×** on Python-heavy paths. Slow-path alloc is CUDA-dr
 | 1024 | 52.6 | 16.8 | 3.1× |
 | 16384 | 834 | 292 | 2.9× |
 
-Multi-thread (Kops/s, `async_sched=True`, k=16) — Python degrades, C++ holds:
+**~3× across the range.** Restored-resize matches C++ within noise.
+
+### 3. Slow-path alloc — `cuMemMap` per call
+
+`KVCACHED_MIN/MAX_RESERVED_PAGES=0` forces every alloc to map a fresh 2 MB VMM page. `k` is blocks per alloc; k=128 ≈ one page.
+
+| k | Python | C++ | C++ + resize |
+|--:|--:|--:|--:|
+| 128 | 4196 | 4023 | 4354 |
+| 1024 | 33028 | 32488 | 34662 |
+| 4096 | 134479 | 134430 | 137295 |
+
+**All within 5%.** The CUDA driver syscall dominates; switching the surrounding code to C++ doesn't help.
 
-| threads | Python | C++ | C++ + resize |
+### 4. Multi-thread throughput — Python contention dissolves
+
+N Python threads, each in a tight `alloc(k=16) + free(h)` loop, `async_sched=True`. Aggregate ops/s (**higher is better**).
+
+| threads | Python Kops/s | C++ Kops/s | C++ + resize Kops/s |
 |--:|--:|--:|--:|
 | 1 | 15.1 | 41.2 | 32.5 |
 | 4 | 12.0 | 48.6 | 31.6 |
 | 8 | 9.1 | 51.5 | 29.1 |
 
-`KVCacheBlock` object pool — speedup grows with N:
+Python **degrades** under thread count (Python-level contention dominated the old hot path). C++ holds or improves. Restored-resize is flat ~30K — each alloc polls a resize shm descriptor that bare C++ skips.
+
+(GIL is still held during C++ work, so gains come from shorter critical sections, not real parallelism. Real vLLM uses `async_sched=False` and doesn't exercise this path.)
+
+### 5. `KVCacheBlock` object pool — C++ only
+
+Pre-allocated pool of `KVCacheBlock` objects vs `new` per call. The Python baseline has no equivalent pool.
 
 | N | no-pool | pool | speedup |
 |--:|--:|--:|--:|
@@ -54,8 +78,16 @@ Multi-thread (Kops/s, `async_sched=True`, k=16) — Python degrades, C++ holds:
 | 1024 | 147 | 17.4 | 8.5× |
 | 4096 | 651 | 67.7 | 9.6× |
 
-Note: GIL is still held during C++ work, so multi-thread gains come from shorter critical sections, not real parallelism. Real vLLM uses `async_sched=False` (NoOpLock) and doesn't hit this path.
+**5-10×**, speedup grows with N.
+
+## Summary — what the C++ allocator delivers
+
+- **12.5× on `available_size()`** — eliminates the per-scheduler-step `cudaMemGetInfo` cost.
+- **~3× on `group_indices_by_page`** — flat across N from 64 to 16,384.
+- **Multi-thread throughput scales** instead of degrading: 8 threads go from 9 Kops/s (Python) to 51 Kops/s (C++).
+- **5-10× on `KVCacheBlock` allocation** via the new object pool (no Python equivalent).
+- Slow-path `cuMemMap` is driver-bound and unaffected by the migration.
 
-## Bottom line
+The restored-resize variant retains every gain except multi-thread (~70% of bare C++) because each alloc polls the resize shm descriptor.
 
-3-12× microbench wins amortise to **~5% on e2e vLLM serving** because per-token model forward dominates. The much larger lever is the KV layout default — see [`../bench_layout/README.md`](../bench_layout/README.md).
+These wins amortise to ~5% on e2e vLLM serving (per-token model forward dominates). The much larger e2e lever is unrelated — see [`../bench_layout/README.md`](../bench_layout/README.md).

From ee4fe8b2c164e4f39d29bd529d67f194a29338be Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Mon, 11 May 2026 11:25:35 -0500
Subject: [PATCH 13/15] bench_layout: add Section 3 quantifying when
 LAYOUT=true wins

Three measured advantages of the contiguous layout, all small relative to
the attention-read cost shown in Sections 1-2:

- 3.1 Hybrid linear / mamba: interfaces.py:138 forces it; no choice.
- 3.2 Init time: alloc_kv_cache costs ~635 ms (contiguous) vs ~2055 ms
  (non-contiguous). Gap is a flat ~1.4 s across num_layers in {8..80},
  not linear in layer count. Breakdown via temporary timing prints in
  csrc/allocator.cpp and csrc/ftensor.cpp showed ~99% of the gap lives
  in FTensor::init_with_zero_(): contiguous uses a compound page of
  page_size * num_layers * num_kv_buffers and makes 1947 cuMemMap
  calls at ~325 us each (= 632 ms); non-contiguous uses 2 MB pages
  and makes 62,304 cuMemMap calls at ~33 us each (= 2,054 ms). Driver
  per-call overhead dominates -- fewer larger calls win.
- 3.3 Alloc/free hot path: contiguous ~2x faster on cold mapping
  (RESERVED=0) and ~2x at small k under default reserve, collapsing to
  ~1x at k=256.

3.4 then frames the trade-off as startup vs steady-state with an explicit
break-even calculation: at the Section 1 workload, LAYOUT=true's 1.4 s
startup advantage is paid off after about 45 requests, so LAYOUT=false
strictly wins for any non-trivial serving workload. Concrete cases where
LAYOUT=true still pays: smoke tests / single-shot inference (<~45
requests), frequent vLLM restarts, boot-time SLA, hybrid linear / mamba.

Recommendation section tightened to acknowledge the startup cost
explicitly rather than dismissing it as "doesn't show up at e2e".
---
 benchmarks/bench_layout/README.md | 79 ++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/benchmarks/bench_layout/README.md b/benchmarks/bench_layout/README.md
index 023239d2..d5f4ddea 100644
--- a/benchmarks/bench_layout/README.md
+++ b/benchmarks/bench_layout/README.md
@@ -72,10 +72,85 @@ Per-call attention slowdown grows with concurrent working-set size — consisten
 
 `LAYOUT=false` is flat with workload because ~32 blocks share each page. Deeper models (32-layer Llama2-7B, 48+ layer Llama3-70B) cross the 2 MB page boundary even more sharply.
 
-## 3. Patch needed for `LAYOUT=false`
+## 3. When does `LAYOUT=true` win?
+
+The previous sections show non-contiguous wins big on attention. The contiguous layout still pays off on three things:
+
+### 3.1 Hybrid linear / mamba models — required
+
+Mamba layers share the same flat KV buffer as attention layers and index into it per virtual block across layers. `interfaces.py:138` detects hybrid-linear configs and refuses to run with `CONTIGUOUS_LAYOUT=false`. So for `HYBRID_LINEAR` model types there is no choice — contiguous is mandatory.
+
+### 3.2 Init time — `LAYOUT=true` is ~3× faster
+
+Contiguous reserves one big VM range; non-contiguous reserves `num_layers` separate ranges + a shared zero-page handle. Measured `alloc_kv_cache` time (ms) at varying `num_layers`, 1 GB per layer:
+
+| `num_layers` | `LAYOUT=true` | `LAYOUT=false` | Δ |
+|--:|--:|--:|--:|
+| 8 | 658 | 2,058 | +1,400 |
+| 16 | 640 | 1,899 | +1,259 |
+| 28 | 568 | 2,058 | +1,490 |
+| 32 | 632 | 1,936 | +1,304 |
+| 80 | 613 | 1,890 | +1,277 |
+
+The extra cost is a roughly flat **~1.4 s**, not linear in `num_layers` — looks like fixed-cost driver / metadata overhead per VM range plus a small per-layer term, with the per-layer term cheap enough that it stays in the noise up to 80 layers. One-shot at server startup; doesn't recur per request.
+
+### 3.3 Alloc/free hot path — `LAYOUT=true` is ~2× faster
+
+Each PageAllocator "page" maps to 1 cuMemMap call under contiguous, vs `num_layers × (K+V)` FTensor `map()` calls under non-contiguous. Measured with the same `bench_alloc.py` harness (16 layers):
+
+Cold path — `KVCACHED_MIN/MAX_RESERVED_PAGES=0` forces a fresh mapping every alloc:
+
+| k | `LAYOUT=true` μs | `LAYOUT=false` μs | ratio |
+|--:|--:|--:|--:|
+| 1 | 4527 | 9512 | 2.1× |
+| 16 | 4554 | 9567 | 2.1× |
+| 64 | 4526 | 9638 | 2.1× |
+| 256 | 9240 | 19261 | 2.1× |
+
+Steady state — default `RESERVED=5/10`, mappings cached:
+
+| k | `LAYOUT=true` μs | `LAYOUT=false` μs | ratio |
+|--:|--:|--:|--:|
+| 1 | 43 | 87 | 2.0× |
+| 16 | 37 | 94 | 2.5× |
+| 64 | 38 | 64 | 1.7× |
+| 256 | 52 | 59 | 1.1× |
+
+Holds ~2× at small k; collapses to ~1× at larger k as bookkeeping amortises.
+
+### 3.4 Net effect: startup vs steady-state
+
+Per-request attention overhead hits every decode step. Startup overhead hits once. So which layout wins on **total wall-clock including server boot** depends on how many requests you serve before tearing down.
+
+For the Section 1 workload:
+- `LAYOUT=true` startup advantage: ~1.4 s (Section 3.2)
+- `LAYOUT=false` throughput advantage: 14.17 vs 9.83 req/s, i.e. 31 ms saved per request on average
+
+Break-even:
+
+```
+t_true(N)  =       N / 9.83
+t_false(N) = 1.4 + N / 14.17
+N* ≈ 45 requests
+```
+
+For **>45 requests** total, `LAYOUT=false` is faster wall-clock even counting startup. For **<45**, `LAYOUT=true`'s faster boot wins on total time. The 45 figure depends on the model — deeper layers shift the attention gap upward (Section 2 "Scaling") and would lower the break-even further.
+
+So the trade-off is genuine but heavily favours `LAYOUT=false` for any real serving workload. Where `LAYOUT=true` actually pays off:
+
+- **Very short-lived runs** — smoke tests, debugging, single-shot inference with <50 requests
+- **Frequent server restarts** — request-level autoscaling, model reloading
+- **Boot-time SLA** — when every second to first-ready matters
+- **Hybrid linear / mamba** — forced regardless
+
+For all other cases (long-running serving, throughput-bound deployments, latency-bound deployments under sustained load), the 1.4 s of extra startup is paid off within tens of requests and `LAYOUT=false` is strictly better thereafter.
+
+## 4. Patch needed for `LAYOUT=false`
 
 `kvcached/kv_cache_manager.py:107` previously hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction. Fix: wire `CONTIGUOUS_LAYOUT` env var into the `KVCacheManager` constructor.
 
 ## Recommendation
 
-`CONTIGUOUS_LAYOUT=true` is the kvcached default but is only needed for hybrid-linear/mamba models (`interfaces.py:138` already detects those and bails out of the non-contiguous path). On standard MHA/GQA/MLA, flipping the default to `false` eliminates the entire kvcached-vs-vanilla e2e overhead measured here.
+For standard MHA/GQA/MLA, `CONTIGUOUS_LAYOUT=false` is the right default — it eliminates the entire kvcached-vs-vanilla e2e overhead at the cost of a few seconds of extra startup and microseconds of extra alloc-path latency, neither of which shows up in steady-state throughput.
+
+`CONTIGUOUS_LAYOUT=true` should remain the default only for hybrid-linear/mamba, which `interfaces.py:138` already detects.

From 233b2c9ee6c804c91641f54bc739f08a31ead86c Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Mon, 11 May 2026 11:32:21 -0500
Subject: [PATCH 14/15] bench_layout: drop section 4 (LAYOUT=false enablement
 already fixed)

---
 benchmarks/bench_layout/README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/benchmarks/bench_layout/README.md b/benchmarks/bench_layout/README.md
index d5f4ddea..3d273d07 100644
--- a/benchmarks/bench_layout/README.md
+++ b/benchmarks/bench_layout/README.md
@@ -145,10 +145,6 @@ So the trade-off is genuine but heavily favours `LAYOUT=false` for any real serv
 
 For all other cases (long-running serving, throughput-bound deployments, latency-bound deployments under sustained load), the 1.4 s of extra startup is paid off within tens of requests and `LAYOUT=false` is strictly better thereafter.
 
-## 4. Patch needed for `LAYOUT=false`
-
-`kvcached/kv_cache_manager.py:107` previously hardcoded `contiguous_layout=True`. With `LAYOUT=false`, FTensor used per-layer buffers but `PageAllocator` still computed striped offsets → `cuMemUnmap (1): invalid argument` on first eviction. Fix: wire `CONTIGUOUS_LAYOUT` env var into the `KVCacheManager` constructor.
-
 ## Recommendation
 
 For standard MHA/GQA/MLA, `CONTIGUOUS_LAYOUT=false` is the right default — it eliminates the entire kvcached-vs-vanilla e2e overhead at the cost of a few seconds of extra startup and microseconds of extra alloc-path latency, neither of which shows up in steady-state throughput.

From 6b2bbbcc633d5205f7063fc05a2cd2241213d8a2 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Mon, 11 May 2026 15:38:43 -0500
Subject: [PATCH 15/15] bench_layout: tighten README, rename Recommendation ->
 Summary

---
 benchmarks/bench_layout/README.md | 140 +++++++++---------------------
 1 file changed, 42 insertions(+), 98 deletions(-)

diff --git a/benchmarks/bench_layout/README.md b/benchmarks/bench_layout/README.md
index 3d273d07..b23d1fde 100644
--- a/benchmarks/bench_layout/README.md
+++ b/benchmarks/bench_layout/README.md
@@ -1,152 +1,96 @@
 # vLLM e2e + `KVCACHED_CONTIGUOUS_LAYOUT` overhead
 
-Where the 30-50% kvcached-vs-vanilla e2e overhead comes from, and why `KVCACHED_CONTIGUOUS_LAYOUT=false` closes it.
+Why kvcached is 30-50% slower than vanilla vLLM by default, and why `KVCACHED_CONTIGUOUS_LAYOUT=false` fixes it.
 
 For the alloc/free microbench, see [`../bench_alloc/README.md`](../bench_alloc/README.md).
 
 ## Setup
 
-NVIDIA GB10 (aarch64). `Qwen/Qwen3-0.6B` (28 layers, 8 KV heads, head_dim 128, bf16). `vllm serve --gpu-memory-utilization 0.5 --max-model-len 2048`. Bench: `vllm bench serve` random 512in/128out, 500 prompts, 3 seeds, median.
-
-Variants compared: **vanilla** vs **kvcached** (Python allocator / C++ allocator / C++ + restored resize — all three behave identically at e2e).
+GB10 (aarch64). `Qwen/Qwen3-0.6B` (28 layers, 8 KV heads, head_dim 128, bf16). `vllm serve --gpu-memory-utilization 0.5 --max-model-len 2048`. Bench: `vllm bench serve` random 512in/128out, 500 prompts, 3 seeds.
 
 ## Run
 
 ```bash
 # E2E sweep (vanilla vs kvcached × LAYOUT × reserved pool)
-bash run_sweep.sh
-python parse_results.py sweep_results/
+bash run_sweep.sh && python parse_results.py sweep_results/
 
 # Kernel-level profile under both layouts
 bash run_nsys_layout.sh
 python diff_nsys_kernels.py nsys_runs/layout_false.nsys-rep nsys_runs/layout_true.nsys-rep
 ```
 
-Intermediate outputs (`sweep_results/`, `nsys_runs/`, ...) are reproducible from the scripts and not tracked in git.
+Intermediate outputs aren't tracked in git — reproducible from the scripts.
 
 ## 1. The gap
 
-`rate=inf`, 500 prompts:
+500 prompts at `rate=inf`:
 
 | | tput (req/s) | TTFT mean (ms) | TPOT mean (ms) |
 |---|--:|--:|--:|
 | vanilla | 14.21 | 11575 | 119.3 |
-| kvcached default (`LAYOUT=true`) | 9.87 (-31%) | 16555 | 177.5 |
+| kvcached (`LAYOUT=true`, default) | 9.87 (-31%) | 16555 | 177.5 |
 | kvcached + `LAYOUT=false` | 14.17 (-1%) | 11642 | 119.0 |
 
-`LAYOUT=false` matches vanilla on every metric, also at `rate=16` (sustained load). The C++ allocator from PR #319 only closes ~5% of the kvcached overhead; reserved-pool size has no measurable effect. **The entire 30% gap is the layout default.**
-
-## 2. Mechanism
-
-### Stride math (from `interfaces.py:282-289`)
-
-Under `CONTIGUOUS_LAYOUT=true`, the raw KV tensor has shape `[num_blocks, num_layers, k/v, token, head, dim]`. When sliced to layer `i`, block n→n+1 stride is `num_layers × per_block_bytes`. For Qwen3-0.6B:
-
-- per-block K+V, one layer = 16 × 8 × 128 × 2 = **64 KB**
-- stride **`LAYOUT=true`** = 28 × 64 KB = **1.75 MB** (≈ VMM page size = 2 MB)
-- stride **`LAYOUT=false`** = **64 KB** (~32 blocks share one 2 MB page)
+`LAYOUT=false` matches vanilla on every metric, also at `rate=16` (sustained load). The C++ allocator from PR #319 only buys back ~5%; reserve-pool size doesn't help either. **It's all the layout.**
 
-So every FlashAttention block read crosses a fresh 2 MB VMM page under `LAYOUT=true`; non-contiguous packs them densely.
+## 2. Where the gap actually is
 
-### nsys per-kernel attribution
+### Stride math
 
-Same workload as section 1. Total GPU kernel time grows by **+8,043 ms (+34.8%)** going from `LAYOUT=false → true`:
+`CONTIGUOUS_LAYOUT=true` lays out KV as `[num_blocks, num_layers, k/v, token, head, dim]` (`interfaces.py:282-289`). When you slice down to one layer, block n→n+1 stride is `num_layers × per_block_bytes`. For Qwen3-0.6B:
 
-| kernel | calls | `LAYOUT=false` ms | `LAYOUT=true` ms | Δms | Δ% |
-|---|--:|--:|--:|--:|--:|
-| `flash::flash_fwd_splitkv_kernel` (KV-read) | 3948 | 14,666 | **22,879** | **+8,213** | **+56.0%** |
-| `cutlass_80_..._gemm_relu` | 7756 | 2,277 | 2,214 | -63 | -2.8% |
-| `vllm::reshape_and_cache_flash_kernel` (KV-write) | 3948 | 302 | 271 | -32 | -10.5% |
-| (everything else) | — | ~6,000 | ~6,000 | ~0 | flat |
+- per-block K+V, one layer = 16·8·128·2 = **64 KB**
+- stride under `LAYOUT=true` = 28 × 64 KB = **1.75 MB** (≈ VMM page = 2 MB)
+- stride under `LAYOUT=false` = **64 KB** (~32 blocks share a page)
 
-**One kernel — FlashAttention's split-KV read — accounts for more than the entire e2e gap.** Every other kernel is unchanged or noise. The KV-*write* kernel is even slightly faster under contiguous (writes one position per request, no stride pattern).
-
-### Scaling
-
-Per-call attention slowdown grows with concurrent working-set size — consistent with TLB/L2 pressure:
-
-| workload | `LAYOUT=false` μs/call | `LAYOUT=true` μs/call | Δ |
-|---|--:|--:|--:|
-| 100 prompts | 851 | 1163 | +37% |
-| 500 prompts | 3,715 | 5,795 | **+56%** |
+So under contiguous, every FlashAttention block read lands on its own fresh 2 MB page. Non-contiguous packs 32 blocks per page. The attention kernel can't hide that.
 
-`LAYOUT=false` is flat with workload because ~32 blocks share each page. Deeper models (32-layer Llama2-7B, 48+ layer Llama3-70B) cross the 2 MB page boundary even more sharply.
+### nsys per-kernel breakdown
 
-## 3. When does `LAYOUT=true` win?
+Same workload as Section 1. Going from `LAYOUT=false → true` adds **+8,043 ms (+34.8%)** total GPU kernel time, all in one kernel:
 
-The previous sections show non-contiguous wins big on attention. The contiguous layout still pays off on three things:
+| kernel | calls | LAYOUT=false ms | LAYOUT=true ms | Δ |
+|---|--:|--:|--:|--:|
+| `flash::flash_fwd_splitkv_kernel` (KV-read) | 3948 | 14,666 | 22,879 | **+8,213 (+56%)** |
+| `vllm::reshape_and_cache_flash_kernel` (KV-write) | 3948 | 302 | 271 | -32 (-11%) |
+| everything else | — | ~8,000 | ~8,000 | ~0 |
 
-### 3.1 Hybrid linear / mamba models — required
+That one kernel exceeds the entire gap. Worth noting: the KV-*write* kernel isn't affected — only the multi-block read path is. Writes are sequential per-position so they never hit the cross-page stride.
 
-Mamba layers share the same flat KV buffer as attention layers and index into it per virtual block across layers. `interfaces.py:138` detects hybrid-linear configs and refuses to run with `CONTIGUOUS_LAYOUT=false`. So for `HYBRID_LINEAR` model types there is no choice — contiguous is mandatory.
+### Scales with working set
 
-### 3.2 Init time — `LAYOUT=true` is ~3× faster
+Per-call attention time:
 
-Contiguous reserves one big VM range; non-contiguous reserves `num_layers` separate ranges + a shared zero-page handle. Measured `alloc_kv_cache` time (ms) at varying `num_layers`, 1 GB per layer:
+- 100 prompts: 1163 vs 851 μs (**+37%**)
+- 500 prompts: 5795 vs 3715 μs (**+56%**)
 
-| `num_layers` | `LAYOUT=true` | `LAYOUT=false` | Δ |
-|--:|--:|--:|--:|
-| 8 | 658 | 2,058 | +1,400 |
-| 16 | 640 | 1,899 | +1,259 |
-| 28 | 568 | 2,058 | +1,490 |
-| 32 | 632 | 1,936 | +1,304 |
-| 80 | 613 | 1,890 | +1,277 |
+More concurrent requests → larger working set → more distinct 2 MB pages touched → worse TLB/L2 hit rate. `LAYOUT=false` stays flat because 32 blocks share one page. Deeper models (Llama2-7B at 32 layers, Llama3-70B at 80) cross the page boundary even harder.
 
-The extra cost is a roughly flat **~1.4 s**, not linear in `num_layers` — looks like fixed-cost driver / metadata overhead per VM range plus a small per-layer term, with the per-layer term cheap enough that it stays in the noise up to 80 layers. One-shot at server startup; doesn't recur per request.
+## 3. Where `LAYOUT=true` still wins
 
-### 3.3 Alloc/free hot path — `LAYOUT=true` is ~2× faster
+Three things to put on the other side of the scale.
 
-Each PageAllocator "page" maps to 1 cuMemMap call under contiguous, vs `num_layers × (K+V)` FTensor `map()` calls under non-contiguous. Measured with the same `bench_alloc.py` harness (16 layers):
+**Hybrid linear / mamba: required.** Mamba state shares the KV buffer and indexes by virtual block across layers. `interfaces.py:138` outright refuses non-contiguous for hybrid-linear configs.
 
-Cold path — `KVCACHED_MIN/MAX_RESERVED_PAGES=0` forces a fresh mapping every alloc:
+**Init time: ~1.4 s faster at server boot.** Contiguous reserves one big VM range; non-contiguous reserves `num_layers` separate ones. Measured `alloc_kv_cache` (16 layers, 1 GB/layer): 635 ms vs 2055 ms. ~99% of that 1.4 s is `FTensor::init_with_zero_()` mapping the zero-page over the entire VM range — contiguous uses a 64 MB compound page so it makes 1947 `cuMemMap` calls (~325 μs each); non-contiguous uses 2 MB pages and makes 62,304 calls (~33 μs each). CUDA driver per-call overhead is the dominant cost, and bigger pages amortise it better.
 
-| k | `LAYOUT=true` μs | `LAYOUT=false` μs | ratio |
-|--:|--:|--:|--:|
-| 1 | 4527 | 9512 | 2.1× |
-| 16 | 4554 | 9567 | 2.1× |
-| 64 | 4526 | 9638 | 2.1× |
-| 256 | 9240 | 19261 | 2.1× |
+The gap stays roughly flat across `num_layers ∈ {8..80}` (1.3–1.5 s), one-shot at startup.
 
-Steady state — default `RESERVED=5/10`, mappings cached:
+**Alloc/free hot path: ~2× faster.** Each page mapping under contiguous = 1 `cuMemMap`; under non-contiguous = `num_layers × (K+V)` FTensor `map()` calls. Cold path (`RESERVED=0`) shows a consistent 2.1× ratio; steady-state at small `k` is similar, collapsing to ~1× at `k=256`.
 
-| k | `LAYOUT=true` μs | `LAYOUT=false` μs | ratio |
-|--:|--:|--:|--:|
-| 1 | 43 | 87 | 2.0× |
-| 16 | 37 | 94 | 2.5× |
-| 64 | 38 | 64 | 1.7× |
-| 256 | 52 | 59 | 1.1× |
-
-Holds ~2× at small k; collapses to ~1× at larger k as bookkeeping amortises.
-
-### 3.4 Net effect: startup vs steady-state
-
-Per-request attention overhead hits every decode step. Startup overhead hits once. So which layout wins on **total wall-clock including server boot** depends on how many requests you serve before tearing down.
-
-For the Section 1 workload:
-- `LAYOUT=true` startup advantage: ~1.4 s (Section 3.2)
-- `LAYOUT=false` throughput advantage: 14.17 vs 9.83 req/s, i.e. 31 ms saved per request on average
-
-Break-even:
-
-```
-t_true(N)  =       N / 9.83
-t_false(N) = 1.4 + N / 14.17
-N* ≈ 45 requests
-```
+### When does the trade-off flip?
 
-For **>45 requests** total, `LAYOUT=false` is faster wall-clock even counting startup. For **<45**, `LAYOUT=true`'s faster boot wins on total time. The 45 figure depends on the model — deeper layers shift the attention gap upward (Section 2 "Scaling") and would lower the break-even further.
+Attention overhead hits every decode step. Startup hits once. For the Section 1 workload:
 
-So the trade-off is genuine but heavily favours `LAYOUT=false` for any real serving workload. Where `LAYOUT=true` actually pays off:
+- `LAYOUT=true` startup advantage: ~1.4 s
+- `LAYOUT=false` throughput advantage: 14.17 vs 9.83 req/s, ≈ 31 ms/req
 
-- **Very short-lived runs** — smoke tests, debugging, single-shot inference with <50 requests
-- **Frequent server restarts** — request-level autoscaling, model reloading
-- **Boot-time SLA** — when every second to first-ready matters
-- **Hybrid linear / mamba** — forced regardless
+Break-even at **~45 requests**. Above that, non-contiguous wins on total wall-clock; below, contiguous's faster boot wins. Deeper models shift the break-even down further.
 
-For all other cases (long-running serving, throughput-bound deployments, latency-bound deployments under sustained load), the 1.4 s of extra startup is paid off within tens of requests and `LAYOUT=false` is strictly better thereafter.
+So contiguous still wins for: smoke tests, single-shot inference, request-level autoscaling, boot-SLA workloads, hybrid linear/mamba (forced). Everything else: non-contiguous.
 
-## Recommendation
+## Summary
 
-For standard MHA/GQA/MLA, `CONTIGUOUS_LAYOUT=false` is the right default — it eliminates the entire kvcached-vs-vanilla e2e overhead at the cost of a few seconds of extra startup and microseconds of extra alloc-path latency, neither of which shows up in steady-state throughput.
+The kvcached default `CONTIGUOUS_LAYOUT=true` costs ~30% e2e throughput on standard MHA/GQA/MLA because every FlashAttention block read crosses a fresh 2 MB VMM page. Flipping to `LAYOUT=false` closes the gap entirely, at the price of ~1.4 s extra startup that's paid off in tens of requests.
 
-`CONTIGUOUS_LAYOUT=true` should remain the default only for hybrid-linear/mamba, which `interfaces.py:138` already detects.
+The default should flip to `false` for non-hybrid models; `interfaces.py:138` already handles the hybrid-linear case where contiguous is mandatory.